1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.hvector.count;
32 
33     uintptr_t x1 = res / inner_elements;
34     res %= inner_elements;
35     inner_elements /= md->u.hvector.blocklength;
36     uintptr_t x2 = res;
37 
38     intptr_t stride1 = md->u.hvector.stride;
39     uintptr_t extent2 = md->u.hvector.child->extent;
40     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2));
41 }
42 
yaksuri_cudai_pack_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)43 void yaksuri_cudai_pack_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
44 {
45 void *args[] = { &inbuf, &outbuf, &count, &md };
46     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_resized_wchar_t,
47         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
48     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
49 }
50 
yaksuri_cudai_kernel_unpack_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)51 __global__ void yaksuri_cudai_kernel_unpack_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
52 {
53     const char *__restrict__ sbuf = (const char *) inbuf;
54     char *__restrict__ dbuf = (char *) outbuf;
55     uintptr_t extent = md->extent;
56     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
57     uintptr_t res = idx;
58     uintptr_t inner_elements = md->num_elements;
59 
60     if (idx >= (count * inner_elements))
61         return;
62 
63     uintptr_t x0 = res / inner_elements;
64     res %= inner_elements;
65     inner_elements /= md->u.hvector.count;
66 
67     uintptr_t x1 = res / inner_elements;
68     res %= inner_elements;
69     inner_elements /= md->u.hvector.blocklength;
70     uintptr_t x2 = res;
71 
72     intptr_t stride1 = md->u.hvector.stride;
73     uintptr_t extent2 = md->u.hvector.child->extent;
74     *((wchar_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
75 }
76 
yaksuri_cudai_unpack_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)77 void yaksuri_cudai_unpack_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
78 {
79 void *args[] = { &inbuf, &outbuf, &count, &md };
80     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_resized_wchar_t,
81         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
82     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
83 }
84 
yaksuri_cudai_kernel_pack_hvector_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)85 __global__ void yaksuri_cudai_kernel_pack_hvector_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
86 {
87     const char *__restrict__ sbuf = (const char *) inbuf;
88     char *__restrict__ dbuf = (char *) outbuf;
89     uintptr_t extent = md->extent;
90     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
91     uintptr_t res = idx;
92     uintptr_t inner_elements = md->num_elements;
93 
94     if (idx >= (count * inner_elements))
95         return;
96 
97     uintptr_t x0 = res / inner_elements;
98     res %= inner_elements;
99     inner_elements /= md->u.hvector.count;
100 
101     uintptr_t x1 = res / inner_elements;
102     res %= inner_elements;
103     inner_elements /= md->u.hvector.blocklength;
104     uintptr_t x2 = res / inner_elements;
105     res %= inner_elements;
106     inner_elements /= md->u.hvector.child->u.hvector.count;
107 
108     uintptr_t x3 = res / inner_elements;
109     res %= inner_elements;
110     inner_elements /= md->u.hvector.child->u.hvector.blocklength;
111     uintptr_t x4 = res;
112 
113     intptr_t stride1 = md->u.hvector.stride;
114     intptr_t stride2 = md->u.hvector.child->u.hvector.stride;
115     uintptr_t extent2 = md->u.hvector.child->extent;
116     uintptr_t extent3 = md->u.hvector.child->u.hvector.child->extent;
117     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + x4 * extent3));
118 }
119 
yaksuri_cudai_pack_hvector_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)120 void yaksuri_cudai_pack_hvector_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
121 {
122 void *args[] = { &inbuf, &outbuf, &count, &md };
123     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hvector_resized_wchar_t,
124         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
125     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
126 }
127 
yaksuri_cudai_kernel_unpack_hvector_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)128 __global__ void yaksuri_cudai_kernel_unpack_hvector_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
129 {
130     const char *__restrict__ sbuf = (const char *) inbuf;
131     char *__restrict__ dbuf = (char *) outbuf;
132     uintptr_t extent = md->extent;
133     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
134     uintptr_t res = idx;
135     uintptr_t inner_elements = md->num_elements;
136 
137     if (idx >= (count * inner_elements))
138         return;
139 
140     uintptr_t x0 = res / inner_elements;
141     res %= inner_elements;
142     inner_elements /= md->u.hvector.count;
143 
144     uintptr_t x1 = res / inner_elements;
145     res %= inner_elements;
146     inner_elements /= md->u.hvector.blocklength;
147     uintptr_t x2 = res / inner_elements;
148     res %= inner_elements;
149     inner_elements /= md->u.hvector.child->u.hvector.count;
150 
151     uintptr_t x3 = res / inner_elements;
152     res %= inner_elements;
153     inner_elements /= md->u.hvector.child->u.hvector.blocklength;
154     uintptr_t x4 = res;
155 
156     intptr_t stride1 = md->u.hvector.stride;
157     intptr_t stride2 = md->u.hvector.child->u.hvector.stride;
158     uintptr_t extent2 = md->u.hvector.child->extent;
159     uintptr_t extent3 = md->u.hvector.child->u.hvector.child->extent;
160     *((wchar_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + x4 * extent3)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
161 }
162 
yaksuri_cudai_unpack_hvector_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)163 void yaksuri_cudai_unpack_hvector_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
164 {
165 void *args[] = { &inbuf, &outbuf, &count, &md };
166     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hvector_resized_wchar_t,
167         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
168     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
169 }
170 
yaksuri_cudai_kernel_pack_blkhindx_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)171 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
172 {
173     const char *__restrict__ sbuf = (const char *) inbuf;
174     char *__restrict__ dbuf = (char *) outbuf;
175     uintptr_t extent = md->extent;
176     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
177     uintptr_t res = idx;
178     uintptr_t inner_elements = md->num_elements;
179 
180     if (idx >= (count * inner_elements))
181         return;
182 
183     uintptr_t x0 = res / inner_elements;
184     res %= inner_elements;
185     inner_elements /= md->u.blkhindx.count;
186 
187     uintptr_t x1 = res / inner_elements;
188     res %= inner_elements;
189     inner_elements /= md->u.blkhindx.blocklength;
190     uintptr_t x2 = res / inner_elements;
191     res %= inner_elements;
192     inner_elements /= md->u.blkhindx.child->u.hvector.count;
193 
194     uintptr_t x3 = res / inner_elements;
195     res %= inner_elements;
196     inner_elements /= md->u.blkhindx.child->u.hvector.blocklength;
197     uintptr_t x4 = res;
198 
199     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
200     intptr_t stride2 = md->u.blkhindx.child->u.hvector.stride;
201     uintptr_t extent2 = md->u.blkhindx.child->extent;
202     uintptr_t extent3 = md->u.blkhindx.child->u.hvector.child->extent;
203     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * extent3));
204 }
205 
yaksuri_cudai_pack_blkhindx_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)206 void yaksuri_cudai_pack_blkhindx_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
207 {
208 void *args[] = { &inbuf, &outbuf, &count, &md };
209     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hvector_resized_wchar_t,
210         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
211     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
212 }
213 
yaksuri_cudai_kernel_unpack_blkhindx_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)214 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
215 {
216     const char *__restrict__ sbuf = (const char *) inbuf;
217     char *__restrict__ dbuf = (char *) outbuf;
218     uintptr_t extent = md->extent;
219     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
220     uintptr_t res = idx;
221     uintptr_t inner_elements = md->num_elements;
222 
223     if (idx >= (count * inner_elements))
224         return;
225 
226     uintptr_t x0 = res / inner_elements;
227     res %= inner_elements;
228     inner_elements /= md->u.blkhindx.count;
229 
230     uintptr_t x1 = res / inner_elements;
231     res %= inner_elements;
232     inner_elements /= md->u.blkhindx.blocklength;
233     uintptr_t x2 = res / inner_elements;
234     res %= inner_elements;
235     inner_elements /= md->u.blkhindx.child->u.hvector.count;
236 
237     uintptr_t x3 = res / inner_elements;
238     res %= inner_elements;
239     inner_elements /= md->u.blkhindx.child->u.hvector.blocklength;
240     uintptr_t x4 = res;
241 
242     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
243     intptr_t stride2 = md->u.blkhindx.child->u.hvector.stride;
244     uintptr_t extent2 = md->u.blkhindx.child->extent;
245     uintptr_t extent3 = md->u.blkhindx.child->u.hvector.child->extent;
246     *((wchar_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * extent3)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
247 }
248 
yaksuri_cudai_unpack_blkhindx_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)249 void yaksuri_cudai_unpack_blkhindx_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
250 {
251 void *args[] = { &inbuf, &outbuf, &count, &md };
252     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hvector_resized_wchar_t,
253         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
254     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
255 }
256 
yaksuri_cudai_kernel_pack_hindexed_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)257 __global__ void yaksuri_cudai_kernel_pack_hindexed_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
258 {
259     const char *__restrict__ sbuf = (const char *) inbuf;
260     char *__restrict__ dbuf = (char *) outbuf;
261     uintptr_t extent = md->extent;
262     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
263     uintptr_t res = idx;
264     uintptr_t inner_elements = md->num_elements;
265 
266     if (idx >= (count * inner_elements))
267         return;
268 
269     uintptr_t x0 = res / inner_elements;
270     res %= inner_elements;
271     inner_elements /= md->u.hindexed.count;
272 
273     uintptr_t x1;
274     for (int i = 0; i < md->u.hindexed.count; i++) {
275             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
276                                  md->u.hindexed.child->num_elements;
277             if (res < in_elems) {
278                     x1 = i;
279                     res %= in_elems;
280                     inner_elements = md->u.hindexed.child->num_elements;
281                     break;
282             } else {
283                     res -= in_elems;
284             }
285     }
286 
287     uintptr_t x2 = res / inner_elements;
288     res %= inner_elements;
289     inner_elements /= md->u.hindexed.child->u.hvector.count;
290 
291     uintptr_t x3 = res / inner_elements;
292     res %= inner_elements;
293     inner_elements /= md->u.hindexed.child->u.hvector.blocklength;
294     uintptr_t x4 = res;
295 
296     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
297     intptr_t stride2 = md->u.hindexed.child->u.hvector.stride;
298     uintptr_t extent2 = md->u.hindexed.child->extent;
299     uintptr_t extent3 = md->u.hindexed.child->u.hvector.child->extent;
300     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * extent3));
301 }
302 
yaksuri_cudai_pack_hindexed_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)303 void yaksuri_cudai_pack_hindexed_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
304 {
305 void *args[] = { &inbuf, &outbuf, &count, &md };
306     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hvector_resized_wchar_t,
307         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
308     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
309 }
310 
yaksuri_cudai_kernel_unpack_hindexed_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)311 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
312 {
313     const char *__restrict__ sbuf = (const char *) inbuf;
314     char *__restrict__ dbuf = (char *) outbuf;
315     uintptr_t extent = md->extent;
316     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
317     uintptr_t res = idx;
318     uintptr_t inner_elements = md->num_elements;
319 
320     if (idx >= (count * inner_elements))
321         return;
322 
323     uintptr_t x0 = res / inner_elements;
324     res %= inner_elements;
325     inner_elements /= md->u.hindexed.count;
326 
327     uintptr_t x1;
328     for (int i = 0; i < md->u.hindexed.count; i++) {
329             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
330                                  md->u.hindexed.child->num_elements;
331             if (res < in_elems) {
332                     x1 = i;
333                     res %= in_elems;
334                     inner_elements = md->u.hindexed.child->num_elements;
335                     break;
336             } else {
337                     res -= in_elems;
338             }
339     }
340 
341     uintptr_t x2 = res / inner_elements;
342     res %= inner_elements;
343     inner_elements /= md->u.hindexed.child->u.hvector.count;
344 
345     uintptr_t x3 = res / inner_elements;
346     res %= inner_elements;
347     inner_elements /= md->u.hindexed.child->u.hvector.blocklength;
348     uintptr_t x4 = res;
349 
350     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
351     intptr_t stride2 = md->u.hindexed.child->u.hvector.stride;
352     uintptr_t extent2 = md->u.hindexed.child->extent;
353     uintptr_t extent3 = md->u.hindexed.child->u.hvector.child->extent;
354     *((wchar_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * extent3)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
355 }
356 
yaksuri_cudai_unpack_hindexed_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)357 void yaksuri_cudai_unpack_hindexed_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
358 {
359 void *args[] = { &inbuf, &outbuf, &count, &md };
360     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hvector_resized_wchar_t,
361         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
362     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
363 }
364 
yaksuri_cudai_kernel_pack_contig_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)365 __global__ void yaksuri_cudai_kernel_pack_contig_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
366 {
367     const char *__restrict__ sbuf = (const char *) inbuf;
368     char *__restrict__ dbuf = (char *) outbuf;
369     uintptr_t extent = md->extent;
370     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
371     uintptr_t res = idx;
372     uintptr_t inner_elements = md->num_elements;
373 
374     if (idx >= (count * inner_elements))
375         return;
376 
377     uintptr_t x0 = res / inner_elements;
378     res %= inner_elements;
379     inner_elements /= md->u.contig.count;
380 
381     uintptr_t x1 = res / inner_elements;
382     res %= inner_elements;
383     inner_elements /= md->u.contig.child->u.hvector.count;
384 
385     uintptr_t x2 = res / inner_elements;
386     res %= inner_elements;
387     inner_elements /= md->u.contig.child->u.hvector.blocklength;
388     uintptr_t x3 = res;
389 
390     intptr_t stride1 = md->u.contig.child->extent;
391     intptr_t stride2 = md->u.contig.child->u.hvector.stride;
392     uintptr_t extent3 = md->u.contig.child->u.hvector.child->extent;
393     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2 + x3 * extent3));
394 }
395 
yaksuri_cudai_pack_contig_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)396 void yaksuri_cudai_pack_contig_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
397 {
398 void *args[] = { &inbuf, &outbuf, &count, &md };
399     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hvector_resized_wchar_t,
400         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
401     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
402 }
403 
yaksuri_cudai_kernel_unpack_contig_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)404 __global__ void yaksuri_cudai_kernel_unpack_contig_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
405 {
406     const char *__restrict__ sbuf = (const char *) inbuf;
407     char *__restrict__ dbuf = (char *) outbuf;
408     uintptr_t extent = md->extent;
409     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
410     uintptr_t res = idx;
411     uintptr_t inner_elements = md->num_elements;
412 
413     if (idx >= (count * inner_elements))
414         return;
415 
416     uintptr_t x0 = res / inner_elements;
417     res %= inner_elements;
418     inner_elements /= md->u.contig.count;
419 
420     uintptr_t x1 = res / inner_elements;
421     res %= inner_elements;
422     inner_elements /= md->u.contig.child->u.hvector.count;
423 
424     uintptr_t x2 = res / inner_elements;
425     res %= inner_elements;
426     inner_elements /= md->u.contig.child->u.hvector.blocklength;
427     uintptr_t x3 = res;
428 
429     intptr_t stride1 = md->u.contig.child->extent;
430     intptr_t stride2 = md->u.contig.child->u.hvector.stride;
431     uintptr_t extent3 = md->u.contig.child->u.hvector.child->extent;
432     *((wchar_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2 + x3 * extent3)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
433 }
434 
yaksuri_cudai_unpack_contig_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)435 void yaksuri_cudai_unpack_contig_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
436 {
437 void *args[] = { &inbuf, &outbuf, &count, &md };
438     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hvector_resized_wchar_t,
439         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
440     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
441 }
442 
yaksuri_cudai_kernel_pack_resized_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)443 __global__ void yaksuri_cudai_kernel_pack_resized_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
444 {
445     const char *__restrict__ sbuf = (const char *) inbuf;
446     char *__restrict__ dbuf = (char *) outbuf;
447     uintptr_t extent = md->extent;
448     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
449     uintptr_t res = idx;
450     uintptr_t inner_elements = md->num_elements;
451 
452     if (idx >= (count * inner_elements))
453         return;
454 
455     uintptr_t x0 = res / inner_elements;
456     res %= inner_elements;
457     inner_elements /= md->u.resized.child->u.hvector.count;
458 
459     uintptr_t x1 = res / inner_elements;
460     res %= inner_elements;
461     inner_elements /= md->u.resized.child->u.hvector.blocklength;
462     uintptr_t x2 = res;
463 
464     intptr_t stride2 = md->u.resized.child->u.hvector.stride;
465     uintptr_t extent3 = md->u.resized.child->u.hvector.child->extent;
466     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + x1 * stride2 + x2 * extent3));
467 }
468 
yaksuri_cudai_pack_resized_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)469 void yaksuri_cudai_pack_resized_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
470 {
471 void *args[] = { &inbuf, &outbuf, &count, &md };
472     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hvector_resized_wchar_t,
473         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
474     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
475 }
476 
yaksuri_cudai_kernel_unpack_resized_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)477 __global__ void yaksuri_cudai_kernel_unpack_resized_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
478 {
479     const char *__restrict__ sbuf = (const char *) inbuf;
480     char *__restrict__ dbuf = (char *) outbuf;
481     uintptr_t extent = md->extent;
482     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
483     uintptr_t res = idx;
484     uintptr_t inner_elements = md->num_elements;
485 
486     if (idx >= (count * inner_elements))
487         return;
488 
489     uintptr_t x0 = res / inner_elements;
490     res %= inner_elements;
491     inner_elements /= md->u.resized.child->u.hvector.count;
492 
493     uintptr_t x1 = res / inner_elements;
494     res %= inner_elements;
495     inner_elements /= md->u.resized.child->u.hvector.blocklength;
496     uintptr_t x2 = res;
497 
498     intptr_t stride2 = md->u.resized.child->u.hvector.stride;
499     uintptr_t extent3 = md->u.resized.child->u.hvector.child->extent;
500     *((wchar_t *) (void *) (dbuf + x0 * extent + x1 * stride2 + x2 * extent3)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
501 }
502 
yaksuri_cudai_unpack_resized_hvector_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)503 void yaksuri_cudai_unpack_resized_hvector_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
504 {
505 void *args[] = { &inbuf, &outbuf, &count, &md };
506     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hvector_resized_wchar_t,
507         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
508     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
509 }
510 
511