1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res;
30 
31     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent));
32 }
33 
yaksuri_cudai_pack_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)34 void yaksuri_cudai_pack_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
35 {
36 void *args[] = { &inbuf, &outbuf, &count, &md };
37     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_wchar_t,
38         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
39     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
40 }
41 
yaksuri_cudai_kernel_unpack_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)42 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
43 {
44     const char *__restrict__ sbuf = (const char *) inbuf;
45     char *__restrict__ dbuf = (char *) outbuf;
46     uintptr_t extent = md->extent;
47     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
48     uintptr_t res = idx;
49     uintptr_t inner_elements = md->num_elements;
50 
51     if (idx >= (count * inner_elements))
52         return;
53 
54     uintptr_t x0 = res;
55 
56     *((wchar_t *) (void *) (dbuf + x0 * extent)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
57 }
58 
yaksuri_cudai_unpack_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)59 void yaksuri_cudai_unpack_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
60 {
61 void *args[] = { &inbuf, &outbuf, &count, &md };
62     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_wchar_t,
63         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
64     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
65 }
66 
yaksuri_cudai_kernel_pack_hvector_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)67 __global__ void yaksuri_cudai_kernel_pack_hvector_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
68 {
69     const char *__restrict__ sbuf = (const char *) inbuf;
70     char *__restrict__ dbuf = (char *) outbuf;
71     uintptr_t extent = md->extent;
72     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
73     uintptr_t res = idx;
74     uintptr_t inner_elements = md->num_elements;
75 
76     if (idx >= (count * inner_elements))
77         return;
78 
79     uintptr_t x0 = res / inner_elements;
80     res %= inner_elements;
81     inner_elements /= md->u.hvector.count;
82 
83     uintptr_t x1 = res / inner_elements;
84     res %= inner_elements;
85     inner_elements /= md->u.hvector.blocklength;
86     uintptr_t x2 = res;
87 
88     intptr_t stride1 = md->u.hvector.stride;
89     uintptr_t extent2 = md->u.hvector.child->extent;
90     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2));
91 }
92 
yaksuri_cudai_pack_hvector_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)93 void yaksuri_cudai_pack_hvector_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
94 {
95 void *args[] = { &inbuf, &outbuf, &count, &md };
96     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_resized_resized_wchar_t,
97         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
98     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
99 }
100 
yaksuri_cudai_kernel_unpack_hvector_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)101 __global__ void yaksuri_cudai_kernel_unpack_hvector_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
102 {
103     const char *__restrict__ sbuf = (const char *) inbuf;
104     char *__restrict__ dbuf = (char *) outbuf;
105     uintptr_t extent = md->extent;
106     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
107     uintptr_t res = idx;
108     uintptr_t inner_elements = md->num_elements;
109 
110     if (idx >= (count * inner_elements))
111         return;
112 
113     uintptr_t x0 = res / inner_elements;
114     res %= inner_elements;
115     inner_elements /= md->u.hvector.count;
116 
117     uintptr_t x1 = res / inner_elements;
118     res %= inner_elements;
119     inner_elements /= md->u.hvector.blocklength;
120     uintptr_t x2 = res;
121 
122     intptr_t stride1 = md->u.hvector.stride;
123     uintptr_t extent2 = md->u.hvector.child->extent;
124     *((wchar_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
125 }
126 
yaksuri_cudai_unpack_hvector_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)127 void yaksuri_cudai_unpack_hvector_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
128 {
129 void *args[] = { &inbuf, &outbuf, &count, &md };
130     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_resized_resized_wchar_t,
131         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
132     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
133 }
134 
yaksuri_cudai_kernel_pack_blkhindx_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)135 __global__ void yaksuri_cudai_kernel_pack_blkhindx_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
136 {
137     const char *__restrict__ sbuf = (const char *) inbuf;
138     char *__restrict__ dbuf = (char *) outbuf;
139     uintptr_t extent = md->extent;
140     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
141     uintptr_t res = idx;
142     uintptr_t inner_elements = md->num_elements;
143 
144     if (idx >= (count * inner_elements))
145         return;
146 
147     uintptr_t x0 = res / inner_elements;
148     res %= inner_elements;
149     inner_elements /= md->u.blkhindx.count;
150 
151     uintptr_t x1 = res / inner_elements;
152     res %= inner_elements;
153     inner_elements /= md->u.blkhindx.blocklength;
154     uintptr_t x2 = res;
155 
156     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
157     uintptr_t extent2 = md->u.blkhindx.child->extent;
158     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2));
159 }
160 
yaksuri_cudai_pack_blkhindx_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)161 void yaksuri_cudai_pack_blkhindx_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
162 {
163 void *args[] = { &inbuf, &outbuf, &count, &md };
164     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_resized_resized_wchar_t,
165         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
166     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
167 }
168 
yaksuri_cudai_kernel_unpack_blkhindx_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)169 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
170 {
171     const char *__restrict__ sbuf = (const char *) inbuf;
172     char *__restrict__ dbuf = (char *) outbuf;
173     uintptr_t extent = md->extent;
174     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
175     uintptr_t res = idx;
176     uintptr_t inner_elements = md->num_elements;
177 
178     if (idx >= (count * inner_elements))
179         return;
180 
181     uintptr_t x0 = res / inner_elements;
182     res %= inner_elements;
183     inner_elements /= md->u.blkhindx.count;
184 
185     uintptr_t x1 = res / inner_elements;
186     res %= inner_elements;
187     inner_elements /= md->u.blkhindx.blocklength;
188     uintptr_t x2 = res;
189 
190     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
191     uintptr_t extent2 = md->u.blkhindx.child->extent;
192     *((wchar_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
193 }
194 
yaksuri_cudai_unpack_blkhindx_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)195 void yaksuri_cudai_unpack_blkhindx_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
196 {
197 void *args[] = { &inbuf, &outbuf, &count, &md };
198     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_resized_resized_wchar_t,
199         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
200     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
201 }
202 
yaksuri_cudai_kernel_pack_hindexed_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)203 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
204 {
205     const char *__restrict__ sbuf = (const char *) inbuf;
206     char *__restrict__ dbuf = (char *) outbuf;
207     uintptr_t extent = md->extent;
208     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
209     uintptr_t res = idx;
210     uintptr_t inner_elements = md->num_elements;
211 
212     if (idx >= (count * inner_elements))
213         return;
214 
215     uintptr_t x0 = res / inner_elements;
216     res %= inner_elements;
217     inner_elements /= md->u.hindexed.count;
218 
219     uintptr_t x1;
220     for (int i = 0; i < md->u.hindexed.count; i++) {
221             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
222                                  md->u.hindexed.child->num_elements;
223             if (res < in_elems) {
224                     x1 = i;
225                     res %= in_elems;
226                     inner_elements = md->u.hindexed.child->num_elements;
227                     break;
228             } else {
229                     res -= in_elems;
230             }
231     }
232 
233     uintptr_t x2 = res;
234 
235     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
236     uintptr_t extent2 = md->u.hindexed.child->extent;
237     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2));
238 }
239 
yaksuri_cudai_pack_hindexed_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)240 void yaksuri_cudai_pack_hindexed_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
241 {
242 void *args[] = { &inbuf, &outbuf, &count, &md };
243     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_resized_wchar_t,
244         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
245     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
246 }
247 
yaksuri_cudai_kernel_unpack_hindexed_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)248 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
249 {
250     const char *__restrict__ sbuf = (const char *) inbuf;
251     char *__restrict__ dbuf = (char *) outbuf;
252     uintptr_t extent = md->extent;
253     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
254     uintptr_t res = idx;
255     uintptr_t inner_elements = md->num_elements;
256 
257     if (idx >= (count * inner_elements))
258         return;
259 
260     uintptr_t x0 = res / inner_elements;
261     res %= inner_elements;
262     inner_elements /= md->u.hindexed.count;
263 
264     uintptr_t x1;
265     for (int i = 0; i < md->u.hindexed.count; i++) {
266             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
267                                  md->u.hindexed.child->num_elements;
268             if (res < in_elems) {
269                     x1 = i;
270                     res %= in_elems;
271                     inner_elements = md->u.hindexed.child->num_elements;
272                     break;
273             } else {
274                     res -= in_elems;
275             }
276     }
277 
278     uintptr_t x2 = res;
279 
280     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
281     uintptr_t extent2 = md->u.hindexed.child->extent;
282     *((wchar_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
283 }
284 
yaksuri_cudai_unpack_hindexed_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)285 void yaksuri_cudai_unpack_hindexed_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
286 {
287 void *args[] = { &inbuf, &outbuf, &count, &md };
288     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_resized_wchar_t,
289         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
290     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
291 }
292 
yaksuri_cudai_kernel_pack_contig_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)293 __global__ void yaksuri_cudai_kernel_pack_contig_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
294 {
295     const char *__restrict__ sbuf = (const char *) inbuf;
296     char *__restrict__ dbuf = (char *) outbuf;
297     uintptr_t extent = md->extent;
298     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
299     uintptr_t res = idx;
300     uintptr_t inner_elements = md->num_elements;
301 
302     if (idx >= (count * inner_elements))
303         return;
304 
305     uintptr_t x0 = res / inner_elements;
306     res %= inner_elements;
307     inner_elements /= md->u.contig.count;
308 
309     uintptr_t x1 = res;
310 
311     intptr_t stride1 = md->u.contig.child->extent;
312     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent + x1 * stride1));
313 }
314 
yaksuri_cudai_pack_contig_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)315 void yaksuri_cudai_pack_contig_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
316 {
317 void *args[] = { &inbuf, &outbuf, &count, &md };
318     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_resized_resized_wchar_t,
319         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
320     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
321 }
322 
yaksuri_cudai_kernel_unpack_contig_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)323 __global__ void yaksuri_cudai_kernel_unpack_contig_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
324 {
325     const char *__restrict__ sbuf = (const char *) inbuf;
326     char *__restrict__ dbuf = (char *) outbuf;
327     uintptr_t extent = md->extent;
328     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
329     uintptr_t res = idx;
330     uintptr_t inner_elements = md->num_elements;
331 
332     if (idx >= (count * inner_elements))
333         return;
334 
335     uintptr_t x0 = res / inner_elements;
336     res %= inner_elements;
337     inner_elements /= md->u.contig.count;
338 
339     uintptr_t x1 = res;
340 
341     intptr_t stride1 = md->u.contig.child->extent;
342     *((wchar_t *) (void *) (dbuf + x0 * extent + x1 * stride1)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
343 }
344 
yaksuri_cudai_unpack_contig_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)345 void yaksuri_cudai_unpack_contig_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
346 {
347 void *args[] = { &inbuf, &outbuf, &count, &md };
348     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_resized_resized_wchar_t,
349         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
350     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
351 }
352 
yaksuri_cudai_kernel_pack_resized_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)353 __global__ void yaksuri_cudai_kernel_pack_resized_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
354 {
355     const char *__restrict__ sbuf = (const char *) inbuf;
356     char *__restrict__ dbuf = (char *) outbuf;
357     uintptr_t extent = md->extent;
358     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
359     uintptr_t res = idx;
360     uintptr_t inner_elements = md->num_elements;
361 
362     if (idx >= (count * inner_elements))
363         return;
364 
365     uintptr_t x0 = res;
366 
367     *((wchar_t *) (void *) (dbuf + idx * sizeof(wchar_t))) = *((const wchar_t *) (const void *) (sbuf + x0 * extent));
368 }
369 
yaksuri_cudai_pack_resized_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)370 void yaksuri_cudai_pack_resized_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
371 {
372 void *args[] = { &inbuf, &outbuf, &count, &md };
373     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_resized_wchar_t,
374         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
375     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
376 }
377 
yaksuri_cudai_kernel_unpack_resized_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)378 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
379 {
380     const char *__restrict__ sbuf = (const char *) inbuf;
381     char *__restrict__ dbuf = (char *) outbuf;
382     uintptr_t extent = md->extent;
383     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
384     uintptr_t res = idx;
385     uintptr_t inner_elements = md->num_elements;
386 
387     if (idx >= (count * inner_elements))
388         return;
389 
390     uintptr_t x0 = res;
391 
392     *((wchar_t *) (void *) (dbuf + x0 * extent)) = *((const wchar_t *) (const void *) (sbuf + idx * sizeof(wchar_t)));
393 }
394 
yaksuri_cudai_unpack_resized_resized_resized_wchar_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)395 void yaksuri_cudai_unpack_resized_resized_resized_wchar_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
396 {
397 void *args[] = { &inbuf, &outbuf, &count, &md };
398     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_resized_wchar_t,
399         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
400     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
401 }
402 
403