1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.contig.count;
32
33 uintptr_t x1 = res / inner_elements;
34 res %= inner_elements;
35 inner_elements /= md->u.contig.child->u.contig.count;
36
37 uintptr_t x2 = res;
38
39 intptr_t stride1 = md->u.contig.child->extent;
40 intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
41 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2));
42 }
43
yaksuri_cudai_pack_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)44 void yaksuri_cudai_pack_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
45 {
46 void *args[] = { &inbuf, &outbuf, &count, &md };
47 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_contig_int16_t,
48 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
49 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
50 }
51
yaksuri_cudai_kernel_unpack_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)52 __global__ void yaksuri_cudai_kernel_unpack_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
53 {
54 const char *__restrict__ sbuf = (const char *) inbuf;
55 char *__restrict__ dbuf = (char *) outbuf;
56 uintptr_t extent = md->extent;
57 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
58 uintptr_t res = idx;
59 uintptr_t inner_elements = md->num_elements;
60
61 if (idx >= (count * inner_elements))
62 return;
63
64 uintptr_t x0 = res / inner_elements;
65 res %= inner_elements;
66 inner_elements /= md->u.contig.count;
67
68 uintptr_t x1 = res / inner_elements;
69 res %= inner_elements;
70 inner_elements /= md->u.contig.child->u.contig.count;
71
72 uintptr_t x2 = res;
73
74 intptr_t stride1 = md->u.contig.child->extent;
75 intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
76 *((int16_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
77 }
78
yaksuri_cudai_unpack_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)79 void yaksuri_cudai_unpack_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
80 {
81 void *args[] = { &inbuf, &outbuf, &count, &md };
82 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_contig_int16_t,
83 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
84 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
85 }
86
yaksuri_cudai_kernel_pack_hvector_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)87 __global__ void yaksuri_cudai_kernel_pack_hvector_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
88 {
89 const char *__restrict__ sbuf = (const char *) inbuf;
90 char *__restrict__ dbuf = (char *) outbuf;
91 uintptr_t extent = md->extent;
92 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
93 uintptr_t res = idx;
94 uintptr_t inner_elements = md->num_elements;
95
96 if (idx >= (count * inner_elements))
97 return;
98
99 uintptr_t x0 = res / inner_elements;
100 res %= inner_elements;
101 inner_elements /= md->u.hvector.count;
102
103 uintptr_t x1 = res / inner_elements;
104 res %= inner_elements;
105 inner_elements /= md->u.hvector.blocklength;
106 uintptr_t x2 = res / inner_elements;
107 res %= inner_elements;
108 inner_elements /= md->u.hvector.child->u.contig.count;
109
110 uintptr_t x3 = res / inner_elements;
111 res %= inner_elements;
112 inner_elements /= md->u.hvector.child->u.contig.child->u.contig.count;
113
114 uintptr_t x4 = res;
115
116 intptr_t stride1 = md->u.hvector.stride;
117 intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
118 uintptr_t extent2 = md->u.hvector.child->extent;
119 intptr_t stride3 = md->u.hvector.child->u.contig.child->u.contig.child->extent;
120 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + x4 * stride3));
121 }
122
yaksuri_cudai_pack_hvector_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)123 void yaksuri_cudai_pack_hvector_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
124 {
125 void *args[] = { &inbuf, &outbuf, &count, &md };
126 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_contig_contig_int16_t,
127 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
128 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
129 }
130
yaksuri_cudai_kernel_unpack_hvector_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)131 __global__ void yaksuri_cudai_kernel_unpack_hvector_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
132 {
133 const char *__restrict__ sbuf = (const char *) inbuf;
134 char *__restrict__ dbuf = (char *) outbuf;
135 uintptr_t extent = md->extent;
136 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
137 uintptr_t res = idx;
138 uintptr_t inner_elements = md->num_elements;
139
140 if (idx >= (count * inner_elements))
141 return;
142
143 uintptr_t x0 = res / inner_elements;
144 res %= inner_elements;
145 inner_elements /= md->u.hvector.count;
146
147 uintptr_t x1 = res / inner_elements;
148 res %= inner_elements;
149 inner_elements /= md->u.hvector.blocklength;
150 uintptr_t x2 = res / inner_elements;
151 res %= inner_elements;
152 inner_elements /= md->u.hvector.child->u.contig.count;
153
154 uintptr_t x3 = res / inner_elements;
155 res %= inner_elements;
156 inner_elements /= md->u.hvector.child->u.contig.child->u.contig.count;
157
158 uintptr_t x4 = res;
159
160 intptr_t stride1 = md->u.hvector.stride;
161 intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
162 uintptr_t extent2 = md->u.hvector.child->extent;
163 intptr_t stride3 = md->u.hvector.child->u.contig.child->u.contig.child->extent;
164 *((int16_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + x4 * stride3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
165 }
166
yaksuri_cudai_unpack_hvector_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)167 void yaksuri_cudai_unpack_hvector_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
168 {
169 void *args[] = { &inbuf, &outbuf, &count, &md };
170 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_contig_contig_int16_t,
171 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
172 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
173 }
174
yaksuri_cudai_kernel_pack_blkhindx_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)175 __global__ void yaksuri_cudai_kernel_pack_blkhindx_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
176 {
177 const char *__restrict__ sbuf = (const char *) inbuf;
178 char *__restrict__ dbuf = (char *) outbuf;
179 uintptr_t extent = md->extent;
180 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
181 uintptr_t res = idx;
182 uintptr_t inner_elements = md->num_elements;
183
184 if (idx >= (count * inner_elements))
185 return;
186
187 uintptr_t x0 = res / inner_elements;
188 res %= inner_elements;
189 inner_elements /= md->u.blkhindx.count;
190
191 uintptr_t x1 = res / inner_elements;
192 res %= inner_elements;
193 inner_elements /= md->u.blkhindx.blocklength;
194 uintptr_t x2 = res / inner_elements;
195 res %= inner_elements;
196 inner_elements /= md->u.blkhindx.child->u.contig.count;
197
198 uintptr_t x3 = res / inner_elements;
199 res %= inner_elements;
200 inner_elements /= md->u.blkhindx.child->u.contig.child->u.contig.count;
201
202 uintptr_t x4 = res;
203
204 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
205 intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
206 uintptr_t extent2 = md->u.blkhindx.child->extent;
207 intptr_t stride3 = md->u.blkhindx.child->u.contig.child->u.contig.child->extent;
208 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * stride3));
209 }
210
yaksuri_cudai_pack_blkhindx_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)211 void yaksuri_cudai_pack_blkhindx_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
212 {
213 void *args[] = { &inbuf, &outbuf, &count, &md };
214 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_contig_contig_int16_t,
215 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
216 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
217 }
218
yaksuri_cudai_kernel_unpack_blkhindx_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)219 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
220 {
221 const char *__restrict__ sbuf = (const char *) inbuf;
222 char *__restrict__ dbuf = (char *) outbuf;
223 uintptr_t extent = md->extent;
224 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
225 uintptr_t res = idx;
226 uintptr_t inner_elements = md->num_elements;
227
228 if (idx >= (count * inner_elements))
229 return;
230
231 uintptr_t x0 = res / inner_elements;
232 res %= inner_elements;
233 inner_elements /= md->u.blkhindx.count;
234
235 uintptr_t x1 = res / inner_elements;
236 res %= inner_elements;
237 inner_elements /= md->u.blkhindx.blocklength;
238 uintptr_t x2 = res / inner_elements;
239 res %= inner_elements;
240 inner_elements /= md->u.blkhindx.child->u.contig.count;
241
242 uintptr_t x3 = res / inner_elements;
243 res %= inner_elements;
244 inner_elements /= md->u.blkhindx.child->u.contig.child->u.contig.count;
245
246 uintptr_t x4 = res;
247
248 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
249 intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
250 uintptr_t extent2 = md->u.blkhindx.child->extent;
251 intptr_t stride3 = md->u.blkhindx.child->u.contig.child->u.contig.child->extent;
252 *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * stride3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
253 }
254
yaksuri_cudai_unpack_blkhindx_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)255 void yaksuri_cudai_unpack_blkhindx_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
256 {
257 void *args[] = { &inbuf, &outbuf, &count, &md };
258 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_contig_contig_int16_t,
259 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
260 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
261 }
262
yaksuri_cudai_kernel_pack_hindexed_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)263 __global__ void yaksuri_cudai_kernel_pack_hindexed_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
264 {
265 const char *__restrict__ sbuf = (const char *) inbuf;
266 char *__restrict__ dbuf = (char *) outbuf;
267 uintptr_t extent = md->extent;
268 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
269 uintptr_t res = idx;
270 uintptr_t inner_elements = md->num_elements;
271
272 if (idx >= (count * inner_elements))
273 return;
274
275 uintptr_t x0 = res / inner_elements;
276 res %= inner_elements;
277 inner_elements /= md->u.hindexed.count;
278
279 uintptr_t x1;
280 for (int i = 0; i < md->u.hindexed.count; i++) {
281 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
282 md->u.hindexed.child->num_elements;
283 if (res < in_elems) {
284 x1 = i;
285 res %= in_elems;
286 inner_elements = md->u.hindexed.child->num_elements;
287 break;
288 } else {
289 res -= in_elems;
290 }
291 }
292
293 uintptr_t x2 = res / inner_elements;
294 res %= inner_elements;
295 inner_elements /= md->u.hindexed.child->u.contig.count;
296
297 uintptr_t x3 = res / inner_elements;
298 res %= inner_elements;
299 inner_elements /= md->u.hindexed.child->u.contig.child->u.contig.count;
300
301 uintptr_t x4 = res;
302
303 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
304 intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
305 uintptr_t extent2 = md->u.hindexed.child->extent;
306 intptr_t stride3 = md->u.hindexed.child->u.contig.child->u.contig.child->extent;
307 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * stride3));
308 }
309
yaksuri_cudai_pack_hindexed_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)310 void yaksuri_cudai_pack_hindexed_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
311 {
312 void *args[] = { &inbuf, &outbuf, &count, &md };
313 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_contig_contig_int16_t,
314 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
315 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
316 }
317
yaksuri_cudai_kernel_unpack_hindexed_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)318 __global__ void yaksuri_cudai_kernel_unpack_hindexed_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
319 {
320 const char *__restrict__ sbuf = (const char *) inbuf;
321 char *__restrict__ dbuf = (char *) outbuf;
322 uintptr_t extent = md->extent;
323 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
324 uintptr_t res = idx;
325 uintptr_t inner_elements = md->num_elements;
326
327 if (idx >= (count * inner_elements))
328 return;
329
330 uintptr_t x0 = res / inner_elements;
331 res %= inner_elements;
332 inner_elements /= md->u.hindexed.count;
333
334 uintptr_t x1;
335 for (int i = 0; i < md->u.hindexed.count; i++) {
336 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
337 md->u.hindexed.child->num_elements;
338 if (res < in_elems) {
339 x1 = i;
340 res %= in_elems;
341 inner_elements = md->u.hindexed.child->num_elements;
342 break;
343 } else {
344 res -= in_elems;
345 }
346 }
347
348 uintptr_t x2 = res / inner_elements;
349 res %= inner_elements;
350 inner_elements /= md->u.hindexed.child->u.contig.count;
351
352 uintptr_t x3 = res / inner_elements;
353 res %= inner_elements;
354 inner_elements /= md->u.hindexed.child->u.contig.child->u.contig.count;
355
356 uintptr_t x4 = res;
357
358 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
359 intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
360 uintptr_t extent2 = md->u.hindexed.child->extent;
361 intptr_t stride3 = md->u.hindexed.child->u.contig.child->u.contig.child->extent;
362 *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * stride3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
363 }
364
yaksuri_cudai_unpack_hindexed_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)365 void yaksuri_cudai_unpack_hindexed_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
366 {
367 void *args[] = { &inbuf, &outbuf, &count, &md };
368 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_contig_contig_int16_t,
369 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
370 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
371 }
372
yaksuri_cudai_kernel_pack_contig_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)373 __global__ void yaksuri_cudai_kernel_pack_contig_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
374 {
375 const char *__restrict__ sbuf = (const char *) inbuf;
376 char *__restrict__ dbuf = (char *) outbuf;
377 uintptr_t extent = md->extent;
378 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
379 uintptr_t res = idx;
380 uintptr_t inner_elements = md->num_elements;
381
382 if (idx >= (count * inner_elements))
383 return;
384
385 uintptr_t x0 = res / inner_elements;
386 res %= inner_elements;
387 inner_elements /= md->u.contig.count;
388
389 uintptr_t x1 = res / inner_elements;
390 res %= inner_elements;
391 inner_elements /= md->u.contig.child->u.contig.count;
392
393 uintptr_t x2 = res / inner_elements;
394 res %= inner_elements;
395 inner_elements /= md->u.contig.child->u.contig.child->u.contig.count;
396
397 uintptr_t x3 = res;
398
399 intptr_t stride1 = md->u.contig.child->extent;
400 intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
401 intptr_t stride3 = md->u.contig.child->u.contig.child->u.contig.child->extent;
402 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2 + x3 * stride3));
403 }
404
yaksuri_cudai_pack_contig_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)405 void yaksuri_cudai_pack_contig_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
406 {
407 void *args[] = { &inbuf, &outbuf, &count, &md };
408 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_contig_contig_int16_t,
409 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
410 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
411 }
412
yaksuri_cudai_kernel_unpack_contig_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)413 __global__ void yaksuri_cudai_kernel_unpack_contig_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
414 {
415 const char *__restrict__ sbuf = (const char *) inbuf;
416 char *__restrict__ dbuf = (char *) outbuf;
417 uintptr_t extent = md->extent;
418 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
419 uintptr_t res = idx;
420 uintptr_t inner_elements = md->num_elements;
421
422 if (idx >= (count * inner_elements))
423 return;
424
425 uintptr_t x0 = res / inner_elements;
426 res %= inner_elements;
427 inner_elements /= md->u.contig.count;
428
429 uintptr_t x1 = res / inner_elements;
430 res %= inner_elements;
431 inner_elements /= md->u.contig.child->u.contig.count;
432
433 uintptr_t x2 = res / inner_elements;
434 res %= inner_elements;
435 inner_elements /= md->u.contig.child->u.contig.child->u.contig.count;
436
437 uintptr_t x3 = res;
438
439 intptr_t stride1 = md->u.contig.child->extent;
440 intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
441 intptr_t stride3 = md->u.contig.child->u.contig.child->u.contig.child->extent;
442 *((int16_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2 + x3 * stride3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
443 }
444
yaksuri_cudai_unpack_contig_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)445 void yaksuri_cudai_unpack_contig_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
446 {
447 void *args[] = { &inbuf, &outbuf, &count, &md };
448 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_contig_contig_int16_t,
449 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
450 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
451 }
452
yaksuri_cudai_kernel_pack_resized_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)453 __global__ void yaksuri_cudai_kernel_pack_resized_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
454 {
455 const char *__restrict__ sbuf = (const char *) inbuf;
456 char *__restrict__ dbuf = (char *) outbuf;
457 uintptr_t extent = md->extent;
458 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
459 uintptr_t res = idx;
460 uintptr_t inner_elements = md->num_elements;
461
462 if (idx >= (count * inner_elements))
463 return;
464
465 uintptr_t x0 = res / inner_elements;
466 res %= inner_elements;
467 inner_elements /= md->u.resized.child->u.contig.count;
468
469 uintptr_t x1 = res / inner_elements;
470 res %= inner_elements;
471 inner_elements /= md->u.resized.child->u.contig.child->u.contig.count;
472
473 uintptr_t x2 = res;
474
475 intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
476 intptr_t stride3 = md->u.resized.child->u.contig.child->u.contig.child->extent;
477 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + x1 * stride2 + x2 * stride3));
478 }
479
yaksuri_cudai_pack_resized_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)480 void yaksuri_cudai_pack_resized_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
481 {
482 void *args[] = { &inbuf, &outbuf, &count, &md };
483 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_contig_contig_int16_t,
484 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
485 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
486 }
487
yaksuri_cudai_kernel_unpack_resized_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)488 __global__ void yaksuri_cudai_kernel_unpack_resized_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
489 {
490 const char *__restrict__ sbuf = (const char *) inbuf;
491 char *__restrict__ dbuf = (char *) outbuf;
492 uintptr_t extent = md->extent;
493 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
494 uintptr_t res = idx;
495 uintptr_t inner_elements = md->num_elements;
496
497 if (idx >= (count * inner_elements))
498 return;
499
500 uintptr_t x0 = res / inner_elements;
501 res %= inner_elements;
502 inner_elements /= md->u.resized.child->u.contig.count;
503
504 uintptr_t x1 = res / inner_elements;
505 res %= inner_elements;
506 inner_elements /= md->u.resized.child->u.contig.child->u.contig.count;
507
508 uintptr_t x2 = res;
509
510 intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
511 intptr_t stride3 = md->u.resized.child->u.contig.child->u.contig.child->extent;
512 *((int16_t *) (void *) (dbuf + x0 * extent + x1 * stride2 + x2 * stride3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
513 }
514
yaksuri_cudai_unpack_resized_contig_contig_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)515 void yaksuri_cudai_unpack_resized_contig_contig_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
516 {
517 void *args[] = { &inbuf, &outbuf, &count, &md };
518 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_contig_contig_int16_t,
519 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
520 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
521 }
522
523