1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_hindexed_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.hindexed.count;
32 
33     uintptr_t x1;
34     for (int i = 0; i < md->u.hindexed.count; i++) {
35             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36                                  md->u.hindexed.child->num_elements;
37             if (res < in_elems) {
38                     x1 = i;
39                     res %= in_elems;
40                     inner_elements = md->u.hindexed.child->num_elements;
41                     break;
42             } else {
43                     res -= in_elems;
44             }
45     }
46 
47     uintptr_t x2 = res;
48 
49     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
50     *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * sizeof(int16_t)));
51 }
52 
yaksuri_cudai_pack_hindexed_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)53 void yaksuri_cudai_pack_hindexed_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
54 {
55 void *args[] = { &inbuf, &outbuf, &count, &md };
56     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_int16_t,
57         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
58     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
59 }
60 
yaksuri_cudai_kernel_unpack_hindexed_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)61 __global__ void yaksuri_cudai_kernel_unpack_hindexed_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
62 {
63     const char *__restrict__ sbuf = (const char *) inbuf;
64     char *__restrict__ dbuf = (char *) outbuf;
65     uintptr_t extent = md->extent;
66     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
67     uintptr_t res = idx;
68     uintptr_t inner_elements = md->num_elements;
69 
70     if (idx >= (count * inner_elements))
71         return;
72 
73     uintptr_t x0 = res / inner_elements;
74     res %= inner_elements;
75     inner_elements /= md->u.hindexed.count;
76 
77     uintptr_t x1;
78     for (int i = 0; i < md->u.hindexed.count; i++) {
79             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
80                                  md->u.hindexed.child->num_elements;
81             if (res < in_elems) {
82                     x1 = i;
83                     res %= in_elems;
84                     inner_elements = md->u.hindexed.child->num_elements;
85                     break;
86             } else {
87                     res -= in_elems;
88             }
89     }
90 
91     uintptr_t x2 = res;
92 
93     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
94     *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
95 }
96 
yaksuri_cudai_unpack_hindexed_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)97 void yaksuri_cudai_unpack_hindexed_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
98 {
99 void *args[] = { &inbuf, &outbuf, &count, &md };
100     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_int16_t,
101         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
102     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
103 }
104 
105