1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_hindexed_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.hindexed.count;
32
33 uintptr_t x1;
34 for (int i = 0; i < md->u.hindexed.count; i++) {
35 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36 md->u.hindexed.child->num_elements;
37 if (res < in_elems) {
38 x1 = i;
39 res %= in_elems;
40 inner_elements = md->u.hindexed.child->num_elements;
41 break;
42 } else {
43 res -= in_elems;
44 }
45 }
46
47 uintptr_t x2 = res;
48
49 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
50 *((int32_t *) (void *) (dbuf + idx * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * sizeof(int32_t)));
51 }
52
yaksuri_cudai_pack_hindexed_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)53 void yaksuri_cudai_pack_hindexed_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
54 {
55 void *args[] = { &inbuf, &outbuf, &count, &md };
56 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_int32_t,
57 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
58 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
59 }
60
yaksuri_cudai_kernel_unpack_hindexed_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)61 __global__ void yaksuri_cudai_kernel_unpack_hindexed_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
62 {
63 const char *__restrict__ sbuf = (const char *) inbuf;
64 char *__restrict__ dbuf = (char *) outbuf;
65 uintptr_t extent = md->extent;
66 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
67 uintptr_t res = idx;
68 uintptr_t inner_elements = md->num_elements;
69
70 if (idx >= (count * inner_elements))
71 return;
72
73 uintptr_t x0 = res / inner_elements;
74 res %= inner_elements;
75 inner_elements /= md->u.hindexed.count;
76
77 uintptr_t x1;
78 for (int i = 0; i < md->u.hindexed.count; i++) {
79 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
80 md->u.hindexed.child->num_elements;
81 if (res < in_elems) {
82 x1 = i;
83 res %= in_elems;
84 inner_elements = md->u.hindexed.child->num_elements;
85 break;
86 } else {
87 res -= in_elems;
88 }
89 }
90
91 uintptr_t x2 = res;
92
93 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
94 *((int32_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + idx * sizeof(int32_t)));
95 }
96
yaksuri_cudai_unpack_hindexed_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)97 void yaksuri_cudai_unpack_hindexed_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
98 {
99 void *args[] = { &inbuf, &outbuf, &count, &md };
100 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_int32_t,
101 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
102 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
103 }
104
105