1 #include <cuda_runtime.h>
2 #include <cuComplex.h>
3 #include <complex.h>
4 #include <stdio.h>
5 #include <omp.h>
6
7 #define type double
8
9 #define STR1(X) #X
10 #define STR(X) STR1(X)
11 #define STRINGIFY(X,Y) X ## Y
12 #define CON(X,Y) STRINGIFY(X,Y)
13
14 #define KDir kernels
15
16 #include "includes/ourmacros.h"
17
18 #define FNAME fvimatchg32_blocking.h
19 #include "includes/macro.h"
20 #undef FNAME
21
22
fvimatchg32_blockingCallerWrapper(int ndim,const type * A,type * B,const int size0,const int size1,const int size2,const int param0,const int numblocks,const int numthreads,const int * __restrict__ lda_s,const int * __restrict__ ldb_s,const int * __restrict__ idx_s,const int lda_kernel1,const int ldb_kernel1,type alpha,type beta)23 void fvimatchg32_blockingCallerWrapper(int ndim, const type * A, type * B,const int size0, const int size1, const int size2, const int param0, const int numblocks, const int numthreads
24 , const int * __restrict__ lda_s, const int* __restrict__ ldb_s, const int* __restrict__ idx_s
25 , const int lda_kernel1, const int ldb_kernel1, type alpha, type beta)
26 {
27
28 // dim3 param3(idx_ss[1],idx_ss[2], numblocks/(idx_ss[1]*idx_ss[2]));
29
30 dim3 thread_blocks(numblocks/1, 1, 1);
31 switch(ndim)
32 {
33 EXPANDDIMS(fvimatchg32_blocking_kernel_, thread_blocks, numthreads,0, ( A, B, size0,size1, size2, param0, lda_s,ldb_s,idx_s, lda_kernel1, ldb_kernel1, alpha, beta))
34 default:
35 {
36 }
37
38 }
39
40 }
41 void swap(int array[], int ind1, int ind2);
42
43
44 extern "C"
fvimatchg32_blocking_transpose_kernel(int ndim,const type * A,type * B,const int * lda,const int * ldb,const int * params,const int * perm,type alpha,type beta)45 void fvimatchg32_blocking_transpose_kernel(int ndim, const type *A, type *B, const int *lda, const int *ldb, const int* params, const int * perm, type alpha, type beta)
46 {
47 // int numBlocks = computeNumBlocksCode ;
48 #ifdef printd
49 printf("\nA Dims: %d \t %d \t %d\t %d\t %d\n", lda[0], lda[1], lda[2], lda[3], lda[4]);
50 printf("\nParams: %d \t %d \t %d\t %d\t %d\t %d\t %d\n", params[0], params[1], params[2], params[3], params[4], params[5], params[6]);
51 printf("\nB Dims: %d \t %d \t %d\t %d\t %d\n", ldb[0], ldb[1], ldb[2], ldb[3], ldb[4]);
52 printf("\nR Perm: %d \t %d \t %d\t %d\t %d\n", perm[0], perm[1], perm[2], perm[3], perm[4]);
53 #endif
54
55
56 int numBlocks = params[6];//((size[1] + 8 -1)/8) * size[2] * ((size[3] + 8 -1)/8) * size[4] ;
57
58
59 int *d_lda_s, *d_ldb_s, *d_idx_s;
60 int lda_s[20], ldb_s[20], idx_s[20], temp[20];
61 lda_s[0] = 1;
62 ldb_s[0] = 1;
63 int i, blockA=params[0];
64 idx_s[1] = (ldb[1] + blockA - 1) / blockA;
65 lda_s[1] = lda_s[0] * lda[0];
66 ldb_s[1] = ldb_s[0] * ldb[0];
67 for(i = 2; i < ndim; i++)
68 {
69 if(i == params[4])
70 {
71 idx_s[i] = (ldb[i] + blockA - 1)/blockA;
72 }
73 else
74 {
75 idx_s[i] = ldb[i];
76
77 }
78 lda_s[i] = lda_s[i-1] * lda[i-1];
79 ldb_s[i] = ldb_s[i-1] * ldb[i-1];
80 }
81 for(i = 1; i < ndim; i++)
82 {
83 #ifdef printd
84 printf("%d ", idx_s[i]);
85 #endif
86 temp[i] = lda_s[perm[i]];
87 }
88 #ifdef printd
89 printf("\n");
90 #endif
91
92
93 const int lda_kernel1 = lda_s[params[3]];
94 const int ldb_kernel1 = ldb_s[params[4]];
95
96 /* ldb_s[1] *= blockA;
97 ldb_s[params[4]] *= blockA;
98 temp[1] *= blockA;
99 temp[params[4]] *= blockA;
100 */
101 if(params[4] != 2)
102 {
103 swap(idx_s, 2, params[4]);
104 swap(ldb_s, 2, params[4]);
105 swap(temp, 2, params[4]);
106 }
107
108
109 SAFECUDAMALLOC(&d_lda_s,ndim*sizeof(int));
110 SAFECUDAMALLOC(&d_ldb_s,ndim*sizeof(int));
111 SAFECUDAMALLOC(&d_idx_s,ndim*sizeof(int));
112 SAFECUDAMEMCPY(d_idx_s, idx_s,ndim*sizeof(int), cudaMemcpyHostToDevice);
113 SAFECUDAMEMCPY(d_lda_s, temp,ndim*sizeof(int), cudaMemcpyHostToDevice);
114 SAFECUDAMEMCPY(d_ldb_s, ldb_s,ndim*sizeof(int), cudaMemcpyHostToDevice);
115
116
117 #ifdef NOHTIME
118 #include "includes/nohtimestart.h"
119 #endif
120
121 fvimatchg32_blockingCallerWrapper(ndim, A, B, lda[0], lda[1], ldb[1],params[0],
122 numBlocks, params[2]
123 , d_lda_s, d_ldb_s,d_idx_s
124 , lda_kernel1, ldb_kernel1, alpha, beta);
125
126 //printf("ihello\n");
127 #ifdef NOHTIME
128 #include "includes/nohtimestop.h"
129 #endif
130
131
132 {cudaError_t err = cudaGetLastError();
133 if(err != cudaSuccess){
134 printf("\nKernel ERROR in dCuKernel %s (line: %d)\n", cudaGetErrorString(err), __LINE__);
135 //exit(-1);
136 }}
137 cudaFree(d_lda_s);
138 cudaFree(d_ldb_s);
139 cudaFree(d_idx_s);
140 }
141
142
143
144