1 #include <cuda_runtime.h>
2 #include <cuComplex.h>
3 #include <complex.h>
4 #include <stdio.h>
5 #include <omp.h>
6 
7 #define type double
8 
9 #define STR1(X) #X
10 #define STR(X) STR1(X)
11 #define STRINGIFY(X,Y) X ## Y
12 #define CON(X,Y) STRINGIFY(X,Y)
13 
14 #define KDir kernels
15 
16 #include "includes/ourmacros.h"
17 
18 #define FNAME fvimatchg32_blocking.h
19 #include "includes/macro.h"
20 #undef FNAME
21 
22 
fvimatchg32_blockingCallerWrapper(int ndim,const type * A,type * B,const int size0,const int size1,const int size2,const int param0,const int numblocks,const int numthreads,const int * __restrict__ lda_s,const int * __restrict__ ldb_s,const int * __restrict__ idx_s,const int lda_kernel1,const int ldb_kernel1,type alpha,type beta)23 void fvimatchg32_blockingCallerWrapper(int ndim, const type * A, type * B,const int size0, const int size1, const int size2, const  int param0, const int numblocks, const int numthreads
24 		, const int * __restrict__ lda_s, const int* __restrict__ ldb_s, const int* __restrict__ idx_s
25 		, const int lda_kernel1, const int ldb_kernel1, type alpha, type beta)
26 {
27 
28 	//	dim3 param3(idx_ss[1],idx_ss[2], numblocks/(idx_ss[1]*idx_ss[2]));
29 
30 	dim3 thread_blocks(numblocks/1, 1, 1);
31 	switch(ndim)
32 	{
33 		EXPANDDIMS(fvimatchg32_blocking_kernel_, thread_blocks, numthreads,0, ( A,  B, size0,size1, size2, param0, lda_s,ldb_s,idx_s, lda_kernel1, ldb_kernel1, alpha, beta))
34 		default:
35 		{
36 		}
37 
38 	}
39 
40 }
41 void swap(int array[], int ind1, int ind2);
42 
43 
44 	extern "C"
fvimatchg32_blocking_transpose_kernel(int ndim,const type * A,type * B,const int * lda,const int * ldb,const int * params,const int * perm,type alpha,type beta)45 void  fvimatchg32_blocking_transpose_kernel(int ndim, const type *A, type *B,  const int *lda, const int *ldb, const int* params, const int * perm, type alpha, type beta)
46 {
47 	// int numBlocks = computeNumBlocksCode ;
48 #ifdef printd
49 	printf("\nA Dims: %d \t %d \t %d\t %d\t %d\n", lda[0], lda[1], lda[2], lda[3], lda[4]);
50 	printf("\nParams: %d \t %d \t %d\t %d\t %d\t %d\t %d\n", params[0], params[1], params[2], params[3], params[4], params[5], params[6]);
51 	printf("\nB Dims: %d \t %d \t %d\t %d\t %d\n", ldb[0], ldb[1], ldb[2], ldb[3], ldb[4]);
52 	printf("\nR Perm: %d \t %d \t %d\t %d\t %d\n", perm[0], perm[1], perm[2], perm[3], perm[4]);
53 #endif
54 
55 
56 	int numBlocks = params[6];//((size[1] + 8 -1)/8) * size[2] * ((size[3] + 8 -1)/8) * size[4] ;
57 
58 
59 	int *d_lda_s, *d_ldb_s,  *d_idx_s;
60 	int lda_s[20], ldb_s[20], idx_s[20], temp[20];
61 	lda_s[0] = 1;
62 	ldb_s[0] = 1;
63 	int i, blockA=params[0];
64 	idx_s[1] = (ldb[1] + blockA - 1) / blockA;
65 	lda_s[1] = lda_s[0] * lda[0];
66 	ldb_s[1] = ldb_s[0] * ldb[0];
67 	for(i = 2; i < ndim; i++)
68 	{
69 		if(i == params[4])
70 		{
71 			idx_s[i] = (ldb[i] + blockA - 1)/blockA;
72 		}
73 		else
74 		{
75 			idx_s[i] = ldb[i];
76 
77 		}
78 		lda_s[i] = lda_s[i-1] * lda[i-1];
79 		ldb_s[i] = ldb_s[i-1] * ldb[i-1];
80 	}
81 	for(i = 1; i < ndim; i++)
82 	{
83 #ifdef printd
84 		printf("%d ", idx_s[i]);
85 #endif
86 		temp[i] = lda_s[perm[i]];
87 	}
88 #ifdef printd
89 	printf("\n");
90 #endif
91 
92 
93 	const int lda_kernel1 = lda_s[params[3]];
94 	const  int ldb_kernel1 = ldb_s[params[4]];
95 
96 /*	ldb_s[1] *= blockA;
97 	ldb_s[params[4]] *= blockA;
98 	temp[1] *= blockA;
99 	temp[params[4]] *= blockA;
100 */
101 	if(params[4] != 2)
102 	{
103 		swap(idx_s, 2, params[4]);
104 		swap(ldb_s, 2, params[4]);
105 		swap(temp, 2, params[4]);
106 	}
107 
108 
109 	SAFECUDAMALLOC(&d_lda_s,ndim*sizeof(int));
110 	SAFECUDAMALLOC(&d_ldb_s,ndim*sizeof(int));
111 	SAFECUDAMALLOC(&d_idx_s,ndim*sizeof(int));
112 	SAFECUDAMEMCPY(d_idx_s, idx_s,ndim*sizeof(int), cudaMemcpyHostToDevice);
113 	SAFECUDAMEMCPY(d_lda_s, temp,ndim*sizeof(int), cudaMemcpyHostToDevice);
114 	SAFECUDAMEMCPY(d_ldb_s, ldb_s,ndim*sizeof(int), cudaMemcpyHostToDevice);
115 
116 
117 #ifdef NOHTIME
118 #include "includes/nohtimestart.h"
119 #endif
120 
121 	fvimatchg32_blockingCallerWrapper(ndim, A,  B, lda[0], lda[1], ldb[1],params[0],
122 			numBlocks, params[2]
123 			, d_lda_s, d_ldb_s,d_idx_s
124 			, lda_kernel1, ldb_kernel1, alpha, beta);
125 
126 	//printf("ihello\n");
127 #ifdef NOHTIME
128 #include "includes/nohtimestop.h"
129 #endif
130 
131 
132 	{cudaError_t err = cudaGetLastError();
133 		if(err != cudaSuccess){
134 			printf("\nKernel ERROR in dCuKernel %s (line: %d)\n", cudaGetErrorString(err), __LINE__);
135 			//exit(-1);
136 		}}
137 	cudaFree(d_lda_s);
138 	cudaFree(d_ldb_s);
139 	cudaFree(d_idx_s);
140 }
141 
142 
143 
144