1 /*
2 
3    matmul.c : Matrix Multiplication with tiling for openmp4 example
4 
5 */
6 
7 #include <stdlib.h>
8 #include <math.h>
9 
10 #define BLOCK_SIZE 16
11 /*
12   #define BLOCK_SIZE 32
13 */
14 #define NSECPERSEC 1000000000L
15 
16 typedef struct {
17    int width;
18    int height;
19    int stride;
20    int hpad;
21    float* elements;
22 } Matrix;
23 
24 /* Correctly extract the number of nanoseconds from the two time structures */
get_nanosecs(struct timespec start_time,struct timespec end_time)25 long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
26    long int nanosecs;
27    if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
28       ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
29       ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
30    else nanosecs =
31       (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
32       ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
33    return nanosecs;
34 }
35 
36 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
37      const float* B,const int LDB, const float beta,float* C, const int LDC) ;
38 void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
39      const float* B,const int LDB, const float beta,float* C, const int LDC) ;
40 void  tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
41      const float* B,const int LDB, const float beta,float* C, const int LDC) ;
42 
verify(float * v_res,float * v_ref,int len)43 int verify(float* v_res, float* v_ref, int len) {
44     int passed = 1;
45     int i;
46     for (i = 0; i < len; ++i) {
47         if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
48 	  __builtin_abort ();
49         }
50     }
51     return passed;
52 }
53 
54 
main(int argc,char * argv[])55 int main(int argc, char* argv[]){
56 
57    Matrix A,B,Bt,C,Cref;
58    int a1,a2,a3,i,j;
59    struct timespec start_time1, end_time1;
60    struct timespec start_time2, end_time2;
61    long int nanosecs,total_ops;
62    float gflopsTiled,gflopsCPU;
63 
64    a1 = 35;
65    a2 = 28;
66    a3 = 47;
67 
68    A.height = a1;
69    A.width = a2;
70    A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
71    A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
72    A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
73 
74    B.height = a2;
75    B.width = a3;
76    B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
77    B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
78    B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
79 
80    /* Bt is same as B but stored in column-major order */
81    Bt.height = B.height;
82    Bt.width = B.width;
83    Bt.stride = B.stride;
84    Bt.hpad = B.hpad;
85    Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
86 
87    C.height = a1;
88    C.width = a3;
89    C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
90    C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
91    C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
92 
93    Cref.height = a1;
94    Cref.width = a3;
95    Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
96    Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
97    Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
98 
99    for(i = 0; i < A.hpad ; i++)
100       for(j = 0; j < A.stride; j++) {
101          if (( j<A.width ) && (i<A.height)) {
102             A.elements[i*A.stride + j] = (i % 3);
103          } else {
104             A.elements[i*A.stride + j] = 0.0;
105          }
106       }
107 
108    /*  Initialize B and Bt */
109    for(i = 0; i < B.hpad ; i++)
110       for(j = 0; j < B.stride; j++) {
111          if (( j<B.width ) && (i<B.height)) {
112             B.elements[i*B.stride+j] = (j % 2);
113             Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
114          } else {
115             B.elements[i*B.stride+j] = 0.0;
116             Bt.elements[j*Bt.stride+i] = 0.0;
117          }
118       }
119 
120    /* zero C, and Cref */
121    for(i = 0; i < C.hpad; i++)
122       for(j = 0; j < C.stride; j++) {
123          C.elements[i*C.stride+j] = 0.0;
124          Cref.elements[i*Cref.stride+j] = 0.0;
125       }
126 
127    simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
128    tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
129 
130    verify(C.elements, Cref.elements, C.height * C.stride);
131    return 0;
132 }
133 
simple_sgemm_tt(const int M,const int N,const int K,const float alpha,const float * A,const int LDA,const float * B,const int LDB,const float beta,float * C,const int LDC)134 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
135 const float* B,const int LDB, const float beta,float* C, const int LDC) {
136    /*  A,B, and C  are in row-major order */
137    int c_row,c_col,inner;
138    float sum;
139    for (c_col  = 0 ;  c_col<N; c_col++ ) {
140       for (c_row = 0 ; c_row<M; c_row++ ) {
141          sum = 0.0 ;
142          for (inner = 0 ; inner<K; inner++ ) {
143             sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
144          }
145          C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
146       }
147    }
148 }
149 
150 /***************************
151 
152    tiled_sgemm_tt:  Tiled matrix multiplication:
153 
154 ***************************/
155 
tiled_sgemm_tt(const int M,const int N,const int K,const float alpha,const float * A,const int LDA,const float * B,const int LDB,const float beta,float * C,const int LDC)156 void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
157    const float*B, const int LDB, const float beta, float*C, const int LDC){
158 
159 #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
160 #pragma omp distribute collapse(2)
161    for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
162       for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
163 	{
164 //       Each team has a local copy of these mini matrices
165          float As[BLOCK_SIZE][BLOCK_SIZE];
166          float Bs[BLOCK_SIZE][BLOCK_SIZE];
167 #pragma omp parallel
168 	 {
169          int C_row, C_col;
170          float Cval = 0.0;
171 
172          for (int kblock = 0; kblock  < K ; kblock += BLOCK_SIZE )
173 	   {
174 #pragma omp for collapse(2)
175 	     for (int row=0 ; row < BLOCK_SIZE ; row++)
176                for (int col=0 ; col < BLOCK_SIZE ; col++)
177 		 {
178 		   C_row = C_row_start + row;
179 		   C_col = C_col_start + col;
180 		   if ((C_row < M) && (kblock + col < K))
181 		     As[row][col] = A[(C_row*LDA)+ kblock + col];
182 		   else
183 		     As[row][col] = 0;
184 		   if ((kblock + row < K) && C_col < N)
185 		     Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
186 		   else
187 		     Bs[row][col] = 0;
188 		 }
189 
190 #pragma omp for collapse(2)
191 	     for (int row=0 ; row < BLOCK_SIZE ; row++)
192 	       for (int col=0 ; col < BLOCK_SIZE ; col++)
193 		 {
194 		   for (int e = 0; e < BLOCK_SIZE; ++e)
195                      Cval += As[row][e] * Bs[e][col];
196 		 }
197 	   }  /* End for kblock .. */
198 
199 
200 #pragma omp for collapse(2)
201          for (int row=0 ; row < BLOCK_SIZE ; row++)
202 	   for (int col=0 ; col < BLOCK_SIZE ; col++)
203 	     {
204                C_row = C_row_start + row;
205                C_col = C_col_start + col;
206 	       if ((C_row < M) && (C_col < N))
207 		 C[(C_row*LDC)+C_col] = alpha*Cval + beta*C[(C_row*LDC)+C_col];
208 
209 	     }
210          } /* end parallel */
211       }	   /* end target teams distribute */
212 }
213