1 /* 2 3 matmul.c : Matrix Multiplication with tiling for openmp4 example 4 5 */ 6 7 #include <stdlib.h> 8 #include <math.h> 9 10 #define BLOCK_SIZE 16 11 /* TestMethod()12 #define BLOCK_SIZE 32 13 */ 14 #define NSECPERSEC 1000000000L 15 16 typedef struct { 17 int width; 18 int height; 19 int stride; 20 int hpad; 21 float* elements; 22 } Matrix; 23 24 /* Correctly extract the number of nanoseconds from the two time structures */ 25 long int get_nanosecs( struct timespec start_time, struct timespec end_time) { 26 long int nanosecs; 27 if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs = 28 ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) + 29 ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ; 30 else nanosecs = 31 (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) + 32 ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec ); 33 return nanosecs; 34 } 35 36 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA, 37 const float* B,const int LDB, const float beta,float* C, const int LDC) ; 38 void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA, 39 const float* B,const int LDB, const float beta,float* C, const int LDC) ; 40 void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA, 41 const float* B,const int LDB, const float beta,float* C, const int LDC) ; 42 43 int verify(float* v_res, float* v_ref, int len) { 44 int passed = 1; 45 int i; 46 for (i = 0; i < len; ++i) { 47 if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) { 48 __builtin_abort (); 49 } 50 } 51 return passed; 52 } 53 54 55 int main(int argc, char* argv[]){ 56 57 Matrix A,B,Bt,C,Cref; 58 int a1,a2,a3,i,j; 59 struct timespec start_time1, end_time1; 60 struct timespec start_time2, end_time2; 61 long int nanosecs,total_ops; 62 float gflopsTiled,gflopsCPU; 63 64 a1 = 35; 65 a2 = 28; 66 a3 = 47; 67 68 A.height = a1; 69 A.width = a2; 70 A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; 71 A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; 72 A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float)); 73 74 B.height = a2; 75 B.width = a3; 76 B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; 77 B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; 78 B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float)); 79 80 /* Bt is same as B but stored in column-major order */ 81 Bt.height = B.height; 82 Bt.width = B.width; 83 Bt.stride = B.stride; 84 Bt.hpad = B.hpad; 85 Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float)); 86 87 C.height = a1; 88 C.width = a3; 89 C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; 90 C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; 91 C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float)); 92 93 Cref.height = a1; 94 Cref.width = a3; 95 Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; 96 Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; 97 Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float)); 98 99 for(i = 0; i < A.hpad ; i++) 100 for(j = 0; j < A.stride; j++) { 101 if (( j<A.width ) && (i<A.height)) { 102 A.elements[i*A.stride + j] = (i % 3); 103 } else { 104 A.elements[i*A.stride + j] = 0.0; 105 } 106 } 107 108 /* Initialize B and Bt */ 109 for(i = 0; i < B.hpad ; i++) 110 for(j = 0; j < B.stride; j++) { 111 if (( j<B.width ) && (i<B.height)) { 112 B.elements[i*B.stride+j] = (j % 2); 113 Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ; 114 } else { 115 B.elements[i*B.stride+j] = 0.0; 116 Bt.elements[j*Bt.stride+i] = 0.0; 117 } 118 } 119 120 /* zero C, and Cref */ 121 for(i = 0; i < C.hpad; i++) 122 for(j = 0; j < C.stride; j++) { 123 C.elements[i*C.stride+j] = 0.0; 124 Cref.elements[i*Cref.stride+j] = 0.0; 125 } 126 127 simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride); 128 tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride); 129 130 verify(C.elements, Cref.elements, C.height * C.stride); 131 return 0; 132 } 133 134 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA, 135 const float* B,const int LDB, const float beta,float* C, const int LDC) { 136 /* A,B, and C are in row-major order */ 137 int c_row,c_col,inner; 138 float sum; 139 for (c_col = 0 ; c_col<N; c_col++ ) { 140 for (c_row = 0 ; c_row<M; c_row++ ) { 141 sum = 0.0 ; 142 for (inner = 0 ; inner<K; inner++ ) { 143 sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ; 144 } 145 C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ; 146 } 147 } 148 } 149 150 /*************************** 151 152 tiled_sgemm_tt: Tiled matrix multiplication: 153 154 ***************************/ 155 156 void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA, 157 const float*B, const int LDB, const float beta, float*C, const int LDC){ 158 159 #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N]) 160 #pragma omp distribute collapse(2) 161 for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) { 162 for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) { 163 164 // We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (M*N)/(BLOCK_SIZE*BLOCK_SIZE) 165 // The grid global dimensions are M,N,1 166 // The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1 167 168 // ------------------------------------------------------------------- 169 // The rest of this code forms the HSAIL kernel with the 170 // pairs of "parallel for collapse(2)" loops replaced with a barrier. 171 // The kernel initializes these values 172 // C_row_start = get_group_id(0) * BLOCK_SIZE 173 // C_col_start = get_group_id(1) * BLOCK_SIZE 174 // row=get_local_id(0) 175 // col=get_local_id(1) 176 // ------------------------------------------------------------------- 177 178 // Each team has a local copy of these mini matrices 179 float As[BLOCK_SIZE][BLOCK_SIZE]; 180 float Bs[BLOCK_SIZE][BLOCK_SIZE]; 181 float Cs[BLOCK_SIZE][BLOCK_SIZE]; 182 int C_row, C_col; 183 184 /* Zero Cs for this BLOCK */ 185 // - - - - - - - - - - - - - - - - - - - - 186 // REPLACE NEXT THREE LINES WITH A BARRIER 187 #pragma omp parallel for collapse(2) 188 for (int row=0 ; row < BLOCK_SIZE ; row++) { 189 for (int col=0 ; col < BLOCK_SIZE ; col++) { 190 // END BARRIER 191 // - - - - - - - - - - - - - - - - - - - - 192 Cs[row][col] = 0.0; 193 } 194 } 195 196 // This kblock loop is run on the master thread of each team 197 for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE ) { 198 199 // Copy global memory values to local memory 200 // - - - - - - - - - - - - - - - - - - - - 201 // REPLACE NEXT THREE LINES WITH A BARRIER 202 #pragma omp parallel for collapse(2) 203 for (int row=0 ; row < BLOCK_SIZE ; row++) { 204 for (int col=0 ; col < BLOCK_SIZE ; col++) { 205 // END BARRIER 206 // - - - - - - - - - - - - - - - - - - - - 207 C_row = C_row_start + row; 208 C_col = C_col_start + col; 209 if ((C_row < M) && (kblock + col < K)) 210 As[row][col] = A[(C_row*LDA)+ kblock + col]; 211 else 212 As[row][col] = 0; 213 if ((kblock + row < K) && C_col < N) 214 Bs[row][col] = B[((kblock+row)*LDB)+ C_col]; 215 else 216 Bs[row][col] = 0; 217 } 218 } 219 220 // Calculate Cs <- Sum(As X Bs) across all kblocks 221 // - - - - - - - - - - - - - - - - - - - - 222 // REPLACE NEXT THREE LINES WITH A BARRIER 223 #pragma omp parallel for collapse(2) 224 for (int row=0 ; row < BLOCK_SIZE ; row++) { 225 for (int col=0 ; col < BLOCK_SIZE ; col++) { 226 // END BARRIER 227 // - - - - - - - - - - - - - - - - - - - - 228 for (int e = 0; e < BLOCK_SIZE; ++e) 229 Cs[row][col] += As[row][e] * Bs[e][col]; 230 } 231 } 232 233 } /* End for kblock .. */ 234 235 236 // Scale Update actual C from Cs 237 // - - - - - - - - - - - - - - - - - - - - 238 // REPLACE NEXT THREE LINES WITH A BARRIER 239 #pragma omp parallel for collapse(2) 240 for (int row=0 ; row < BLOCK_SIZE ; row++) { 241 for (int col=0 ; col < BLOCK_SIZE ; col++) { 242 // END BARRIER 243 // - - - - - - - - - - - - - - - - - - - - 244 C_row = C_row_start + row; 245 C_col = C_col_start + col; 246 if ((C_row < M) && (C_col < N)) { 247 C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col]; 248 } 249 } 250 } 251 252 // ------------------------------------------------------------------- 253 // This is the end of the kernel 254 255 } 256 } 257 258 } 259