/* matmul.c : Matrix Multiplication with tiling for openmp4 example */ #include #include #define BLOCK_SIZE 16 /* #define BLOCK_SIZE 32 */ #define NSECPERSEC 1000000000L typedef struct { int width; int height; int stride; int hpad; float* elements; } Matrix; /* Correctly extract the number of nanoseconds from the two time structures */ long int get_nanosecs( struct timespec start_time, struct timespec end_time) { long int nanosecs; if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs = ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) + ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ; else nanosecs = (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) + ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec ); return nanosecs; } void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA, const float* B,const int LDB, const float beta,float* C, const int LDC) ; void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA, const float* B,const int LDB, const float beta,float* C, const int LDC) ; void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA, const float* B,const int LDB, const float beta,float* C, const int LDC) ; int verify(float* v_res, float* v_ref, int len) { int passed = 1; int i; for (i = 0; i < len; ++i) { if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) { __builtin_abort (); } } return passed; } int main(int argc, char* argv[]){ Matrix A,B,Bt,C,Cref; int a1,a2,a3,i,j; struct timespec start_time1, end_time1; struct timespec start_time2, end_time2; long int nanosecs,total_ops; float gflopsTiled,gflopsCPU; a1 = 35; a2 = 28; a3 = 47; A.height = a1; A.width = a2; A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float)); B.height = a2; B.width = a3; B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float)); /* Bt is same as B but stored in column-major order */ Bt.height = B.height; Bt.width = B.width; Bt.stride = B.stride; Bt.hpad = B.hpad; Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float)); C.height = a1; C.width = a3; C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float)); Cref.height = a1; Cref.width = a3; Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float)); for(i = 0; i < A.hpad ; i++) for(j = 0; j < A.stride; j++) { if (( j