1/* 2 * kernel_partial_8_dp.cl 3 * 4 * Created on: Jul 5, 2011 5 * Author: Matthew Wezowicz 6 */ 7 8#define BLOCK_SIZE 16 9#pragma OPENCL EXTENSION cl_khr_fp64 : enable 10 11__kernel void matrixMulKernelModular8DP(__global double* C, __global double* A, __global double* B, 12 const int widthA, const int widthB, const double mod){ 13 //Get Workgroup ID 14 int bx = get_group_id(0); 15 int by = get_group_id(1); 16 17 //Get Local ID 18 int tx = get_local_id(0); 19 int ty = get_local_id(1); 20 21 //Range of indecies for sub-matrix of A 22 int aBegin = widthA * BLOCK_SIZE * by; 23 int aEnd = aBegin + widthA - 1; 24 int aStep = BLOCK_SIZE; 25 26 //Range of indecies for sub-matrix of B 27 int bBegin = BLOCK_SIZE * bx; 28 int bStep = BLOCK_SIZE * widthB; 29 30 //Local storage of sub-matrices of A and B 31 __local double As[BLOCK_SIZE][BLOCK_SIZE]; 32 __local double Bs[BLOCK_SIZE][BLOCK_SIZE]; 33 34 //Temporary storage for result 35 double Csub = 0; 36 37 //Loop over all the sub-matrices of A and B required to compute 38 //the result sub-matrix 39 for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){ 40 //Load the matrices from global memory to local memory 41 //Each thread loads one element of each sub-matrix 42 As[ty][tx] = A[a + widthA * ty + tx]; 43 Bs[ty][tx] = B[b + widthB * ty + tx]; 44 45 //Synchronize threads 46 barrier(CLK_LOCAL_MEM_FENCE); 47 48 //Multiply the two sub-matrices together 49 for(int i = 0; i < BLOCK_SIZE / 2; i++){ 50 Csub += As[ty][i] * Bs[i][tx]; 51 } 52 Csub = fmod(Csub, mod); 53 54 for(int i = BLOCK_SIZE / 2; i < BLOCK_SIZE; i++){ 55 Csub += As[ty][i] * Bs[i][tx]; 56 } 57 Csub = fmod(Csub, mod); 58 59 //Synchronize threads 60 barrier(CLK_LOCAL_MEM_FENCE); 61 } 62 63 //Calculates the offset in the result matrix and add the sum to the 64 //appropriate spot 65 int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 66 C[c + ty * widthB + tx] = Csub; 67} 68