1/* 2 * kernel_partial_16_sp.cl 3 * 4 * Created on: Jul 5, 2011 5 * Author: Matthew Wezowicz 6 */ 7 8#define BLOCK_SIZE 16 9 10__kernel void matrixMulKernelModular16SP(__global float* C, __global float* A, __global float* B, 11 const int widthA, const int widthB, const float mod){ 12 //Get Workgroup ID 13 int bx = get_group_id(0); 14 int by = get_group_id(1); 15 16 //Get Local ID 17 int tx = get_local_id(0); 18 int ty = get_local_id(1); 19 20 //Range of indecies for sub-matrix of A 21 int aBegin = widthA * BLOCK_SIZE * by; 22 int aEnd = aBegin + widthA - 1; 23 int aStep = BLOCK_SIZE; 24 25 //Range of indecies for sub-matrix of B 26 int bBegin = BLOCK_SIZE * bx; 27 int bStep = BLOCK_SIZE * widthB; 28 29 //Local storage of sub-matrices of A and B 30 __local float As[BLOCK_SIZE][BLOCK_SIZE]; 31 __local float Bs[BLOCK_SIZE][BLOCK_SIZE]; 32 33 //Temporary storage for result 34 float Csub = 0; 35 36 //Loop over all the sub-matrices of A and B required to compute 37 //the result sub-matrix 38 for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){ 39 //Load the matrices from global memory to local memory 40 //Each thread loads one element of each sub-matrix 41 As[ty][tx] = A[a + widthA * ty + tx]; 42 Bs[ty][tx] = B[b + widthB * ty + tx]; 43 44 //Synchronize threads 45 barrier(CLK_LOCAL_MEM_FENCE); 46 47 //Multiply the two sub-matrices together 48 for(int i = 0; i < BLOCK_SIZE; i++){ 49 Csub += As[ty][i] * Bs[i][tx]; 50 } 51 Csub = fmod(Csub, mod); 52 53 //Synchronize threads 54 barrier(CLK_LOCAL_MEM_FENCE); 55 } 56 57 //Calculates the offset in the result matrix and add the sum to the 58 //appropriate spot 59 int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 60 C[c + ty * widthB + tx] = Csub; 61} 62