1/* 2 * kernel_modulus_sp.cl 3 * 4 * Created on: Jul 5, 2011 5 * Author: Matthew Wezowicz 6 */ 7 8#define BLOCK_SIZE 16 9 10__kernel void matrixMuladdKernelModular1SP(__global float* D, float alpha, __global float* A, __global float* B, 11 float beta, __global float* C, const int widthA, const int widthB, const float mod){ 12 //Get Workgroup ID 13 int bx = get_group_id(0); 14 int by = get_group_id(1); 15 16 //Get Local ID 17 int tx = get_local_id(0); 18 int ty = get_local_id(1); 19 20 //Range of indecies for sub-matrix of A 21 int aBegin = widthA * BLOCK_SIZE * by; 22 int aEnd = aBegin + widthA - 1; 23 int aStep = BLOCK_SIZE; 24 25 //Range of indecies for sub-matrix of B 26 int bBegin = BLOCK_SIZE * bx; 27 int bStep = BLOCK_SIZE * widthB; 28 29 //Local storage of sub-matrices of A and B 30 __local float As[BLOCK_SIZE][BLOCK_SIZE]; 31 __local float Bs[BLOCK_SIZE][BLOCK_SIZE]; 32 33 //Temporary storage for result 34 float Dsub = 0; 35 36 //Loop over all the sub-matrices of A and B required to compute 37 //the result sub-matrix 38 for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){ 39 //Load the matrices from global memory to local memory 40 //Each thread loads one element of each sub-matrix 41 As[ty][tx] = A[a + widthA * ty + tx]; 42 Bs[ty][tx] = B[b + widthB * ty + tx]; 43 44 //Synchronize threads 45 barrier(CLK_LOCAL_MEM_FENCE); 46 47 //Multiply the two sub-matrices together 48 for(int i = 0; i < BLOCK_SIZE; i++){ 49 Dsub += As[ty][i] * Bs[i][tx]; 50 //Calls fmod every iteration to normalize the partial sum 51 Dsub = fmod(Dsub, mod); 52 } 53 54 //Synchronize threads 55 barrier(CLK_LOCAL_MEM_FENCE); 56 } 57 //Calculates the offset in the result matrix 58 int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 59 60 //Scale Dsub by alpha 61 Dsub = alpha * Dsub; 62 Dsub = fmod(Dsub, mod); 63 if(Dsub < 0){ 64 Dsub = mod + Dsub; 65 } 66 67 //Scalse Csub by beta 68 float Csub = C[d + ty * widthB + tx]; 69 Csub = beta * Csub; 70 Csub = fmod(Csub, mod); 71 if(Csub < 0){ 72 Csub = mod + Csub; 73 } 74 75 //Add Dsub and Dsub 76 Dsub = Dsub + Csub; 77 Dsub = fmod(Dsub, mod); 78 79 //Add the sum to the appropriate spot 80 D[d + ty * widthB + tx] = Dsub; 81} 82