1/*
2 * kernel_partial_16_sp.cl
3 *
4 *  Created on: Jul 5, 2011
5 *      Author: Matthew Wezowicz
6 */
7
8#define BLOCK_SIZE 16
9
10__kernel void matrixMulKernelModular16SP(__global float* C, __global float* A, __global float* B,
11		const int widthA, const int widthB, const float mod){
12	//Get Workgroup ID
13	int bx = get_group_id(0);
14	int by = get_group_id(1);
15
16	//Get Local ID
17	int tx = get_local_id(0);
18	int ty = get_local_id(1);
19
20	//Range of indecies for sub-matrix of A
21	int aBegin = widthA * BLOCK_SIZE * by;
22	int aEnd = aBegin + widthA - 1;
23	int aStep = BLOCK_SIZE;
24
25	//Range of indecies for sub-matrix of B
26	int bBegin = BLOCK_SIZE * bx;
27	int bStep = BLOCK_SIZE * widthB;
28
29	//Local storage of sub-matrices of A and B
30	__local float As[BLOCK_SIZE][BLOCK_SIZE];
31	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
32
33	//Temporary storage for result
34	float Csub = 0;
35
36	//Loop over all the sub-matrices of A and B required to compute
37	//the result sub-matrix
38	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){
39		//Load the matrices from global memory to local memory
40		//Each thread loads one element of each sub-matrix
41		As[ty][tx] = A[a + widthA * ty + tx];
42		Bs[ty][tx] = B[b + widthB * ty + tx];
43
44		//Synchronize threads
45		barrier(CLK_LOCAL_MEM_FENCE);
46
47		//Multiply the two sub-matrices together
48		for(int i = 0; i < BLOCK_SIZE; i++){
49			Csub += As[ty][i] * Bs[i][tx];
50		}
51		Csub = fmod(Csub, mod);
52
53		//Synchronize threads
54		barrier(CLK_LOCAL_MEM_FENCE);
55	}
56
57	//Calculates the offset in the result matrix and add the sum to the
58	//appropriate spot
59	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
60	C[c + ty * widthB + tx] = Csub;
61}
62