1/*
2 * kernel_modulus_sp.cl
3 *
4 *  Created on: Jul 5, 2011
5 *      Author: Matthew Wezowicz
6 */
7
8#define BLOCK_SIZE 16
9
10__kernel void matrixMuladdKernelModular1SP(__global float* D, float alpha, __global float* A, __global float* B,
11		float beta, __global float* C, const int widthA, const int widthB, const float mod){
12	//Get Workgroup ID
13	int bx = get_group_id(0);
14	int by = get_group_id(1);
15
16	//Get Local ID
17	int tx = get_local_id(0);
18	int ty = get_local_id(1);
19
20	//Range of indecies for sub-matrix of A
21	int aBegin = widthA * BLOCK_SIZE * by;
22	int aEnd = aBegin + widthA - 1;
23	int aStep = BLOCK_SIZE;
24
25	//Range of indecies for sub-matrix of B
26	int bBegin = BLOCK_SIZE * bx;
27	int bStep = BLOCK_SIZE * widthB;
28
29	//Local storage of sub-matrices of A and B
30	__local float As[BLOCK_SIZE][BLOCK_SIZE];
31	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
32
33	//Temporary storage for result
34	float Dsub = 0;
35
36	//Loop over all the sub-matrices of A and B required to compute
37	//the result sub-matrix
38	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){
39		//Load the matrices from global memory to local memory
40		//Each thread loads one element of each sub-matrix
41		As[ty][tx] = A[a + widthA * ty + tx];
42		Bs[ty][tx] = B[b + widthB * ty + tx];
43
44		//Synchronize threads
45		barrier(CLK_LOCAL_MEM_FENCE);
46
47		//Multiply the two sub-matrices together
48		for(int i = 0; i < BLOCK_SIZE; i++){
49			Dsub += As[ty][i] * Bs[i][tx];
50			//Calls fmod every iteration to normalize the partial sum
51			Dsub = fmod(Dsub, mod);
52		}
53
54		//Synchronize threads
55		barrier(CLK_LOCAL_MEM_FENCE);
56	}
57	//Calculates the offset in the result matrix
58	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
59
60	//Scale Dsub by alpha
61	Dsub = alpha * Dsub;
62	Dsub = fmod(Dsub, mod);
63	if(Dsub < 0){
64		Dsub = mod + Dsub;
65	}
66
67	//Scalse Csub by beta
68	float Csub = C[d + ty * widthB + tx];
69	Csub = beta * Csub;
70	Csub = fmod(Csub, mod);
71	if(Csub < 0){
72		Csub = mod + Csub;
73	}
74
75	//Add Dsub and Dsub
76	Dsub = Dsub + Csub;
77	Dsub = fmod(Dsub, mod);
78
79	//Add the sum to the appropriate spot
80	D[d + ty * widthB + tx] = Dsub;
81}
82