1/*
2 * kernel_partial_8_dp.cl
3 *
4 *  Created on: Jul 5, 2011
5 *      Author: Matthew Wezowicz
6 */
7
8#define BLOCK_SIZE 16
9#pragma OPENCL EXTENSION cl_khr_fp64 : enable
10
11__kernel void matrixMulKernelModular8DP(__global double* C, __global double* A, __global double* B,
12		const int widthA, const int widthB, const double mod){
13	//Get Workgroup ID
14	int bx = get_group_id(0);
15	int by = get_group_id(1);
16
17	//Get Local ID
18	int tx = get_local_id(0);
19	int ty = get_local_id(1);
20
21	//Range of indecies for sub-matrix of A
22	int aBegin = widthA * BLOCK_SIZE * by;
23	int aEnd = aBegin + widthA - 1;
24	int aStep = BLOCK_SIZE;
25
26	//Range of indecies for sub-matrix of B
27	int bBegin = BLOCK_SIZE * bx;
28	int bStep = BLOCK_SIZE * widthB;
29
30	//Local storage of sub-matrices of A and B
31	__local double As[BLOCK_SIZE][BLOCK_SIZE];
32	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];
33
34	//Temporary storage for result
35	double Csub = 0;
36
37	//Loop over all the sub-matrices of A and B required to compute
38	//the result sub-matrix
39	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){
40		//Load the matrices from global memory to local memory
41		//Each thread loads one element of each sub-matrix
42		As[ty][tx] = A[a + widthA * ty + tx];
43		Bs[ty][tx] = B[b + widthB * ty + tx];
44
45		//Synchronize threads
46		barrier(CLK_LOCAL_MEM_FENCE);
47
48		//Multiply the two sub-matrices together
49		for(int i = 0; i < BLOCK_SIZE / 2; i++){
50			Csub += As[ty][i] * Bs[i][tx];
51		}
52		Csub = fmod(Csub, mod);
53
54		for(int i = BLOCK_SIZE / 2; i < BLOCK_SIZE; i++){
55			Csub += As[ty][i] * Bs[i][tx];
56		}
57		Csub = fmod(Csub, mod);
58
59		//Synchronize threads
60		barrier(CLK_LOCAL_MEM_FENCE);
61	}
62
63	//Calculates the offset in the result matrix and add the sum to the
64	//appropriate spot
65	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
66	C[c + ty * widthB + tx] = Csub;
67}
68