1 /*
2  * ParameterTuner.cpp
3  *
4  *  Created on: Jan 30, 2017
5  */
6 
7 #include "ParameterTuner.h"
8 #include "ourproj.h"
9 #include <math.h>
10 #include "Parameters.h"
11 #include "BlockingCase.h"
12 #include "TransposeSpec.h"
13 #include <iostream>
14 
15 #include "ourinclude.h"
16 using namespace std;
17 
18 class ParameterTuner {
19 	int sharedMemLimitPerSM;// = 6144; // (48 * 1024)/8 words per SM
20 	int numThreadsLimitPerSM;// =  2048; //per SM
21 	int numThreadBlocksLimitPerSM;// =  16; //per SM
22 	int threadBlocKSizeLimit;// = 1024;
23 	int numSMs;// = 15;
24 	int blockFactor;// = 4
25 	BlockingCase caseId;
26 
27 	public:
getTBSize(unsigned int shm)28 	unsigned int getTBSize(unsigned int shm)
29 	{
30 		return (unsigned) ceil((double)(numThreadsLimitPerSM/floor(32.0*(unsigned)(sharedMemLimitPerSM/ shm)))) * 32;
31 	}
getCaseId()32 	BlockingCase getCaseId()
33 	{
34 		return caseId;
35 	}
tune(TransposeSpec & spec)36 	Parameters& tune(TransposeSpec &spec) {
37 		int *sizes = spec.getSizes();
38 		if((spec.getPermutation()[0] != 0) || (sizes[0] * sizes[1] < 32) || (sizes[spec.getPermutation()[0]] * sizes[spec.getPermutation()[1]] < 32) )
39 		{
40 			return tuneFastestVaryingNotMatchingCase(spec);
41 		}
42 		else if ( spec.getSizes()[0] >= 32)
43 		{
44 			caseId = BlockingCase::FVI_MATCH_AND_GREATERT32;
45 			return tuneFastestVaryingMatchingCaseWithoutBlocking(spec);
46 		}
47 		else
48 		{
49 			caseId = BlockingCase::FVI_MATCH_AND_LESST32;
50 			return tuneFastestVaryingMatchingCaseWithBlocking(spec);
51 		}
52 	}
53 
tuneFastestVaryingNotMatchG32Case(TransposeSpec & spec)54 	Parameters& tuneFastestVaryingNotMatchG32Case(TransposeSpec &spec){
55 		Parameters *parameters = new Parameters();
56 		int *sizes = spec.getSizes();
57 		int* permutation = spec.getPermutation();
58 		unsigned numElements = 32*32;
59 		unsigned sharedMemSize = (32 *33);
60 		unsigned paddingSize = 1;
61 		unsigned tbSize = 256;
62 		unsigned numBlocksPerSM = sharedMemLimitPerSM/sharedMemSize;
63 		parameters->setNumElementsProcessedPerBlock(numElements);
64 		parameters->setPaddingSize(paddingSize);
65 		parameters->setOccupancy(100.0f);
66 		parameters->setTbSize(tbSize);
67 		parameters->setNumBlocksPerSM(numBlocksPerSM);
68 		parameters->setTileSize(32);
69 		parameters->setSharedMemSize1(32);
70 		parameters->setSharedMemSize2(33);
71 		double eff = getEfficiency_nomatchg32(sizes[0], sizes[permutation[0]]);
72 		double bandwidth = getBW_nomatchg32(eff);
73 		parameters -> setBW(bandwidth);
74 		unsigned long vol = spec.getVolume();
75 
76 		//cout <<"cc0= "<< spec.getVolume() << " eff = "<<eff<<" bw = "<<bandwidth<< " time = "<<getTime(bandwidth, spec.getVolume())<<"\t";
77 		//cout <<"cc1= "<< vol << " eff = "<<eff<<" bw = "<<bandwidth<< " time = "<<getTime(bandwidth, vol)<<"\t";
78 		parameters -> setTime(getTime(bandwidth, spec.getVolume()));
79 		parameters->setWarpEfficiency(eff);
80 		return *parameters;
81 		//parametersList.add(parameters);
82 	}
tuneConflictCase(TransposeSpec & spec)83 	Parameters& tuneConflictCase(TransposeSpec &spec){
84 		Parameters *parameters = new Parameters();
85 		int *sizes = spec.getSizes();
86 		int* permutation = spec.getPermutation();
87 		int blockA = 1, blockB = 1;
88 		int sharedMemSize = 1;
89 		int sharedMemSize1 = 1;
90 		int sharedMemSize2 = 1;
91 		int tbSize = 32;
92 		int numElements = 0;
93 		int paddingSize= 0;
94 		int csize, asize, bsize, bonlysize, pad;//sizes[0];
95 		int repeat = 0, rlimit, alimit, blimit,count = 0;
96 		unsigned SHMLIMIT = 1056;
97 		//unsigned SHMLIMIT = 1400;
98 		const int limit = 32;//starts from 32 and goes 64, 128...
99 		int limiti, limito, nlimit;
100 		unsigned long int volume = spec.getVolume();
101 		int minnumblocks = numSMs * sharedMemLimitPerSM/(33*32);
102 		nlimit = sqrt(volume/(blockFactor * minnumblocks* 32*32));
103 		if(nlimit == 0) nlimit = 1;
104 		double besteff = 0;
105 		for(int limiti = 0; limiti < nlimit; limiti++)
106 		{
107 
108 			rlimit = 32 + 32*limiti;
109 			int i;
110 			blockA = 1, blockB = 1, csize = 1;
111 			for(i = 0; i <= spec.getNdim(); i++)
112 			{
113 				if(csize == rlimit)
114 				{
115 					break;
116 				}
117 				if(csize > rlimit)
118 				{
119 					csize/=sizes[i-1];
120 					//blockA = (rlimit+csize-1)/csize;
121 						blockA = (rlimit)/csize;
122 					//cyyout << "blockA = "<<blockA;
123 					if(blockA != 1){
124 						//csize/=sizes[i-1];
125 						csize*=blockA;
126 					}
127 					else i--;
128 					if(blockA == sizes[i-1]) { blockA = 1;}
129 					break;
130 				}
131 			if(i < spec.getNdim())
132 				csize*= sizes[i];
133 			}
134 				if(i == spec.getNdim() + 1)
135 					i--;
136 			//if(blockA == 1 && i < spec.getNdim()) i++;
137 			alimit = i-1;
138 			asize = csize;
139 #ifdef printd
140 			cout <<  "asize == "<<asize<<" ablock = "<<blockA<<" rlimit= "<<rlimit<<"\n";
141 #endif
142 			for(int limito = 0; limito < nlimit; limito++)
143 			{
144 				i = 0;
145 				bonlysize = 1;
146 				int	limit = 32 + limito * 32;//1024/(asize*2);
147 				//int	limit = 32;//rlimit;//1024/(asize*2);
148 				csize = 1;//sizes[permutation[i]];
149 
150 				for(; i <= spec.getNdim(); i++)
151 				{
152 					if(csize == limit)
153 					{
154 						break;
155 					}
156 					if(csize > limit)
157 					{
158 						if(i > 0){
159 							csize/=sizes[permutation[i-1]];
160 						//	blockB =  (limit+csize-1)/csize;
161 								blockB =  (limit)/csize;
162 #ifdef printd
163 							cout << "\ncsize = "<<csize;
164 							cout << "\nblockB = "<<blockB;
165 #endif
166 							if( permutation[i-1] > alimit)
167 							{
168 								bonlysize /= sizes[permutation[i-1]];
169 								if(blockB != 1){
170 									bonlysize*= blockB;
171 
172 								}
173 							}
174 							if(blockB != 1){
175 								csize*= blockB;
176 								if(blockB == sizes[permutation[i-1]]) {blockB = 1;}
177 
178 							}
179 							else i--;
180 
181 
182 						}
183 
184 						break;
185 					}
186 					if(i == spec.getNdim()) break;
187 					//if(permutation[i] < alimit) continue;
188 					//if(i > 0 && ((blockA != 1) && (permutation[i] == alimit)))
189 					if(i < spec.getNdim())
190 					csize*= sizes[permutation[i]];
191 					if(permutation[i] > alimit)
192 					{
193 						bonlysize *= sizes[permutation[i]];
194 					}
195 #ifdef printd
196 					cout<<"\ni = "<<i<<"bsize = "<<csize<<"\n";
197 #endif
198 				}
199 #ifdef printd
200 				cout <<  "\nbsize == "<<csize<<"\n";
201 #endif
202 				if(i == spec.getNdim() + 1)
203 					i--;
204 				//if((blockB == 1) && (i < spec.getNdim())) i++;
205 				blimit = i-1;
206 				//cout<<" alimitp = "<<alimit<<" blimitp = "<<blimit<<"\n";
207 				bsize = csize;
208 				int n = spec.getNdim();
209 				int rperm[20];
210 				for(int i = 0; i < n; i++)
211 				{
212 					for(int j = 0; j < n; j++)
213 					{
214 						if(permutation[i] == j)
215 						{
216 							rperm[j] = i;
217 						}
218 					}
219 				}
220 
221 
222 
223 				if(blockA > 1)//checking for inner dimensions in output which gets blocked in input
224 				{
225 					for(int i = 0; i < blimit; i++)
226 					{
227 						if(permutation[i] == alimit)
228 						{
229 							asize /= blockA;
230 							asize *= sizes[permutation[i]];
231 							blockA = 1;
232 						}
233 					}
234 				}
235 				//	cout <<" blockA = "<<blockA <<" blockB = "<<blockB;
236 				if(blockB > 1)//checking for inner dimensions in input which gets blocked in output
237 				{
238 
239 					//	cout <<"A smaller in B\n";
240 					for(int i = 0; i < alimit; i++)
241 					{
242 						if(rperm[i] == blimit)
243 						{
244 							bsize /= blockB;
245 							bsize *= sizes[i];
246 							//	cout <<"\nNew bsize "<<bsize<<"\n";
247 							blockB = 1;
248 						}
249 					}
250 				}
251 
252 				//we need to change the blocksize in case alimit and blimit are same dimension
253 				if((alimit == permutation[blimit]) && (blockA > 1 || blockB > 1))
254 				{
255 #ifdef printd
256 					cout <<"Blocking dimensions same, bloackA = "<<blockA<<" blockB = "<<blockB<<"\n";
257 #endif
258 					if(((blockA > blockB) && (blockB != 1)) || (blockA == 1))// > bsize)
259 					{
260 						if(blockA == 1)
261 							bsize = (bsize/blockB)*sizes[alimit], blockB = blockA;
262 						else
263 							bsize = (bsize/blockB)*blockA, blockB = blockA;
264 
265 					}
266 					else
267 					{
268 						if(blockB == 1)
269 							asize = (asize/blockA) * sizes[permutation[blimit]], blockA = blockB;
270 						else
271 							asize = (asize/blockA) * blockB, blockA = blockB;
272 					}
273 				}
274 				pad =  ((asize %2)+1)%2;
275 				sharedMemSize1 = asize + pad;
276 				sharedMemSize2 = bonlysize;
277 				sharedMemSize = sharedMemSize1 * sharedMemSize2;//csize;//blockB * sizes[permutation[0]];
278 				//tbSize = max(64, (sharedMemSize/64)*32);
279 				//tbSize = 320;// max(64, (sharedMemSize/64)*32);
280 				tbSize = getTBSize(sharedMemSize);
281 				if(sharedMemSize > SHMLIMIT && besteff > 0)
282 				{
283 				//	limiti = nlimit;
284 				       	break;
285 				}
286 				double 	eff = getEfficiency_overlap(asize, bsize, sizes[alimit], sizes[permutation[blimit]], blockA, blockB);
287 				//cout <<"\t"<<eff<<"\t"<<asize<<"\t"<<bsize<<"\n";
288 				if(eff > besteff)
289 				{
290 					parameters->setNumElementsProcessedPerBlock(asize);
291 					parameters->setNumElementsProcessedPerBlock1(bsize);
292 					parameters->setPaddingSize(rlimit);
293 					parameters->setTbSize(tbSize);
294 					parameters->setTileSize(blockA);
295 					parameters->setSharedMemSize1(sharedMemSize1);
296 					parameters->setSharedMemSize2(sharedMemSize2);
297 					parameters->setTileSize1(blockB);
298 					parameters -> setBlockAIndex(alimit);
299 					parameters -> setBlockBIndex(blimit);
300 					besteff = eff;
301 
302 				}
303 
304 #ifdef printd
305 				cout<<" alimit = "<<alimit<<" blimit = "<<blimit<<"\n";
306 				cout<<" asize = "<<asize<<" bsize = "<<bsize<<"\n";
307 				cout<<" blockA = "<<blockA<<" blockB = "<<blockB<<"\n";
308 				cout<<" SM1 = "<<sharedMemSize<<"\n";// blockB = "<<blockB<<"\n";
309 				cout <<" TBSize = "<<tbSize<<"\n";
310 #endif
311 				count++;
312 				repeat++;
313 			}
314 		}// while(sharedMemSize <= 512);
315 		//double eff = getEfficiency_overlap(asize, bsize, sizes[alimit], sizes[permutation[blimit]], blockA, blockB );
316 		double bandwidth = getBW_overlap(besteff);
317 		parameters -> setBW(bandwidth);
318 		parameters -> setTime(getTime(bandwidth, spec.getVolume()));
319 		parameters->setWarpEfficiency(besteff);
320 		return *parameters;
321 
322 	}
tuneFastestVaryingNotMatchingCase(TransposeSpec & spec)323 	Parameters& tuneFastestVaryingNotMatchingCase(TransposeSpec &spec){
324 		int *sizes = spec.getSizes();
325 		int* permutation = spec.getPermutation();
326 		TensorType mytype = spec.getDataType();
327 		int blockAIndex = 0;
328 		int blockBIndex = 0;
329 		Parameters &parameters1 = tuneFastestVaryingNotMatchG32Case(spec);
330 		Parameters &parameters2 = tuneNonConflictCase(spec);
331 		Parameters &parameters3 = tuneConflictCase(spec);
332 		double b1, b2, b3;
333 		b1 = getBW_nomatchg32(parameters1 . getWarpEfficiency());
334 		//	if(b1 < 0.7)
335 		//		parameters1.setTbSize(256);
336 		double eff2 = parameters2 . getWarpEfficiency();
337 		//if(eff2 < 0.7) eff2*=0.9;
338 		b2 = getBW_nooverlap(eff2);
339 		b3 = getBW_overlap(parameters3 . getWarpEfficiency());
340 		//if(parameters1 . getWarpEfficiency() > parameters2 . getWarpEfficiency())
341 #ifdef printd
342 		cout<<"\t"<<parameters1 . getWarpEfficiency()<<"\t"<<parameters2 . getWarpEfficiency()<<"\t"<<parameters3 . getWarpEfficiency();
343 		cout <<"\t"<<b1<<"\t"<<b2<<"\t"<<b3<<"\t";
344 #endif
345 
346 		if(b1 >= b2)
347 		{
348 			//if(parameters1 . getWarpEfficiency() > parameters3 . getWarpEfficiency())
349 			if(b1 >= b3)
350 			{
351 				caseId = BlockingCase::FVI_NOMATCH_AND_GREATERT32;
352 				return parameters1;
353 			}
354 			else
355 			{
356 				caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP;
357 				return parameters3;
358 			}
359 		}
360 		else if(b2 >= b3)
361 		{
362 			caseId = BlockingCase::FVI_NOMATCH_GENERAL;
363 			return parameters2;
364 		}
365 		else
366 		{
367 			caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP;
368 			return parameters3;
369 		}
370 
371 
372 	}
tuneNonConflictCase(TransposeSpec & spec)373 	Parameters& tuneNonConflictCase(TransposeSpec &spec){
374 		Parameters *parameters = new Parameters();
375 		int *sizes = spec.getSizes();
376 		int* permutation = spec.getPermutation();
377 		int sharedMemSize = 1;
378 		int sharedMemSize1 = 1;
379 		int sharedMemSize2 = 1;
380 		int numElements = 0;
381 		int paddingSize= 0;
382 		int n = spec.getNdim();
383 		int rperm[20];
384 		for(int i = 0; i < n; i++)
385 		{
386 			for(int j = 0; j < n; j++)
387 			{
388 				if(permutation[i] == j)
389 				{
390 					rperm[j] = i;
391 				}
392 			}
393 		}
394 		int csize = 1, asize, bsize;//sizes[0];
395 		int rlimit = 32;
396 		//int tbSize = 512;
397 		int tbSize = 352;
398 		int alimit = 0, blimit = 0, irlimit = 0;
399 		int i, blockA = 1, blockB = 1;
400 		irlimit += 32;
401 		int limiti, limito, nlimit;
402 		unsigned long int volume = spec.getVolume();
403 		int minnumblocks = numSMs * sharedMemLimitPerSM/(33*32);
404 		nlimit = sqrt(volume/(blockFactor * minnumblocks* 32*32));
405 		//nlimit = 4;
406 		if(nlimit == 0) nlimit = 1;
407 #ifdef printd
408 		cout <<"\nnlimit = "<<nlimit;
409 #endif
410 		double besteff = 0;
411 
412 		for(limiti = 0; limiti <  nlimit; limiti++)
413 		{
414 			int limitir;
415 				limitir	= 32+ limiti*32;
416 			for(limito = 0; limito < nlimit; limito++)
417 			{
418 				int limitor;
419 					limitor = 32+ limito*32;
420 				bool conflict = false;
421 				csize = 1, blockA = 1, blockB = 1;
422 				for(i = 0; i < spec.getNdim(); i++)
423 				{
424 					if(csize == limitir)
425 					{
426 						break;
427 					}
428 					if(csize > limitir)
429 					{
430 						csize/=sizes[i-1];
431 						//blockA = (limitir+csize-1)/csize;
432 						blockA = (limitir)/csize;
433 						//cyyout << "blockA = "<<blockA;
434 						if(blockA != 1){
435 							//csize/=sizes[i-1];
436 							csize*=blockA;
437 						}
438 						else i--;
439 						if(blockA == sizes[i-1]) { blockA = 1;}
440 						break;
441 					}
442 					csize*= sizes[i];
443 				}
444 				if(i == spec.getNdim() + 1)
445 					i--;
446 				//if(blockA == 1 && i < spec.getNdim()) i++;
447 				alimit = i-1;
448 				asize = csize;
449 				i = 0;
450 				int conflicti = -1;
451 				//rlimit = 64;
452 				csize = 1;//sizes[permutation[i]];
453 				for(; i <= spec.getNdim(); i++)
454 				{
455 					if (i > 0 && permutation[i-1] < alimit)
456 					{
457 						limito = nlimit;
458 						conflict = true;
459 #ifdef printd
460 						cout <<"caseid changed "<< i<<"\n";
461 #endif
462 						//caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP;
463 						//break;
464 
465 					}
466 					if(csize == limitor)
467 					{
468 						break;
469 					}
470 					if(csize> limitor)
471 					{
472 
473 						csize/=sizes[permutation[i-1]];
474 						//blockB =  (limitor+csize-1)/csize;
475 						blockB =  (limitor)/csize;
476 #ifdef printd
477 						cout << "\ncsize = "<<csize;
478 						cout << "\nblockB = "<<blockB;
479 #endif
480 						if(blockB != 1){
481 							//if (permutation[i-1] < alimit || ((blockA != 1) && (permutation[i] == alimit)))
482 							if (i > 0 && permutation[i-1] <= alimit )
483 							{
484 								limito = nlimit;
485 								conflict = true;
486 #ifdef printd
487 								cout <<"caseid changed "<< i<<"\n";
488 #endif
489 								//		caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP;
490 								//		break;
491 							}
492 
493 							//csize/=sizes[permutation[i-1]];
494 							csize*= blockB;
495 							if(blockB == sizes[permutation[i-1]]) { blockB = 1;}
496 
497 						}
498 						else{ i--;
499 							if(conflicti == i)
500 								conflict = false;
501 						}
502 						break;
503 					}
504 					if(i == spec.getNdim()) break;
505 					//if(permutation[i] < alimit) continue;
506 					//if(i > 0 && ((blockA != 1) && (permutation[i] == alimit)))
507 					if((permutation[i] <= alimit))
508 					{
509 						limito = nlimit;
510 #ifdef printd
511 						cout <<"caseid changed "<< i<<"\n";
512 #endif
513 						conflict = true;
514 						if(conflicti == -1)
515 							conflicti = i;
516 						//caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP;
517 						//break;
518 					}
519 					if(i < spec.getNdim())
520 					csize*= sizes[permutation[i]];
521 #ifdef printd
522 					cout<<"\ni = "<<i<<"bsize "<<csize<<"\n";
523 #endif
524 				}
525 				if(i == spec.getNdim() + 1)
526 					i--;
527 				//if((blockB == 1) && (i < spec.getNdim())) i++;
528 				blimit = i-1;
529 				bsize = csize;
530 #ifdef printd
531 				cout <<"\nAsize = "<<asize<<" Bsize = "<<bsize<<" alimit = "<<alimit<<" blimit = "<<blimit<<" blockA = "<<blockA<<" blockB = "<<blockB;
532 
533 #endif
534 				//double eff = getEfficiency_nooverlap(asize, bsize, sizes[alimit], sizes[permutation[blimit]], blockA, blockB );
535 				double eff = 0;
536 				if(!conflict)
537 					eff       = getEfficiency_nooverlap(asize, bsize, sizes[alimit], sizes[permutation[blimit]], blockA, blockB );
538 #ifdef printd
539 				cout <<"\nEff = "<<eff<<"\n";
540 #endif
541 				if(eff >= besteff)
542 				{
543 					parameters->setNumElementsProcessedPerBlock(asize);
544 					parameters->setNumElementsProcessedPerBlock1(bsize);
545 					parameters -> setBlockAIndex(alimit);
546 					parameters -> setBlockBIndex(blimit);
547 					parameters->setTileSize(blockA);
548 					parameters->setTileSize1(blockB);
549 					besteff = eff;
550 				}
551 			}//while(rlimit <= maxlimit);
552 		}
553 		sharedMemSize1 = 33;
554 		sharedMemSize2 = 32;
555 		sharedMemSize = sharedMemSize1 * sharedMemSize2;//csize;//blockB * sizes[permutation[0]];
556 		paddingSize = 32;
557 		parameters->setPaddingSize(paddingSize);
558 		parameters->setWarpEfficiency(besteff);
559 		double bandwidth = getBW_overlap(besteff);
560 		parameters -> setBW(bandwidth);
561 		parameters -> setTime(getTime(bandwidth, spec.getVolume()));
562 		tbSize = getTBSize(sharedMemSize);
563 		parameters->setTbSize(tbSize);
564 		parameters->setSharedMemSize1(sharedMemSize1);
565 		parameters->setSharedMemSize2(sharedMemSize2);
566 		return *parameters;
567 
568 
569 	}
570 
571 
572 
tuneFastestVaryingMatchingCaseWithoutBlocking(TransposeSpec & spec)573 	Parameters& tuneFastestVaryingMatchingCaseWithoutBlocking(TransposeSpec &spec){
574 		Parameters *parameters = new Parameters();
575 		// warp efficiency
576 		float maxWarpEfficiency = 0.0f;
577 		int tbSize = 32;
578 		int numMoves = spec.getSizes()[0];
579 		for(int i= 32; i < 2048 && i <= spec.getSizes()[0]; i+=32){
580 			double totalThreadsinActiveWarps =  ceil((float)numMoves/(float)32) * 32;
581 			float warpEfficiency = (float) ((float)spec.getSizes()[0]/(float)(totalThreadsinActiveWarps))*100;
582 			if(warpEfficiency > maxWarpEfficiency){
583 				maxWarpEfficiency = warpEfficiency;
584 				tbSize = (int)totalThreadsinActiveWarps;
585 			}
586 			//			std::cout<<"warpefficiency : tbsize = "<<warpEfficiency<< " : "<<tbSize;
587 		}
588 		tbSize= 128; // fixing TODO optimal value?
589 		parameters->setWarpEfficiency(maxWarpEfficiency);
590 		parameters->setTbSize(tbSize);
591 		if(spec.getSizes()[0] < 1024 && spec.getSizes()[0] > 256){
592 			parameters->setTileSize(2);
593 		}else if(spec.getSizes()[0] <= 256){
594 			parameters->setTileSize(4);
595 		}
596 		double bandwidth = getBW_matchg32();
597 		parameters -> setBW(bandwidth);
598 		parameters -> setTime(getTime(bandwidth, spec.getVolume()));
599 		//cout <<"\nMatching >= 32\n";
600 		//HashSet<Parameters> returnVal = new HashSet<Parameters>();
601 		//returnVal.add(parameters);
602 		parameters->setTileSize(1);
603 		return *parameters;//returnVal;
604 	}
605 
tuneFastestVaryingMatchingCaseWithBlocking(TransposeSpec & spec)606 	Parameters& tuneFastestVaryingMatchingCaseWithBlocking(TransposeSpec &spec){
607 		//warp efficiency
608 		// occupancy
609 		// Indexing overhead
610 		int *sizes = spec.getSizes();
611 		int* permutation = spec.getPermutation();
612 		int blockA;
613 		blockA  = (32+sizes[0]-1)/sizes[0];
614 		int sharedMemSize = 1;
615 		int tbSize = 32;
616 		Parameters *parameters = new Parameters();
617 		int planeSize, numElements, paddingSize;
618 		//for(i = 1; i < sizes.length; i += 2){ //TODO the blocking happens only at 1 and 2 indices (starting from 0)
619 		double mintime = 0;
620 		float occupancy, warpEfficiency;
621 		int maxPossibleBlocksPerSM;
622 		float best = 0; int bblock = 1;
623 		double bf;
624 		/*do {
625 		  numElements = blockA*blockA*sizes[0];
626 		  planeSize = blockA * sizes[0];
627 		  paddingSize = (32 - (planeSize % 32) + sizes[0])%32;
628 		//cout <<"here "<<paddingSize<<"\n";
629 		sharedMemSize = (planeSize + paddingSize)* blockA;
630 		if(sharedMemSize > sharedMemLimitPerSM/6)
631 
632 		break;
633 
634 		maxPossibleBlocksPerSM = sharedMemLimitPerSM/sharedMemSize;
635 
636 		//if(maxPossibleBlocksPerSM > numThreadBlocksLimitPerSM)
637 		//	continue;
638 
639 		// occupancy
640 		// warp efficiency
641 
642 		tbSize = blockA * min(32, blockA * sizes[0]); //blockA warps
643 		if(tbSize <= threadBlocKSizeLimit){
644 		if (numThreadsLimitPerSM/tbSize > maxPossibleBlocksPerSM) //which ever is minimum, use that : numthreadLimit or sharedMemLimit
645 		occupancy = ((float)(tbSize * maxPossibleBlocksPerSM) / (float) numThreadsLimitPerSM) * 100;
646 		else
647 		occupancy = ((float)(tbSize * (numThreadsLimitPerSM/tbSize)) /(float) numThreadsLimitPerSM) * 100;
648 		double index =  ceil((float)planeSize/(float)32) ;
649 		double totalThreadsinActiveWarps =  index * 32;
650 		const int remainder1 = sizes[1] % blockA;
651 		const int remainder2 = sizes[permutation[1]] % blockA;
652 
653 		const int ilimit = remainder1 * sizes[0];
654 		const int olimit = remainder2 * sizes[0];
655 		const int plain = blockA * sizes[0];
656 		double f1, f2, f3, f4, f;
657 		int minlimit = min(ilimit, olimit);
658 		f1 =  ((plain/32)  + (double)(plain%32) /32)/ (int)((plain+31)/32);
659 		f2 =  ((ilimit/32)  + (double)(ilimit%32) /32)/ (int)(max(1,(plain+31)/32));
660 		f3 =  ((olimit/32)  + (double)(olimit%32) /32)/ (int)(max(1,(plain+31)/32));
661 		f4 =  ((minlimit/32)  + (double)(minlimit%32) /32)/ (int)(max(1,(plain+31)/32));
662 		//printf("\tf1=%lf\t", f1 =  ((plain/32)  + (double)(plain%32) /32)/ (int)((plain+31)/32));
663 		//	printf("\tf2=%lf\t", f2 =  ((ilimit/32)  + (double)(ilimit%32) /32)/ (int)(max(1,(plain+31)/32)));
664 		//	printf("\tf3=%lf\t", f3 =  ((olimit/32)  + (double)(olimit%32) /32)/ (int)(max(1,(plain+31)/32)));
665 		//	printf("\tf4=%lf\t", f4 =  ((minlimit/32)  + (double)(minlimit%32) /32)/ (int)(max(1,(plain+31)/32)));
666 		int asize = sizes[1];
667 		int bsize = sizes[permutation[1]];
668 		//	printf("\t%d\t%d\t%d\t%d\t", asize/blockA, asize%blockA, bsize/blockA,bsize%blockA );
669 		//int amax = min(blockA, 32);
670 		//int bmax = min(blockB, 32);
671 		int amax = blockA;
672 		int bmax = blockA;
673 		//printf("\tf=%lf\t", f = ((asize/amax) * (bsize/bmax) *f1 + (double)(asize/amax) * (bsize%bmax > 0) *f3+ (double)(asize%amax>0) * (bsize/bmax)*f2 + (double)(asize%amax > 0) * (bsize%bmax > 0) *f4 )/ (int)(((asize+amax-1)/amax) * ((bsize+bmax-1)/bmax)));
674 		f = ((asize/amax) * (bsize/bmax) *f1 + (double)(asize/amax) * (bsize%bmax > 0) *f3+ (double)(asize%amax>0) * (bsize/bmax)*f2 + (double)(asize%amax > 0) * (bsize%bmax > 0) *f4 )/ (int)(((asize+amax-1)/amax) * ((bsize+bmax-1)/bmax));
675 		//cout <<"f = "<<f<<" blbock = "<<blockA<<" ";
676 
677 		warpEfficiency = f;//((float)planeSize / (float)totalThreadsinActiveWarps)*100;
678 		if(warpEfficiency >= best) {best = warpEfficiency; bblock = blockA; bf = f;}
679 		}
680 		blockA++;
681 		}
682 		while((blockA < sizes[1]) && (blockA < sizes[permutation[1]]));
683 		*/	bblock = 8;
684 		int mul;
685 		if(sizes[0] <=8) mul = 16;
686 		else if(sizes[0] <= 16) mul = 8;
687 		else mul = 4;
688 		bblock = min(min(mul, sizes[1]), sizes[permutation[1]]);
689 		tbSize = bblock * min(32, bblock * sizes[0]); //blockA warps
690 		//cout<<"\t"<<bf << "\t"<<bblock<<"\t"<<tbSize<<"\t";
691 		numElements = bblock*bblock*sizes[0];
692 		planeSize = bblock * sizes[0];
693 		paddingSize = (32 - (planeSize % 32) + sizes[0])%32;
694 		//tbSize = bblock * 32; //blockA warps
695 		if (numThreadsLimitPerSM/tbSize > maxPossibleBlocksPerSM) //which ever is minimum, use that : numthreadLimit or sharedMemLimit
696 			occupancy = ((float)(tbSize * maxPossibleBlocksPerSM) / (float) numThreadsLimitPerSM) * 100;
697 		else
698 			occupancy = ((float)(tbSize * (numThreadsLimitPerSM/tbSize)) /(float) numThreadsLimitPerSM) * 100;
699 		parameters->setNumElementsProcessedPerBlock(numElements);
700 		int rem = (sizes[permutation[1]] % bblock) * (sizes[1] % bblock) * sizes[0];
701 		sharedMemSize = (planeSize + paddingSize)* bblock;
702 		parameters->setRemElements(rem);
703 		parameters->setPaddingSize(paddingSize);
704 		parameters->setOccupancy(occupancy);
705 		parameters->setWarpEfficiency(best);
706 		parameters->setTbSize(tbSize);
707 		parameters->setNumBlocksPerSM(maxPossibleBlocksPerSM);
708 		parameters->setTileSize(bblock);
709 		parameters->setSharedMemSize1(sharedMemSize);
710 		double eff = getEfficiency_matchl32(sizes[0], sizes[1],sizes[permutation[1]], bblock);
711 		//cout <<" Eff = "<<eff <<"\t";
712 		double bandwidth = getBW_matchl32(eff, bblock);
713 		parameters -> setBW(bandwidth);
714 		parameters -> setTime(getTime(bandwidth, spec.getVolume()));
715 
716 
717 		return *parameters;
718 	}
719 
ParameterTuner()720 	ParameterTuner() {
721 		// TODO Auto-generated constructor stub
722 		sharedMemLimitPerSM = 6144; // (48 * 1024)/8 words per
723 		numThreadsLimitPerSM =  2048; //per SM
724 		numThreadBlocksLimitPerSM =  16; //per SM
725 		threadBlocKSizeLimit = 1024;
726 		numSMs = 15;
727 		blockFactor = 6;
728 
729 	}
730 
~ParameterTuner()731 	~ParameterTuner() {
732 		// TODO Auto-generated destructor stub
733 	}
734 	};
735 
736