1 /* 2 * ParameterTuner.cpp 3 * 4 * Created on: Jan 30, 2017 5 */ 6 7 #include "ParameterTuner.h" 8 #include "ourproj.h" 9 #include <math.h> 10 #include "Parameters.h" 11 #include "BlockingCase.h" 12 #include "TransposeSpec.h" 13 #include <iostream> 14 15 #include "ourinclude.h" 16 using namespace std; 17 18 class ParameterTuner { 19 int sharedMemLimitPerSM;// = 6144; // (48 * 1024)/8 words per SM 20 int numThreadsLimitPerSM;// = 2048; //per SM 21 int numThreadBlocksLimitPerSM;// = 16; //per SM 22 int threadBlocKSizeLimit;// = 1024; 23 int numSMs;// = 15; 24 int blockFactor;// = 4 25 BlockingCase caseId; 26 27 public: getTBSize(unsigned int shm)28 unsigned int getTBSize(unsigned int shm) 29 { 30 return (unsigned) ceil((double)(numThreadsLimitPerSM/floor(32.0*(unsigned)(sharedMemLimitPerSM/ shm)))) * 32; 31 } getCaseId()32 BlockingCase getCaseId() 33 { 34 return caseId; 35 } tune(TransposeSpec & spec)36 Parameters& tune(TransposeSpec &spec) { 37 int *sizes = spec.getSizes(); 38 if((spec.getPermutation()[0] != 0) || (sizes[0] * sizes[1] < 32) || (sizes[spec.getPermutation()[0]] * sizes[spec.getPermutation()[1]] < 32) ) 39 { 40 return tuneFastestVaryingNotMatchingCase(spec); 41 } 42 else if ( spec.getSizes()[0] >= 32) 43 { 44 caseId = BlockingCase::FVI_MATCH_AND_GREATERT32; 45 return tuneFastestVaryingMatchingCaseWithoutBlocking(spec); 46 } 47 else 48 { 49 caseId = BlockingCase::FVI_MATCH_AND_LESST32; 50 return tuneFastestVaryingMatchingCaseWithBlocking(spec); 51 } 52 } 53 tuneFastestVaryingNotMatchG32Case(TransposeSpec & spec)54 Parameters& tuneFastestVaryingNotMatchG32Case(TransposeSpec &spec){ 55 Parameters *parameters = new Parameters(); 56 int *sizes = spec.getSizes(); 57 int* permutation = spec.getPermutation(); 58 unsigned numElements = 32*32; 59 unsigned sharedMemSize = (32 *33); 60 unsigned paddingSize = 1; 61 unsigned tbSize = 256; 62 unsigned numBlocksPerSM = sharedMemLimitPerSM/sharedMemSize; 63 parameters->setNumElementsProcessedPerBlock(numElements); 64 parameters->setPaddingSize(paddingSize); 65 parameters->setOccupancy(100.0f); 66 parameters->setTbSize(tbSize); 67 parameters->setNumBlocksPerSM(numBlocksPerSM); 68 parameters->setTileSize(32); 69 parameters->setSharedMemSize1(32); 70 parameters->setSharedMemSize2(33); 71 double eff = getEfficiency_nomatchg32(sizes[0], sizes[permutation[0]]); 72 double bandwidth = getBW_nomatchg32(eff); 73 parameters -> setBW(bandwidth); 74 unsigned long vol = spec.getVolume(); 75 76 //cout <<"cc0= "<< spec.getVolume() << " eff = "<<eff<<" bw = "<<bandwidth<< " time = "<<getTime(bandwidth, spec.getVolume())<<"\t"; 77 //cout <<"cc1= "<< vol << " eff = "<<eff<<" bw = "<<bandwidth<< " time = "<<getTime(bandwidth, vol)<<"\t"; 78 parameters -> setTime(getTime(bandwidth, spec.getVolume())); 79 parameters->setWarpEfficiency(eff); 80 return *parameters; 81 //parametersList.add(parameters); 82 } tuneConflictCase(TransposeSpec & spec)83 Parameters& tuneConflictCase(TransposeSpec &spec){ 84 Parameters *parameters = new Parameters(); 85 int *sizes = spec.getSizes(); 86 int* permutation = spec.getPermutation(); 87 int blockA = 1, blockB = 1; 88 int sharedMemSize = 1; 89 int sharedMemSize1 = 1; 90 int sharedMemSize2 = 1; 91 int tbSize = 32; 92 int numElements = 0; 93 int paddingSize= 0; 94 int csize, asize, bsize, bonlysize, pad;//sizes[0]; 95 int repeat = 0, rlimit, alimit, blimit,count = 0; 96 unsigned SHMLIMIT = 1056; 97 //unsigned SHMLIMIT = 1400; 98 const int limit = 32;//starts from 32 and goes 64, 128... 99 int limiti, limito, nlimit; 100 unsigned long int volume = spec.getVolume(); 101 int minnumblocks = numSMs * sharedMemLimitPerSM/(33*32); 102 nlimit = sqrt(volume/(blockFactor * minnumblocks* 32*32)); 103 if(nlimit == 0) nlimit = 1; 104 double besteff = 0; 105 for(int limiti = 0; limiti < nlimit; limiti++) 106 { 107 108 rlimit = 32 + 32*limiti; 109 int i; 110 blockA = 1, blockB = 1, csize = 1; 111 for(i = 0; i <= spec.getNdim(); i++) 112 { 113 if(csize == rlimit) 114 { 115 break; 116 } 117 if(csize > rlimit) 118 { 119 csize/=sizes[i-1]; 120 //blockA = (rlimit+csize-1)/csize; 121 blockA = (rlimit)/csize; 122 //cyyout << "blockA = "<<blockA; 123 if(blockA != 1){ 124 //csize/=sizes[i-1]; 125 csize*=blockA; 126 } 127 else i--; 128 if(blockA == sizes[i-1]) { blockA = 1;} 129 break; 130 } 131 if(i < spec.getNdim()) 132 csize*= sizes[i]; 133 } 134 if(i == spec.getNdim() + 1) 135 i--; 136 //if(blockA == 1 && i < spec.getNdim()) i++; 137 alimit = i-1; 138 asize = csize; 139 #ifdef printd 140 cout << "asize == "<<asize<<" ablock = "<<blockA<<" rlimit= "<<rlimit<<"\n"; 141 #endif 142 for(int limito = 0; limito < nlimit; limito++) 143 { 144 i = 0; 145 bonlysize = 1; 146 int limit = 32 + limito * 32;//1024/(asize*2); 147 //int limit = 32;//rlimit;//1024/(asize*2); 148 csize = 1;//sizes[permutation[i]]; 149 150 for(; i <= spec.getNdim(); i++) 151 { 152 if(csize == limit) 153 { 154 break; 155 } 156 if(csize > limit) 157 { 158 if(i > 0){ 159 csize/=sizes[permutation[i-1]]; 160 // blockB = (limit+csize-1)/csize; 161 blockB = (limit)/csize; 162 #ifdef printd 163 cout << "\ncsize = "<<csize; 164 cout << "\nblockB = "<<blockB; 165 #endif 166 if( permutation[i-1] > alimit) 167 { 168 bonlysize /= sizes[permutation[i-1]]; 169 if(blockB != 1){ 170 bonlysize*= blockB; 171 172 } 173 } 174 if(blockB != 1){ 175 csize*= blockB; 176 if(blockB == sizes[permutation[i-1]]) {blockB = 1;} 177 178 } 179 else i--; 180 181 182 } 183 184 break; 185 } 186 if(i == spec.getNdim()) break; 187 //if(permutation[i] < alimit) continue; 188 //if(i > 0 && ((blockA != 1) && (permutation[i] == alimit))) 189 if(i < spec.getNdim()) 190 csize*= sizes[permutation[i]]; 191 if(permutation[i] > alimit) 192 { 193 bonlysize *= sizes[permutation[i]]; 194 } 195 #ifdef printd 196 cout<<"\ni = "<<i<<"bsize = "<<csize<<"\n"; 197 #endif 198 } 199 #ifdef printd 200 cout << "\nbsize == "<<csize<<"\n"; 201 #endif 202 if(i == spec.getNdim() + 1) 203 i--; 204 //if((blockB == 1) && (i < spec.getNdim())) i++; 205 blimit = i-1; 206 //cout<<" alimitp = "<<alimit<<" blimitp = "<<blimit<<"\n"; 207 bsize = csize; 208 int n = spec.getNdim(); 209 int rperm[20]; 210 for(int i = 0; i < n; i++) 211 { 212 for(int j = 0; j < n; j++) 213 { 214 if(permutation[i] == j) 215 { 216 rperm[j] = i; 217 } 218 } 219 } 220 221 222 223 if(blockA > 1)//checking for inner dimensions in output which gets blocked in input 224 { 225 for(int i = 0; i < blimit; i++) 226 { 227 if(permutation[i] == alimit) 228 { 229 asize /= blockA; 230 asize *= sizes[permutation[i]]; 231 blockA = 1; 232 } 233 } 234 } 235 // cout <<" blockA = "<<blockA <<" blockB = "<<blockB; 236 if(blockB > 1)//checking for inner dimensions in input which gets blocked in output 237 { 238 239 // cout <<"A smaller in B\n"; 240 for(int i = 0; i < alimit; i++) 241 { 242 if(rperm[i] == blimit) 243 { 244 bsize /= blockB; 245 bsize *= sizes[i]; 246 // cout <<"\nNew bsize "<<bsize<<"\n"; 247 blockB = 1; 248 } 249 } 250 } 251 252 //we need to change the blocksize in case alimit and blimit are same dimension 253 if((alimit == permutation[blimit]) && (blockA > 1 || blockB > 1)) 254 { 255 #ifdef printd 256 cout <<"Blocking dimensions same, bloackA = "<<blockA<<" blockB = "<<blockB<<"\n"; 257 #endif 258 if(((blockA > blockB) && (blockB != 1)) || (blockA == 1))// > bsize) 259 { 260 if(blockA == 1) 261 bsize = (bsize/blockB)*sizes[alimit], blockB = blockA; 262 else 263 bsize = (bsize/blockB)*blockA, blockB = blockA; 264 265 } 266 else 267 { 268 if(blockB == 1) 269 asize = (asize/blockA) * sizes[permutation[blimit]], blockA = blockB; 270 else 271 asize = (asize/blockA) * blockB, blockA = blockB; 272 } 273 } 274 pad = ((asize %2)+1)%2; 275 sharedMemSize1 = asize + pad; 276 sharedMemSize2 = bonlysize; 277 sharedMemSize = sharedMemSize1 * sharedMemSize2;//csize;//blockB * sizes[permutation[0]]; 278 //tbSize = max(64, (sharedMemSize/64)*32); 279 //tbSize = 320;// max(64, (sharedMemSize/64)*32); 280 tbSize = getTBSize(sharedMemSize); 281 if(sharedMemSize > SHMLIMIT && besteff > 0) 282 { 283 // limiti = nlimit; 284 break; 285 } 286 double eff = getEfficiency_overlap(asize, bsize, sizes[alimit], sizes[permutation[blimit]], blockA, blockB); 287 //cout <<"\t"<<eff<<"\t"<<asize<<"\t"<<bsize<<"\n"; 288 if(eff > besteff) 289 { 290 parameters->setNumElementsProcessedPerBlock(asize); 291 parameters->setNumElementsProcessedPerBlock1(bsize); 292 parameters->setPaddingSize(rlimit); 293 parameters->setTbSize(tbSize); 294 parameters->setTileSize(blockA); 295 parameters->setSharedMemSize1(sharedMemSize1); 296 parameters->setSharedMemSize2(sharedMemSize2); 297 parameters->setTileSize1(blockB); 298 parameters -> setBlockAIndex(alimit); 299 parameters -> setBlockBIndex(blimit); 300 besteff = eff; 301 302 } 303 304 #ifdef printd 305 cout<<" alimit = "<<alimit<<" blimit = "<<blimit<<"\n"; 306 cout<<" asize = "<<asize<<" bsize = "<<bsize<<"\n"; 307 cout<<" blockA = "<<blockA<<" blockB = "<<blockB<<"\n"; 308 cout<<" SM1 = "<<sharedMemSize<<"\n";// blockB = "<<blockB<<"\n"; 309 cout <<" TBSize = "<<tbSize<<"\n"; 310 #endif 311 count++; 312 repeat++; 313 } 314 }// while(sharedMemSize <= 512); 315 //double eff = getEfficiency_overlap(asize, bsize, sizes[alimit], sizes[permutation[blimit]], blockA, blockB ); 316 double bandwidth = getBW_overlap(besteff); 317 parameters -> setBW(bandwidth); 318 parameters -> setTime(getTime(bandwidth, spec.getVolume())); 319 parameters->setWarpEfficiency(besteff); 320 return *parameters; 321 322 } tuneFastestVaryingNotMatchingCase(TransposeSpec & spec)323 Parameters& tuneFastestVaryingNotMatchingCase(TransposeSpec &spec){ 324 int *sizes = spec.getSizes(); 325 int* permutation = spec.getPermutation(); 326 TensorType mytype = spec.getDataType(); 327 int blockAIndex = 0; 328 int blockBIndex = 0; 329 Parameters ¶meters1 = tuneFastestVaryingNotMatchG32Case(spec); 330 Parameters ¶meters2 = tuneNonConflictCase(spec); 331 Parameters ¶meters3 = tuneConflictCase(spec); 332 double b1, b2, b3; 333 b1 = getBW_nomatchg32(parameters1 . getWarpEfficiency()); 334 // if(b1 < 0.7) 335 // parameters1.setTbSize(256); 336 double eff2 = parameters2 . getWarpEfficiency(); 337 //if(eff2 < 0.7) eff2*=0.9; 338 b2 = getBW_nooverlap(eff2); 339 b3 = getBW_overlap(parameters3 . getWarpEfficiency()); 340 //if(parameters1 . getWarpEfficiency() > parameters2 . getWarpEfficiency()) 341 #ifdef printd 342 cout<<"\t"<<parameters1 . getWarpEfficiency()<<"\t"<<parameters2 . getWarpEfficiency()<<"\t"<<parameters3 . getWarpEfficiency(); 343 cout <<"\t"<<b1<<"\t"<<b2<<"\t"<<b3<<"\t"; 344 #endif 345 346 if(b1 >= b2) 347 { 348 //if(parameters1 . getWarpEfficiency() > parameters3 . getWarpEfficiency()) 349 if(b1 >= b3) 350 { 351 caseId = BlockingCase::FVI_NOMATCH_AND_GREATERT32; 352 return parameters1; 353 } 354 else 355 { 356 caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP; 357 return parameters3; 358 } 359 } 360 else if(b2 >= b3) 361 { 362 caseId = BlockingCase::FVI_NOMATCH_GENERAL; 363 return parameters2; 364 } 365 else 366 { 367 caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP; 368 return parameters3; 369 } 370 371 372 } tuneNonConflictCase(TransposeSpec & spec)373 Parameters& tuneNonConflictCase(TransposeSpec &spec){ 374 Parameters *parameters = new Parameters(); 375 int *sizes = spec.getSizes(); 376 int* permutation = spec.getPermutation(); 377 int sharedMemSize = 1; 378 int sharedMemSize1 = 1; 379 int sharedMemSize2 = 1; 380 int numElements = 0; 381 int paddingSize= 0; 382 int n = spec.getNdim(); 383 int rperm[20]; 384 for(int i = 0; i < n; i++) 385 { 386 for(int j = 0; j < n; j++) 387 { 388 if(permutation[i] == j) 389 { 390 rperm[j] = i; 391 } 392 } 393 } 394 int csize = 1, asize, bsize;//sizes[0]; 395 int rlimit = 32; 396 //int tbSize = 512; 397 int tbSize = 352; 398 int alimit = 0, blimit = 0, irlimit = 0; 399 int i, blockA = 1, blockB = 1; 400 irlimit += 32; 401 int limiti, limito, nlimit; 402 unsigned long int volume = spec.getVolume(); 403 int minnumblocks = numSMs * sharedMemLimitPerSM/(33*32); 404 nlimit = sqrt(volume/(blockFactor * minnumblocks* 32*32)); 405 //nlimit = 4; 406 if(nlimit == 0) nlimit = 1; 407 #ifdef printd 408 cout <<"\nnlimit = "<<nlimit; 409 #endif 410 double besteff = 0; 411 412 for(limiti = 0; limiti < nlimit; limiti++) 413 { 414 int limitir; 415 limitir = 32+ limiti*32; 416 for(limito = 0; limito < nlimit; limito++) 417 { 418 int limitor; 419 limitor = 32+ limito*32; 420 bool conflict = false; 421 csize = 1, blockA = 1, blockB = 1; 422 for(i = 0; i < spec.getNdim(); i++) 423 { 424 if(csize == limitir) 425 { 426 break; 427 } 428 if(csize > limitir) 429 { 430 csize/=sizes[i-1]; 431 //blockA = (limitir+csize-1)/csize; 432 blockA = (limitir)/csize; 433 //cyyout << "blockA = "<<blockA; 434 if(blockA != 1){ 435 //csize/=sizes[i-1]; 436 csize*=blockA; 437 } 438 else i--; 439 if(blockA == sizes[i-1]) { blockA = 1;} 440 break; 441 } 442 csize*= sizes[i]; 443 } 444 if(i == spec.getNdim() + 1) 445 i--; 446 //if(blockA == 1 && i < spec.getNdim()) i++; 447 alimit = i-1; 448 asize = csize; 449 i = 0; 450 int conflicti = -1; 451 //rlimit = 64; 452 csize = 1;//sizes[permutation[i]]; 453 for(; i <= spec.getNdim(); i++) 454 { 455 if (i > 0 && permutation[i-1] < alimit) 456 { 457 limito = nlimit; 458 conflict = true; 459 #ifdef printd 460 cout <<"caseid changed "<< i<<"\n"; 461 #endif 462 //caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP; 463 //break; 464 465 } 466 if(csize == limitor) 467 { 468 break; 469 } 470 if(csize> limitor) 471 { 472 473 csize/=sizes[permutation[i-1]]; 474 //blockB = (limitor+csize-1)/csize; 475 blockB = (limitor)/csize; 476 #ifdef printd 477 cout << "\ncsize = "<<csize; 478 cout << "\nblockB = "<<blockB; 479 #endif 480 if(blockB != 1){ 481 //if (permutation[i-1] < alimit || ((blockA != 1) && (permutation[i] == alimit))) 482 if (i > 0 && permutation[i-1] <= alimit ) 483 { 484 limito = nlimit; 485 conflict = true; 486 #ifdef printd 487 cout <<"caseid changed "<< i<<"\n"; 488 #endif 489 // caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP; 490 // break; 491 } 492 493 //csize/=sizes[permutation[i-1]]; 494 csize*= blockB; 495 if(blockB == sizes[permutation[i-1]]) { blockB = 1;} 496 497 } 498 else{ i--; 499 if(conflicti == i) 500 conflict = false; 501 } 502 break; 503 } 504 if(i == spec.getNdim()) break; 505 //if(permutation[i] < alimit) continue; 506 //if(i > 0 && ((blockA != 1) && (permutation[i] == alimit))) 507 if((permutation[i] <= alimit)) 508 { 509 limito = nlimit; 510 #ifdef printd 511 cout <<"caseid changed "<< i<<"\n"; 512 #endif 513 conflict = true; 514 if(conflicti == -1) 515 conflicti = i; 516 //caseId = BlockingCase::FVI_NOMATCH_GENERAL_OVERLAP; 517 //break; 518 } 519 if(i < spec.getNdim()) 520 csize*= sizes[permutation[i]]; 521 #ifdef printd 522 cout<<"\ni = "<<i<<"bsize "<<csize<<"\n"; 523 #endif 524 } 525 if(i == spec.getNdim() + 1) 526 i--; 527 //if((blockB == 1) && (i < spec.getNdim())) i++; 528 blimit = i-1; 529 bsize = csize; 530 #ifdef printd 531 cout <<"\nAsize = "<<asize<<" Bsize = "<<bsize<<" alimit = "<<alimit<<" blimit = "<<blimit<<" blockA = "<<blockA<<" blockB = "<<blockB; 532 533 #endif 534 //double eff = getEfficiency_nooverlap(asize, bsize, sizes[alimit], sizes[permutation[blimit]], blockA, blockB ); 535 double eff = 0; 536 if(!conflict) 537 eff = getEfficiency_nooverlap(asize, bsize, sizes[alimit], sizes[permutation[blimit]], blockA, blockB ); 538 #ifdef printd 539 cout <<"\nEff = "<<eff<<"\n"; 540 #endif 541 if(eff >= besteff) 542 { 543 parameters->setNumElementsProcessedPerBlock(asize); 544 parameters->setNumElementsProcessedPerBlock1(bsize); 545 parameters -> setBlockAIndex(alimit); 546 parameters -> setBlockBIndex(blimit); 547 parameters->setTileSize(blockA); 548 parameters->setTileSize1(blockB); 549 besteff = eff; 550 } 551 }//while(rlimit <= maxlimit); 552 } 553 sharedMemSize1 = 33; 554 sharedMemSize2 = 32; 555 sharedMemSize = sharedMemSize1 * sharedMemSize2;//csize;//blockB * sizes[permutation[0]]; 556 paddingSize = 32; 557 parameters->setPaddingSize(paddingSize); 558 parameters->setWarpEfficiency(besteff); 559 double bandwidth = getBW_overlap(besteff); 560 parameters -> setBW(bandwidth); 561 parameters -> setTime(getTime(bandwidth, spec.getVolume())); 562 tbSize = getTBSize(sharedMemSize); 563 parameters->setTbSize(tbSize); 564 parameters->setSharedMemSize1(sharedMemSize1); 565 parameters->setSharedMemSize2(sharedMemSize2); 566 return *parameters; 567 568 569 } 570 571 572 tuneFastestVaryingMatchingCaseWithoutBlocking(TransposeSpec & spec)573 Parameters& tuneFastestVaryingMatchingCaseWithoutBlocking(TransposeSpec &spec){ 574 Parameters *parameters = new Parameters(); 575 // warp efficiency 576 float maxWarpEfficiency = 0.0f; 577 int tbSize = 32; 578 int numMoves = spec.getSizes()[0]; 579 for(int i= 32; i < 2048 && i <= spec.getSizes()[0]; i+=32){ 580 double totalThreadsinActiveWarps = ceil((float)numMoves/(float)32) * 32; 581 float warpEfficiency = (float) ((float)spec.getSizes()[0]/(float)(totalThreadsinActiveWarps))*100; 582 if(warpEfficiency > maxWarpEfficiency){ 583 maxWarpEfficiency = warpEfficiency; 584 tbSize = (int)totalThreadsinActiveWarps; 585 } 586 // std::cout<<"warpefficiency : tbsize = "<<warpEfficiency<< " : "<<tbSize; 587 } 588 tbSize= 128; // fixing TODO optimal value? 589 parameters->setWarpEfficiency(maxWarpEfficiency); 590 parameters->setTbSize(tbSize); 591 if(spec.getSizes()[0] < 1024 && spec.getSizes()[0] > 256){ 592 parameters->setTileSize(2); 593 }else if(spec.getSizes()[0] <= 256){ 594 parameters->setTileSize(4); 595 } 596 double bandwidth = getBW_matchg32(); 597 parameters -> setBW(bandwidth); 598 parameters -> setTime(getTime(bandwidth, spec.getVolume())); 599 //cout <<"\nMatching >= 32\n"; 600 //HashSet<Parameters> returnVal = new HashSet<Parameters>(); 601 //returnVal.add(parameters); 602 parameters->setTileSize(1); 603 return *parameters;//returnVal; 604 } 605 tuneFastestVaryingMatchingCaseWithBlocking(TransposeSpec & spec)606 Parameters& tuneFastestVaryingMatchingCaseWithBlocking(TransposeSpec &spec){ 607 //warp efficiency 608 // occupancy 609 // Indexing overhead 610 int *sizes = spec.getSizes(); 611 int* permutation = spec.getPermutation(); 612 int blockA; 613 blockA = (32+sizes[0]-1)/sizes[0]; 614 int sharedMemSize = 1; 615 int tbSize = 32; 616 Parameters *parameters = new Parameters(); 617 int planeSize, numElements, paddingSize; 618 //for(i = 1; i < sizes.length; i += 2){ //TODO the blocking happens only at 1 and 2 indices (starting from 0) 619 double mintime = 0; 620 float occupancy, warpEfficiency; 621 int maxPossibleBlocksPerSM; 622 float best = 0; int bblock = 1; 623 double bf; 624 /*do { 625 numElements = blockA*blockA*sizes[0]; 626 planeSize = blockA * sizes[0]; 627 paddingSize = (32 - (planeSize % 32) + sizes[0])%32; 628 //cout <<"here "<<paddingSize<<"\n"; 629 sharedMemSize = (planeSize + paddingSize)* blockA; 630 if(sharedMemSize > sharedMemLimitPerSM/6) 631 632 break; 633 634 maxPossibleBlocksPerSM = sharedMemLimitPerSM/sharedMemSize; 635 636 //if(maxPossibleBlocksPerSM > numThreadBlocksLimitPerSM) 637 // continue; 638 639 // occupancy 640 // warp efficiency 641 642 tbSize = blockA * min(32, blockA * sizes[0]); //blockA warps 643 if(tbSize <= threadBlocKSizeLimit){ 644 if (numThreadsLimitPerSM/tbSize > maxPossibleBlocksPerSM) //which ever is minimum, use that : numthreadLimit or sharedMemLimit 645 occupancy = ((float)(tbSize * maxPossibleBlocksPerSM) / (float) numThreadsLimitPerSM) * 100; 646 else 647 occupancy = ((float)(tbSize * (numThreadsLimitPerSM/tbSize)) /(float) numThreadsLimitPerSM) * 100; 648 double index = ceil((float)planeSize/(float)32) ; 649 double totalThreadsinActiveWarps = index * 32; 650 const int remainder1 = sizes[1] % blockA; 651 const int remainder2 = sizes[permutation[1]] % blockA; 652 653 const int ilimit = remainder1 * sizes[0]; 654 const int olimit = remainder2 * sizes[0]; 655 const int plain = blockA * sizes[0]; 656 double f1, f2, f3, f4, f; 657 int minlimit = min(ilimit, olimit); 658 f1 = ((plain/32) + (double)(plain%32) /32)/ (int)((plain+31)/32); 659 f2 = ((ilimit/32) + (double)(ilimit%32) /32)/ (int)(max(1,(plain+31)/32)); 660 f3 = ((olimit/32) + (double)(olimit%32) /32)/ (int)(max(1,(plain+31)/32)); 661 f4 = ((minlimit/32) + (double)(minlimit%32) /32)/ (int)(max(1,(plain+31)/32)); 662 //printf("\tf1=%lf\t", f1 = ((plain/32) + (double)(plain%32) /32)/ (int)((plain+31)/32)); 663 // printf("\tf2=%lf\t", f2 = ((ilimit/32) + (double)(ilimit%32) /32)/ (int)(max(1,(plain+31)/32))); 664 // printf("\tf3=%lf\t", f3 = ((olimit/32) + (double)(olimit%32) /32)/ (int)(max(1,(plain+31)/32))); 665 // printf("\tf4=%lf\t", f4 = ((minlimit/32) + (double)(minlimit%32) /32)/ (int)(max(1,(plain+31)/32))); 666 int asize = sizes[1]; 667 int bsize = sizes[permutation[1]]; 668 // printf("\t%d\t%d\t%d\t%d\t", asize/blockA, asize%blockA, bsize/blockA,bsize%blockA ); 669 //int amax = min(blockA, 32); 670 //int bmax = min(blockB, 32); 671 int amax = blockA; 672 int bmax = blockA; 673 //printf("\tf=%lf\t", f = ((asize/amax) * (bsize/bmax) *f1 + (double)(asize/amax) * (bsize%bmax > 0) *f3+ (double)(asize%amax>0) * (bsize/bmax)*f2 + (double)(asize%amax > 0) * (bsize%bmax > 0) *f4 )/ (int)(((asize+amax-1)/amax) * ((bsize+bmax-1)/bmax))); 674 f = ((asize/amax) * (bsize/bmax) *f1 + (double)(asize/amax) * (bsize%bmax > 0) *f3+ (double)(asize%amax>0) * (bsize/bmax)*f2 + (double)(asize%amax > 0) * (bsize%bmax > 0) *f4 )/ (int)(((asize+amax-1)/amax) * ((bsize+bmax-1)/bmax)); 675 //cout <<"f = "<<f<<" blbock = "<<blockA<<" "; 676 677 warpEfficiency = f;//((float)planeSize / (float)totalThreadsinActiveWarps)*100; 678 if(warpEfficiency >= best) {best = warpEfficiency; bblock = blockA; bf = f;} 679 } 680 blockA++; 681 } 682 while((blockA < sizes[1]) && (blockA < sizes[permutation[1]])); 683 */ bblock = 8; 684 int mul; 685 if(sizes[0] <=8) mul = 16; 686 else if(sizes[0] <= 16) mul = 8; 687 else mul = 4; 688 bblock = min(min(mul, sizes[1]), sizes[permutation[1]]); 689 tbSize = bblock * min(32, bblock * sizes[0]); //blockA warps 690 //cout<<"\t"<<bf << "\t"<<bblock<<"\t"<<tbSize<<"\t"; 691 numElements = bblock*bblock*sizes[0]; 692 planeSize = bblock * sizes[0]; 693 paddingSize = (32 - (planeSize % 32) + sizes[0])%32; 694 //tbSize = bblock * 32; //blockA warps 695 if (numThreadsLimitPerSM/tbSize > maxPossibleBlocksPerSM) //which ever is minimum, use that : numthreadLimit or sharedMemLimit 696 occupancy = ((float)(tbSize * maxPossibleBlocksPerSM) / (float) numThreadsLimitPerSM) * 100; 697 else 698 occupancy = ((float)(tbSize * (numThreadsLimitPerSM/tbSize)) /(float) numThreadsLimitPerSM) * 100; 699 parameters->setNumElementsProcessedPerBlock(numElements); 700 int rem = (sizes[permutation[1]] % bblock) * (sizes[1] % bblock) * sizes[0]; 701 sharedMemSize = (planeSize + paddingSize)* bblock; 702 parameters->setRemElements(rem); 703 parameters->setPaddingSize(paddingSize); 704 parameters->setOccupancy(occupancy); 705 parameters->setWarpEfficiency(best); 706 parameters->setTbSize(tbSize); 707 parameters->setNumBlocksPerSM(maxPossibleBlocksPerSM); 708 parameters->setTileSize(bblock); 709 parameters->setSharedMemSize1(sharedMemSize); 710 double eff = getEfficiency_matchl32(sizes[0], sizes[1],sizes[permutation[1]], bblock); 711 //cout <<" Eff = "<<eff <<"\t"; 712 double bandwidth = getBW_matchl32(eff, bblock); 713 parameters -> setBW(bandwidth); 714 parameters -> setTime(getTime(bandwidth, spec.getVolume())); 715 716 717 return *parameters; 718 } 719 ParameterTuner()720 ParameterTuner() { 721 // TODO Auto-generated constructor stub 722 sharedMemLimitPerSM = 6144; // (48 * 1024)/8 words per 723 numThreadsLimitPerSM = 2048; //per SM 724 numThreadBlocksLimitPerSM = 16; //per SM 725 threadBlocKSizeLimit = 1024; 726 numSMs = 15; 727 blockFactor = 6; 728 729 } 730 ~ParameterTuner()731 ~ParameterTuner() { 732 // TODO Auto-generated destructor stub 733 } 734 }; 735 736