1dnl 2dnl @author: Michele Martone 3dnl 4/*! 5 @file 6 @brief 7 Performance kernels dispatching code, for each type, submatrix size, operation. 8 But for block compressed sparse stripes format. 9 Kernels unrolled, with no loops, for only user-specified blockings. 10 */ 11dnl 12include(`rsb_misc.m4')dnl 13RSB_M4_HEADER_MESSAGE()dnl 14RSB_M4_HEADER_EXTRA_DECLARATIONS()dnl 15include(`rsb_krnl_bcss_macros.m4')dnl 16include(`rsb_krnl_vb_macros.m4')dnl FIXME : RSB_M4_KERNEL_DIRECT_DISPATCHER_FUNCTION_NAME 17dnl 18dnl 19dnl 20dnl 21dnl 22dnl 23define(`RSB_M4_BCOO_SPMV_KERNELS',`dnl 24dnl 25pushdef(`unrollings',$1)dnl 26dnl 27dnl FIXED BLOCK SIZE KERNELS : 28dnl 29foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 30foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 31foreach(`matrix_storage',RSB_M4_BCOO_FORMATS,`dnl 32foreach(`unrolling',unrollings,`dnl 33dnl ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop), 34ifelse(1,1,`dnl 35foreach(`diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 36foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl 37foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl 38foreach(`symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 39foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 40foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 41foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 42RSB_M4_BCOO_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,symmetry,rowsu,colsu,unrolling,mop,citype,diagonal,uplo) 43')dnl 44')dnl 45')dnl 46')dnl 47')dnl 48')dnl 49')dnl 50')dnl 51')dnl 52')dnl 53')dnl 54')dnl 55dnl 56dnl FIXED BLOCK SIZE DISPATCHERS : 57dnl 58foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 59foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 60dnl ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),1,.. 61ifelse(1,1,`dnl 62foreach(`matrix_storage',RSB_M4_BCOO_FORMATS,`dnl 63foreach(`unrolling',unrollings,`dnl 64foreach(`symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 65foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 66foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 67foreach(`diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 68foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 69RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo) 70')dnl 71')dnl 72')dnl 73')dnl 74')dnl 75')dnl 76')dnl 77')dnl 78')dnl 79')dnl 80dnl 81dnl 82popdef(`unrollings')dnl 83dnl 84')dnl 85dnl 86dnl 87dnl 88dnl 89define(`RSB_M4_BCOO_KERNEL_FUNCTION',`dnl 90dnl 91dnl 92pushdef(`want_what',$1)dnl 93pushdef(`mtype',$2)dnl 94pushdef(`matrix_storage',$3)dnl 95pushdef(`transposition',$4)dnl 96pushdef(`symmetry',$5)dnl 97pushdef(`b_rows',$6)dnl block rows 98pushdef(`b_columns',$7)dnl block columns 99pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices) 100pushdef(`unrolling',$8)dnl 101pushdef(`mop',$9)dnl 102pushdef(`citype',$10)dnl 103pushdef(`diagonal',$11)dnl 104pushdef(`uplo',$12)dnl 105dnl 106pushdef(`total_columns',ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`Mdim',`mdim'))dnl 107pushdef(`total_rows',ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`mdim',`Mdim'))dnl 108pushdef(`out_dim',ifelse(transposition,RSB_M4_TRANS_N,total_rows,total_columns))dnl 109pushdef(`fid',RSB_M4_KERNEL_DIRECT_DISPATCHER_FUNCTION_NAME(mtype,matrix_storage,transposition,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo))dnl 110dnl 111ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`dnl 112pushdef(`mi',`i')dnl 113pushdef(`Mi',`j')dnl 114')dnl 115ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,`dnl 116pushdef(`mi',`j')dnl 117pushdef(`Mi',`i')dnl 118')dnl 119dnl 120ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl 121pushdef(`tmi',mi)dnl 122pushdef(`tMi',Mi)dnl 123')dnl 124ifelse(RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),1,`dnl 125pushdef(`tmi',Mi)dnl 126pushdef(`tMi',mi)dnl 127')dnl 128dnl 129ifelse(RSB_M4_IS_SPXX_OP_SCALING_KERNEL_MOP(mop),1,`dnl 130pushdef(`postmult',`(alpha)*')dnl 131',`dnl 132dnl 133ifelse(RSB_M4_IS_SPMX_OP_NEGATING_KERNEL_MOP(mop),1,`dnl 134pushdef(`postmult',`(-1)*')dnl 135',`dnl 136pushdef(`postmult',`')dnl 137')dnl 138dnl 139')dnl 140dnl 141pushdef(`ttransposition',`RSB_M4_TRANSPOSE_TRANSPOSITION(transposition)')dnl 142pushdef(`htransposition',`ifelse(symmetry,RSB_M4_SYMBOL_HERMITIAN,`RSB_M4_H2T_TRANSPOSITION(transposition)',transposition)')dnl 143dnl 144pushdef(`tsymmetry',`ifelse(symmetry,RSB_M4_SYMBOL_HERMITIAN,`RSB_M4_TRANSPOSE_SYMMETRY(symmetry)',symmetry)')dnl 145dnl 146pushdef(`toskipbecauseofsymmetry',`RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_IS_COMPLEX_TYPE(mtype)),RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))')dnl 147dnl 148dnl 149ifelse(RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED(want_what,mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo),`1',`dnl 150dnl 151ifelse(want_what,`DOC',`dnl 152 /* TODO */ 153')dnl 154ifelse(want_what,`all',`dnl 155dnl 156ifelse(RSB_M4_IS_FORMAT_BCOO(matrix_storage),1,`dnl 157rsb_err_t fid`'dnl 158RSB_M4_BCOO_KERNEL_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo)dnl 159')dnl 160ifdef(`ONLY_WANT_HEADERS',`; 161',` 162dnl /* begin of fid function */ 163RSB_M4_BCOO_KERNEL_FUNCTION(`BODY',mtype,matrix_storage,transposition,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo)dnl 164')dnl 165')dnl 166dnl 167ifelse(want_what,`ID',`dnl 168fid`'dnl 169')dnl 170dnl 171ifelse(want_what,`ARGS',`dnl 172RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo)`'dnl 173')dnl 174dnl 175dnl 176ifelse(want_what,`BODY',`dnl 177dnl 178{ 179ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`0',`dnl 180pushdef(`incx',`1')dnl 181pushdef(`incy',`1')dnl 182')dnl 183RSB_M4_BXXX_KERNEL_FUNCTION_HELP($@) 184dnl 185ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_IS_DIAGONAL_IMPLICIT(diagonal)),1,`dnl 186 RSB_M4_FAKE_DIAG_IMPLICIT_MSG 187')dnl 188dnl 189ifelse(toskipbecauseofsymmetry,1,`dnl 190dnl 191 /* Symmetric `transposed' reverts to symmetric `not transposed' */ 192 return RSB_M4_BCOO_KERNEL_FUNCTION(`ID',mtype,matrix_storage,RSB_M4_TRANS_N,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo)dnl 193(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCOO_KERNEL_FUNCTION(`ARGS',mtype,matrix_storage,RSB_M4_TRANS_N,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo))); 194dnl 195')dnl 196dnl 197ifelse(toskipbecauseofsymmetry,0,`dnl 198dnl 199dnl the i,j type has to be the same as the arrays one. 200dnl if not, mismatch on the copied bytes will occur. 201ifelse(RSB_M4_AND(RSB_M4_NOT(RSB_M4_IS_RC_BIASED_KERNEL_MOP(mop)),RSB_M4_NOT(RSB_M4_AND(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry))))),`1',`dnl 202ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`dnl 203 register rsb_coo_idx_t i=0,j=0; 204',`dnl 205 register citype i=0,j=0; 206dnl 20110227 if declaring short indices, we should care about proper conversion 207')dnl 208 const citype *IA=(const citype*)bpntr, *JA=(const citype*)bindx; 209dnl 210',`dnl 211dnl 212ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),`0',`dnl 213 const citype *JA=(const citype*)bindx; 214 register citype j=0; 215',`dnl 216 const citype *IA=(const citype*)bpntr; 217 register citype i=0; 218')dnl 219')dnl 220dnl ifelse(mop,`scale',`',`dnl 221dnl ')dnl 20121005 shall change this condition when enabling transpose scale as well 222 register rsb_nnz_idx_t n=0; 223ifelse(RSB_M4_IS_OP_SCALING_KERNEL_MOP(mop),`1',`dnl 224 const mtype alpha=*alphap;`'dnl 225')dnl 226ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),`1',`dnl 227 const mtype beta=*betap;`'dnl 228')dnl 229dnl 230ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`0',`dnl 231 dnl const rsb_coo_idx_t incx=1,incy=1;`' 232')dnl 233dnl 234dnl 235ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop)),1,`dnl 236dnl 237 238dnl 239dnl 240dnl 241ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),1,`dnl 242 const mtype *trhs = rhs+incx*(roff-coff);`'// symmetry 243 mtype *tout=out+incy*(coff-roff);`' 244 245')dnl 246dnl 247ifelse(RSB_M4_IS_ZEROING_KERNEL_MOP(mop),1,`dnl 248 rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(mtype),out_dim,NULL,out,incy); 249')dnl 250dnl 251ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),1,`dnl 252ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`dnl 253 if(beta!=1)rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(mtype),out_dim,&beta,out,ystride); 254',`dnl 255 rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(mtype), out_dim,&beta, out, 1); 256')dnl 257')dnl 258dnl 259ifelse(transposition,RSB_M4_TRANS_N,`dnl 260',`dnl 261ifelse(RSB_M4_IS_UNSYMMETRIC(symmetry),1,`dnl 262 rhs=(rhs-coff*(incx))+roff*(incx); 263 out=(out-roff*(incy))+coff*(incy); 264')dnl 265')dnl 266dnl 267ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),1,`dnl 268 if(roff==coff) 269')dnl 270dnl 271dnl 272ifelse(RSB_M4_IS_UNSYMMETRIC(symmetry),1,`dnl 273dnl 274ifelse(1,1,`dnl 275dnl 276dnl RSB_M4_SIMPLE_LOOP_UNROLL.. 277 RSB_M4_SIMPLE_LOOP_UNROLL_5S(`n',`LI',`0',`nnz',`dnl 278',`dnl 279 i=IA[n+LI]; j=JA[n+LI]; 280 out[tMi*incy]+=`'postmult`'RSB_M4_CONJ(VA[n+LI],mtype,transposition,RSB_M4_SYMBOL_UNSYMMETRIC)*rhs[tmi*incx]; 281dnl 282',`',`',`RSB_M4_EARLY_EVICT_INSTRUCTION((IA+n,JA+n,VA+n))`'dnl 283',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL) 284dnl 285',`dnl 286dnl 287RSB_M4_SIMPLE_LOOP_UNROLL_5S(`n',`LI',`0',`nnz',`dnl 288dnl 289dnl 290',`dnl 291dnl 292 `const rsb_coo_idx_t' `i_'``''LI`'=IA[n+LI]; 293 `const rsb_coo_idx_t' `j_'``''LI`'=JA[n+LI]; 294 `const mtype b_'``''LI`'=rhs[tmi``_''LI`'*incx]; 295 `const mtype a_'``''LI`'=VA[n+LI]; 296dnl 297',`dnl 298 if(tMi``_''0`'== tMi``_''eval(RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_MEDIUM-1)`') 299 { 300 mtype cacc = RSB_M4_ZERO(mtype); 301forloop(`_LI_',0,decr(RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_MEDIUM),`dnl 302 cacc+=`'postmult`'RSB_M4_CONJ(`a_'``''_LI_,mtype,transposition,RSB_M4_SYMBOL_UNSYMMETRIC)`*b_'``''_LI_; 303')dnl 304 out[tMi``_''0`'*incy]+=cacc; 305`'dnl 306 } 307 else 308 { 309',`dnl 310 out[tMi``_''LI`'*incy]+=`'postmult`RSB_M4_CONJ(a``_''``''LI`',mtype,transposition,RSB_M4_SYMBOL_UNSYMMETRIC)'`*b_'``''LI; 311',`dnl 312 } 313',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_MEDIUM) 314dnl 315')dnl 316dnl 317',`dnl 318dnl 319 for(n=0;RSB_LIKELY(n<nnz);++n) 320 { 321 i=IA[n]; 322 j=JA[n]; 323dnl assert(i< Mdim); 324dnl assert(j< mdim); 325 out[tMi*incy]+=`'postmult`'RSB_M4_UIM_CONJ(VA[n],mtype,transposition,symmetry)*rhs[tmi*incx]; 326dnl 327ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),1,`dnl 328 if(RSB_LIKELY(tMi!=tmi)) 329 out[tmi*incy]+=`'postmult`'RSB_M4_CIM_CONJ(VA[n],mtype,transposition,symmetry)*rhs[tMi*incx]; 330')dnl 331dnl 332 } 333dnl 334')dnl 335dnl 336ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),1,`dnl 337 if(roff!=coff) 338 RSB_M4_SIMPLE_LOOP_UNROLL(`n',`LI',`0',`nnz',`dnl 339 i=IA[n+LI]; 340 j=JA[n+LI]; 341dnl assert(i< Mdim); 342dnl assert(j< mdim); 343ifelse(transposition,RSB_M4_TRANS_N,`dnl 344 out[Mi*incy]+=`'postmult`'RSB_M4_UIM_CONJ(VA[n+LI],mtype,transposition,symmetry)*rhs[mi*incx]; 345 tout[mi*incy]+=`'postmult`'RSB_M4_CIM_CONJ(VA[n+LI],mtype,transposition,symmetry)*trhs[Mi*incx]; 346',`dnl 347 tout[tMi*incy]+=`'postmult`'RSB_M4_UIM_CONJ(VA[n+LI],mtype,transposition,symmetry)*trhs[tmi*incx]; 348 out[tmi*incy]+=`'postmult`'RSB_M4_CIM_CONJ(VA[n+LI],mtype,transposition,symmetry)*rhs[tMi*incx]; 349')dnl 350dnl 351 ',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL) 352')dnl 353dnl 354 return RSB_ERR_NO_ERROR; 355')dnl 356')dnl 357dnl 358ifelse(RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop)),1,`dnl 359dnl 360dnl FIXME: and roff and coff ? 361dnl 362dnl 363pushdef(`is_an_externally_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_XOR(RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),RSB_M4_SAME(uplo,`u')))')dnl 364pushdef(`is_vector_updating_spsv',RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))dnl 365dnl 366 rsb_coo_idx_t ii; 367ifelse(is_an_externally_backward_kernel,1,` 368 for(n=nnz-1,ii=Mdim-1;RSB_LIKELY(ii+1>0) ;--ii) 369',`dnl 370 for(n=0,ii=0;RSB_LIKELY(ii<Mdim);++ii) 371')dnl 372 { 373 mtype ax; 374ifelse(is_vector_updating_spsv,1,`dnl 375ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(diagonal),1,`dnl 376dnl .. 377',`dnl 378dnl const mtype aa; 379 mtype aa; 380ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK,1,`dnl 381 if(n>=nnz)return RSB_ERR_INVALID_NUMERICAL_DATA; 382')dnl 383 aa=VA[n]; 384ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK,1,`dnl 385 if(VA[n]==RSB_M4_ZERO(mtype))return RSB_ERR_INVALID_NUMERICAL_DATA; 386')dnl 387ifelse(is_an_externally_backward_kernel,1,` 388 n--; 389',`dnl 390 n++; 391')dnl 392 out[ii*incy]/=aa; 393')dnl 394 ax=out[ii*incy]; 395',`dnl 396 ax=0; 397')dnl 398ifelse(is_an_externally_backward_kernel,1,` 399 for(;RSB_LIKELY(n+1>0);--n) 400',`dnl 401 for(;RSB_LIKELY(n<nnz);++n) 402')dnl 403 { 404 i=IA[n]; 405 j=JA[n]; 406ifelse(is_vector_updating_spsv,1,`dnl 407 if(RSB_UNLIKELY(!(i==ii ))) 408',`dnl 409 if(RSB_UNLIKELY(!(i==ii && j!=i))) 410')dnl 411 break; 412ifelse(is_vector_updating_spsv,1,`dnl 413 out[j*incy]-=RSB_M4_CONJ(VA[n],mtype,transposition,symmetry)*ax; 414',`dnl 415 ax += RSB_M4_CONJ(VA[n],mtype,transposition,symmetry)*out[j*incy]; 416')dnl 417 } 418 419ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(diagonal),1,`dnl 420ifelse(is_vector_updating_spsv,1,`dnl 421 out[ii*incy]=(`'postmult`'out[ii*incy]); 422',`dnl 423 out[ii*incy]=(`'postmult`'out[ii*incy]-ax); 424')dnl 425',`dnl 426dnl 427dnl FIXME: goto err is illegal for nnz=0 ... 428dnl 429dnl if(!(i==ii && i==j)) 430dnl goto err; 431ifelse(is_vector_updating_spsv,1,`dnl 432 out[ii*incy]=(`'postmult`'out[ii*incy]); 433',`dnl 434ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK,1,`dnl 435 if(n==nnz || VA[n]==RSB_M4_ZERO(mtype))return RSB_ERR_INVALID_NUMERICAL_DATA; 436')dnl 437 out[ii*incy]=(`'postmult`'out[ii*incy]-ax)/VA[n]; 438ifelse(is_an_externally_backward_kernel,1,`dnl 439 --n; 440',`dnl 441 ++n; 442')dnl 443')dnl 444')dnl 445 } 446 return RSB_ERR_NO_ERROR; 447dnl err: 448dnl return RSB_ERR_BADARGS; 449dnl 450popdef(`is_an_externally_backward_kernel')dnl 451popdef(`is_vector_updating_spsv')dnl 452dnl 453')dnl 454dnl 455dnl ifelse(RSB_M4_NOT(RSB_M4_IS_SPXX_TWO_VECTORS_OPERATING_KERNEL_MOP(mop)),1,`dnl 456dnl return RSB_ERR_UNIMPLEMENTED_YET; 457dnl ')dnl 458dnl 459ifelse(mop,`scale',`dnl 460 for(n=0;RSB_LIKELY(n<nnz);++n) 461 { 462 dnl 463dnl FIXME: what about hermitian ? 464dnl 465ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl 466 i=IA[n]; 467 VA[n]*=scale_factors[i]; 468',`dnl 469 j=JA[n]; 470dnl i=IA[n]; 471dnl VA[n]*=scale_factors[i]; 472 VA[n]*=scale_factors[j]; 473')dnl 474dnl 475 } 476 return RSB_ERR_NO_ERROR; 477')dnl 478dnl 479ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl 480 dnl 481 dnl TODO: do we need vector blank ? 482 dnl 483 for(n=0;RSB_LIKELY(n<nnz);++n) 484 { 485dnl 486ifelse(RSB_M4_IS_UNSYMMETRIC(symmetry),1,`dnl 487dnl 488ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl 489 i=IA[n]; 490ifelse(mop,`infty_norm',`dnl 491 row_sums[roff+i]+=RSB_M4_ABS(mtype,VA[n]); 492')dnl 493ifelse(mop,`rowssums',`dnl 494 row_sums[roff+i]+=VA[n]; 495')dnl 496',`dnl 497 j=JA[n]; 498ifelse(mop,`infty_norm',`dnl 499 row_sums[coff+j]+=RSB_M4_ABS(mtype,VA[n]); 500')dnl 501ifelse(mop,`rowssums',`dnl 502 row_sums[coff+j]+=VA[n]; 503')dnl 504')dnl 505')dnl 506dnl 507dnl 508ifelse(RSB_M4_IS_UNSYMMETRIC(symmetry),0,`dnl 509dnl 510 i=IA[n]; 511 j=JA[n]; 512dnl 513ifelse(mop,`infty_norm',`dnl 514 row_sums[roff+i]+=RSB_M4_ABS(mtype,VA[n]); 515')dnl 516ifelse(mop,`rowssums',`dnl 517 row_sums[roff+i]+=VA[n]; 518')dnl 519 if( roff+i != coff+j ) 520ifelse(mop,`infty_norm',`dnl 521 row_sums[coff+j]+=RSB_M4_ABS(mtype,VA[n]); 522')dnl 523ifelse(mop,`rowssums',`dnl 524 row_sums[coff+j]+=VA[n]; 525')dnl 526')dnl 527dnl 528 } 529 return RSB_ERR_NO_ERROR; 530')dnl 531dnl 532ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`0',`dnl 533popdef(`incx')dnl 534popdef(`incy')dnl 535')dnl 536dnl 537} 538dnl } /* end of fid function */ 539dnl 540')dnl 541dnl 542')dnl 543dnl 544 545popdef(`toskipbecauseofsymmetry')dnl 546popdef(`htransposition')dnl 547popdef(`ttransposition')dnl 548popdef(`tsymmetry')dnl 549popdef(`postmult')dnl 550popdef(`tmi')dnl 551popdef(`tMi')dnl 552popdef(`mi')dnl 553popdef(`Mi')dnl 554popdef(`total_columns')dnl 555popdef(`total_rows')dnl 556popdef(`out_dim')dnl 557popdef(`fid')dnl 558dnl 559popdef(`uplo')dnl 560popdef(`diagonal')dnl 561popdef(`citype')dnl 562popdef(`mop')dnl 563popdef(`unrolling')dnl 564popdef(`itype')dnl 565popdef(`b_columns')dnl 566popdef(`b_rows')dnl 567popdef(`symmetry')dnl 568popdef(`transposition')dnl 569popdef(`matrix_storage')dnl 570popdef(`mtype')dnl 571popdef(`want_what')dnl 572')dnl 573dnl 574dnl 575define(`RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION',`dnl 576dnl 577pushdef(`want_what',$1)dnl 578pushdef(`mtype',$2)dnl 579pushdef(`matrix_storage',$3)dnl 580pushdef(`transposition',$4)dnl 581pushdef(`symmetry',$5)dnl 582pushdef(`unrolling',$6)dnl 583dnl pushdef(`b_rows',$7)dnl block rows 584dnl pushdef(`b_columns',$8)dnl block columns 585pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices) 586pushdef(`mop',`$9')dnl 587pushdef(`citype',`$10')dnl 588pushdef(`diagonal',`$11')dnl 589pushdef(`uplo',$12)dnl 590dnl 591dnl 592dnl 593ifelse(RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED(want_what,mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo),`1',`dnl 594dnl 595ifelse(want_what,`DOC',`dnl 596 /* TODO */ 597')dnl 598dnl 599ifelse(want_what,`all',`dnl 600dnl `/* This code is intended for a block compressed sparse stripe matrix. */' 601ifdef(`ONLY_WANT_HEADERS',`dnl 602RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`function_declaration',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo) 603',`dnl 604RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`function_definition',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo) 605')dnl 606dnl 607dnl 608dnl 609')dnl 610dnl 611ifelse(want_what,`function_definition',`dnl 612rsb_err_t RSB_M4_KERNEL_SIZE_DISPATCH_FUNCTION_NAME(mtype,matrix_storage,transposition,symmetry,unrolling,mop,citype,diagonal,uplo)dnl 613RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo) 614RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`BODY',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo) 615')dnl 616dnl 617ifelse(want_what,`function_declaration',`dnl 618rsb_err_t RSB_M4_KERNEL_SIZE_DISPATCH_FUNCTION_NAME(mtype,matrix_storage,transposition,symmetry,unrolling,mop,citype,diagonal,uplo)dnl 619RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo);dnl 620')dnl 621dnl 622ifelse(want_what,`ARGS',`dnl 623dnl 624dnl 625pushdef(`matrix_structs',`const itype Mdim,const itype mdim,const citype * RSB_M4_RESTRICT bindx,const rsb_nnz_idx_t * RSB_M4_RESTRICT bpntr,const rsb_nnz_idx_t *RSB_M4_RESTRICT indptr,const rsb_coo_idx_t * RSB_M4_RESTRICT rpntr,const rsb_coo_idx_t * RSB_M4_RESTRICT cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const rsb_nnz_idx_t nnz')dnl 626(`'dnl 627ifelse(RSB_M4_IS_SPXX_TWO_VECTORS_OPERATING_KERNEL_MOP(mop),1,`dnl 628dnl 629dnl no restrict on aliasing ops 630dnl 631ifelse(RSB_M4_IS_ALLOWING_ALIASING_KERNEL_MOP(mop),1,`dnl 632const mtype * RSB_M4_RESTRICT VA, const mtype * rhs, mtype * out, matrix_structs`'dnl 633',`dnl 634const mtype * RSB_M4_RESTRICT VA, const mtype * RSB_M4_RESTRICT rhs, mtype * RSB_M4_RESTRICT out, matrix_structs`'dnl 635')dnl 636')dnl 637ifelse(RSB_M4_IS_OP_SCALING_KERNEL_MOP(mop),`1',`dnl 638,const mtype * RSB_M4_RESTRICT alphap`'dnl 639')dnl 640ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),`1',`dnl 641,const mtype * RSB_M4_RESTRICT betap`'dnl 642')dnl 643ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`1',`dnl 644,rsb_coo_idx_t incx, rsb_coo_idx_t incy`'dnl 645')dnl 646ifelse(mop,`spmm_az',`dnl 647dnl 648dnl FIXME 649dnl 650const itype bstride, const itype cstride, const itype nrhs`'dnl 651')dnl 652ifelse(mop,`scale',`dnl 653mtype * VA, matrix_structs, const mtype *scale_factors`'dnl 654')dnl 655ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl 656const mtype * VA, mtype * row_sums, matrix_structs`'dnl 657')dnl 658ifelse(mop,`negation',`dnl 659mtype * VA, matrix_structs`'dnl 660')dnl 661)dnl 662dnl 663')dnl 664dnl 665dnl 666ifelse(want_what,`BODY',`dnl 667dnl 668dnl 669{ 670 RSB_M4_DEBUGINFO(``$0'')dnl 671dnl /*! \ingroup rsb_doc_kernels 672 /* 673 * This function will dispatch the specialized looped kernel function for 674 * performing the desired matrix operation ("mop") for the current fixed 675 * block size. 676 * 677 * \return \rsb_errval_inp_param_msg 678ifelse(RSB_M4_IS_FORMAT_BCOO(matrix_storage),1,`dnl 679 * 680 * Since this is strictly blocked code, you should allow the rhs and the out 681 * vector to accept a small overflow not bigger, respectively, than 682 * mod(blockrows-mod(matrixrows,blockrows),blockrows) 683 * and 684 * mod(blockcols-mod(matrixcols,blockcols),blockcols) 685dnl * 686dnl * Note: We assume this quantity is the same for each block. 687dnl * 688dnl * WARNING : EXPERIMENTAL FUNCTION 689dnl * for block bigger than ~12x12 it seems that inline matrix multiplication code slows down the whole thing 690')dnl 691 */ 692 register rsb_coo_idx_t columns,rows; 693 rsb_err_t errval = RSB_ERR_NO_ERROR; 694 if(cpntr && rpntr) 695 { 696 columns=cpntr[1]-cpntr[0]; 697 rows =rpntr[1]-rpntr[0]; 698 } 699 else 700dnl #if RSB_EXPERIMENTAL_WANT_PURE_BCOO 701ifelse(RSB_M4_WANT_20110206_BOUNDED_BOX_PATCH,1,`dnl 702dnl 20110206 set the following 703 columns = rows=1; /* experimental, for the bounded box patch */ 704',`dnl 705dnl 20110206 and commented the following 706 columns=bc,rows=br; 707')dnl 708dnl #else 709dnl columns = rows=1; 710dnl #endif 711 712ifelse(RSB_M4_IS_FORMAT_BCOO(matrix_storage),1,`dnl 713pushdef(`args',`RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo))')dnl 714switch(rows) 715{ 716foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl 717 case rowsu: 718 {switch(columns) 719 { 720foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl 721 case colsu:/* rowsu colsu matrix_storage */ 722 errval = RSB_M4_BCOO_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,symmetry,rowsu,colsu,unrolling,mop,citype,diagonal,uplo)( args ); 723 break; 724')dnl 725 default: 726#ifdef RSB_WANT_LOOPING_KERNELS 727 errval = RSB_M4_BCOO_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,symmetry,rowsu,RSB_M4_COLUMNS_FALLBACK_UNROLL,`l',mop,citype,diagonal,uplo)( args ); 728#else /* RSB_WANT_LOOPING_KERNELS */ 729 errval = RSB_ERR_UNSUPPORTED_OPERATION; 730#endif /* RSB_WANT_LOOPING_KERNELS */ 731 }} 732 break; 733')dnl 734 default: 735#ifdef RSB_WANT_LOOPING_KERNELS 736 errval = RSB_M4_BCOO_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,symmetry,RSB_M4_ROWS_FALLBACK_UNROLL,RSB_M4_COLUMNS_FALLBACK_UNROLL,`l',mop,citype,diagonal,uplo)( args ); 737#else /* RSB_WANT_LOOPING_KERNELS */ 738 errval = RSB_ERR_UNSUPPORTED_OPERATION; 739#endif /* RSB_WANT_LOOPING_KERNELS */ 740}; 741popdef(`args')dnl 742')dnl 743 dnl errval = RSB_ERR_UNSUPPORTED_TYPE; 744 return errval; 745} 746dnl 747')dnl 748dnl 749')dnl 750dnl 751popdef(`citype')dnl 752popdef(`mop')dnl 753popdef(`matrix_storage')dnl 754dnl popdef(`b_rows')dnl 755dnl popdef(`b_columns')dnl 756popdef(`transposition')dnl 757popdef(`symmetry')dnl 758popdef(`mtype')dnl 759popdef(`itype')dnl 760popdef(`unrolling')dnl 761popdef(`diagonal')dnl 762popdef(`want_what')dnl 763popdef(`uplo')dnl 764')dnl 765dnl 766dnl 767dnl 768