1dnl 2dnl @author: Michele Martone 3dnl 4dnl 5dnl 6define(`RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED',`dnl 7dnl 8pushdef(`want_what',$1)dnl 9pushdef(`mtype',$2)dnl 10pushdef(`matrix_storage',$3)dnl 11pushdef(`transposition',$4)dnl 12pushdef(`k_symmetry',$5)dnl 13pushdef(`unrolling',$6)dnl 14pushdef(`b_rows',$7)dnl block rows 15pushdef(`b_columns',$8)dnl block columns 16pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices) 17pushdef(`mop',`$9')dnl 18pushdef(`citype',`$10')dnl 19pushdef(`k_diagonal',`$11')dnl 20pushdef(`uplo',$12)dnl 21dnl 22RSB_M4_AND(dnl 23RSB_M4_IMPLY(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_SAME(uplo,`g'))),dnl 24RSB_M4_IMPLY(RSB_M4_NOT(RSB_M4_IS_SPSX_KERNEL_MOP(mop)),RSB_M4_SAME(uplo,`g')),dnl 251)`'dnl 26dnl 27dnl 28popdef(`citype')dnl 29popdef(`mop')dnl 30popdef(`matrix_storage')dnl 31popdef(`b_rows')dnl 32popdef(`b_columns')dnl 33popdef(`transposition')dnl 34popdef(`k_symmetry')dnl 35popdef(`mtype')dnl 36popdef(`itype')dnl 37popdef(`unrolling')dnl 38popdef(`k_diagonal')dnl 39popdef(`want_what')dnl 40popdef(`uplo')dnl 41')dnl 42dnl 43dnl 44dnl 45dnl 46dnl These functions dispatch on the column size, calling the 47dnl proper kernels. 48dnl 49dnl They assume type dispatching has just been performed. 50dnl 51dnl 52dnl RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(want_what,mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo) 53dnl ----------------------------------------------------------------------------------------------------------------------------------- 54dnl 55define(`RSB_M4_BCXX_KERNEL_SIZE_DISPATCH_FUNCTION',`dnl 56dnl 57dnl 58ifelse(RSB_M4_IS_FORMAT_BCOO(matrix_storage),`1',`dnl 59dnl 60RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION($@)`'dnl 61dnl 62')dnl 63dnl 64dnl 65ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),`1',`dnl 66dnl 67RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION($@)`'dnl 68dnl 69')dnl 70dnl 71dnl 72dnl 73')dnl 74dnl 75dnl 76dnl 77define(`RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION',`dnl 78dnl 79pushdef(`want_what',$1)dnl 80pushdef(`mtype',$2)dnl 81pushdef(`matrix_storage',$3)dnl 82pushdef(`transposition',$4)dnl 83pushdef(`k_symmetry',$5)dnl 84pushdef(`unrolling',$6)dnl 85dnl pushdef(`b_rows',$7)dnl block rows 86dnl pushdef(`b_columns',$8)dnl block columns 87pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices) 88pushdef(`mop',`$9')dnl 89pushdef(`citype',`$10')dnl 90pushdef(`k_diagonal',`$11')dnl 91pushdef(`uplo',$12)dnl 92dnl 93ifelse(RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED(want_what,mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo),`1',`dnl 94dnl 95ifelse(want_what,`DOC',`dnl 96 /* TODO */ 97')dnl 98dnl 99ifelse(want_what,`all',`dnl 100dnl `/* This code is intended for a block compressed sparse stripe matrix. */' 101ifdef(`ONLY_WANT_HEADERS',`dnl 102RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`function_declaration',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo) 103',`dnl 104RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`function_definition',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo) 105')dnl 106dnl 107dnl 108dnl 109')dnl 110dnl 111ifelse(want_what,`function_definition',`dnl 112rsb_err_t RSB_M4_KERNEL_SIZE_DISPATCH_FUNCTION_NAME(mtype,matrix_storage,transposition,k_symmetry,unrolling,mop,citype,k_diagonal,uplo)dnl 113RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo) 114RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`BODY',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo) 115')dnl 116dnl 117ifelse(want_what,`function_declaration',`dnl 118rsb_err_t RSB_M4_KERNEL_SIZE_DISPATCH_FUNCTION_NAME(mtype,matrix_storage,transposition,k_symmetry,unrolling,mop,citype,k_diagonal,uplo)dnl 119RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo);dnl 120')dnl 121dnl 122ifelse(want_what,`ARGS',`dnl 123dnl 124dnl 125pushdef(`matrix_structs',`const itype Mdim,const itype mdim,const citype * RSB_M4_RESTRICT bindx,const rsb_nnz_idx_t * RSB_M4_RESTRICT bpntr,const rsb_nnz_idx_t *RSB_M4_RESTRICT indptr,const rsb_coo_idx_t * RSB_M4_RESTRICT rpntr,const rsb_coo_idx_t * RSB_M4_RESTRICT cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags')dnl 126(`'dnl 127ifelse(RSB_M4_IS_SPXX_TWO_VECTORS_OPERATING_KERNEL_MOP(mop),1,`dnl 128dnl 129dnl no restrict on aliasing ops 130dnl 131ifelse(RSB_M4_IS_ALLOWING_ALIASING_KERNEL_MOP(mop),1,`dnl 132const mtype * RSB_M4_RESTRICT VA, const mtype * rhs, mtype * out, matrix_structs`'dnl 133',`dnl 134const mtype * RSB_M4_RESTRICT VA, const mtype * RSB_M4_RESTRICT rhs, mtype * RSB_M4_RESTRICT out, matrix_structs`'dnl 135')dnl 136')dnl 137ifelse(RSB_M4_IS_OP_SCALING_KERNEL_MOP(mop),`1',`dnl 138,const mtype * RSB_M4_RESTRICT alphap`'dnl 139')dnl 140ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),`1',`dnl 141,const mtype * RSB_M4_RESTRICT betap`'dnl 142')dnl 143ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`1',`dnl 144,rsb_coo_idx_t incx, rsb_coo_idx_t incy`'dnl 145')dnl 146ifelse(mop,`spmm_az',`dnl 147dnl 148dnl FIXME 149dnl 150const itype bstride, const itype cstride, const itype nrhs`'dnl 151')dnl 152ifelse(mop,`scale',`dnl 153mtype * VA, matrix_structs, const mtype *scale_factors`'dnl 154')dnl 155ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl 156const mtype * VA, mtype * row_sums, matrix_structs`'dnl 157')dnl 158ifelse(mop,`negation',`dnl 159mtype * VA, matrix_structs`'dnl 160')dnl 161)dnl 162dnl 163')dnl 164dnl 165dnl 166ifelse(want_what,`BODY',`dnl 167dnl 168dnl 169{ 170 RSB_M4_DEBUGINFO(``$0'')dnl 171dnl /*! \ingroup rsb_doc_kernels 172 /* 173 * This function will dispatch the specialized looped kernel function for 174 * performing the desired matrix operation ("mop") for the current fixed 175 * block size. 176 * 177 * \return \rsb_errval_inp_param_msg 178ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl 179 * 180 * Since this is strictly blocked code, you should allow the rhs and the out 181 * vector to accept a small overflow not bigger, respectively, than 182 * mod(blockrows-mod(matrixrows,blockrows),blockrows) 183 * and 184 * mod(blockcols-mod(matrixcols,blockcols),blockcols) 185dnl * 186dnl * Note: We assume this quantity is the same for each block. 187dnl * 188dnl * WARNING : EXPERIMENTAL FUNCTION 189dnl * for block bigger than ~12x12 it seems that inline matrix multiplication code slows down the whole thing 190')dnl 191 */ 192 rsb_err_t errval = RSB_ERR_NO_ERROR; 193 194ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl 195pushdef(`args',`RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo))')dnl 196 197 register rsb_coo_idx_t columns,rows; 198 if(cpntr && rpntr) 199 { 200 columns=cpntr[1]-cpntr[0]; 201 rows =rpntr[1]-rpntr[0]; 202 } 203 else 204dnl #if RSB_EXPERIMENTAL_WANT_PURE_BCSS 205ifelse(RSB_M4_WANT_20110206_BOUNDED_BOX_PATCH,1,`dnl 206dnl 20110206 set the following 207 columns = rows=1; /* experimental, for the bounded box patch */ 208',`dnl 209dnl 20110206 and commented the following 210 columns=bc,rows=br; 211')dnl 212dnl #else 213dnl columns = rows=1; 214dnl #endif 215 216switch(rows) 217{ 218foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl 219 case rowsu: 220 {switch(columns) 221 { 222foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl 223 case colsu:/* rowsu colsu matrix_storage */ 224 errval = RSB_M4_BCSS_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)( args ); 225 break; 226')dnl 227 default: 228#ifdef RSB_WANT_LOOPING_KERNELS 229 errval = RSB_M4_BCSS_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,k_symmetry,rowsu,RSB_M4_COLUMNS_FALLBACK_UNROLL,`l',mop,citype,k_diagonal,uplo)( args ); 230#else /* RSB_WANT_LOOPING_KERNELS */ 231 errval = RSB_ERR_UNSUPPORTED_OPERATION; 232#endif /* RSB_WANT_LOOPING_KERNELS */ 233 }} 234 break; 235')dnl 236 default: 237#ifdef RSB_WANT_LOOPING_KERNELS 238 errval = RSB_M4_BCSS_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,k_symmetry,RSB_M4_ROWS_FALLBACK_UNROLL,RSB_M4_COLUMNS_FALLBACK_UNROLL,`l',mop,citype,k_diagonal,uplo)( args ); 239#else /* RSB_WANT_LOOPING_KERNELS */ 240 errval = RSB_ERR_UNSUPPORTED_OPERATION; 241#endif /* RSB_WANT_LOOPING_KERNELS */ 242}; 243popdef(`args')dnl 244')dnl 245 dnl errval = RSB_ERR_UNSUPPORTED_TYPE; 246 return errval; 247} 248dnl 249')dnl 250dnl 251')dnl 252dnl 253popdef(`citype')dnl 254popdef(`mop')dnl 255popdef(`matrix_storage')dnl 256dnl popdef(`b_rows')dnl 257dnl popdef(`b_columns')dnl 258popdef(`transposition')dnl 259popdef(`k_symmetry')dnl 260popdef(`mtype')dnl 261popdef(`itype')dnl 262popdef(`unrolling')dnl 263popdef(`k_diagonal')dnl 264popdef(`want_what')dnl 265popdef(`uplo')dnl 266')dnl 267dnl 268dnl 269dnl 270dnl 271dnl These functions will perform their operations on fixed block matrices. 272dnl 273define(`RSB_M4_BXXX_KERNEL_FUNCTION_HAS_IMPLEMENTATION',`dnl 274dnl 275dnl 276pushdef(`want_what',$1)dnl 277pushdef(`mtype',$2)dnl 278pushdef(`matrix_storage',$3)dnl 279pushdef(`transposition',$4)dnl 280pushdef(`k_symmetry',$5)dnl 281pushdef(`b_rows',$6)dnl block rows 282pushdef(`b_columns',$7)dnl block columns 283pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices) 284pushdef(`unrolling',$8)dnl 285pushdef(`mop',$9)dnl 286pushdef(`citype',$10)dnl 287pushdef(`k_diagonal',$11)dnl 288pushdef(`uplo',$12)dnl 289dnl 290ifelse(dnl 291dnl 292dnl The following are cases which are NOT implemented. 293dnl Each line emits a non empty character (`*') to block an implementation. 294dnl 295dnl CSC SPSV gets blocked: 296dnl ifelse(RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(transposed)),1,`no',`')`'dnl 297dnl CSR transposed SPSV gets blocked: 298dnl ifelse(RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),transposed),1,`no'`')dnl 299dnl SPSV for non 1x1 blockings gets blocked 300ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),1,ifelse(RSB_M4_AND(RSB_M4_SAME(b_rows,1),RSB_M4_SAME(b_columns,1)),`1',`',`no'))`'dnl 301dnl 302dnl any symmetric kernel for non 1x1 blockings gets blocked 303dnl TODO : should modify RSB_M4_EXTRA_SYMMETRIC_DIAGONAL_FIXING_KERNEL to support k_symmetry and blocking 304ifelse(RSB_M4_OR(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),RSB_M4_AND(RSB_M4_SAME(b_rows,1),RSB_M4_SAME(b_columns,1))),1,`',`no')`'dnl 305dnl any SPSV symmetric gets blocked 306ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),1,ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),1,`no',`'))`'dnl 307dnl 308,`',`1',`0')dnl 309dnl 310popdef(`uplo')dnl 311popdef(`want_what')dnl 312popdef(`k_diagonal')dnl 313popdef(`citype')dnl 314popdef(`mop')dnl 315popdef(`matrix_storage')dnl 316popdef(`k_symmetry')dnl 317popdef(`transposition')dnl 318popdef(`mtype')dnl 319popdef(`itype')dnl 320popdef(`unrolling')dnl 321')dnl 322dnl 323dnl 324dnl 325dnl 326dnl 327define(`RSB_M4_BXXX_KERNEL_FUNCTION_HELP',`dnl 328dnl 329dnl 330pushdef(`want_what',$1)dnl 331pushdef(`mtype',$2)dnl 332pushdef(`matrix_storage',$3)dnl 333pushdef(`transposition',$4)dnl 334pushdef(`k_symmetry',$5)dnl 335pushdef(`b_rows',$6)dnl block rows 336pushdef(`b_columns',$7)dnl block columns 337pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices) 338pushdef(`unrolling',$8)dnl 339pushdef(`mop',$9)dnl 340pushdef(`citype',$10)dnl 341pushdef(`k_diagonal',$11)dnl 342pushdef(`uplo',$12)dnl 343dnl 344 /** 345 * \ingroup rsb_doc_kernels 346ifelse(RSB_M4_MEMBER(mop,`spsv_uxua'),1,`dnl 347 * Computes \f$y \leftarrow RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A')^{-1} \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 348')dnl 349ifelse(mop,`spmv_unua',`dnl 350 * Computes \f$y \leftarrow y - RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 351')dnl 352ifelse(mop,`spmv_uaua',`dnl 353 * Computes \f$y \leftarrow y + RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 354')dnl 355ifelse(mop,`spmv_sxsa',`dnl 356 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 357 * with incx and incy as x and y vector strides 358')dnl 359ifelse(mop,`spmv_sxsx',`dnl 360 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 361 * with incx and incy as x and y vector strides 362')dnl 363ifelse(mop,`spmv_sasa',`dnl 364 * Computes \f$y \leftarrow y + RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 365')dnl 366ifelse(mop,`spmv_uxua',`dnl 367 * Computes \f$y \leftarrow y + \alpha \cdot RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 368')dnl 369ifelse(mop,`spmv_uxux',`dnl 370 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 371')dnl 372ifelse(mop,`spmm_az',`dnl 373 * Computes \f$y \leftarrow RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 374')dnl 375ifelse(mop,`infty_norm',`dnl 376 * Computes \f$ \|A\|_{\infty} \f$ (or rather, \f$ row\_sums_i \leftarrow \sum_{j=0}^{mdim} A_{ij} ), where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A').\f$ 377')dnl 378ifelse(mop,`rowssums',`dnl 379 * Computes \f$ \|A\|_{1} \f$ (or rather, \f$ row\_sums_i \leftarrow \sum_{i=0}^{Mdim} A^{T}_{ij} ), where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A').\f$ 380')dnl 381ifelse(mop,`spmv_uauz',`dnl 382 * Computes \f$y \leftarrow RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$ 383')dnl 384ifelse(mop,`scale',`dnl 385 * Computes \f$A \leftarrow A\cdot P, P_{ii}=s_{i}, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A').\f$ 386')dnl 387ifelse(mop,`negation',`dnl 388 * Computes \f$A \leftarrow - A \f$ 389')dnl 390 * Matrix A should be blocked b_rows x b_columns, stored in matrix_storage format, RSB_M4_MATRIX_DIAGONAL_DENOMINATION(k_diagonal), of `type' mtype, with citype column indices. 391dnl 392ifelse(RSB_M4_BXXX_KERNEL_FUNCTION_HAS_IMPLEMENTATION($@),`1',`dnl 393 * \return \rsb_errval_inp_param_msg 394 */ 395',`dnl 396dnl FIXME: the return error is not always adequate, here. 397 * \return RSB_ERR_UNIMPLEMENTED_YET (this function is not implemented). 398 */ 399dnl /* or RSB_ERR_UNSUPPORTED_FEATURE ? */ 400')dnl 401dnl 402popdef(`uplo')dnl 403popdef(`want_what')dnl 404popdef(`k_diagonal')dnl 405popdef(`citype')dnl 406popdef(`mop')dnl 407popdef(`matrix_storage')dnl 408popdef(`k_symmetry')dnl 409popdef(`transposition')dnl 410popdef(`mtype')dnl 411popdef(`itype')dnl 412popdef(`unrolling')dnl 413')dnl 414dnl 415dnl 416dnl 417dnl 418dnl 419dnl These functions will perform their operations on fixed block matrices. 420dnl 421define(`RSB_M4_BCSS_KERNEL_FUNCTION',`dnl 422dnl 423dnl 424pushdef(`want_what',$1)dnl 425pushdef(`mtype',$2)dnl 426pushdef(`matrix_storage',$3)dnl 427pushdef(`transposition',$4)dnl 428pushdef(`k_symmetry',$5)dnl 429pushdef(`b_rows',$6)dnl block rows 430pushdef(`b_columns',$7)dnl block columns 431pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices) 432pushdef(`unrolling',$8)dnl 433pushdef(`mop',$9)dnl 434pushdef(`citype',$10)dnl 435pushdef(`k_diagonal',$11)dnl 436pushdef(`uplo',$12)dnl 437dnl 438ifelse(RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED(want_what,mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo),`1',`dnl 439dnl 440ifelse(want_what,`all',`dnl 441dnl 442ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl 443rsb_err_t RSB_M4_BCSS_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,k_symmetry,b_rows,b_columns,unrolling,mop,citype,k_diagonal,uplo)dnl 444RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,b_rows,b_columns,unrolling,mop,citype,k_diagonal,uplo)dnl 445')dnl 446ifdef(`ONLY_WANT_HEADERS',`; 447',` 448RSB_M4_BCSS_KERNEL_FUNCTION(`BODY',mtype,matrix_storage,transposition,k_symmetry,b_rows,b_columns,unrolling,mop,citype,k_diagonal,uplo)dnl 449')dnl 450')dnl 451dnl 452ifelse(want_what,`ID',`dnl 453RSB_M4_KERNEL_DIRECT_DISPATCHER_FUNCTION_NAME(mtype,matrix_storage,transposition,k_symmetry,b_rows,b_columns,unrolling,mop,citype,k_diagonal,uplo)`'dnl 454')dnl 455dnl 456ifelse(want_what,`ARGS',`dnl 457RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)`'dnl 458')dnl 459dnl 460ifelse(want_what,`BODY',`dnl 461dnl 462{ 463dnl 464dnl The body of a CSR/CSC computational kernel. 465dnl 466dnl RSB_M4_DEBUGINFO(``$0'')dnl 467dnl 468ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl 469dnl 470pushdef(`total_block_columns',ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`Mdim',`mdim'))dnl 471pushdef(`total_block_rows',ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`mdim',`Mdim'))dnl 472pushdef(`total_rows',ifelse(unrolling,`l',rpntr[total_block_rows],total_block_rows*b_rows))dnl 473pushdef(`total_columns',ifelse(unrolling,`l',cpntr[total_block_columns],total_block_columns*b_columns))dnl 474dnl 475ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`dnl 476pushdef(`mi',`i')dnl 477pushdef(`Mi',`j')dnl 478')dnl 479ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,`dnl 480pushdef(`mi',`j')dnl 481pushdef(`Mi',`i')dnl 482')dnl 483dnl 484dnl FIXME : out_dim should depend on the operation! 485dnl 486pushdef(`out_dim',ifelse(transposition,RSB_M4_TRANS_N,total_rows,total_columns))dnl 487dnl 488pushdef(`is_zero_acc_spsv_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_OR(RSB_M4_AND(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),RSB_M4_AND(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))))')dnl 489dnl pushdef(`is_zero_acc_spsv_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_OR(RSB_M4_AND(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),RSB_M4_SAME(transposition,RSB_M4_TRANS_N))))')dnl 490dnl 491pushdef(`is_diag_d_spsv_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_OR(RSB_M4_AND(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),RSB_M4_AND(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N))))))')dnl 492dnl 493dnl pushdef(`is_an_externally_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_XOR(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))')dnl 494dnl pushdef(`is_an_externally_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))')dnl 495pushdef(`is_an_externally_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_XOR(RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),RSB_M4_SAME(uplo,`u')))')dnl 496dnl 497pushdef(`is_a_backward_kernel',is_an_externally_backward_kernel)dnl 498dnl pushdef(`is_a_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))')dnl 499dnl 500pushdef(`block_backward',`ifelse(is_a_backward_kernel,1,`a += rows*columns',`a -= rows*columns')')dnl 501pushdef(`block_forward',`ifelse(is_a_backward_kernel,1,`a -= rows*columns',`a += rows*columns')')dnl 502dnl 503dnl 504dnl FIXME : and so the stride x/y association 505dnl 506dnl pushdef(`extra_xstride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`incx',`0'))dnl 507dnl pushdef(`extra_ystride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`incy',`0'))dnl 508pushdef(`xstride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`(incx)',`1'))dnl 509pushdef(`ystride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`(incy)',`1'))dnl 510pushdef(`extra_xstride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`xstride',`1'))dnl 511pushdef(`extra_ystride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`ystride',`1'))dnl 512dnl 513dnl NEW: 514dnl 515pushdef(`transposed',ifelse(transposition,RSB_M4_TRANS_N,0,1))dnl 516dnl pushdef(`transposed',dnl 517dnl ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,eval(transposed),eval(1-transposed))dnl 518dnl )dnl 519dnl 520dnl 521pushdef(`brin',`(i*extra_ystride)')dnl 522pushdef(`bcin',`(j*extra_xstride)')dnl 523dnl 524ifelse(transposed,`1',`dnl 525dnl 526dnl block row index, block column index 527dnl 528pushdef(`bci',`(i*extra_xstride)')dnl 529pushdef(`bri',`(j*extra_ystride)')dnl 530pushdef(`bcit',`(j*extra_xstride)')dnl 531pushdef(`brit',`(i*extra_ystride)')dnl 532',`dnl 533pushdef(`bri',`(i*extra_ystride)')dnl 534pushdef(`bci',`(j*extra_xstride)')dnl 535pushdef(`brit',`(j*extra_ystride)')dnl 536pushdef(`bcit',`(i*extra_xstride)')dnl 537')dnl 538dnl 539pushdef(`should_init_out_vector_before_outer_loop',`dnl 540RSB_M4_OR(RSB_M4_IS_SCALING_KERNEL_MOP(mop),dnl 541RSB_M4_AND(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(eval(transposed))),dnl 542RSB_M4_AND(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),eval(transposed)),dnl 543RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry))dnl 544')dnl 545dnl 546dnl 547dnl 548pushdef(`has_implementation',`dnl 549RSB_M4_BXXX_KERNEL_FUNCTION_HAS_IMPLEMENTATION($@)`'dnl 550')dnl 551dnl 552') 553RSB_M4_BXXX_KERNEL_FUNCTION_HELP($@) 554ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal)),1,`dnl 555 RSB_M4_FAKE_DIAG_IMPLICIT_MSG 556')dnl 557ifelse(has_implementation,`1',`dnl 558',`dnl 559dnl /* or RSB_ERR_UNSUPPORTED_FEATURE ? */ 560 return RSB_ERR_UNIMPLEMENTED_YET; 561')dnl 562dnl 563ifelse(has_implementation,`1',`dnl 564dnl Comments 565dnl 566ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry)),1,`dnl 567 /* 568ifelse(RSB_M4_want_verbose_comments,`1',`dnl 569 WARNING : This function assumes the matrix symmetric, and therefore 570 will write the output vector in the 0,Mdim and -roff+coff,-roff+coff+Mdim range. 571 So if you are using this function in a parallel environment, you should care about 572 proper locking of the output vectors. 573')dnl 574ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_SCALING_KERNEL_MOP(mop),RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry)),1,`dnl 575 The output vector zero-ing is impacted, too, so if you are using this kernel with 576 recursive storage, you should care about the proper zeroing of the whole output vector. 577')dnl 578 */ 579')dnl 580dnl 581dnl 582dnl 583dnl 584ifelse(RSB_M4_OR(RSB_M4_AND(RSB_M4_NOT(RSB_M4_IS_COMPLEX_TYPE(type)),RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),RSB_M4_NOT(transposition,RSB_M4_TRANS_N))),1,`dnl 585dnl 586ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_C),1,`dnl 587 /* `For non complex types, hermitian defaults to plain transposition.' */ 588 return RSB_M4_BCSS_KERNEL_FUNCTION(`ID',type,matrix_storage,RSB_M4_H2T_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)dnl 589(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',type,matrix_storage,RSB_M4_H2T_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype))); 590')dnl 591dnl 592ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_T),1,`dnl 593 /* `This kernel performs the same as its transposed', transposition -> RSB_M4_TRANSPOSE_TRANSPOSITION(transposition). */ 594 return RSB_M4_BCSS_KERNEL_FUNCTION(`ID',type,matrix_storage,RSB_M4_TRANSPOSE_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)dnl 595(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',type,matrix_storage,RSB_M4_TRANSPOSE_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype))); 596')dnl 597dnl 598',`dnl 599ifelse(RSB_M4_OR(RSB_M4_AND(RSB_M4_IS_COMPLEX_TYPE(type),RSB_M4_SAME(k_symmetry,`hNEVEROCCURINGFIXME'),RSB_M4_SAME(transposition,RSB_M4_TRANS_C)),RSB_M4_AND(RSB_M4_IS_COMPLEX_TYPE(type),RSB_M4_SAME(k_symmetry,`s'),RSB_M4_SAME(transposition,RSB_M4_TRANS_T))),1,`dnl 600dnl 601 /* `This kernel performs the same as its transposed', transposition -> RSB_M4_TRANSPOSE_TRANSPOSITION(transposition). */ 602 return RSB_M4_BCSS_KERNEL_FUNCTION(`ID',type,matrix_storage,RSB_M4_TRANSPOSE_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)dnl 603(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',type,matrix_storage,RSB_M4_TRANSPOSE_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype))); 604dnl 605dnl ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_C),1,`dnl 606dnl /* 607dnl The matrix is treated as symmetric hermitian. 608dnl FIXME: missing implementation. 609dnl */ 610dnl return RSB_ERR_UNIMPLEMENTED_YET; 611dnl ')dnl 612dnl 613',`dnl 614dnl 615ifelse(RSB_M4_AND(RSB_M4_NOT(RSB_M4_IS_COMPLEX_TYPE(type)),RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N))),1,`dnl 616dnl 617 /* Symmetric `transposed' reverts to symmetric `not transposed' */ 618 return RSB_M4_BCSS_KERNEL_FUNCTION(`ID',type,matrix_storage,RSB_M4_TRANS_N,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)dnl 619(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',type,matrix_storage,RSB_M4_TRANS_N,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo))); 620dnl 621',`dnl 622dnl 623dnl 624ifelse(unrolling,`l',/* FIXME : l-unrolled functions are broken */)dnl 625dnl 626dnl BEGIN VARIABLES DECLARATIONS 627dnl 628ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl 629ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),`1',`dnl 630ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),1,`dnl 631 register rsb_coo_idx_t i=0,j=0; 632',`dnl 633 register rsb_coo_idx_t i=0; 634')dnl 635',`dnl 636 register rsb_coo_idx_t i=0,j=0; 637')dnl 638 register rsb_nnz_idx_t k=0; 639dnl 640ifelse(RSB_M4_NOT(RSB_M4_IS_SPMV_KERNEL_MOP(mop)),`1',`dnl 641ifelse(unrolling,`l',`dnl 642 const register rsb_coo_idx_t columns=cpntr[1]-cpntr[0]; /* we assume that block_count >= 1 */ 643 const register rsb_coo_idx_t rows =rpntr[1]-rpntr[0]; /* we assume that block_count >= 1 */ 644',`dnl 645 const register rsb_coo_idx_t columns=b_columns,rows=b_rows; 646')dnl 647')dnl 648')dnl 649dnl 650ifelse(RSB_M4_IS_READONLY_KERNEL_MOP(mop),1,`dnl 651ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),`0',`dnl 652 const mtype *a=VA; 653')dnl 654')dnl 655ifelse(RSB_M4_IS_WRITEONLY_KERNEL_MOP(mop),1,`dnl 656 mtype *a=VA; 657')dnl 658dnl 659ifelse(RSB_M4_IS_RC_BIASED_KERNEL_MOP(mop),`0',`dnl 660ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`0',`dnl 661ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop),`0',`dnl 662ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),`0',`dnl 663 const rsb_coo_idx_t incx=1,incy=1;`' 664')dnl 665')dnl 666')dnl 667')dnl 668dnl 669ifelse(RSB_M4_IS_OP_SCALING_KERNEL_MOP(mop),`1',`dnl 670 const mtype alpha=*alphap;`'dnl 671')dnl 672ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),`1',`dnl 673 const mtype beta=*betap;`'dnl 674')dnl 675dnl 676ifelse(RSB_M4_is_transposed_spmv,1,`dnl 677 const mtype *trhs = rhs+xstride*(roff-coff);`' 678 mtype *tout=out+ystride*(coff-roff);`' 679 680')dnl 681ifelse(RSB_M4_IS_SPXX_OP_SCALING_KERNEL_MOP(mop),1,`dnl 682')dnl 683dnl 684dnl 685dnl END VARIABLES DECLARATIONS 686dnl 687dnl BEGIN CONDITIONAL VECTOR SCALING 688dnl 689ifelse(should_init_out_vector_before_outer_loop,1,`dnl 690ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),1,`dnl 691ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`dnl 692 if(beta!=1)rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type),out_dim,&beta,out,ystride); 693',`dnl 694 rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type), out_dim,&beta, out, 1); 695') /* we scale the destination vector */ 696')dnl 697ifelse(RSB_M4_IS_ZEROING_KERNEL_MOP(mop),1,`dnl 698 rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type),out_dim,NULL,out,ystride); 699')dnl 700')dnl 701dnl 702dnl END CONDITIONAL VECTOR SCALING 703dnl 704dnl BEGIN COMMON EXTERNAL LOOP BEGINNING 705dnl 706ifelse(RSB_M4_want_verbose_comments,`1',` /* Outer loop. Occurs on the major dimension. */ ')dnl 707dnl 708ifelse(is_an_externally_backward_kernel,1,` 709 for(Mi=Mdim-1; RSB_LIKELY((Mi+1)>0 /*trick for unsigned indices */);--Mi) //RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)) 710 { 711',`dnl 712dnl 713ifelse(RSB_M4_AND(RSB_M4_WANT_20110206_BOUNDED_BOX_PATCH,RSB_M4_NOT(RSB_M4_IS_SCALING_OR_ZEROING_KERNEL_MOP(mop))),1,`dnl 714dnl really, the above condition should also check for transposition! but in this way it does no wrong. 715 for(Mi=br;RSB_LIKELY(Mi<bc);++Mi) /* experimental, for the bounded box patch */ 716',`dnl 717 for(Mi=0;RSB_LIKELY(Mi<Mdim);++Mi) 718')dnl 719dnl 720 { 721')dnl 722dnl 723ifelse(RSB_M4_want_verbose_comments,`1',`dnl 724 /* logically, i is the working block row, j is the working block column */ 725 /* physically, Mi is the working block row, mi is the working block column */ 726')dnl 727dnl 728pushdef(`colsu',ifelse(unrolling,`l',columns,colsu))dnl 729pushdef(`rowsu',ifelse(unrolling,`l',rows,rowsu))dnl 730pushdef(`tcolsu',ifelse(transposition,RSB_M4_TRANS_T,rowsu,colsu))dnl 731pushdef(`trowsu',ifelse(transposition,RSB_M4_TRANS_T,colsu,rowsu))dnl 732dnl 733ifelse(RSB_M4_IS_SPXX_OP_SCALING_KERNEL_MOP(mop),1,`dnl 734pushdef(`postalphamult',`(alpha)*')dnl 735',`dnl 736dnl 737ifelse(RSB_M4_IS_SPMX_OP_NEGATING_KERNEL_MOP(mop),1,`dnl 738pushdef(`postalphamult',`(-1)*')dnl 739',`dnl 740pushdef(`postalphamult',`')dnl 741')dnl 742dnl 743')dnl 744dnl 745ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),`1',`dnl 746ifelse(transposed,`0',`dnl 747ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),0,`dnl 748 const mtype *a=VA; 749')dnl 750')dnl 751ifelse(RSB_M4_OR(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),RSB_M4_AND(RSB_M4_IS_UNSYMMETRIC(k_symmetry),RSB_M4_NOT(transposed))),1,`dnl 752 register mtype cacc = RSB_M4_ZERO(mtype); 753dnl mtype *outi=out+(trowsu*i*ystride); 754',`dnl 755')dnl 756')dnl 757dnl 758ifelse(RSB_M4_IS_SPXX_KERNEL_MOP(mop),`1',`dnl 759ifelse(RSB_M4_is_transposed_spmv,1,`dnl 760 const mtype bt=postalphamult`'trhs[(tcolsu*xstride*(Mi))]; 761dnl const mtype *b = rhs+(tcolsu*bci); 762',`dnl 763dnl const mtype bn = rhs[(tcolsu*xstride*(Mi))]; /*20120915: spurious instruction commented out*/ 764')dnl 765')dnl 766 const rsb_nnz_idx_t fk=bpntr[Mi],lk=bpntr[Mi+1]; 767dnl 768dnl 769dnl END COMMON EXTERNAL LOOP BEGINNING 770dnl 771dnl BEGIN EXTERNAL LOOP VECTOR SCALING 772dnl 773ifelse(RSB_NOT(RSB_M4_IS_ALLOWING_ALIASING_KERNEL_MOP(mop)),1,` 774ifelse(should_init_out_vector_before_outer_loop,0,`dnl 775ifelse(unrolling,`l',` 776ifelse(mop,`spmv_uxux',`dnl 777 rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type), b_rows,&beta, out+rows*i, 1);/* we scale the destination vector */ 778')dnl 779ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,`dnl 780ifelse(mop,`spmv_uauz',`dnl 781 rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type),b_rows,NULL,out+rows*i,ystride); 782')dnl 783')dnl 784dnl 785',`dnl 786dnl 787ifelse(RSB_M4_IS_ZEROING_KERNEL_MOP(mop),1,`dnl 788 forloop(`row',0,decr(trowsu),`out[trowsu*bri+row]=0;') 789')dnl 790ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),1,`dnl 791 forloop(`row',0,decr(trowsu),`out[trowsu*bri+row]*=beta;') 792')dnl 793')dnl 794')dnl 795')dnl 796dnl 797dnl 798ifelse(should_init_out_vector_before_outer_loop,0,`dnl 799ifelse(RSB_M4_IS_ZEROING_KERNEL_MOP(mop),1,`dnl 800 forloop(`row',0,decr(trowsu),`out[trowsu*bri+row]=0;') 801')dnl 802ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),1,`dnl 803 forloop(`row',0,decr(trowsu),`out[trowsu*bri+row]*=beta;') 804')dnl 805')dnl 806dnl 807dnl END EXTERNAL LOOP VECTOR SCALING 808dnl 809ifelse(RSB_M4_want_verbose_comments,`1',` /* Inner loop. Occurs on the minor dimension. */ ')dnl 810dnl 811dnl BEGIN KERNELS DEFINITION 812dnl 813ifelse(RSB_M4_IS_SPMX_KERNEL_MOP(mop),`1',`dnl 814dnl 815dnl BEGIN SPMV KERNEL DEF 816dnl /* SPMV KERNEL BEGINS HERE */ 817dnl 818ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),1----,`dnl 819ifelse(RSB_M4_IS_SPMX_KERNEL_MOP(mop),1,`dnl 820ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`',`dnl 821ifelse(RSB_M4_want_verbose_comments,`1',`dnl 822/* 823 Symmetric kernels should process the first block separately, if it contains `diagonal' elements. 824 FIXME : this is NOT the case for blocked code. 825*/ 826')dnl 827 k=fk; 828 if(RSB_UNLIKELY(lk==k)) continue;/* nothing to do here */ 829 mi=bindx[k]; 830 if(mi==Mi && ((lk-k)>1) && roff==coff) /* a `diagonal' element, and not the only one, on a diagonally positioned matrix */ 831 { 832 const mtype *b = rhs+(tcolsu*bci); 833 mtype *c=out+(trowsu*bri); 834dnl const mtype *b = rhs+(trowsu*bri); 835dnl mtype *c=out+(tcolsu*bci); 836dnl 837dnl /* FIXME : THIS IS AN EXAMPLE : SHOULD INTRODUCE DIAGONAL-SUBTRACTION CODELET */ 838dnl 839{RSB_M4_EXTRA_SYMMETRIC_DIAGONAL_FIXING_KERNEL(`row',`rows',b_rows,`column',`columns',b_columns,mtype,,mop,unrolling,transposition,RSB_M4_SYMMETRY_SWITCH(k_symmetry))} 840 } 841')dnl 842')dnl 843')dnl 844dnl 845ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_SAME(transposed,1)),1,`dnl 846ifelse(RSB_M4_want_verbose_comments,`1',`dnl 847dnl /* `Since this is a transposed kernel, we apply a correction to the output vector locations.' */ 848')dnl 849dnl rhs=(rhs-coff*(xstride))+roff*(xstride); out=(out-roff*(ystride))+coff*(ystride); 850')dnl 851dnl 852dnl 853ifelse(RSB_M4_IS_UNSYMMETRIC(k_symmetry),1,`dnl 854ifelse(transposed,`0',`dnl 855dnl 856dnl RSB_M4_EARLY_EVICT_INSTRUCTION((a+k,bindx+k))`'dnl 857dnl 858dnl RSB_M4_SIMPLE_LOOP_UNROLL_2S_J.. 859RSB_M4_SIMPLE_LOOP_UNROLL_5S(`k',`LI',`fk',`lk',`dnl 860',`dnl 861dnl 862 `const rsb_coo_idx_t' `j_'``''LI`'=bindx[k+LI]; 863 `const mtype b_'``''LI`'=rhs[tcolsu*(`j_'``''LI`')*xstride]; 864 `const mtype a_'``''LI`'=a[k+LI]; 865dnl 866',`dnl 867',`dnl 868dnl cacc+=a[k+LI]*b_``''LI; 869dnl cacc+=a_``''LI*b_``''LI; 870 ``cacc+=a_''``''LI``*b_''``''LI; 871',`dnl RSB_M4_EARLY_EVICT_INSTRUCTION((a+k,bindx+k))`'dnl 872',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL) 873dnl 874dnl RSB_M4_EARLY_EVICT_INSTRUCTION((a+k,bindx+k))`'dnl 875dnl RSB_M4_EARLY_EVICT_INSTRUCTION((outi+k-12))`'dnl 876dnl 877')dnl 878')dnl 879dnl 880dnl 881ifelse(RSB_M4_IS_UNSYMMETRIC(k_symmetry),1,`dnl 882ifelse(transposed,`1',`dnl 883dnl 884RSB_M4_SIMPLE_LOOP_UNROLL_2S_J(`k',`LI',`fk',`lk',`dnl 885dnl 886 `const rsb_coo_idx_t' `j_'``''LI`'=bindx[k+LI]; 887 `const mtype a_'``''LI`'=RSB_M4_CONJ(VA[k+LI],mtype,transposition,k_symmetry); 888 `mtype c_'``''LI`'=a_``''LI*bt; 889dnl 890',`dnl 891 tout[(tcolsu)*(`j_'``''LI`')*ystride]+=`c_'``''LI`'; 892',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL) 893dnl 894dnl 895')dnl 896')dnl 897dnl 898ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),1,`dnl 899ifelse(k_symmetry,RSB_M4_SYMBOL_HERMITIAN,`dnl 900ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_C),1,`dnl 901pushdef(`ntransposition',transposition)dnl 902pushdef(`ttransposition',RSB_M4_TRANSPOSE_TRANSPOSITION(transposition))dnl 903')dnl 904ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_T),1,`dnl 905pushdef(`ntransposition',transposition)dnl 906pushdef(`ttransposition',RSB_M4_TRANS_C)dnl 907')dnl 908ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl 909pushdef(`ntransposition',RSB_M4_TRANS_C)dnl 910pushdef(`ttransposition',transposition)dnl 911')dnl 912',`dnl 913ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_C),1,`dnl 914pushdef(`ntransposition',transposition)dnl 915pushdef(`ttransposition',transposition)dnl 916',`dnl 917pushdef(`ntransposition',RSB_M4_TRANSPOSE_TRANSPOSITION(transposition))dnl 918pushdef(`ttransposition',RSB_M4_TRANSPOSE_TRANSPOSITION(transposition))dnl 919')dnl 920')dnl 921dnl // nt: ntransposition ttransposition 922 k=fk; 923 if(k==lk)continue; 924 j=bindx[k]; 925 cacc += RSB_M4_CONJ(VA[k],mtype,ntransposition,k_symmetry)*rhs[tcolsu*j*xstride]; 926 if(roff!=coff || (j!=i)) 927 tout[(tcolsu)*(j)*ystride]+=RSB_M4_CONJ(VA[k],mtype,ttransposition,k_symmetry)*bt; 928 ++k; 929dnl RSB_M4_SIMPLE_LOOP_UNROLL_2S.. 930RSB_M4_SIMPLE_LOOP_UNROLL_2S_J(`k',`LI',`fk+1',`lk-1',`dnl 931dnl 932 `const rsb_coo_idx_t' `j_'``''LI`'=bindx[k+LI]; 933 `const mtype b_'``''LI`'=rhs[tcolsu*(`j_'``''LI`')*xstride]; 934 `const mtype a_'``''LI`'=VA[k+LI]; 935 `mtype c_'``''LI`'=RSB_M4_CONJ_SYM(mtype,ttransposition,k_symmetry)( `a_'``''LI)*bt; 936dnl `mtype c_'``''LI`'=RSB_M4_CONJ(( `a_'``''LI),mtype,transposition,k_symmetry) *bt ; 937dnl 938',`dnl 939 cacc += RSB_M4_CONJ_SYM(mtype,ntransposition,k_symmetry)(`a_'``''LI)*b_``''LI; 940 tout[(tcolsu)*(`j_'``''LI`')*ystride]+=`c_'``''LI`'; 941',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL) 942 if(k<lk) 943 { 944 j=bindx[k]; 945 cacc += RSB_M4_CONJ(VA[k],mtype,ntransposition,k_symmetry)*rhs[trowsu*j*xstride]; 946 if(roff!=coff || (j!=i)) 947 tout[(tcolsu)*(j)*ystride]+=RSB_M4_CONJ(VA[k],mtype,ttransposition,k_symmetry)*bt; 948 ++k; 949 } 950popdef(`ntransposition')dnl 951popdef(`ttransposition')dnl 952dnl 953')dnl 954dnl 955ifelse(RSB_M4_should_merge_value_after_inner_loop,`1',`dnl 956dnl outi[0]+=postalphamult`cacc'; 957 out[(trowsu*i*ystride)]+=postalphamult`cacc'; 958')dnl 959dnl 960dnl } 961dnl 962dnl 963dnl 964dnl FIXME : this code is only a quick hack for CSR! 965dnl 966dnl 967dnl /* SPMV KERNEL ENDS HERE */ 968popdef(`postalphamult')dnl 969dnl END SPMV KERNEL DEF 970')dnl 971dnl 972ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),`1',`dnl 973dnl BEGIN SPSV KERNEL DEF 974dnl /* SPSV KERNEL BEGINS HERE */ 975dnl 976ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),1,`dnl 977dnl const mtype bb_0=rhs[(trowsu*bri)]; 978ifelse(is_diag_d_spsv_kernel,1,`',`dnl 979ifelse(RSB_M4_OR(RSB_M4_IS_SPSX_OP_SCALING_KERNEL_MOP(mop),RSB_M4_IS_SPSX_OP_SETTING_KERNEL_MOP(mop)),1,`dnl 980 const mtype bb_0=rhs[(trowsu*Mi*extra_xstride)]; 981')dnl 982')dnl 983 mtype ax_0; 984dnl 985ifelse(is_diag_d_spsv_kernel,1,`dnl 986dnl 987dnl FIXME: missing incx, incy support here! 988dnl 989ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`dnl 990 const mtype aa=1; 991',`dnl 992 const mtype aa=VA[ifelse(uplo,`u',`fk',`lk-1')]; 993ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK(),1,`dnl 994 if(aa == RSB_M4_ZERO(mtype))return RSB_ERR_INVALID_NUMERICAL_DATA; 995')dnl 996')dnl 997dnl 998 999ifelse(RSB_M4_IS_SPSX_OP_SCALING_KERNEL_MOP(mop),1,`dnl 1000dnl 1001dnl out[tcolsu*bci]/=RSB_M4_CONJ(VA[bpntr[Mi+1]-1],mtype,transposition,k_symmetry); 1002dnl 1003',`dnl 1004dnl out[tcolsu*bci]/=RSB_M4_CONJ(VA[bpntr[Mi+1]-1],mtype,transposition,k_symmetry); 1005dnl 1006')dnl 1007dnl 1008 out[tcolsu*bci]/=aa; 1009dnl 1010')dnl 1011dnl 1012ifelse(is_zero_acc_spsv_kernel,1,`dnl 1013 ax_0=0; 1014',`dnl 1015 ax_0=out[tcolsu*bci]; 1016')dnl 1017dnl 1018dnl 1019')dnl 1020dnl 1021ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),`1',`dnl 1022pushdef(`skip_head_row_elements',ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),`1',`0',ifelse(uplo,`u',`1',`0')))dnl 1023pushdef(`skip_tail_row_elements',ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),`1',`0',ifelse(uplo,`u',`0',`1')))dnl 1024',`dnl 1025pushdef(`skip_head_row_elements',0)dnl 1026pushdef(`skip_tail_row_elements',0)dnl 1027')dnl 1028dnl 1029ifelse(is_a_backward_kernel,1,` 1030dnl 1031dnl FIXME : backward kernels are noly used for SPSV, and they start with one element less 1032dnl 1033 for(k=lk-1-skip_tail_row_elements`'dnl 1034,a=VA+k,mi=bindx[k];k+1>=fk+1+skip_head_row_elements ;--k,block_forward,mi=bindx[k]) 1035dnl /* k is the index of the block */ 1036',`dnl 1037 ifelse(skip_head_row_elements,1,block_forward;) 1038 for(k=fk+skip_head_row_elements,mi=bindx[k];k<lk-skip_tail_row_elements ;++k,block_forward,mi=bindx[k]) 1039dnl /* k is the index of the block */ 1040')dnl 1041 { 1042ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl 1043 const mtype *b=out + (tcolsu*bci); 1044 mtype *c=&ax_0; 1045')dnl 1046dnl 1047dnl Fixed for Hermitian k_symmetry. 1048dnl 1049ifelse(is_diag_d_spsv_kernel,1,`dnl 1050 out[trowsu*bri]-=RSB_M4_CONJ(*a,mtype,transposition,k_symmetry)*ax_0; 1051',`dnl 1052{RSB_M4_KERNEL_FUNCTION_BODY(`row',`rows',b_rows,`column',`columns',b_columns,mtype,,mop,unrolling,RSB_M4_SYMBOL_UNSYMMETRIC)} 1053')dnl 1054dnl 1055 } 1056dnl 1057ifelse(is_diag_d_spsv_kernel,1,`dnl 1058ifelse(RSB_M4_IS_SPSX_OP_SCALING_KERNEL_MOP(mop),1,`dnl 1059 out[tcolsu*bci]*=alpha; 1060')dnl 1061')dnl 1062dnl 1063ifelse(is_diag_d_spsv_kernel,1,`',`dnl 1064ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`',`dnl 1065 if(lk-fk>0) 1066dnl /* if this row block was not empty */ 1067')dnl 1068 { 1069 /* `the last element (which for a lower triangular solve is on the diagonal')*/ 1070dnl block_backward; 1071 /* Lx=y ; x_0=y_0/L_1_1 */ 1072 mtype *c_0=out+(trowsu*bri); 1073ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`dnl 1074 const mtype aa=1; 1075',`dnl 1076dnl elements on the diagonal are real, and no conjugation is needed 1077 const mtype aa=VA[ifelse(uplo,`u',`fk',`lk-1')]; 1078ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK(),1,`dnl 1079 if(aa == RSB_M4_ZERO(mtype))return RSB_ERR_INVALID_NUMERICAL_DATA; 1080')dnl 1081')dnl 1082dnl 1083dnl 1084ifelse(RSB_M4_IS_SPSX_OP_SCALING_KERNEL_MOP(mop),1,`dnl 1085 *c_0 =(alpha*bb_0 - ax_0)/aa; /* ax_0 + *a * *c_0=bb_0 -> (*c_0)=(bb_0 - ax_0 )/(*a) */ 1086')dnl 1087ifelse(RSB_M4_IS_SPSX_OP_SETTING_KERNEL_MOP(mop),1,`dnl 1088 *c_0=(bb_0 - ax_0)/aa; /* ax_0 + *a * *c_0=bb_0 -> (*c_0)=(bb_0 - ax_0 )/(*a) */ 1089')dnl 1090dnl 1091ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`',`dnl 1092 block_forward; 1093')dnl 1094 } 1095')dnl 1096dnl 1097popdef(`skip_head_row_elements')dnl 1098popdef(`skip_tail_row_elements')dnl 1099dnl 1100dnl 1101dnl /* SPSV KERNEL ENDS HERE */ 1102dnl END SPSV KERNEL DEF 1103')dnl 1104dnl 1105ifelse(RSB_M4_NOT(RSB_M4_IS_SPXX_KERNEL_MOP(mop)),`1',`dnl 1106dnl BEGIN MISC KERNEL DEF 1107dnl 1108 /* touppercase(mop) KERNEL HERE */ 1109dnl for(k=fk,mi=bindx[k];k<lk;++k,block_forward,mi=bindx[k]) 20120915 /*buggy loop */ 1110 for(k=fk;k<lk;++k,block_forward) 1111 { 1112 mi=bindx[k]; 1113 { 1114ifelse(mop,`scale',`dnl 1115 /*a=VA+indptr[(k)];*/ 1116 const mtype *d=scale_factors+(trowsu*bri); 1117')dnl 1118ifelse(mop,`negation',`dnl 1119 /*a=VA+indptr[k];*/ 1120')dnl 1121ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl 1122 /*a=VA+indptr[k];*/ 1123 mtype *local_row_sums = row_sums+(trowsu*bri); 1124')dnl 1125dnl {RSB_M4_KERNEL_FUNCTION_BODY(`row',`rows',b_rows,`column',`columns',b_columns,mtype,,mop,unrolling,RSB_M4_SYMBOL_UNSYMMETRIC)} 1126{RSB_M4_KERNEL_FUNCTION_BODY(`row',`rows',b_rows,`column',`columns',b_columns,mtype,,mop,unrolling,k_symmetry)} 1127 } 1128 } 1129dnl 1130dnl END MISC KERNEL DEF 1131')dnl 1132dnl 1133dnl END KERNELS DEFINITION 1134dnl 1135dnl BEGIN COMMON EXTERNAL LOOP CLOSING 1136 } 1137dnl END COMMON EXTERNAL LOOP CLOSING 1138dnl 1139dnl ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`1',dnl 1140dnl `incx--;incy--;/* we are interested in the increment off 1 */ 1141dnl ')dnl 1142dnl 1143dnl 1144ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl 1145dnl this check would be good for no-looped functions only! 1146dnl if(columns != b_columns || rows != b_rows)return RSB_ERR_BADARGS; /* a non comprehensive check of course*/ 1147 1148dnl FIXME : ONLY EXPERIMENTAL OPENMP SUPPORT 1149dnl 1150dnl 1151ifelse(RSB_M4_WANT_OMP_IN_KERNELS,`1',`dnl 1152 size_t tn; 1153 size_t nt; 1154`#'dnl 1155 pragma omp parallel num_threads(rsb_global_session_handle.rsb_g_threads) private(mi,Mi,k,tn,nt) 1156 { 1157 tn = omp_get_thread_num(); 1158 nt = omp_get_num_threads(); 1159 /*RSB_INFO("working on %d / %d threads\n",tn,nt);*/ 1160 //for(Mi=tn;Mi<Mdim;Mi+=nt) 1161 size_t ui=((Mdim/nt)*(tn+1)); 1162 size_t li=(Mdim/nt)*tn; 1163 if(ui>Mdim)ui=Mdim; 1164dnl #pragma omp for schedule(static,1) /* shared L1 cache */ 1165#pragma omp for schedule(static,(Mdim+1)/2) /* separate L1 caches */ 1166 for(Mi=li;RSB_LIKELY(Mi<ui);++Mi) 1167 { 1168 //RSB_INFO("row %d working on %d / %d threads\n",mi,tn,nt); 1169',`dnl 1170dnl 1171')dnl 1172dnl ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,`dnl 1173dnl /* should zero output block here (for efficiency) instead of function top */ 1174dnl ')dnl 1175dnl 1176dnl FIXME: the following is NEW, and useful also for SYMMETRIC 1177dnl /* transpose.. is transposed */ 1178dnl /* useless for storage matrix_storage */ 1179dnl /*if(bpntr[Mi]==bpntr[Mi+1])continue;*/ /* empty */ 1180ifelse(mop,`spmv_uauz',`dnl 1181dnl mtype *c=out+(rowsu*mi); /* declaration of c put here for experimental purposes */ 1182')dnl 1183dnl 1184dnl 1185dnl FIXME : blocked TRS kernels are broken, in this way 1186dnl 1187dnl mi=bindx[k]; 1188dnl /* `mop' is mop */ 1189dnl 1190dnl 1191dnl 1192dnl 1193dnl 1194popdef(`is_diag_d_spsv_kernel')dnl 1195popdef(`tcolsu')dnl 1196popdef(`trowsu')dnl 1197popdef(`colsu')dnl 1198popdef(`rowsu')dnl 1199popdef(`transposed')dnl 1/2 1200dnl popdef(`transposed')dnl 2/2 1201popdef(`should_init_out_vector_before_outer_loop')dnl 1202popdef(`total_block_columns')dnl 1203popdef(`total_block_rows')dnl 1204popdef(`total_rows')dnl 1205popdef(`total_columns')dnl 1206dnl 1207dnl 1208ifelse(RSB_M4_WANT_OMP_IN_KERNELS,`1',`dnl 1209 } 1210')dnl 1211popdef(`mi')dnl 1212popdef(`Mi')dnl 1213popdef(`brit')dnl 1214popdef(`bcit')dnl 1215popdef(`brin')dnl 1216popdef(`bcin')dnl 1217popdef(`bri')dnl 1218popdef(`bci')dnl 1219')dnl 1220dnl 1221 return RSB_ERR_NO_ERROR; 1222dnl 1223')dnl 1224')dnl 1225dnl 1226')')dnl 1227dnl 1228dnl 1229popdef(`skip_implementation')dnl 1230popdef(`out_dim')dnl 1231popdef(`is_a_backward_kernel')dnl 1232popdef(`is_an_externally_backward_kernel')dnl 1233popdef(`is_zero_acc_spsv_kernel')dnl 1234popdef(`block_forward')dnl 1235popdef(`block_backward')dnl 1236popdef(`extra_xstride')dnl 1237popdef(`extra_ystride')dnl 1238} 1239dnl 1240')dnl 1241dnl 1242')dnl 1243dnl 1244popdef(`uplo')dnl 1245popdef(`want_what')dnl 1246popdef(`k_diagonal')dnl 1247popdef(`citype')dnl 1248popdef(`mop')dnl 1249popdef(`matrix_storage')dnl 1250popdef(`k_symmetry')dnl 1251popdef(`transposition')dnl 1252popdef(`mtype')dnl 1253popdef(`itype')dnl 1254popdef(`unrolling')dnl 1255')dnl 1256dnl 1257dnl 1258define(`RSB_M4_BCSS_MISC_KERNELS',`dnl 1259dnl 1260pushdef(`unrollings',$1)dnl 1261dnl 1262dnl FIXED BLOCK SIZE KERNELS : 1263dnl 1264foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 1265foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 1266foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl 1267foreach(`unrolling',unrollings,`dnl 1268ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop)RSB_M4_IS_SPMV_KERNEL_MOP(mop),00,`dnl 1269foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl 1270foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl 1271foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 1272foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 1273foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 1274foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 1275foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 1276RSB_M4_BCSS_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo) 1277')dnl 1278')dnl 1279')dnl 1280')dnl 1281')dnl 1282')dnl 1283')dnl 1284')dnl 1285')dnl 1286')dnl 1287')dnl 1288')dnl 1289dnl 1290dnl FIXED BLOCK SIZE DISPATCHERS : 1291dnl 1292foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 1293foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 1294ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop)RSB_M4_IS_SPMV_KERNEL_MOP(mop),00,`dnl 1295foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl 1296foreach(`unrolling',unrollings,`dnl 1297foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 1298foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 1299foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 1300foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 1301foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 1302RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo) 1303')dnl 1304')dnl 1305')dnl 1306')dnl 1307')dnl 1308')dnl 1309')dnl 1310')dnl 1311')dnl 1312')dnl 1313dnl 1314dnl 1315popdef(`unrollings')dnl 1316dnl 1317')dnl 1318dnl 1319dnl 1320dnl 1321dnl 1322define(`RSB_M4_BCSS_SPMV_KERNELS',`dnl 1323dnl 1324pushdef(`unrollings',$1)dnl 1325dnl 1326dnl FIXED BLOCK SIZE KERNELS : 1327dnl 1328foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 1329foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 1330foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl 1331foreach(`unrolling',unrollings,`dnl 1332ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),1,`dnl 1333foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 1334foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl 1335foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl 1336foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 1337foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 1338foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 1339foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 1340RSB_M4_BCSS_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo) 1341')dnl 1342')dnl 1343')dnl 1344')dnl 1345')dnl 1346')dnl 1347')dnl 1348')dnl 1349')dnl 1350')dnl 1351')dnl 1352')dnl 1353dnl 1354dnl FIXED BLOCK SIZE DISPATCHERS : 1355dnl 1356foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 1357foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 1358ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),1,`dnl 1359foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl 1360foreach(`unrolling',unrollings,`dnl 1361foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 1362foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 1363foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 1364foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 1365foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 1366RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo) 1367')dnl 1368')dnl 1369')dnl 1370')dnl 1371')dnl 1372')dnl 1373')dnl 1374')dnl 1375')dnl 1376')dnl 1377dnl 1378dnl 1379popdef(`unrollings')dnl 1380dnl 1381')dnl 1382dnl 1383dnl 1384dnl 1385dnl 1386define(`RSB_M4_BCSS_SPSV_KERNELS',`dnl 1387dnl 1388pushdef(`unrollings',$1)dnl 1389dnl 1390dnl FIXED BLOCK SIZE KERNELS : 1391dnl 1392foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 1393foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 1394ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop),1,`dnl 1395foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 1396foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl 1397foreach(`unrolling',unrollings,`dnl 1398foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl 1399foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl 1400foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 1401foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 1402foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 1403foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 1404RSB_M4_BCSS_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo) 1405')dnl 1406')dnl 1407')dnl 1408')dnl 1409')dnl 1410')dnl 1411')dnl 1412')dnl 1413')dnl 1414')dnl 1415')dnl 1416')dnl 1417dnl 1418dnl FIXED BLOCK SIZE DISPATCHERS : 1419dnl 1420foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 1421foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 1422ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop),1,`dnl 1423foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl 1424foreach(`unrolling',unrollings,`dnl 1425foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 1426foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 1427foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 1428foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 1429foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 1430RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo) 1431')dnl 1432')dnl 1433')dnl 1434')dnl 1435')dnl 1436')dnl 1437')dnl 1438')dnl 1439')dnl 1440')dnl 1441dnl 1442dnl 1443popdef(`unrollings')dnl 1444dnl 1445')dnl 1446dnl 1447dnl 1448dnl 1449dnl 1450dnl 1451define(`RSB_M4_BCSS_KERNELS',`dnl 1452dnl 1453pushdef(`unrollings',$1)dnl 1454dnl 1455dnl FIXED BLOCK SIZE KERNELS : 1456dnl 1457foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 1458foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 1459foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl 1460foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 1461foreach(`unrolling',unrollings,`dnl 1462foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl 1463foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl 1464foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 1465foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 1466foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 1467foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 1468RSB_M4_BCSS_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo) 1469')dnl 1470')dnl 1471')dnl 1472')dnl 1473')dnl 1474')dnl 1475')dnl 1476')dnl 1477')dnl 1478')dnl 1479')dnl 1480dnl 1481dnl FIXED BLOCK SIZE DISPATCHERS : 1482dnl 1483foreach(`type',RSB_M4_MATRIX_TYPES,`dnl 1484foreach(`mop',RSB_M4_MATRIX_OPS,`dnl 1485foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl 1486foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl 1487foreach(`unrolling',unrollings,`dnl 1488foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl 1489foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl 1490foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl 1491foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl 1492RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo) 1493')dnl 1494')dnl 1495')dnl 1496')dnl 1497')dnl 1498')dnl 1499')dnl 1500')dnl 1501')dnl 1502dnl 1503dnl 1504popdef(`unrollings')dnl 1505dnl 1506')dnl 1507dnl 1508dnl 1509dnl 1510