1 /* 2 3 BLIS 4 An object-based framework for developing high-performance BLAS-like 5 libraries. 6 7 Copyright (C) 2014, The University of Texas at Austin 8 9 Redistribution and use in source and binary forms, with or without 10 modification, are permitted provided that the following conditions are 11 met: 12 - Redistributions of source code must retain the above copyright 13 notice, this list of conditions and the following disclaimer. 14 - Redistributions in binary form must reproduce the above copyright 15 notice, this list of conditions and the following disclaimer in the 16 documentation and/or other materials provided with the distribution. 17 - Neither the name(s) of the copyright holder(s) nor the names of its 18 contributors may be used to endorse or promote products derived 19 from this software without specific prior written permission. 20 21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33 */ 34 35 #include "blis.h" 36 37 // 38 // Define BLAS-like interfaces with typed operands. 39 // 40 41 #undef GENTFUNC 42 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ 43 \ 44 void PASTEMAC(ch,opname) \ 45 ( \ 46 doff_t diagoffx, \ 47 diag_t diagx, \ 48 uplo_t uplox, \ 49 trans_t transx, \ 50 dim_t m, \ 51 dim_t n, \ 52 ctype* x, inc_t rs_x, inc_t cs_x, \ 53 ctype* y, inc_t rs_y, inc_t cs_y, \ 54 cntx_t* cntx, \ 55 rntm_t* rntm \ 56 ) \ 57 { \ 58 const num_t dt = PASTEMAC(ch,type); \ 59 \ 60 ctype* x1; \ 61 ctype* y1; \ 62 uplo_t uplox_eff; \ 63 conj_t conjx; \ 64 dim_t n_iter; \ 65 dim_t n_elem, n_elem_max; \ 66 inc_t ldx, incx; \ 67 inc_t ldy, incy; \ 68 dim_t j, i; \ 69 dim_t ij0, n_shift; \ 70 \ 71 /* Set various loop parameters. */ \ 72 bli_set_dims_incs_uplo_2m \ 73 ( \ 74 diagoffx, diagx, transx, \ 75 uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ 76 &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ 77 &ij0, &n_shift \ 78 ); \ 79 \ 80 if ( bli_is_zeros( uplox_eff ) ) return; \ 81 \ 82 /* Extract the conjugation component from the transx parameter. */ \ 83 conjx = bli_extract_conj( transx ); \ 84 \ 85 /* Query the kernel needed for this operation. */ \ 86 PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ 87 \ 88 /* Handle dense and upper/lower storage cases separately. */ \ 89 if ( bli_is_dense( uplox_eff ) ) \ 90 { \ 91 for ( j = 0; j < n_iter; ++j ) \ 92 { \ 93 n_elem = n_elem_max; \ 94 \ 95 x1 = x + (j )*ldx + (0 )*incx; \ 96 y1 = y + (j )*ldy + (0 )*incy; \ 97 \ 98 /* Invoke the kernel with the appropriate parameters. */ \ 99 f( \ 100 conjx, \ 101 n_elem, \ 102 x1, incx, \ 103 y1, incy, \ 104 cntx \ 105 ); \ 106 } \ 107 } \ 108 else \ 109 { \ 110 if ( bli_is_upper( uplox_eff ) ) \ 111 { \ 112 for ( j = 0; j < n_iter; ++j ) \ 113 { \ 114 n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ 115 \ 116 x1 = x + (ij0+j )*ldx + (0 )*incx; \ 117 y1 = y + (ij0+j )*ldy + (0 )*incy; \ 118 \ 119 /* Invoke the kernel with the appropriate parameters. */ \ 120 f( \ 121 conjx, \ 122 n_elem, \ 123 x1, incx, \ 124 y1, incy, \ 125 cntx \ 126 ); \ 127 } \ 128 } \ 129 else if ( bli_is_lower( uplox_eff ) ) \ 130 { \ 131 for ( j = 0; j < n_iter; ++j ) \ 132 { \ 133 i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ 134 n_elem = n_elem_max - i; \ 135 \ 136 x1 = x + (j )*ldx + (ij0+i )*incx; \ 137 y1 = y + (j )*ldy + (ij0+i )*incy; \ 138 \ 139 /* Invoke the kernel with the appropriate parameters. */ \ 140 f( \ 141 conjx, \ 142 n_elem, \ 143 x1, incx, \ 144 y1, incy, \ 145 cntx \ 146 ); \ 147 } \ 148 } \ 149 } \ 150 } 151 152 INSERT_GENTFUNC_BASIC2( addm_unb_var1, addv, BLIS_ADDV_KER ) 153 INSERT_GENTFUNC_BASIC2( copym_unb_var1, copyv, BLIS_COPYV_KER ) 154 INSERT_GENTFUNC_BASIC2( subm_unb_var1, subv, BLIS_SUBV_KER ) 155 156 157 #undef GENTFUNC 158 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ 159 \ 160 void PASTEMAC(ch,opname) \ 161 ( \ 162 doff_t diagoffx, \ 163 diag_t diagx, \ 164 uplo_t uplox, \ 165 trans_t transx, \ 166 dim_t m, \ 167 dim_t n, \ 168 ctype* alpha, \ 169 ctype* x, inc_t rs_x, inc_t cs_x, \ 170 ctype* y, inc_t rs_y, inc_t cs_y, \ 171 cntx_t* cntx, \ 172 rntm_t* rntm \ 173 ) \ 174 { \ 175 const num_t dt = PASTEMAC(ch,type); \ 176 \ 177 ctype* x1; \ 178 ctype* y1; \ 179 uplo_t uplox_eff; \ 180 conj_t conjx; \ 181 dim_t n_iter; \ 182 dim_t n_elem, n_elem_max; \ 183 inc_t ldx, incx; \ 184 inc_t ldy, incy; \ 185 dim_t j, i; \ 186 dim_t ij0, n_shift; \ 187 \ 188 /* Set various loop parameters. */ \ 189 bli_set_dims_incs_uplo_2m \ 190 ( \ 191 diagoffx, diagx, transx, \ 192 uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ 193 &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ 194 &ij0, &n_shift \ 195 ); \ 196 \ 197 if ( bli_is_zeros( uplox_eff ) ) return; \ 198 \ 199 /* Extract the conjugation component from the transx parameter. */ \ 200 conjx = bli_extract_conj( transx ); \ 201 \ 202 /* Query the kernel needed for this operation. */ \ 203 PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ 204 \ 205 /* Handle dense and upper/lower storage cases separately. */ \ 206 if ( bli_is_dense( uplox_eff ) ) \ 207 { \ 208 for ( j = 0; j < n_iter; ++j ) \ 209 { \ 210 n_elem = n_elem_max; \ 211 \ 212 x1 = x + (j )*ldx + (0 )*incx; \ 213 y1 = y + (j )*ldy + (0 )*incy; \ 214 \ 215 /* Invoke the kernel with the appropriate parameters. */ \ 216 f( \ 217 conjx, \ 218 n_elem, \ 219 alpha, \ 220 x1, incx, \ 221 y1, incy, \ 222 cntx \ 223 ); \ 224 } \ 225 } \ 226 else \ 227 { \ 228 if ( bli_is_upper( uplox_eff ) ) \ 229 { \ 230 for ( j = 0; j < n_iter; ++j ) \ 231 { \ 232 n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ 233 \ 234 x1 = x + (ij0+j )*ldx + (0 )*incx; \ 235 y1 = y + (ij0+j )*ldy + (0 )*incy; \ 236 \ 237 /* Invoke the kernel with the appropriate parameters. */ \ 238 f( \ 239 conjx, \ 240 n_elem, \ 241 alpha, \ 242 x1, incx, \ 243 y1, incy, \ 244 cntx \ 245 ); \ 246 } \ 247 } \ 248 else if ( bli_is_lower( uplox_eff ) ) \ 249 { \ 250 for ( j = 0; j < n_iter; ++j ) \ 251 { \ 252 i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ 253 n_elem = n_elem_max - i; \ 254 \ 255 x1 = x + (j )*ldx + (ij0+i )*incx; \ 256 y1 = y + (j )*ldy + (ij0+i )*incy; \ 257 \ 258 /* Invoke the kernel with the appropriate parameters. */ \ 259 f( \ 260 conjx, \ 261 n_elem, \ 262 alpha, \ 263 x1, incx, \ 264 y1, incy, \ 265 cntx \ 266 ); \ 267 } \ 268 } \ 269 } \ 270 } 271 272 INSERT_GENTFUNC_BASIC2( axpym_unb_var1, axpyv, BLIS_AXPYV_KER ) 273 INSERT_GENTFUNC_BASIC2( scal2m_unb_var1, scal2v, BLIS_SCAL2V_KER ) 274 275 276 #undef GENTFUNC 277 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ 278 \ 279 void PASTEMAC(ch,opname) \ 280 ( \ 281 conj_t conjalpha, \ 282 doff_t diagoffx, \ 283 diag_t diagx, \ 284 uplo_t uplox, \ 285 dim_t m, \ 286 dim_t n, \ 287 ctype* alpha, \ 288 ctype* x, inc_t rs_x, inc_t cs_x, \ 289 cntx_t* cntx, \ 290 rntm_t* rntm \ 291 ) \ 292 { \ 293 const num_t dt = PASTEMAC(ch,type); \ 294 \ 295 ctype* x1; \ 296 uplo_t uplox_eff; \ 297 dim_t n_iter; \ 298 dim_t n_elem, n_elem_max; \ 299 inc_t ldx, incx; \ 300 dim_t j, i; \ 301 dim_t ij0, n_shift; \ 302 \ 303 /* Set various loop parameters. */ \ 304 bli_set_dims_incs_uplo_1m \ 305 ( \ 306 diagoffx, diagx, \ 307 uplox, m, n, rs_x, cs_x, \ 308 &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \ 309 &ij0, &n_shift \ 310 ); \ 311 \ 312 if ( bli_is_zeros( uplox_eff ) ) return; \ 313 \ 314 /* Query the kernel needed for this operation. */ \ 315 PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ 316 \ 317 /* Handle dense and upper/lower storage cases separately. */ \ 318 if ( bli_is_dense( uplox_eff ) ) \ 319 { \ 320 for ( j = 0; j < n_iter; ++j ) \ 321 { \ 322 n_elem = n_elem_max; \ 323 \ 324 x1 = x + (j )*ldx + (0 )*incx; \ 325 \ 326 /* Invoke the kernel with the appropriate parameters. */ \ 327 f( \ 328 conjalpha, \ 329 n_elem, \ 330 alpha, \ 331 x1, incx, \ 332 cntx \ 333 ); \ 334 } \ 335 } \ 336 else \ 337 { \ 338 if ( bli_is_upper( uplox_eff ) ) \ 339 { \ 340 for ( j = 0; j < n_iter; ++j ) \ 341 { \ 342 n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ 343 \ 344 x1 = x + (ij0+j )*ldx + (0 )*incx; \ 345 \ 346 /* Invoke the kernel with the appropriate parameters. */ \ 347 f( \ 348 conjalpha, \ 349 n_elem, \ 350 alpha, \ 351 x1, incx, \ 352 cntx \ 353 ); \ 354 } \ 355 } \ 356 else if ( bli_is_lower( uplox_eff ) ) \ 357 { \ 358 for ( j = 0; j < n_iter; ++j ) \ 359 { \ 360 i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ 361 n_elem = n_elem_max - i; \ 362 \ 363 x1 = x + (j )*ldx + (ij0+i )*incx; \ 364 \ 365 /* Invoke the kernel with the appropriate parameters. */ \ 366 f( \ 367 conjalpha, \ 368 n_elem, \ 369 alpha, \ 370 x1, incx, \ 371 cntx \ 372 ); \ 373 } \ 374 } \ 375 } \ 376 } 377 378 INSERT_GENTFUNC_BASIC2( scalm_unb_var1, scalv, BLIS_SCALV_KER ) 379 INSERT_GENTFUNC_BASIC2( setm_unb_var1, setv, BLIS_SETV_KER ) 380 381 382 #undef GENTFUNC 383 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ 384 \ 385 void PASTEMAC(ch,opname) \ 386 ( \ 387 doff_t diagoffx, \ 388 diag_t diagx, \ 389 uplo_t uplox, \ 390 trans_t transx, \ 391 dim_t m, \ 392 dim_t n, \ 393 ctype* x, inc_t rs_x, inc_t cs_x, \ 394 ctype* beta, \ 395 ctype* y, inc_t rs_y, inc_t cs_y, \ 396 cntx_t* cntx, \ 397 rntm_t* rntm \ 398 ) \ 399 { \ 400 const num_t dt = PASTEMAC(ch,type); \ 401 \ 402 ctype* x1; \ 403 ctype* y1; \ 404 uplo_t uplox_eff; \ 405 conj_t conjx; \ 406 dim_t n_iter; \ 407 dim_t n_elem, n_elem_max; \ 408 inc_t ldx, incx; \ 409 inc_t ldy, incy; \ 410 dim_t j, i; \ 411 dim_t ij0, n_shift; \ 412 \ 413 /* Set various loop parameters. */ \ 414 bli_set_dims_incs_uplo_2m \ 415 ( \ 416 diagoffx, diagx, transx, \ 417 uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ 418 &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ 419 &ij0, &n_shift \ 420 ); \ 421 \ 422 if ( bli_is_zeros( uplox_eff ) ) return; \ 423 \ 424 /* Extract the conjugation component from the transx parameter. */ \ 425 conjx = bli_extract_conj( transx ); \ 426 \ 427 /* Query the kernel needed for this operation. */ \ 428 PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ 429 \ 430 /* Handle dense and upper/lower storage cases separately. */ \ 431 if ( bli_is_dense( uplox_eff ) ) \ 432 { \ 433 for ( j = 0; j < n_iter; ++j ) \ 434 { \ 435 n_elem = n_elem_max; \ 436 \ 437 x1 = x + (j )*ldx + (0 )*incx; \ 438 y1 = y + (j )*ldy + (0 )*incy; \ 439 \ 440 /* Invoke the kernel with the appropriate parameters. */ \ 441 f( \ 442 conjx, \ 443 n_elem, \ 444 x1, incx, \ 445 beta, \ 446 y1, incy, \ 447 cntx \ 448 ); \ 449 } \ 450 } \ 451 else \ 452 { \ 453 if ( bli_is_upper( uplox_eff ) ) \ 454 { \ 455 for ( j = 0; j < n_iter; ++j ) \ 456 { \ 457 n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ 458 \ 459 x1 = x + (ij0+j )*ldx + (0 )*incx; \ 460 y1 = y + (ij0+j )*ldy + (0 )*incy; \ 461 \ 462 /* Invoke the kernel with the appropriate parameters. */ \ 463 f( \ 464 conjx, \ 465 n_elem, \ 466 x1, incx, \ 467 beta, \ 468 y1, incy, \ 469 cntx \ 470 ); \ 471 } \ 472 } \ 473 else if ( bli_is_lower( uplox_eff ) ) \ 474 { \ 475 for ( j = 0; j < n_iter; ++j ) \ 476 { \ 477 i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ 478 n_elem = n_elem_max - i; \ 479 \ 480 x1 = x + (j )*ldx + (ij0+i )*incx; \ 481 y1 = y + (j )*ldy + (ij0+i )*incy; \ 482 \ 483 /* Invoke the kernel with the appropriate parameters. */ \ 484 f( \ 485 conjx, \ 486 n_elem, \ 487 x1, incx, \ 488 beta, \ 489 y1, incy, \ 490 cntx \ 491 ); \ 492 } \ 493 } \ 494 } \ 495 } 496 497 INSERT_GENTFUNC_BASIC2( xpbym_unb_var1, xpbyv, BLIS_XPBYV_KER ) 498 499 500 #undef GENTFUNC2 501 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \ 502 \ 503 void PASTEMAC2(chx,chy,opname) \ 504 ( \ 505 doff_t diagoffx, \ 506 diag_t diagx, \ 507 uplo_t uplox, \ 508 trans_t transx, \ 509 dim_t m, \ 510 dim_t n, \ 511 ctype_x* x, inc_t rs_x, inc_t cs_x, \ 512 ctype_y* beta, \ 513 ctype_y* y, inc_t rs_y, inc_t cs_y, \ 514 cntx_t* cntx, \ 515 rntm_t* rntm \ 516 ) \ 517 { \ 518 ctype_x* restrict x1; \ 519 ctype_y* restrict y1; \ 520 uplo_t uplox_eff; \ 521 dim_t n_iter; \ 522 dim_t n_elem, n_elem_max; \ 523 inc_t ldx, incx; \ 524 inc_t ldy, incy; \ 525 dim_t j, i; \ 526 dim_t ij0, n_shift; \ 527 \ 528 /* Set various loop parameters. */ \ 529 bli_set_dims_incs_uplo_2m \ 530 ( \ 531 diagoffx, diagx, transx, \ 532 uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ 533 &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ 534 &ij0, &n_shift \ 535 ); \ 536 \ 537 /* Extract the conjugation component from the transx parameter. */ \ 538 /*conjx = bli_extract_conj( transx );*/ \ 539 \ 540 /* Handle dense and upper/lower storage cases separately. */ \ 541 if ( PASTEMAC(chy,eq1)( *beta ) ) \ 542 { \ 543 if ( incx == 1 && incy == 1 ) \ 544 { \ 545 n_elem = n_elem_max; \ 546 \ 547 for ( j = 0; j < n_iter; ++j ) \ 548 { \ 549 x1 = x + (j )*ldx + (0 )*incx; \ 550 y1 = y + (j )*ldy + (0 )*incy; \ 551 \ 552 ctype_x* restrict chi1 = x1; \ 553 ctype_y* restrict psi1 = y1; \ 554 \ 555 for ( i = 0; i < n_elem; ++i ) \ 556 { \ 557 PASTEMAC2(chx,chy,adds)( chi1[i], psi1[i] ); \ 558 } \ 559 } \ 560 } \ 561 else \ 562 { \ 563 n_elem = n_elem_max; \ 564 \ 565 for ( j = 0; j < n_iter; ++j ) \ 566 { \ 567 x1 = x + (j )*ldx + (0 )*incx; \ 568 y1 = y + (j )*ldy + (0 )*incy; \ 569 \ 570 ctype_x* restrict chi1 = x1; \ 571 ctype_y* restrict psi1 = y1; \ 572 \ 573 for ( i = 0; i < n_elem; ++i ) \ 574 { \ 575 PASTEMAC2(chx,chy,adds)( *chi1, *psi1 ); \ 576 \ 577 chi1 += incx; \ 578 psi1 += incy; \ 579 } \ 580 } \ 581 } \ 582 } \ 583 else /* ( !PASTEMAC(chy,eq1)( *beta ) ) */ \ 584 { \ 585 if ( incx == 1 && incy == 1 ) \ 586 { \ 587 n_elem = n_elem_max; \ 588 \ 589 for ( j = 0; j < n_iter; ++j ) \ 590 { \ 591 x1 = x + (j )*ldx + (0 )*incx; \ 592 y1 = y + (j )*ldy + (0 )*incy; \ 593 \ 594 ctype_x* restrict chi1 = x1; \ 595 ctype_y* restrict psi1 = y1; \ 596 \ 597 for ( i = 0; i < n_elem; ++i ) \ 598 { \ 599 PASTEMAC3(chx,chy,chy,xpbys)( chi1[i], *beta, psi1[i] ); \ 600 } \ 601 } \ 602 } \ 603 else \ 604 { \ 605 n_elem = n_elem_max; \ 606 \ 607 for ( j = 0; j < n_iter; ++j ) \ 608 { \ 609 x1 = x + (j )*ldx + (0 )*incx; \ 610 y1 = y + (j )*ldy + (0 )*incy; \ 611 \ 612 ctype_x* restrict chi1 = x1; \ 613 ctype_y* restrict psi1 = y1; \ 614 \ 615 for ( i = 0; i < n_elem; ++i ) \ 616 { \ 617 PASTEMAC3(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \ 618 \ 619 chi1 += incx; \ 620 psi1 += incy; \ 621 } \ 622 } \ 623 } \ 624 } \ 625 } 626 627 INSERT_GENTFUNC2_BASIC0( xpbym_md_unb_var1 ) 628 INSERT_GENTFUNC2_MIXDP0( xpbym_md_unb_var1 ) 629 630