1 /* 2 3 BLIS 4 An object-based framework for developing high-performance BLAS-like 5 libraries. 6 7 Copyright (C) 2019, Advanced Micro Devices, Inc. 8 9 Redistribution and use in source and binary forms, with or without 10 modification, are permitted provided that the following conditions are 11 met: 12 - Redistributions of source code must retain the above copyright 13 notice, this list of conditions and the following disclaimer. 14 - Redistributions in binary form must reproduce the above copyright 15 notice, this list of conditions and the following disclaimer in the 16 documentation and/or other materials provided with the distribution. 17 - Neither the name(s) of the copyright holder(s) nor the names of its 18 contributors may be used to endorse or promote products derived 19 from this software without specific prior written permission. 20 21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33 */ 34 35 #include "blis.h" 36 37 // 38 // -- Row storage case --------------------------------------------------------- 39 // 40 41 #undef GENTFUNC 42 #define GENTFUNC( ctype, ch, opname, arch, suf ) \ 43 \ 44 void PASTEMAC3(ch,opname,arch,suf) \ 45 ( \ 46 conj_t conja, \ 47 conj_t conjb, \ 48 dim_t m, \ 49 dim_t n, \ 50 dim_t k, \ 51 ctype* restrict alpha, \ 52 ctype* restrict a, inc_t rs_a, inc_t cs_a, \ 53 ctype* restrict b, inc_t rs_b, inc_t cs_b, \ 54 ctype* restrict beta, \ 55 ctype* restrict c, inc_t rs_c, inc_t cs_c, \ 56 auxinfo_t* restrict data, \ 57 cntx_t* restrict cntx \ 58 ) \ 59 { \ 60 /* NOTE: This microkernel can actually handle arbitrarily large 61 values of m, n, and k. */ \ 62 \ 63 if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ 64 { \ 65 /* Traverse c by rows. */ \ 66 for ( dim_t i = 0; i < m; ++i ) \ 67 { \ 68 ctype* restrict ci = &c[ i*rs_c ]; \ 69 ctype* restrict ai = &a[ i*rs_a ]; \ 70 \ 71 for ( dim_t j = 0; j < n; ++j ) \ 72 { \ 73 ctype* restrict cij = &ci[ j*cs_c ]; \ 74 ctype* restrict bj = &b [ j*cs_b ]; \ 75 ctype ab; \ 76 \ 77 PASTEMAC(ch,set0s)( ab ); \ 78 \ 79 /* Perform a dot product to update the (i,j) element of c. */ \ 80 for ( dim_t l = 0; l < k; ++l ) \ 81 { \ 82 ctype* restrict aij = &ai[ l*cs_a ]; \ 83 ctype* restrict bij = &bj[ l*rs_b ]; \ 84 \ 85 PASTEMAC(ch,dots)( *aij, *bij, ab ); \ 86 } \ 87 \ 88 /* If beta is one, add ab into c. If beta is zero, overwrite c 89 with the result in ab. Otherwise, scale by beta and accumulate 90 ab to c. */ \ 91 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 92 { \ 93 PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ 94 } \ 95 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 96 { \ 97 PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ 98 } \ 99 else \ 100 { \ 101 PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ 102 } \ 103 } \ 104 } \ 105 } \ 106 else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ 107 { \ 108 /* Traverse c by rows. */ \ 109 for ( dim_t i = 0; i < m; ++i ) \ 110 { \ 111 ctype* restrict ci = &c[ i*rs_c ]; \ 112 ctype* restrict ai = &a[ i*rs_a ]; \ 113 \ 114 for ( dim_t j = 0; j < n; ++j ) \ 115 { \ 116 ctype* restrict cij = &ci[ j*cs_c ]; \ 117 ctype* restrict bj = &b [ j*cs_b ]; \ 118 ctype ab; \ 119 \ 120 PASTEMAC(ch,set0s)( ab ); \ 121 \ 122 /* Perform a dot product to update the (i,j) element of c. */ \ 123 for ( dim_t l = 0; l < k; ++l ) \ 124 { \ 125 ctype* restrict aij = &ai[ l*cs_a ]; \ 126 ctype* restrict bij = &bj[ l*rs_b ]; \ 127 \ 128 PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ 129 } \ 130 \ 131 /* If beta is one, add ab into c. If beta is zero, overwrite c 132 with the result in ab. Otherwise, scale by beta and accumulate 133 ab to c. */ \ 134 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 135 { \ 136 PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ 137 } \ 138 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 139 { \ 140 PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ 141 } \ 142 else \ 143 { \ 144 PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ 145 } \ 146 } \ 147 } \ 148 } \ 149 else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ 150 { \ 151 /* Traverse c by rows. */ \ 152 for ( dim_t i = 0; i < m; ++i ) \ 153 { \ 154 ctype* restrict ci = &c[ i*rs_c ]; \ 155 ctype* restrict ai = &a[ i*rs_a ]; \ 156 \ 157 for ( dim_t j = 0; j < n; ++j ) \ 158 { \ 159 ctype* restrict cij = &ci[ j*cs_c ]; \ 160 ctype* restrict bj = &b [ j*cs_b ]; \ 161 ctype ab; \ 162 \ 163 PASTEMAC(ch,set0s)( ab ); \ 164 \ 165 /* Perform a dot product to update the (i,j) element of c. */ \ 166 for ( dim_t l = 0; l < k; ++l ) \ 167 { \ 168 ctype* restrict aij = &ai[ l*cs_a ]; \ 169 ctype* restrict bij = &bj[ l*rs_b ]; \ 170 \ 171 PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ 172 } \ 173 \ 174 /* If beta is one, add ab into c. If beta is zero, overwrite c 175 with the result in ab. Otherwise, scale by beta and accumulate 176 ab to c. */ \ 177 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 178 { \ 179 PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ 180 } \ 181 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 182 { \ 183 PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ 184 } \ 185 else \ 186 { \ 187 PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ 188 } \ 189 } \ 190 } \ 191 } \ 192 else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ 193 { \ 194 /* Traverse c by rows. */ \ 195 for ( dim_t i = 0; i < m; ++i ) \ 196 { \ 197 ctype* restrict ci = &c[ i*rs_c ]; \ 198 ctype* restrict ai = &a[ i*rs_a ]; \ 199 \ 200 for ( dim_t j = 0; j < n; ++j ) \ 201 { \ 202 ctype* restrict cij = &ci[ j*cs_c ]; \ 203 ctype* restrict bj = &b [ j*cs_b ]; \ 204 ctype ab; \ 205 \ 206 PASTEMAC(ch,set0s)( ab ); \ 207 \ 208 /* Perform a dot product to update the (i,j) element of c. */ \ 209 for ( dim_t l = 0; l < k; ++l ) \ 210 { \ 211 ctype* restrict aij = &ai[ l*cs_a ]; \ 212 ctype* restrict bij = &bj[ l*rs_b ]; \ 213 \ 214 PASTEMAC(ch,dots)( *aij, *bij, ab ); \ 215 } \ 216 \ 217 /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ 218 PASTEMAC(ch,conjs)( ab ); \ 219 \ 220 /* If beta is one, add ab into c. If beta is zero, overwrite c 221 with the result in ab. Otherwise, scale by beta and accumulate 222 ab to c. */ \ 223 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 224 { \ 225 PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ 226 } \ 227 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 228 { \ 229 PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ 230 } \ 231 else \ 232 { \ 233 PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ 234 } \ 235 } \ 236 } \ 237 } \ 238 } 239 240 INSERT_GENTFUNC_BASIC2( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) 241 242 // 243 // -- Column storage case ------------------------------------------------------ 244 // 245 246 #undef GENTFUNC 247 #define GENTFUNC( ctype, ch, opname, arch, suf ) \ 248 \ 249 void PASTEMAC3(ch,opname,arch,suf) \ 250 ( \ 251 conj_t conja, \ 252 conj_t conjb, \ 253 dim_t m, \ 254 dim_t n, \ 255 dim_t k, \ 256 ctype* restrict alpha, \ 257 ctype* restrict a, inc_t rs_a, inc_t cs_a, \ 258 ctype* restrict b, inc_t rs_b, inc_t cs_b, \ 259 ctype* restrict beta, \ 260 ctype* restrict c, inc_t rs_c, inc_t cs_c, \ 261 auxinfo_t* restrict data, \ 262 cntx_t* restrict cntx \ 263 ) \ 264 { \ 265 /* NOTE: This microkernel can actually handle arbitrarily large 266 values of m, n, and k. */ \ 267 \ 268 if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ 269 { \ 270 /* Traverse c by columns. */ \ 271 for ( dim_t j = 0; j < n; ++j ) \ 272 { \ 273 ctype* restrict cj = &c[ j*cs_c ]; \ 274 ctype* restrict bj = &b[ j*cs_b ]; \ 275 \ 276 for ( dim_t i = 0; i < m; ++i ) \ 277 { \ 278 ctype* restrict cij = &cj[ i*rs_c ]; \ 279 ctype* restrict ai = &a [ i*rs_a ]; \ 280 ctype ab; \ 281 \ 282 PASTEMAC(ch,set0s)( ab ); \ 283 \ 284 /* Perform a dot product to update the (i,j) element of c. */ \ 285 for ( dim_t l = 0; l < k; ++l ) \ 286 { \ 287 ctype* restrict aij = &ai[ l*cs_a ]; \ 288 ctype* restrict bij = &bj[ l*rs_b ]; \ 289 \ 290 PASTEMAC(ch,dots)( *aij, *bij, ab ); \ 291 } \ 292 \ 293 /* If beta is one, add ab into c. If beta is zero, overwrite c 294 with the result in ab. Otherwise, scale by beta and accumulate 295 ab to c. */ \ 296 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 297 { \ 298 PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ 299 } \ 300 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 301 { \ 302 PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ 303 } \ 304 else \ 305 { \ 306 PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ 307 } \ 308 } \ 309 } \ 310 } \ 311 else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ 312 { \ 313 /* Traverse c by columns. */ \ 314 for ( dim_t j = 0; j < n; ++j ) \ 315 { \ 316 ctype* restrict cj = &c[ j*cs_c ]; \ 317 ctype* restrict bj = &b[ j*cs_b ]; \ 318 \ 319 for ( dim_t i = 0; i < m; ++i ) \ 320 { \ 321 ctype* restrict cij = &cj[ i*rs_c ]; \ 322 ctype* restrict ai = &a [ i*rs_a ]; \ 323 ctype ab; \ 324 \ 325 PASTEMAC(ch,set0s)( ab ); \ 326 \ 327 /* Perform a dot product to update the (i,j) element of c. */ \ 328 for ( dim_t l = 0; l < k; ++l ) \ 329 { \ 330 ctype* restrict aij = &ai[ l*cs_a ]; \ 331 ctype* restrict bij = &bj[ l*rs_b ]; \ 332 \ 333 PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ 334 } \ 335 \ 336 /* If beta is one, add ab into c. If beta is zero, overwrite c 337 with the result in ab. Otherwise, scale by beta and accumulate 338 ab to c. */ \ 339 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 340 { \ 341 PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ 342 } \ 343 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 344 { \ 345 PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ 346 } \ 347 else \ 348 { \ 349 PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ 350 } \ 351 } \ 352 } \ 353 } \ 354 else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ 355 { \ 356 /* Traverse c by columns. */ \ 357 for ( dim_t j = 0; j < n; ++j ) \ 358 { \ 359 ctype* restrict cj = &c[ j*cs_c ]; \ 360 ctype* restrict bj = &b[ j*cs_b ]; \ 361 \ 362 for ( dim_t i = 0; i < m; ++i ) \ 363 { \ 364 ctype* restrict cij = &cj[ i*rs_c ]; \ 365 ctype* restrict ai = &a [ i*rs_a ]; \ 366 ctype ab; \ 367 \ 368 PASTEMAC(ch,set0s)( ab ); \ 369 \ 370 /* Perform a dot product to update the (i,j) element of c. */ \ 371 for ( dim_t l = 0; l < k; ++l ) \ 372 { \ 373 ctype* restrict aij = &ai[ l*cs_a ]; \ 374 ctype* restrict bij = &bj[ l*rs_b ]; \ 375 \ 376 PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ 377 } \ 378 \ 379 /* If beta is one, add ab into c. If beta is zero, overwrite c 380 with the result in ab. Otherwise, scale by beta and accumulate 381 ab to c. */ \ 382 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 383 { \ 384 PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ 385 } \ 386 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 387 { \ 388 PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ 389 } \ 390 else \ 391 { \ 392 PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ 393 } \ 394 } \ 395 } \ 396 } \ 397 else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ 398 { \ 399 /* Traverse c by columns. */ \ 400 for ( dim_t j = 0; j < n; ++j ) \ 401 { \ 402 ctype* restrict cj = &c[ j*cs_c ]; \ 403 ctype* restrict bj = &b[ j*cs_b ]; \ 404 \ 405 for ( dim_t i = 0; i < m; ++i ) \ 406 { \ 407 ctype* restrict cij = &cj[ i*rs_c ]; \ 408 ctype* restrict ai = &a [ i*rs_a ]; \ 409 ctype ab; \ 410 \ 411 PASTEMAC(ch,set0s)( ab ); \ 412 \ 413 /* Perform a dot product to update the (i,j) element of c. */ \ 414 for ( dim_t l = 0; l < k; ++l ) \ 415 { \ 416 ctype* restrict aij = &ai[ l*cs_a ]; \ 417 ctype* restrict bij = &bj[ l*rs_b ]; \ 418 \ 419 PASTEMAC(ch,dots)( *aij, *bij, ab ); \ 420 } \ 421 \ 422 /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ 423 PASTEMAC(ch,conjs)( ab ); \ 424 \ 425 /* If beta is one, add ab into c. If beta is zero, overwrite c 426 with the result in ab. Otherwise, scale by beta and accumulate 427 ab to c. */ \ 428 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 429 { \ 430 PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ 431 } \ 432 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 433 { \ 434 PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ 435 } \ 436 else \ 437 { \ 438 PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ 439 } \ 440 } \ 441 } \ 442 } \ 443 } 444 445 INSERT_GENTFUNC_BASIC2( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) 446 447 // 448 // -- General storage case ----------------------------------------------------- 449 // 450 451 INSERT_GENTFUNC_BASIC2( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) 452 453 454 455 456 457 458 459 460 #if 0 461 462 // 463 // -- Row storage case --------------------------------------------------------- 464 // 465 466 #undef GENTFUNC 467 #define GENTFUNC( ctype, ch, opname, arch, suf ) \ 468 \ 469 void PASTEMAC3(ch,opname,arch,suf) \ 470 ( \ 471 conj_t conja, \ 472 conj_t conjb, \ 473 dim_t m, \ 474 dim_t n, \ 475 dim_t k, \ 476 ctype* restrict alpha, \ 477 ctype* restrict a, inc_t rs_a, inc_t cs_a, \ 478 ctype* restrict b, inc_t rs_b, inc_t cs_b, \ 479 ctype* restrict beta, \ 480 ctype* restrict c, inc_t rs_c, inc_t cs_c, \ 481 auxinfo_t* restrict data, \ 482 cntx_t* restrict cntx \ 483 ) \ 484 { \ 485 const dim_t mn = m * n; \ 486 \ 487 ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ 488 / sizeof( ctype ) ] \ 489 __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ 490 const inc_t rs_ab = n; \ 491 const inc_t cs_ab = 1; \ 492 \ 493 \ 494 /* Assumptions: m <= mr, n <= nr so that the temporary array ab is 495 sufficiently large enough to hold the m x n microtile. 496 497 The ability to handle m < mr and n < nr is being provided so that 498 optimized ukernels can call one of these reference implementations 499 for their edge cases, if they choose. When they do so, they will 500 need to call the function directly, by its configuration-mangled 501 name, since it will have been overwritten in the context when 502 the optimized ukernel functions are registered. */ \ 503 \ 504 \ 505 /* Initialize the accumulator elements in ab to zero. */ \ 506 for ( dim_t i = 0; i < mn; ++i ) \ 507 { \ 508 PASTEMAC(ch,set0s)( ab[i] ); \ 509 } \ 510 \ 511 /* Perform a series of k rank-1 updates into ab. */ \ 512 for ( dim_t l = 0; l < k; ++l ) \ 513 { \ 514 /* Traverse ab by rows; assume cs_ab = 1. */ \ 515 for ( dim_t i = 0; i < m; ++i ) \ 516 { \ 517 for ( dim_t j = 0; j < n; ++j ) \ 518 { \ 519 PASTEMAC(ch,dots) \ 520 ( \ 521 a[ i*rs_a ], \ 522 b[ j*cs_b ], \ 523 ab[ i*rs_ab + j*cs_ab ] \ 524 ); \ 525 } \ 526 } \ 527 \ 528 a += cs_a; \ 529 b += rs_b; \ 530 } \ 531 \ 532 /* Scale the result in ab by alpha. */ \ 533 for ( dim_t i = 0; i < mn; ++i ) \ 534 { \ 535 PASTEMAC(ch,scals)( *alpha, ab[i] ); \ 536 } \ 537 \ 538 \ 539 /* If beta is one, add ab into c. If beta is zero, overwrite c with the 540 result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ 541 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 542 { \ 543 /* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \ 544 for ( dim_t i = 0; i < m; ++i ) \ 545 for ( dim_t j = 0; j < n; ++j ) \ 546 { \ 547 PASTEMAC(ch,adds) \ 548 ( \ 549 ab[ i*rs_ab + j*1 ], \ 550 c[ i*rs_c + j*1 ] \ 551 ) \ 552 } \ 553 } \ 554 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 555 { \ 556 \ 557 /* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \ 558 for ( dim_t i = 0; i < m; ++i ) \ 559 for ( dim_t j = 0; j < n; ++j ) \ 560 { \ 561 PASTEMAC(ch,copys) \ 562 ( \ 563 ab[ i*rs_ab + j*1 ], \ 564 c[ i*rs_c + j*1 ] \ 565 ) \ 566 } \ 567 } \ 568 else /* beta != 0 && beta != 1 */ \ 569 { \ 570 /* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \ 571 for ( dim_t i = 0; i < m; ++i ) \ 572 for ( dim_t j = 0; j < n; ++j ) \ 573 { \ 574 PASTEMAC(ch,xpbys) \ 575 ( \ 576 ab[ i*rs_ab + j*1 ], \ 577 *beta, \ 578 c[ i*rs_c + j*1 ] \ 579 ) \ 580 } \ 581 } \ 582 } 583 584 INSERT_GENTFUNC_BASIC2( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) 585 586 // 587 // -- Column storage case ------------------------------------------------------ 588 // 589 590 #undef GENTFUNC 591 #define GENTFUNC( ctype, ch, opname, arch, suf ) \ 592 \ 593 void PASTEMAC3(ch,opname,arch,suf) \ 594 ( \ 595 conj_t conja, \ 596 conj_t conjb, \ 597 dim_t m, \ 598 dim_t n, \ 599 dim_t k, \ 600 ctype* restrict alpha, \ 601 ctype* restrict a, inc_t rs_a, inc_t cs_a, \ 602 ctype* restrict b, inc_t rs_b, inc_t cs_b, \ 603 ctype* restrict beta, \ 604 ctype* restrict c, inc_t rs_c, inc_t cs_c, \ 605 auxinfo_t* restrict data, \ 606 cntx_t* restrict cntx \ 607 ) \ 608 { \ 609 const dim_t mn = m * n; \ 610 \ 611 ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ 612 / sizeof( ctype ) ] \ 613 __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ 614 const inc_t rs_ab = 1; \ 615 const inc_t cs_ab = m; \ 616 \ 617 \ 618 /* Assumptions: m <= mr, n <= nr so that the temporary array ab is 619 sufficiently large enough to hold the m x n microtile. 620 621 The ability to handle m < mr and n < nr is being provided so that 622 optimized ukernels can call one of these reference implementations 623 for their edge cases, if they choose. When they do so, they will 624 need to call the function directly, by its configuration-mangled 625 name, since it will have been overwritten in the context when 626 the optimized ukernel functions are registered. */ \ 627 \ 628 \ 629 /* Initialize the accumulator elements in ab to zero. */ \ 630 for ( dim_t i = 0; i < mn; ++i ) \ 631 { \ 632 PASTEMAC(ch,set0s)( ab[i] ); \ 633 } \ 634 \ 635 /* Perform a series of k rank-1 updates into ab. */ \ 636 for ( dim_t l = 0; l < k; ++l ) \ 637 { \ 638 /* Traverse ab by columns; assume rs_ab = 1. */ \ 639 for ( dim_t j = 0; j < n; ++j ) \ 640 { \ 641 for ( dim_t i = 0; i < m; ++i ) \ 642 { \ 643 PASTEMAC(ch,dots) \ 644 ( \ 645 a[ i*rs_a ], \ 646 b[ j*cs_b ], \ 647 ab[ i*rs_ab + j*cs_ab ] \ 648 ); \ 649 } \ 650 } \ 651 \ 652 a += cs_a; \ 653 b += rs_b; \ 654 } \ 655 \ 656 /* Scale the result in ab by alpha. */ \ 657 for ( dim_t i = 0; i < mn; ++i ) \ 658 { \ 659 PASTEMAC(ch,scals)( *alpha, ab[i] ); \ 660 } \ 661 \ 662 \ 663 /* If beta is one, add ab into c. If beta is zero, overwrite c with the 664 result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ 665 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 666 { \ 667 /* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \ 668 for ( dim_t j = 0; j < n; ++j ) \ 669 for ( dim_t i = 0; i < m; ++i ) \ 670 { \ 671 PASTEMAC(ch,adds) \ 672 ( \ 673 ab[ i*1 + j*cs_ab ], \ 674 c[ i*1 + j*cs_c ] \ 675 ) \ 676 } \ 677 } \ 678 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 679 { \ 680 /* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \ 681 for ( dim_t j = 0; j < n; ++j ) \ 682 for ( dim_t i = 0; i < m; ++i ) \ 683 { \ 684 PASTEMAC(ch,copys) \ 685 ( \ 686 ab[ i*1 + j*cs_ab ], \ 687 c[ i*1 + j*cs_c ] \ 688 ) \ 689 } \ 690 } \ 691 else /* beta != 0 && beta != 1 */ \ 692 { \ 693 /* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \ 694 for ( dim_t j = 0; j < n; ++j ) \ 695 for ( dim_t i = 0; i < m; ++i ) \ 696 { \ 697 PASTEMAC(ch,xpbys) \ 698 ( \ 699 ab[ i*1 + j*cs_ab ], \ 700 *beta, \ 701 c[ i*1 + j*cs_c ] \ 702 ) \ 703 } \ 704 } \ 705 } 706 707 INSERT_GENTFUNC_BASIC2( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) 708 709 // 710 // -- General storage case ----------------------------------------------------- 711 // 712 713 #undef GENTFUNC 714 #define GENTFUNC( ctype, ch, opname, arch, suf ) \ 715 \ 716 void PASTEMAC3(ch,opname,arch,suf) \ 717 ( \ 718 conj_t conja, \ 719 conj_t conjb, \ 720 dim_t m, \ 721 dim_t n, \ 722 dim_t k, \ 723 ctype* restrict alpha, \ 724 ctype* restrict a, inc_t rs_a, inc_t cs_a, \ 725 ctype* restrict b, inc_t rs_b, inc_t cs_b, \ 726 ctype* restrict beta, \ 727 ctype* restrict c, inc_t rs_c, inc_t cs_c, \ 728 auxinfo_t* restrict data, \ 729 cntx_t* restrict cntx \ 730 ) \ 731 { \ 732 const dim_t mn = m * n; \ 733 \ 734 ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ 735 / sizeof( ctype ) ] \ 736 __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ 737 const inc_t rs_ab = 1; \ 738 const inc_t cs_ab = m; \ 739 \ 740 \ 741 /* Assumptions: m <= mr, n <= nr so that the temporary array ab is 742 sufficiently large enough to hold the m x n microtile. 743 744 The ability to handle m < mr and n < nr is being provided so that 745 optimized ukernels can call one of these reference implementations 746 for their edge cases, if they choose. When they do so, they will 747 need to call the function directly, by its configuration-mangled 748 name, since it will have been overwritten in the context when 749 the optimized ukernel functions are registered. */ \ 750 \ 751 \ 752 /* Initialize the accumulator elements in ab to zero. */ \ 753 for ( dim_t i = 0; i < mn; ++i ) \ 754 { \ 755 PASTEMAC(ch,set0s)( ab[i] ); \ 756 } \ 757 \ 758 /* Perform a series of k rank-1 updates into ab. */ \ 759 for ( dim_t l = 0; l < k; ++l ) \ 760 { \ 761 /* General storage: doesn't matter how we traverse ab. */ \ 762 for ( dim_t j = 0; j < n; ++j ) \ 763 { \ 764 for ( dim_t i = 0; i < m; ++i ) \ 765 { \ 766 PASTEMAC(ch,dots) \ 767 ( \ 768 a[ i*rs_a ], \ 769 b[ j*cs_b ], \ 770 ab[ i*rs_ab + j*cs_ab ] \ 771 ); \ 772 } \ 773 } \ 774 \ 775 a += cs_a; \ 776 b += rs_b; \ 777 } \ 778 \ 779 /* Scale the result in ab by alpha. */ \ 780 for ( dim_t i = 0; i < mn; ++i ) \ 781 { \ 782 PASTEMAC(ch,scals)( *alpha, ab[i] ); \ 783 } \ 784 \ 785 \ 786 /* If beta is one, add ab into c. If beta is zero, overwrite c with the 787 result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ 788 if ( PASTEMAC(ch,eq1)( *beta ) ) \ 789 { \ 790 /* General storage: doesn't matter how we traverse ab and c. */ \ 791 for ( dim_t j = 0; j < n; ++j ) \ 792 for ( dim_t i = 0; i < m; ++i ) \ 793 { \ 794 PASTEMAC(ch,adds) \ 795 ( \ 796 ab[ i*rs_ab + j*cs_ab ], \ 797 c[ i*rs_c + j*cs_c ] \ 798 ) \ 799 } \ 800 } \ 801 else if ( PASTEMAC(ch,eq0)( *beta ) ) \ 802 { \ 803 /* General storage: doesn't matter how we traverse ab and c. */ \ 804 for ( dim_t j = 0; j < n; ++j ) \ 805 for ( dim_t i = 0; i < m; ++i ) \ 806 { \ 807 PASTEMAC(ch,copys) \ 808 ( \ 809 ab[ i*rs_ab + j*cs_ab ], \ 810 c[ i*rs_c + j*cs_c ] \ 811 ) \ 812 } \ 813 } \ 814 else /* beta != 0 && beta != 1 */ \ 815 { \ 816 /* General storage: doesn't matter how we traverse ab and c. */ \ 817 for ( dim_t j = 0; j < n; ++j ) \ 818 for ( dim_t i = 0; i < m; ++i ) \ 819 { \ 820 PASTEMAC(ch,xpbys) \ 821 ( \ 822 ab[ i*rs_ab + j*cs_ab ], \ 823 *beta, \ 824 c[ i*rs_c + j*cs_c ] \ 825 ) \ 826 } \ 827 } \ 828 } 829 830 INSERT_GENTFUNC_BASIC2( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) 831 832 #endif 833