1 /* 2 3 BLIS 4 An object-based framework for developing high-performance BLAS-like 5 libraries. 6 7 Copyright (C) 2014, The University of Texas at Austin 8 9 Redistribution and use in source and binary forms, with or without 10 modification, are permitted provided that the following conditions are 11 met: 12 - Redistributions of source code must retain the above copyright 13 notice, this list of conditions and the following disclaimer. 14 - Redistributions in binary form must reproduce the above copyright 15 notice, this list of conditions and the following disclaimer in the 16 documentation and/or other materials provided with the distribution. 17 - Neither the name(s) of the copyright holder(s) nor the names of its 18 contributors may be used to endorse or promote products derived 19 from this software without specific prior written permission. 20 21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33 */ 34 35 #include "blis.h" 36 37 38 #undef GENTFUNCCO 39 #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \ 40 \ 41 void PASTEMAC3(ch,opname,arch,suf) \ 42 ( \ 43 ctype* restrict a, \ 44 ctype* restrict b, \ 45 ctype* restrict c, inc_t rs_c, inc_t cs_c, \ 46 auxinfo_t* restrict data, \ 47 cntx_t* restrict cntx \ 48 ) \ 49 { \ 50 const num_t dt = PASTEMAC(ch,type); \ 51 \ 52 const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ 53 const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ 54 \ 55 const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ 56 const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ 57 \ 58 const dim_t m = mr; \ 59 const dim_t n = nr; \ 60 \ 61 const inc_t rs_a = 1; \ 62 const inc_t cs_a = packmr; \ 63 \ 64 const inc_t rs_b = packnr; \ 65 const inc_t cs_b = 1; \ 66 \ 67 const inc_t ld_a = cs_a; \ 68 const inc_t ld_b = rs_b; \ 69 \ 70 const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ 71 \ 72 dim_t iter, i, j, l; \ 73 dim_t n_behind; \ 74 \ 75 \ 76 if ( bli_is_1e_packed( schema_b ) ) \ 77 { \ 78 const inc_t rs_a2 = 1 * rs_a; \ 79 const inc_t cs_a2 = 2 * cs_a; \ 80 \ 81 ctype_r* restrict a_r = ( ctype_r* )a; \ 82 ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ 83 \ 84 ctype* restrict b_ri = ( ctype* )b; \ 85 ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ 86 \ 87 for ( iter = 0; iter < m; ++iter ) \ 88 { \ 89 i = iter; \ 90 n_behind = i; \ 91 \ 92 ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ 93 ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ 94 ctype_r* restrict a10t_r = a_r + (i )*rs_a2 + (0 )*cs_a2; \ 95 ctype_r* restrict a10t_i = a_i + (i )*rs_a2 + (0 )*cs_a2; \ 96 ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ 97 ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ 98 ctype* restrict B0_ri = b_ri + (0 )*rs_b + (0 )*cs_b; \ 99 \ 100 /* b1 = b1 - a10t * B0; */ \ 101 /* b1 = b1 / alpha11; */ \ 102 for ( j = 0; j < n; ++j ) \ 103 { \ 104 ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ 105 ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ 106 ctype* restrict b01_ri = B0_ri + (0 )*rs_b + (j )*cs_b; \ 107 ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ 108 ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ 109 ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ 110 ctype_r rho11_r; \ 111 ctype_r rho11_i; \ 112 \ 113 /* beta11 = beta11 - a10t * b01; */ \ 114 PASTEMAC(ch,set0ris)( rho11_r, \ 115 rho11_i ); \ 116 for ( l = 0; l < n_behind; ++l ) \ 117 { \ 118 ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a2; \ 119 ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a2; \ 120 ctype* restrict beta01_ri = b01_ri + (l )*rs_b; \ 121 ctype_r* restrict beta01_r = &PASTEMAC(ch,real)( *beta01_ri ); \ 122 ctype_r* restrict beta01_i = &PASTEMAC(ch,imag)( *beta01_ri ); \ 123 \ 124 PASTEMAC(ch,axpyris)( *alpha10_r, \ 125 *alpha10_i, \ 126 *beta01_r, \ 127 *beta01_i, \ 128 rho11_r, \ 129 rho11_i ); \ 130 } \ 131 PASTEMAC(ch,subris)( rho11_r, \ 132 rho11_i, \ 133 beta11c_r, \ 134 beta11c_i ); \ 135 \ 136 /* beta11 = beta11 / alpha11; */ \ 137 /* NOTE: When preinversion is enabled, the INVERSE of alpha11 138 (1.0/alpha11) is stored during packing instead alpha11 so we 139 can multiply rather than divide. When preinversion is disabled, 140 alpha11 is stored and division happens below explicitly. */ \ 141 PASTEMAC(ch,diagop)( *alpha11_r, \ 142 *alpha11_i, \ 143 beta11c_r, \ 144 beta11c_i ); \ 145 \ 146 /* Output final result to matrix c. */ \ 147 PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ 148 \ 149 /* Store the local values back to b11. */ \ 150 PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ 151 PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ 152 } \ 153 } \ 154 } \ 155 else /* ( bli_is_1r_packed( schema_b ) ) */ \ 156 { \ 157 const inc_t rs_b2 = 2 * rs_b; \ 158 const inc_t cs_b2 = 1 * cs_b; \ 159 \ 160 ctype* restrict a_ri = ( ctype* )a; \ 161 /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ 162 \ 163 ctype_r* restrict b_r = ( ctype_r* )b; \ 164 ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ 165 \ 166 for ( iter = 0; iter < m; ++iter ) \ 167 { \ 168 i = iter; \ 169 n_behind = i; \ 170 \ 171 ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ 172 ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ 173 ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ 174 ctype* restrict a10t_ri = a_ri + (i )*rs_a + (0 )*cs_a; \ 175 ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ 176 ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ 177 ctype_r* restrict B0_r = b_r + (0 )*rs_b2 + (0 )*cs_b2; \ 178 ctype_r* restrict B0_i = b_i + (0 )*rs_b2 + (0 )*cs_b2; \ 179 \ 180 /* b1 = b1 - a10t * B0; */ \ 181 /* b1 = b1 / alpha11; */ \ 182 for ( j = 0; j < n; ++j ) \ 183 { \ 184 ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ 185 ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ 186 ctype_r* restrict b01_r = B0_r + (0 )*rs_b2 + (j )*cs_b2; \ 187 ctype_r* restrict b01_i = B0_i + (0 )*rs_b2 + (j )*cs_b2; \ 188 ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ 189 ctype_r beta11c_r = *beta11_r; \ 190 ctype_r beta11c_i = *beta11_i; \ 191 ctype_r rho11_r; \ 192 ctype_r rho11_i; \ 193 \ 194 /* beta11 = beta11 - a10t * b01; */ \ 195 PASTEMAC(ch,set0ris)( rho11_r, \ 196 rho11_i ); \ 197 for ( l = 0; l < n_behind; ++l ) \ 198 { \ 199 ctype* restrict alpha10_ri = a10t_ri + (l )*cs_a; \ 200 ctype_r* restrict alpha10_r = &PASTEMAC(ch,real)( *alpha10_ri ); \ 201 ctype_r* restrict alpha10_i = &PASTEMAC(ch,imag)( *alpha10_ri ); \ 202 ctype_r* restrict beta01_r = b01_r + (l )*rs_b2; \ 203 ctype_r* restrict beta01_i = b01_i + (l )*rs_b2; \ 204 \ 205 PASTEMAC(ch,axpyris)( *alpha10_r, \ 206 *alpha10_i, \ 207 *beta01_r, \ 208 *beta01_i, \ 209 rho11_r, \ 210 rho11_i ); \ 211 } \ 212 PASTEMAC(ch,subris)( rho11_r, \ 213 rho11_i, \ 214 beta11c_r, \ 215 beta11c_i ); \ 216 \ 217 /* beta11 = beta11 / alpha11; */ \ 218 /* NOTE: When preinversion is enabled, the INVERSE of alpha11 219 (1.0/alpha11) is stored during packing instead alpha11 so we 220 can multiply rather than divide. When preinversion is disabled, 221 alpha11 is stored and division happens below explicitly. */ \ 222 PASTEMAC(ch,diagop)( *alpha11_r, \ 223 *alpha11_i, \ 224 beta11c_r, \ 225 beta11c_i ); \ 226 \ 227 /* Output final result to matrix c. */ \ 228 PASTEMAC(ch,sets)( beta11c_r, \ 229 beta11c_i, *gamma11 ); \ 230 \ 231 /* Store the local values back to b11. */ \ 232 PASTEMAC(ch,copyris)( beta11c_r, \ 233 beta11c_i, \ 234 *beta11_r, \ 235 *beta11_i ); \ 236 } \ 237 } \ 238 } \ 239 } 240 241 #ifdef BLIS_ENABLE_TRSM_PREINVERSION 242 INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris ) 243 #else 244 INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris ) 245 #endif 246 247 248 #undef GENTFUNCCO 249 #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \ 250 \ 251 void PASTEMAC3(ch,opname,arch,suf) \ 252 ( \ 253 ctype* restrict a, \ 254 ctype* restrict b, \ 255 ctype* restrict c, inc_t rs_c, inc_t cs_c, \ 256 auxinfo_t* restrict data, \ 257 cntx_t* restrict cntx \ 258 ) \ 259 { \ 260 const num_t dt = PASTEMAC(ch,type); \ 261 \ 262 const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ 263 const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ 264 \ 265 const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ 266 const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ 267 \ 268 const dim_t m = mr; \ 269 const dim_t n = nr; \ 270 \ 271 const inc_t rs_a = 1; \ 272 const inc_t cs_a = packmr; \ 273 \ 274 const inc_t rs_b = packnr; \ 275 const inc_t cs_b = 1; \ 276 \ 277 const inc_t ld_a = cs_a; \ 278 const inc_t ld_b = rs_b; \ 279 \ 280 const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ 281 \ 282 dim_t iter, i, j, l; \ 283 dim_t n_behind; \ 284 \ 285 \ 286 if ( bli_is_1e_packed( schema_b ) ) \ 287 { \ 288 const inc_t rs_a2 = 1 * rs_a; \ 289 const inc_t cs_a2 = 2 * cs_a; \ 290 \ 291 ctype_r* restrict a_r = ( ctype_r* )a; \ 292 ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ 293 \ 294 ctype* restrict b_ri = ( ctype* )b; \ 295 ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ 296 \ 297 for ( iter = 0; iter < m; ++iter ) \ 298 { \ 299 i = m - iter - 1; \ 300 n_behind = iter; \ 301 \ 302 ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ 303 ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ 304 ctype_r* restrict a12t_r = a_r + (i )*rs_a2 + (i+1)*cs_a2; \ 305 ctype_r* restrict a12t_i = a_i + (i )*rs_a2 + (i+1)*cs_a2; \ 306 ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ 307 ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ 308 ctype* restrict B2_ri = b_ri + (i+1)*rs_b + (0 )*cs_b; \ 309 \ 310 /* b1 = b1 - a12t * B2; */ \ 311 /* b1 = b1 / alpha11; */ \ 312 for ( j = 0; j < n; ++j ) \ 313 { \ 314 ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ 315 ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ 316 ctype* restrict b21_ri = B2_ri + (0 )*rs_b + (j )*cs_b; \ 317 ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ 318 ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ 319 ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ 320 ctype_r rho11_r; \ 321 ctype_r rho11_i; \ 322 \ 323 /* beta11 = beta11 - a10t * b01; */ \ 324 PASTEMAC(ch,set0ris)( rho11_r, \ 325 rho11_i ); \ 326 for ( l = 0; l < n_behind; ++l ) \ 327 { \ 328 ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a2; \ 329 ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a2; \ 330 ctype* restrict beta21_ri = b21_ri + (l )*rs_b; \ 331 ctype_r* restrict beta21_r = &PASTEMAC(ch,real)( *beta21_ri ); \ 332 ctype_r* restrict beta21_i = &PASTEMAC(ch,imag)( *beta21_ri ); \ 333 \ 334 PASTEMAC(ch,axpyris)( *alpha12_r, \ 335 *alpha12_i, \ 336 *beta21_r, \ 337 *beta21_i, \ 338 rho11_r, \ 339 rho11_i ); \ 340 } \ 341 PASTEMAC(ch,subris)( rho11_r, \ 342 rho11_i, \ 343 beta11c_r, \ 344 beta11c_i ); \ 345 \ 346 /* beta11 = beta11 / alpha11; */ \ 347 /* NOTE: When preinversion is enabled, the INVERSE of alpha11 348 (1.0/alpha11) is stored during packing instead alpha11 so we 349 can multiply rather than divide. When preinversion is disabled, 350 alpha11 is stored and division happens below explicitly. */ \ 351 PASTEMAC(ch,diagop)( *alpha11_r, \ 352 *alpha11_i, \ 353 beta11c_r, \ 354 beta11c_i ); \ 355 \ 356 /* Output final result to matrix c. */ \ 357 PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ 358 \ 359 /* Store the local values back to b11. */ \ 360 PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ 361 PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ 362 } \ 363 } \ 364 } \ 365 else /* if ( bli_is_1r_packed( schema_b ) ) */ \ 366 { \ 367 const inc_t rs_b2 = 2 * rs_b; \ 368 const inc_t cs_b2 = 1 * cs_b; \ 369 \ 370 ctype* restrict a_ri = ( ctype* )a; \ 371 /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ 372 \ 373 ctype_r* restrict b_r = ( ctype_r* )b; \ 374 ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ 375 \ 376 for ( iter = 0; iter < m; ++iter ) \ 377 { \ 378 i = m - iter - 1; \ 379 n_behind = iter; \ 380 \ 381 ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ 382 ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ 383 ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ 384 ctype* restrict a12t_ri = a_ri + (i )*rs_a + (i+1)*cs_a; \ 385 ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ 386 ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ 387 ctype_r* restrict B2_r = b_r + (i+1)*rs_b2 + (0 )*cs_b2; \ 388 ctype_r* restrict B2_i = b_i + (i+1)*rs_b2 + (0 )*cs_b2; \ 389 \ 390 /* b1 = b1 - a12t * B2; */ \ 391 /* b1 = b1 / alpha11; */ \ 392 for ( j = 0; j < n; ++j ) \ 393 { \ 394 ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ 395 ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ 396 ctype_r* restrict b21_r = B2_r + (0 )*rs_b2 + (j )*cs_b2; \ 397 ctype_r* restrict b21_i = B2_i + (0 )*rs_b2 + (j )*cs_b2; \ 398 ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ 399 ctype_r beta11c_r = *beta11_r; \ 400 ctype_r beta11c_i = *beta11_i; \ 401 ctype_r rho11_r; \ 402 ctype_r rho11_i; \ 403 \ 404 /* beta11 = beta11 - a10t * b01; */ \ 405 PASTEMAC(ch,set0ris)( rho11_r, \ 406 rho11_i ); \ 407 for ( l = 0; l < n_behind; ++l ) \ 408 { \ 409 ctype* restrict alpha12_ri = a12t_ri + (l )*cs_a; \ 410 ctype_r* restrict alpha12_r = &PASTEMAC(ch,real)( *alpha12_ri ); \ 411 ctype_r* restrict alpha12_i = &PASTEMAC(ch,imag)( *alpha12_ri ); \ 412 ctype_r* restrict beta21_r = b21_r + (l )*rs_b2; \ 413 ctype_r* restrict beta21_i = b21_i + (l )*rs_b2; \ 414 \ 415 PASTEMAC(ch,axpyris)( *alpha12_r, \ 416 *alpha12_i, \ 417 *beta21_r, \ 418 *beta21_i, \ 419 rho11_r, \ 420 rho11_i ); \ 421 } \ 422 PASTEMAC(ch,subris)( rho11_r, \ 423 rho11_i, \ 424 beta11c_r, \ 425 beta11c_i ); \ 426 \ 427 /* beta11 = beta11 / alpha11; */ \ 428 /* NOTE: When preinversion is enabled, the INVERSE of alpha11 429 (1.0/alpha11) is stored during packing instead alpha11 so we 430 can multiply rather than divide. When preinversion is disabled, 431 alpha11 is stored and division happens below explicitly. */ \ 432 PASTEMAC(ch,diagop)( *alpha11_r, \ 433 *alpha11_i, \ 434 beta11c_r, \ 435 beta11c_i ); \ 436 \ 437 /* Output final result to matrix c. */ \ 438 PASTEMAC(ch,sets)( beta11c_r, \ 439 beta11c_i, *gamma11 ); \ 440 \ 441 /* Store the local values back to b11. */ \ 442 PASTEMAC(ch,copyris)( beta11c_r, \ 443 beta11c_i, \ 444 *beta11_r, \ 445 *beta11_i ); \ 446 } \ 447 } \ 448 } \ 449 } 450 451 #ifdef BLIS_ENABLE_TRSM_PREINVERSION 452 INSERT_GENTFUNCCO_BASIC3( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris ) 453 #else 454 INSERT_GENTFUNCCO_BASIC3( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris ) 455 #endif 456