1 // @HEADER 2 // *********************************************************************** 3 // 4 // Teuchos: Common Tools Package 5 // Copyright (2004) Sandia Corporation 6 // 7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 8 // license for use of this work by or on behalf of the U.S. Government. 9 // 10 // Redistribution and use in source and binary forms, with or without 11 // modification, are permitted provided that the following conditions are 12 // met: 13 // 14 // 1. Redistributions of source code must retain the above copyright 15 // notice, this list of conditions and the following disclaimer. 16 // 17 // 2. Redistributions in binary form must reproduce the above copyright 18 // notice, this list of conditions and the following disclaimer in the 19 // documentation and/or other materials provided with the distribution. 20 // 21 // 3. Neither the name of the Corporation nor the names of the 22 // contributors may be used to endorse or promote products derived from 23 // this software without specific prior written permission. 24 // 25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 // 37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 38 // 39 // *********************************************************************** 40 // @HEADER 41 42 // Kris 43 // 06.16.03 -- Start over from scratch 44 // 06.16.03 -- Initial templatization (Tpetra_BLAS.cpp is no longer needed) 45 // 06.18.03 -- Changed xxxxx_() function calls to XXXXX_F77() 46 // -- Added warning messages for generic calls 47 // 07.08.03 -- Move into Teuchos package/namespace 48 // 07.24.03 -- The first iteration of BLAS generics is nearing completion. Caveats: 49 // * TRSM isn't finished yet; it works for one case at the moment (left side, upper tri., no transpose, no unit diag.) 50 // * Many of the generic implementations are quite inefficient, ugly, or both. I wrote these to be easy to debug, not for efficiency or legibility. The next iteration will improve both of these aspects as much as possible. 51 // * Very little verification of input parameters is done, save for the character-type arguments (TRANS, etc.) which is quite robust. 52 // * All of the routines that make use of both an incx and incy parameter (which includes much of the L1 BLAS) are set up to work iff incx == incy && incx > 0. Allowing for differing/negative values of incx/incy should be relatively trivial. 53 // * All of the L2/L3 routines assume that the entire matrix is being used (that is, if A is mxn, lda = m); they don't work on submatrices yet. This *should* be a reasonably trivial thing to fix, as well. 54 // -- Removed warning messages for generic calls 55 // 08.08.03 -- TRSM now works for all cases where SIDE == L and DIAG == N. DIAG == U is implemented but does not work correctly; SIDE == R is not yet implemented. 56 // 08.14.03 -- TRSM now works for all cases and accepts (and uses) leading-dimension information. 57 // 09.26.03 -- character input replaced with enumerated input to cause compiling errors and not run-time errors ( suggested by RAB ). 58 59 #ifndef _TEUCHOS_BLAS_HPP_ 60 #define _TEUCHOS_BLAS_HPP_ 61 62 /*! \file Teuchos_BLAS.hpp 63 \brief Templated interface class to BLAS routines. 64 */ 65 /** \example BLAS/cxx_main.cpp 66 This is an example of how to use the Teuchos::BLAS class. 67 */ 68 69 #include "Teuchos_ConfigDefs.hpp" 70 #include "Teuchos_ScalarTraits.hpp" 71 #include "Teuchos_OrdinalTraits.hpp" 72 #include "Teuchos_BLAS_types.hpp" 73 #include "Teuchos_Assert.hpp" 74 75 /*! \class Teuchos::BLAS 76 \brief Templated BLAS wrapper. 77 78 The Teuchos::BLAS class provides functionality similar to the BLAS 79 (Basic Linear Algebra Subprograms). The BLAS provide portable, 80 high- performance implementations of kernels such as dense vector 81 sums, inner products, and norms (the BLAS 1 routines), dense 82 matrix-vector multiplication and triangular solve (the BLAS 2 83 routines), and dense matrix-matrix multiplication and triangular 84 solve with multiple right-hand sides (the BLAS 3 routines). 85 86 The standard BLAS interface is Fortran-specific. Unfortunately, 87 the interface between C++ and Fortran is not standard across all 88 computer platforms. The Teuchos::BLAS class provides C++ bindings 89 for the BLAS kernels in order to insulate the rest of Petra from 90 the details of C++ to Fortran translation. 91 92 In addition to giving access to the standard BLAS functionality, 93 Teuchos::BLAS also provides a generic fall-back implementation for 94 any ScalarType class that defines the +, - * and / operators. 95 96 Teuchos::BLAS only operates within a single shared-memory space, 97 just like the BLAS. It does not attempt to implement 98 distributed-memory parallel matrix operations. 99 100 \note This class has specializations for ScalarType=float and 101 double, which use the BLAS library directly. If you configure 102 Teuchos to enable complex arithmetic support, via the CMake 103 option -DTeuchos_ENABLE_COMPLEX:BOOL=ON, then this class will 104 also invoke the BLAS library directly for 105 ScalarType=std::complex<float> and std::complex<double>. 106 */ 107 namespace Teuchos 108 { 109 extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char ESideChar[]; 110 extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char ETranspChar[]; 111 extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char EUploChar[]; 112 extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char EDiagChar[]; 113 extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char ETypeChar[]; 114 115 // Forward declaration 116 namespace details { 117 template<typename ScalarType, 118 bool isComplex = Teuchos::ScalarTraits<ScalarType>::isComplex> 119 class GivensRotator; 120 } 121 122 //! Default implementation for BLAS routines 123 /*! 124 * This class provides the default implementation for the BLAS routines. It 125 * is put in a separate class so that specializations of BLAS for other types 126 * still have this implementation available. 127 */ 128 template<typename OrdinalType, typename ScalarType> 129 class DefaultBLASImpl 130 { 131 132 typedef typename Teuchos::ScalarTraits<ScalarType>::magnitudeType MagnitudeType; 133 134 public: 135 //! @name Constructor/Destructor. 136 //@{ 137 138 //! Default constructor. DefaultBLASImpl(void)139 inline DefaultBLASImpl(void) {} 140 141 //! Copy constructor. DefaultBLASImpl(const DefaultBLASImpl<OrdinalType,ScalarType> &)142 inline DefaultBLASImpl(const DefaultBLASImpl<OrdinalType, ScalarType>& /*BLAS_source*/) {} 143 144 //! Destructor. ~DefaultBLASImpl(void)145 inline virtual ~DefaultBLASImpl(void) {} 146 //@} 147 148 //! @name Level 1 BLAS Routines. 149 //@{ 150 151 //! The type used for c in ROTG 152 /*! This is MagnitudeType if ScalarType is complex and ScalarType otherwise. 153 */ 154 typedef typename details::GivensRotator<ScalarType>::c_type rotg_c_type; 155 156 //! Computes a Givens plane rotation. 157 void ROTG(ScalarType* da, ScalarType* db, rotg_c_type* c, ScalarType* s) const; 158 159 //! Applies a Givens plane rotation. 160 void ROT(const OrdinalType& n, ScalarType* dx, const OrdinalType& incx, ScalarType* dy, const OrdinalType& incy, MagnitudeType* c, ScalarType* s) const; 161 162 //! Scale the vector \c x by the constant \c alpha. 163 void SCAL(const OrdinalType& n, const ScalarType& alpha, ScalarType* x, const OrdinalType& incx) const; 164 165 //! Copy the vector \c x to the vector \c y. 166 void COPY(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const; 167 168 //! Perform the operation: \c y \c <- \c y+alpha*x. 169 template <typename alpha_type, typename x_type> 170 void AXPY(const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const; 171 172 //! Sum the absolute values of the entries of \c x. 173 typename ScalarTraits<ScalarType>::magnitudeType ASUM(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const; 174 175 //! Form the dot product of the vectors \c x and \c y. 176 template <typename x_type, typename y_type> 177 ScalarType DOT(const OrdinalType& n, const x_type* x, const OrdinalType& incx, const y_type* y, const OrdinalType& incy) const; 178 179 //! Compute the 2-norm of the vector \c x. 180 typename ScalarTraits<ScalarType>::magnitudeType NRM2(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const; 181 182 //! Return the index of the element of \c x with the maximum magnitude. 183 OrdinalType IAMAX(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const; 184 //@} 185 186 //! @name Level 2 BLAS Routines. 187 //@{ 188 189 //! Performs the matrix-vector operation: \c y \c <- \c alpha*A*x+beta*y or \c y \c <- \c alpha*A'*x+beta*y where \c A is a general \c m by \c n matrix. 190 template <typename alpha_type, typename A_type, typename x_type, typename beta_type> 191 void GEMV(ETransp trans, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, 192 const OrdinalType& lda, const x_type* x, const OrdinalType& incx, const beta_type beta, ScalarType* y, const OrdinalType& incy) const; 193 194 //! Performs the matrix-vector operation: \c x \c <- \c A*x or \c x \c <- \c A'*x where \c A is a unit/non-unit \c n by \c n upper/lower triangular matrix. 195 template <typename A_type> 196 void TRMV(EUplo uplo, ETransp trans, EDiag diag, const OrdinalType& n, const A_type* A, 197 const OrdinalType& lda, ScalarType* x, const OrdinalType& incx) const; 198 199 //! \brief Performs the rank 1 operation: \c A \c <- \c alpha*x*y'+A. 200 /// \note For complex arithmetic, this routine performs [Z/C]GERU. 201 template <typename alpha_type, typename x_type, typename y_type> 202 void GER(const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, 203 const y_type* y, const OrdinalType& incy, ScalarType* A, const OrdinalType& lda) const; 204 //@} 205 206 //! @name Level 3 BLAS Routines. 207 //@{ 208 209 /// \brief General matrix-matrix multiply. 210 /// 211 /// This computes C = alpha*op(A)*op(B) + beta*C. op(X) here may 212 /// be either X, the transpose of X, or the conjugate transpose of 213 /// X. op(A) has m rows and k columns, op(B) has k rows and n 214 /// columns, and C has m rows and n columns. 215 template <typename alpha_type, typename A_type, typename B_type, typename beta_type> 216 void GEMM(ETransp transa, ETransp transb, const OrdinalType& m, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const; 217 218 //! Swap the entries of x and y. 219 void 220 SWAP (const OrdinalType& n, ScalarType* const x, const OrdinalType& incx, 221 ScalarType* const y, const OrdinalType& incy) const; 222 223 //! Performs the matrix-matrix operation: \c C \c <- \c alpha*A*B+beta*C or \c C \c <- \c alpha*B*A+beta*C where \c A is an \c m by \c m or \c n by \c n symmetric matrix and \c B is a general matrix. 224 template <typename alpha_type, typename A_type, typename B_type, typename beta_type> 225 void SYMM(ESide side, EUplo uplo, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const; 226 227 //! Performs the symmetric rank k operation: \c C <- \c alpha*A*A'+beta*C or \c C <- \c alpha*A'*A+beta*C, where \c alpha and \c beta are scalars, \c C is an \c n by \c n symmetric matrix and \c A is an \c n by \c k matrix in the first case or \c k by \c n matrix in the second case. 228 template <typename alpha_type, typename A_type, typename beta_type> 229 void SYRK(EUplo uplo, ETransp trans, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const; 230 231 //! Performs the matrix-matrix operation: \c B \c <- \c alpha*op(A)*B or \c B \c <- \c alpha*B*op(A) where \c op(A) is an unit/non-unit, upper/lower triangular matrix and \c B is a general matrix. 232 template <typename alpha_type, typename A_type> 233 void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n, 234 const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const; 235 236 //! Solves the matrix equations: \c op(A)*X=alpha*B or \c X*op(A)=alpha*B where \c X and \c B are \c m by \c n matrices, \c A is a unit/non-unit, upper/lower triangular matrix and \c op(A) is \c A or \c A'. The matrix \c X is overwritten on \c B. 237 template <typename alpha_type, typename A_type> 238 void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n, 239 const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const; 240 //@} 241 }; 242 243 template<typename OrdinalType, typename ScalarType> 244 class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS : public DefaultBLASImpl<OrdinalType,ScalarType> 245 { 246 247 typedef typename Teuchos::ScalarTraits<ScalarType>::magnitudeType MagnitudeType; 248 249 public: 250 //! @name Constructor/Destructor. 251 //@{ 252 253 //! Default constructor. BLAS(void)254 inline BLAS(void) {} 255 256 //! Copy constructor. BLAS(const BLAS<OrdinalType,ScalarType> &)257 inline BLAS(const BLAS<OrdinalType, ScalarType>& /*BLAS_source*/) {} 258 259 //! Destructor. ~BLAS(void)260 inline virtual ~BLAS(void) {} 261 //@} 262 }; 263 264 //------------------------------------------------------------------------------------------ 265 // LEVEL 1 BLAS ROUTINES 266 //------------------------------------------------------------------------------------------ 267 268 /// \namespace details 269 /// \brief Teuchos implementation details. 270 /// 271 /// \warning Teuchos users should not use anything in this 272 /// namespace. They should not even assume that the namespace 273 /// will continue to exist between releases. The namespace's name 274 /// itself or anything it contains may change at any time. 275 namespace details { 276 277 // Compute magnitude. 278 template<typename ScalarType, bool isComplex> 279 class MagValue { 280 public: 281 void 282 blas_dabs1(const ScalarType* a, typename ScalarTraits<ScalarType>::magnitudeType* ret) const; 283 }; 284 285 // Complex-arithmetic specialization. 286 template<typename ScalarType> 287 class MagValue<ScalarType, true> { 288 public: 289 void 290 blas_dabs1(const ScalarType* a, typename ScalarTraits<ScalarType>::magnitudeType* ret) const; 291 }; 292 293 // Real-arithmetic specialization. 294 template<typename ScalarType> 295 class MagValue<ScalarType, false> { 296 public: 297 void 298 blas_dabs1(const ScalarType* a, ScalarType* ret) const; 299 }; 300 301 template<typename ScalarType, bool isComplex> 302 class GivensRotator {}; 303 304 // Complex-arithmetic specialization. 305 template<typename ScalarType> 306 class GivensRotator<ScalarType, true> { 307 public: 308 typedef typename ScalarTraits<ScalarType>::magnitudeType c_type; 309 void 310 ROTG (ScalarType* ca, 311 ScalarType* cb, 312 typename ScalarTraits<ScalarType>::magnitudeType* c, 313 ScalarType* s) const; 314 }; 315 316 // Real-arithmetic specialization. 317 template<typename ScalarType> 318 class GivensRotator<ScalarType, false> { 319 public: 320 typedef ScalarType c_type; 321 void 322 ROTG (ScalarType* da, 323 ScalarType* db, 324 ScalarType* c, 325 ScalarType* s) const; 326 327 /// Return ABS(x) if y > 0 or y is +0, else -ABS(x) (if y is -0 or < 0). 328 /// 329 /// Note that SIGN respects IEEE 754 floating-point signed zero. 330 /// This is a hopefully correct implementation of the Fortran 331 /// type-generic SIGN intrinsic. ROTG for complex arithmetic 332 /// doesn't require this function. C99 provides a copysign() 333 /// math library function, but we are not able to rely on the 334 /// existence of C99 functions here. 335 /// 336 /// We provide this method on purpose only for the 337 /// real-arithmetic specialization of GivensRotator. Complex 338 /// numbers don't have a sign; they have an angle. SIGN(const ScalarType & x,const ScalarType & y) const339 ScalarType SIGN (const ScalarType& x, const ScalarType& y) const { 340 typedef ScalarTraits<ScalarType> STS; 341 342 if (y > STS::zero()) { 343 return STS::magnitude (x); 344 } else if (y < STS::zero()) { 345 return -STS::magnitude (x); 346 } else { // y == STS::zero() 347 // Suppose that ScalarType& implements signed zero, as IEEE 348 // 754 - compliant floating-point numbers should. You can't 349 // use == to test for signed zero, since +0 == -0. However, 350 // 1/0 = Inf > 0 and 1/-0 = -Inf < 0. Let's hope ScalarType 351 // supports Inf... we don't need to test for Inf, just see 352 // if it's greater than or less than zero. 353 // 354 // NOTE: This ONLY works if ScalarType is real. Complex 355 // infinity doesn't have a sign, so we can't compare it with 356 // zero. That's OK, because finite complex numbers don't 357 // have a sign either; they have an angle. 358 ScalarType signedInfinity = STS::one() / y; 359 if (signedInfinity > STS::zero()) { 360 return STS::magnitude (x); 361 } else { 362 // Even if ScalarType doesn't implement signed zero, 363 // Fortran's SIGN intrinsic returns -ABS(X) if the second 364 // argument Y is zero. We imitate this behavior here. 365 return -STS::magnitude (x); 366 } 367 } 368 } 369 }; 370 371 // Implementation of complex-arithmetic specialization. 372 template<typename ScalarType> 373 void 374 GivensRotator<ScalarType, true>:: ROTG(ScalarType * ca,ScalarType * cb,typename ScalarTraits<ScalarType>::magnitudeType * c,ScalarType * s) const375 ROTG (ScalarType* ca, 376 ScalarType* cb, 377 typename ScalarTraits<ScalarType>::magnitudeType* c, 378 ScalarType* s) const 379 { 380 typedef ScalarTraits<ScalarType> STS; 381 typedef typename STS::magnitudeType MagnitudeType; 382 typedef ScalarTraits<MagnitudeType> STM; 383 384 // This is a straightforward translation into C++ of the 385 // reference BLAS' implementation of ZROTG. You can get 386 // the Fortran 77 source code of ZROTG here: 387 // 388 // http://www.netlib.org/blas/zrotg.f 389 // 390 // I used the following rules to translate Fortran types and 391 // intrinsic functions into C++: 392 // 393 // DOUBLE PRECISION -> MagnitudeType 394 // DOUBLE COMPLEX -> ScalarType 395 // CDABS -> STS::magnitude 396 // DCMPLX -> ScalarType constructor (assuming that ScalarType 397 // is std::complex<MagnitudeType>) 398 // DCONJG -> STS::conjugate 399 // DSQRT -> STM::squareroot 400 ScalarType alpha; 401 MagnitudeType norm, scale; 402 403 if (STS::magnitude (*ca) == STM::zero()) { 404 *c = STM::zero(); 405 *s = STS::one(); 406 *ca = *cb; 407 } else { 408 scale = STS::magnitude (*ca) + STS::magnitude (*cb); 409 { // I introduced temporaries into the translated BLAS code in 410 // order to make the expression easier to read and also save a 411 // few floating-point operations. 412 const MagnitudeType ca_scaled = 413 STS::magnitude (*ca / ScalarType(scale, STM::zero())); 414 const MagnitudeType cb_scaled = 415 STS::magnitude (*cb / ScalarType(scale, STM::zero())); 416 norm = scale * 417 STM::squareroot (ca_scaled*ca_scaled + cb_scaled*cb_scaled); 418 } 419 alpha = *ca / STS::magnitude (*ca); 420 *c = STS::magnitude (*ca) / norm; 421 *s = alpha * STS::conjugate (*cb) / norm; 422 *ca = alpha * norm; 423 } 424 } 425 426 // Implementation of real-arithmetic specialization. 427 template<typename ScalarType> 428 void 429 GivensRotator<ScalarType, false>:: ROTG(ScalarType * da,ScalarType * db,ScalarType * c,ScalarType * s) const430 ROTG (ScalarType* da, 431 ScalarType* db, 432 ScalarType* c, 433 ScalarType* s) const 434 { 435 typedef ScalarTraits<ScalarType> STS; 436 437 // This is a straightforward translation into C++ of the 438 // reference BLAS' implementation of DROTG. You can get 439 // the Fortran 77 source code of DROTG here: 440 // 441 // http://www.netlib.org/blas/drotg.f 442 // 443 // I used the following rules to translate Fortran types and 444 // intrinsic functions into C++: 445 // 446 // DOUBLE PRECISION -> ScalarType 447 // DABS -> STS::magnitude 448 // DSQRT -> STM::squareroot 449 // DSIGN -> SIGN (see below) 450 // 451 // DSIGN(x,y) (the old DOUBLE PRECISION type-specific form of 452 // the Fortran type-generic SIGN intrinsic) required special 453 // translation, which we did in a separate utility function in 454 // the specializaton of GivensRotator for real arithmetic. 455 // (ROTG for complex arithmetic doesn't require this function.) 456 // C99 provides a copysign() math library function, but we are 457 // not able to rely on the existence of C99 functions here. 458 ScalarType r, roe, scale, z; 459 460 roe = *db; 461 if (STS::magnitude (*da) > STS::magnitude (*db)) { 462 roe = *da; 463 } 464 scale = STS::magnitude (*da) + STS::magnitude (*db); 465 if (scale == STS::zero()) { 466 *c = STS::one(); 467 *s = STS::zero(); 468 r = STS::zero(); 469 z = STS::zero(); 470 } else { 471 // I introduced temporaries into the translated BLAS code in 472 // order to make the expression easier to read and also save 473 // a few floating-point& operations. 474 const ScalarType da_scaled = *da / scale; 475 const ScalarType db_scaled = *db / scale; 476 r = scale * STS::squareroot (da_scaled*da_scaled + db_scaled*db_scaled); 477 r = SIGN (STS::one(), roe) * r; 478 *c = *da / r; 479 *s = *db / r; 480 z = STS::one(); 481 if (STS::magnitude (*da) > STS::magnitude (*db)) { 482 z = *s; 483 } 484 if (STS::magnitude (*db) >= STS::magnitude (*da) && *c != STS::zero()) { 485 z = STS::one() / *c; 486 } 487 } 488 489 *da = r; 490 *db = z; 491 } 492 493 // Real-valued implementation of MagValue 494 template<typename ScalarType> 495 void 496 MagValue<ScalarType, false>:: blas_dabs1(const ScalarType * a,ScalarType * ret) const497 blas_dabs1(const ScalarType* a, ScalarType* ret) const 498 { 499 *ret = Teuchos::ScalarTraits<ScalarType>::magnitude( *a ); 500 } 501 502 // Complex-valued implementation of MagValue 503 template<typename ScalarType> 504 void 505 MagValue<ScalarType, true>:: blas_dabs1(const ScalarType * a,typename ScalarTraits<ScalarType>::magnitudeType * ret) const506 blas_dabs1(const ScalarType* a, typename ScalarTraits<ScalarType>::magnitudeType* ret) const 507 { 508 *ret = ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::magnitude(a->real()); 509 *ret += ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::magnitude(a->imag()); 510 } 511 512 } // namespace details 513 514 template<typename OrdinalType, typename ScalarType> 515 void 516 DefaultBLASImpl<OrdinalType, ScalarType>:: ROTG(ScalarType * da,ScalarType * db,rotg_c_type * c,ScalarType * s) const517 ROTG (ScalarType* da, 518 ScalarType* db, 519 rotg_c_type* c, 520 ScalarType* s) const 521 { 522 details::GivensRotator<ScalarType> rotator; 523 rotator.ROTG (da, db, c, s); 524 } 525 526 template<typename OrdinalType, typename ScalarType> ROT(const OrdinalType & n,ScalarType * dx,const OrdinalType & incx,ScalarType * dy,const OrdinalType & incy,MagnitudeType * c,ScalarType * s) const527 void DefaultBLASImpl<OrdinalType,ScalarType>::ROT(const OrdinalType& n, ScalarType* dx, const OrdinalType& incx, ScalarType* dy, const OrdinalType& incy, MagnitudeType* c, ScalarType* s) const 528 { 529 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 530 ScalarType sconj = Teuchos::ScalarTraits<ScalarType>::conjugate(*s); 531 if (n <= 0) return; 532 if (incx==1 && incy==1) { 533 for(OrdinalType i=0; i<n; ++i) { 534 ScalarType temp = *c*dx[i] + sconj*dy[i]; 535 dy[i] = *c*dy[i] - sconj*dx[i]; 536 dx[i] = temp; 537 } 538 } 539 else { 540 OrdinalType ix = 0, iy = 0; 541 if (incx < izero) ix = (-n+1)*incx; 542 if (incy < izero) iy = (-n+1)*incy; 543 for(OrdinalType i=0; i<n; ++i) { 544 ScalarType temp = *c*dx[ix] + sconj*dy[iy]; 545 dy[iy] = *c*dy[iy] - sconj*dx[ix]; 546 dx[ix] = temp; 547 ix += incx; 548 iy += incy; 549 } 550 } 551 } 552 553 template<typename OrdinalType, typename ScalarType> SCAL(const OrdinalType & n,const ScalarType & alpha,ScalarType * x,const OrdinalType & incx) const554 void DefaultBLASImpl<OrdinalType, ScalarType>::SCAL(const OrdinalType& n, const ScalarType& alpha, ScalarType* x, const OrdinalType& incx) const 555 { 556 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 557 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 558 OrdinalType i, ix = izero; 559 560 if ( n < ione || incx < ione ) 561 return; 562 563 // Scale the vector. 564 for(i = izero; i < n; i++) 565 { 566 x[ix] *= alpha; 567 ix += incx; 568 } 569 } /* end SCAL */ 570 571 template<typename OrdinalType, typename ScalarType> COPY(const OrdinalType & n,const ScalarType * x,const OrdinalType & incx,ScalarType * y,const OrdinalType & incy) const572 void DefaultBLASImpl<OrdinalType, ScalarType>::COPY(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const 573 { 574 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 575 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 576 OrdinalType i, ix = izero, iy = izero; 577 if ( n > izero ) { 578 // Set the initial indices (ix, iy). 579 if (incx < izero) { ix = (-n+ione)*incx; } 580 if (incy < izero) { iy = (-n+ione)*incy; } 581 582 for(i = izero; i < n; i++) 583 { 584 y[iy] = x[ix]; 585 ix += incx; 586 iy += incy; 587 } 588 } 589 } /* end COPY */ 590 591 template<typename OrdinalType, typename ScalarType> 592 template <typename alpha_type, typename x_type> AXPY(const OrdinalType & n,const alpha_type alpha,const x_type * x,const OrdinalType & incx,ScalarType * y,const OrdinalType & incy) const593 void DefaultBLASImpl<OrdinalType, ScalarType>::AXPY(const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const 594 { 595 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 596 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 597 OrdinalType i, ix = izero, iy = izero; 598 if( n > izero && alpha != ScalarTraits<alpha_type>::zero()) 599 { 600 // Set the initial indices (ix, iy). 601 if (incx < izero) { ix = (-n+ione)*incx; } 602 if (incy < izero) { iy = (-n+ione)*incy; } 603 604 for(i = izero; i < n; i++) 605 { 606 y[iy] += alpha * x[ix]; 607 ix += incx; 608 iy += incy; 609 } 610 } 611 } /* end AXPY */ 612 613 template<typename OrdinalType, typename ScalarType> ASUM(const OrdinalType & n,const ScalarType * x,const OrdinalType & incx) const614 typename ScalarTraits<ScalarType>::magnitudeType DefaultBLASImpl<OrdinalType, ScalarType>::ASUM(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const 615 { 616 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 617 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 618 typename ScalarTraits<ScalarType>::magnitudeType temp, result = 619 ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero(); 620 OrdinalType i, ix = izero; 621 622 if ( n < ione || incx < ione ) 623 return result; 624 625 details::MagValue<ScalarType, ScalarTraits<ScalarType>::isComplex> mval; 626 for (i = izero; i < n; i++) 627 { 628 mval.blas_dabs1( &x[ix], &temp ); 629 result += temp; 630 ix += incx; 631 } 632 633 return result; 634 } /* end ASUM */ 635 636 template<typename OrdinalType, typename ScalarType> 637 template <typename x_type, typename y_type> DOT(const OrdinalType & n,const x_type * x,const OrdinalType & incx,const y_type * y,const OrdinalType & incy) const638 ScalarType DefaultBLASImpl<OrdinalType, ScalarType>::DOT(const OrdinalType& n, const x_type* x, const OrdinalType& incx, const y_type* y, const OrdinalType& incy) const 639 { 640 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 641 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 642 ScalarType result = ScalarTraits<ScalarType>::zero(); 643 OrdinalType i, ix = izero, iy = izero; 644 if( n > izero ) 645 { 646 // Set the initial indices (ix,iy). 647 if (incx < izero) { ix = (-n+ione)*incx; } 648 if (incy < izero) { iy = (-n+ione)*incy; } 649 650 for(i = izero; i < n; i++) 651 { 652 result += ScalarTraits<x_type>::conjugate(x[ix]) * y[iy]; 653 ix += incx; 654 iy += incy; 655 } 656 } 657 return result; 658 } /* end DOT */ 659 660 template<typename OrdinalType, typename ScalarType> NRM2(const OrdinalType & n,const ScalarType * x,const OrdinalType & incx) const661 typename ScalarTraits<ScalarType>::magnitudeType DefaultBLASImpl<OrdinalType, ScalarType>::NRM2(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const 662 { 663 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 664 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 665 typename ScalarTraits<ScalarType>::magnitudeType result = 666 ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero(); 667 OrdinalType i, ix = izero; 668 669 if ( n < ione || incx < ione ) 670 return result; 671 672 for(i = izero; i < n; i++) 673 { 674 result += ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::conjugate(x[ix]) * x[ix]); 675 ix += incx; 676 } 677 result = ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::squareroot(result); 678 return result; 679 } /* end NRM2 */ 680 681 template<typename OrdinalType, typename ScalarType> 682 OrdinalType DefaultBLASImpl<OrdinalType, ScalarType>::IAMAX(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const 683 { 684 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 685 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 686 OrdinalType result = izero, ix = izero, i; 687 typename ScalarTraits<ScalarType>::magnitudeType curval = 688 ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero(); 689 typename ScalarTraits<ScalarType>::magnitudeType maxval = 690 ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero(); 691 692 if ( n < ione || incx < ione ) 693 return result; 694 695 details::MagValue<ScalarType, ScalarTraits<ScalarType>::isComplex> mval; 696 697 mval.blas_dabs1( &x[ix], &maxval ); 698 ix += incx; 699 for(i = ione; i < n; i++) 700 { 701 mval.blas_dabs1( &x[ix], &curval ); 702 if(curval > maxval) 703 { 704 result = i; 705 maxval = curval; 706 } 707 ix += incx; 708 } 709 710 return result + 1; // the BLAS I?AMAX functions return 1-indexed (Fortran-style) values 711 } /* end IAMAX */ 712 713 //------------------------------------------------------------------------------------------ 714 // LEVEL 2 BLAS ROUTINES 715 //------------------------------------------------------------------------------------------ 716 template<typename OrdinalType, typename ScalarType> 717 template <typename alpha_type, typename A_type, typename x_type, typename beta_type> GEMV(ETransp trans,const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const A_type * A,const OrdinalType & lda,const x_type * x,const OrdinalType & incx,const beta_type beta,ScalarType * y,const OrdinalType & incy) const718 void DefaultBLASImpl<OrdinalType, ScalarType>::GEMV(ETransp trans, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const x_type* x, const OrdinalType& incx, const beta_type beta, ScalarType* y, const OrdinalType& incy) const 719 { 720 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 721 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 722 alpha_type alpha_zero = ScalarTraits<alpha_type>::zero(); 723 beta_type beta_zero = ScalarTraits<beta_type>::zero(); 724 x_type x_zero = ScalarTraits<x_type>::zero(); 725 ScalarType y_zero = ScalarTraits<ScalarType>::zero(); 726 beta_type beta_one = ScalarTraits<beta_type>::one(); 727 bool noConj = true; 728 bool BadArgument = false; 729 730 // Quick return if there is nothing to do! 731 if( m == izero || n == izero || ( alpha == alpha_zero && beta == beta_one ) ){ return; } 732 733 // Otherwise, we need to check the argument list. 734 if( m < izero ) { 735 std::cout << "BLAS::GEMV Error: M == " << m << std::endl; 736 BadArgument = true; 737 } 738 if( n < izero ) { 739 std::cout << "BLAS::GEMV Error: N == " << n << std::endl; 740 BadArgument = true; 741 } 742 if( lda < m ) { 743 std::cout << "BLAS::GEMV Error: LDA < MAX(1,M)"<< std::endl; 744 BadArgument = true; 745 } 746 if( incx == izero ) { 747 std::cout << "BLAS::GEMV Error: INCX == 0"<< std::endl; 748 BadArgument = true; 749 } 750 if( incy == izero ) { 751 std::cout << "BLAS::GEMV Error: INCY == 0"<< std::endl; 752 BadArgument = true; 753 } 754 755 if(!BadArgument) { 756 OrdinalType i, j, lenx, leny, ix, iy, jx, jy; 757 OrdinalType kx = izero, ky = izero; 758 ScalarType temp; 759 760 // Determine the lengths of the vectors x and y. 761 if(ETranspChar[trans] == 'N') { 762 lenx = n; 763 leny = m; 764 } else { 765 lenx = m; 766 leny = n; 767 } 768 769 // Determine if this is a conjugate tranpose 770 noConj = (ETranspChar[trans] == 'T'); 771 772 // Set the starting pointers for the vectors x and y if incx/y < 0. 773 if (incx < izero ) { kx = (ione - lenx)*incx; } 774 if (incy < izero ) { ky = (ione - leny)*incy; } 775 776 // Form y = beta*y 777 ix = kx; iy = ky; 778 if(beta != beta_one) { 779 if (incy == ione) { 780 if (beta == beta_zero) { 781 for(i = izero; i < leny; i++) { y[i] = y_zero; } 782 } else { 783 for(i = izero; i < leny; i++) { y[i] *= beta; } 784 } 785 } else { 786 if (beta == beta_zero) { 787 for(i = izero; i < leny; i++) { 788 y[iy] = y_zero; 789 iy += incy; 790 } 791 } else { 792 for(i = izero; i < leny; i++) { 793 y[iy] *= beta; 794 iy += incy; 795 } 796 } 797 } 798 } 799 800 // Return if we don't have to do anything more. 801 if(alpha == alpha_zero) { return; } 802 803 if( ETranspChar[trans] == 'N' ) { 804 // Form y = alpha*A*y 805 jx = kx; 806 if (incy == ione) { 807 for(j = izero; j < n; j++) { 808 if (x[jx] != x_zero) { 809 temp = alpha*x[jx]; 810 for(i = izero; i < m; i++) { 811 y[i] += temp*A[j*lda + i]; 812 } 813 } 814 jx += incx; 815 } 816 } else { 817 for(j = izero; j < n; j++) { 818 if (x[jx] != x_zero) { 819 temp = alpha*x[jx]; 820 iy = ky; 821 for(i = izero; i < m; i++) { 822 y[iy] += temp*A[j*lda + i]; 823 iy += incy; 824 } 825 } 826 jx += incx; 827 } 828 } 829 } else { 830 jy = ky; 831 if (incx == ione) { 832 for(j = izero; j < n; j++) { 833 temp = y_zero; 834 if ( noConj ) { 835 for(i = izero; i < m; i++) { 836 temp += A[j*lda + i]*x[i]; 837 } 838 } else { 839 for(i = izero; i < m; i++) { 840 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i]; 841 } 842 } 843 y[jy] += alpha*temp; 844 jy += incy; 845 } 846 } else { 847 for(j = izero; j < n; j++) { 848 temp = y_zero; 849 ix = kx; 850 if ( noConj ) { 851 for (i = izero; i < m; i++) { 852 temp += A[j*lda + i]*x[ix]; 853 ix += incx; 854 } 855 } else { 856 for (i = izero; i < m; i++) { 857 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix]; 858 ix += incx; 859 } 860 } 861 y[jy] += alpha*temp; 862 jy += incy; 863 } 864 } 865 } 866 } /* if (!BadArgument) */ 867 } /* end GEMV */ 868 869 template<typename OrdinalType, typename ScalarType> 870 template <typename A_type> TRMV(EUplo uplo,ETransp trans,EDiag diag,const OrdinalType & n,const A_type * A,const OrdinalType & lda,ScalarType * x,const OrdinalType & incx) const871 void DefaultBLASImpl<OrdinalType, ScalarType>::TRMV(EUplo uplo, ETransp trans, EDiag diag, const OrdinalType& n, const A_type* A, const OrdinalType& lda, ScalarType* x, const OrdinalType& incx) const 872 { 873 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 874 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 875 ScalarType zero = ScalarTraits<ScalarType>::zero(); 876 bool BadArgument = false; 877 bool noConj = true; 878 879 // Quick return if there is nothing to do! 880 if( n == izero ){ return; } 881 882 // Otherwise, we need to check the argument list. 883 if( n < izero ) { 884 std::cout << "BLAS::TRMV Error: N == " << n << std::endl; 885 BadArgument = true; 886 } 887 if( lda < n ) { 888 std::cout << "BLAS::TRMV Error: LDA < MAX(1,N)"<< std::endl; 889 BadArgument = true; 890 } 891 if( incx == izero ) { 892 std::cout << "BLAS::TRMV Error: INCX == 0"<< std::endl; 893 BadArgument = true; 894 } 895 896 if(!BadArgument) { 897 OrdinalType i, j, ix, jx, kx = izero; 898 ScalarType temp; 899 bool noUnit = (EDiagChar[diag] == 'N'); 900 901 // Determine if this is a conjugate tranpose 902 noConj = (ETranspChar[trans] == 'T'); 903 904 // Set the starting pointer for the vector x if incx < 0. 905 if (incx < izero) { kx = (-n+ione)*incx; } 906 907 // Start the operations for a nontransposed triangular matrix 908 if (ETranspChar[trans] == 'N') { 909 /* Compute x = A*x */ 910 if (EUploChar[uplo] == 'U') { 911 /* A is an upper triangular matrix */ 912 if (incx == ione) { 913 for (j=izero; j<n; j++) { 914 if (x[j] != zero) { 915 temp = x[j]; 916 for (i=izero; i<j; i++) { 917 x[i] += temp*A[j*lda + i]; 918 } 919 if ( noUnit ) 920 x[j] *= A[j*lda + j]; 921 } 922 } 923 } else { 924 jx = kx; 925 for (j=izero; j<n; j++) { 926 if (x[jx] != zero) { 927 temp = x[jx]; 928 ix = kx; 929 for (i=izero; i<j; i++) { 930 x[ix] += temp*A[j*lda + i]; 931 ix += incx; 932 } 933 if ( noUnit ) 934 x[jx] *= A[j*lda + j]; 935 } 936 jx += incx; 937 } 938 } /* if (incx == ione) */ 939 } else { /* A is a lower triangular matrix */ 940 if (incx == ione) { 941 for (j=n-ione; j>-ione; j--) { 942 if (x[j] != zero) { 943 temp = x[j]; 944 for (i=n-ione; i>j; i--) { 945 x[i] += temp*A[j*lda + i]; 946 } 947 if ( noUnit ) 948 x[j] *= A[j*lda + j]; 949 } 950 } 951 } else { 952 kx += (n-ione)*incx; 953 jx = kx; 954 for (j=n-ione; j>-ione; j--) { 955 if (x[jx] != zero) { 956 temp = x[jx]; 957 ix = kx; 958 for (i=n-ione; i>j; i--) { 959 x[ix] += temp*A[j*lda + i]; 960 ix -= incx; 961 } 962 if ( noUnit ) 963 x[jx] *= A[j*lda + j]; 964 } 965 jx -= incx; 966 } 967 } 968 } /* if (EUploChar[uplo]=='U') */ 969 } else { /* A is transposed/conjugated */ 970 /* Compute x = A'*x */ 971 if (EUploChar[uplo]=='U') { 972 /* A is an upper triangular matrix */ 973 if (incx == ione) { 974 for (j=n-ione; j>-ione; j--) { 975 temp = x[j]; 976 if ( noConj ) { 977 if ( noUnit ) 978 temp *= A[j*lda + j]; 979 for (i=j-ione; i>-ione; i--) { 980 temp += A[j*lda + i]*x[i]; 981 } 982 } else { 983 if ( noUnit ) 984 temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]); 985 for (i=j-ione; i>-ione; i--) { 986 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i]; 987 } 988 } 989 x[j] = temp; 990 } 991 } else { 992 jx = kx + (n-ione)*incx; 993 for (j=n-ione; j>-ione; j--) { 994 temp = x[jx]; 995 ix = jx; 996 if ( noConj ) { 997 if ( noUnit ) 998 temp *= A[j*lda + j]; 999 for (i=j-ione; i>-ione; i--) { 1000 ix -= incx; 1001 temp += A[j*lda + i]*x[ix]; 1002 } 1003 } else { 1004 if ( noUnit ) 1005 temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]); 1006 for (i=j-ione; i>-ione; i--) { 1007 ix -= incx; 1008 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix]; 1009 } 1010 } 1011 x[jx] = temp; 1012 jx -= incx; 1013 } 1014 } 1015 } else { 1016 /* A is a lower triangular matrix */ 1017 if (incx == ione) { 1018 for (j=izero; j<n; j++) { 1019 temp = x[j]; 1020 if ( noConj ) { 1021 if ( noUnit ) 1022 temp *= A[j*lda + j]; 1023 for (i=j+ione; i<n; i++) { 1024 temp += A[j*lda + i]*x[i]; 1025 } 1026 } else { 1027 if ( noUnit ) 1028 temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]); 1029 for (i=j+ione; i<n; i++) { 1030 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i]; 1031 } 1032 } 1033 x[j] = temp; 1034 } 1035 } else { 1036 jx = kx; 1037 for (j=izero; j<n; j++) { 1038 temp = x[jx]; 1039 ix = jx; 1040 if ( noConj ) { 1041 if ( noUnit ) 1042 temp *= A[j*lda + j]; 1043 for (i=j+ione; i<n; i++) { 1044 ix += incx; 1045 temp += A[j*lda + i]*x[ix]; 1046 } 1047 } else { 1048 if ( noUnit ) 1049 temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]); 1050 for (i=j+ione; i<n; i++) { 1051 ix += incx; 1052 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix]; 1053 } 1054 } 1055 x[jx] = temp; 1056 jx += incx; 1057 } 1058 } 1059 } /* if (EUploChar[uplo]=='U') */ 1060 } /* if (ETranspChar[trans]=='N') */ 1061 } /* if (!BadArgument) */ 1062 } /* end TRMV */ 1063 1064 template<typename OrdinalType, typename ScalarType> 1065 template <typename alpha_type, typename x_type, typename y_type> GER(const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const x_type * x,const OrdinalType & incx,const y_type * y,const OrdinalType & incy,ScalarType * A,const OrdinalType & lda) const1066 void DefaultBLASImpl<OrdinalType, ScalarType>::GER(const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, const y_type* y, const OrdinalType& incy, ScalarType* A, const OrdinalType& lda) const 1067 { 1068 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 1069 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 1070 alpha_type alpha_zero = ScalarTraits<alpha_type>::zero(); 1071 y_type y_zero = ScalarTraits<y_type>::zero(); 1072 bool BadArgument = false; 1073 1074 // Quick return if there is nothing to do! 1075 if( m == izero || n == izero || alpha == alpha_zero ){ return; } 1076 1077 // Otherwise, we need to check the argument list. 1078 if( m < izero ) { 1079 std::cout << "BLAS::GER Error: M == " << m << std::endl; 1080 BadArgument = true; 1081 } 1082 if( n < izero ) { 1083 std::cout << "BLAS::GER Error: N == " << n << std::endl; 1084 BadArgument = true; 1085 } 1086 if( lda < m ) { 1087 std::cout << "BLAS::GER Error: LDA < MAX(1,M)"<< std::endl; 1088 BadArgument = true; 1089 } 1090 if( incx == 0 ) { 1091 std::cout << "BLAS::GER Error: INCX == 0"<< std::endl; 1092 BadArgument = true; 1093 } 1094 if( incy == 0 ) { 1095 std::cout << "BLAS::GER Error: INCY == 0"<< std::endl; 1096 BadArgument = true; 1097 } 1098 1099 if(!BadArgument) { 1100 OrdinalType i, j, ix, jy = izero, kx = izero; 1101 ScalarType temp; 1102 1103 // Set the starting pointers for the vectors x and y if incx/y < 0. 1104 if (incx < izero) { kx = (-m+ione)*incx; } 1105 if (incy < izero) { jy = (-n+ione)*incy; } 1106 1107 // Start the operations for incx == 1 1108 if( incx == ione ) { 1109 for( j=izero; j<n; j++ ) { 1110 if ( y[jy] != y_zero ) { 1111 temp = alpha*y[jy]; 1112 for ( i=izero; i<m; i++ ) { 1113 A[j*lda + i] += x[i]*temp; 1114 } 1115 } 1116 jy += incy; 1117 } 1118 } 1119 else { // Start the operations for incx != 1 1120 for( j=izero; j<n; j++ ) { 1121 if ( y[jy] != y_zero ) { 1122 temp = alpha*y[jy]; 1123 ix = kx; 1124 for( i=izero; i<m; i++ ) { 1125 A[j*lda + i] += x[ix]*temp; 1126 ix += incx; 1127 } 1128 } 1129 jy += incy; 1130 } 1131 } 1132 } /* if(!BadArgument) */ 1133 } /* end GER */ 1134 1135 //------------------------------------------------------------------------------------------ 1136 // LEVEL 3 BLAS ROUTINES 1137 //------------------------------------------------------------------------------------------ 1138 1139 template<typename OrdinalType, typename ScalarType> 1140 template <typename alpha_type, typename A_type, typename B_type, typename beta_type> GEMM(ETransp transa,ETransp transb,const OrdinalType & m,const OrdinalType & n,const OrdinalType & k,const alpha_type alpha,const A_type * A,const OrdinalType & lda,const B_type * B,const OrdinalType & ldb,const beta_type beta,ScalarType * C,const OrdinalType & ldc) const1141 void DefaultBLASImpl<OrdinalType, ScalarType>::GEMM(ETransp transa, ETransp transb, const OrdinalType& m, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const 1142 { 1143 1144 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 1145 alpha_type alpha_zero = ScalarTraits<alpha_type>::zero(); 1146 beta_type beta_zero = ScalarTraits<beta_type>::zero(); 1147 B_type B_zero = ScalarTraits<B_type>::zero(); 1148 ScalarType C_zero = ScalarTraits<ScalarType>::zero(); 1149 beta_type beta_one = ScalarTraits<beta_type>::one(); 1150 OrdinalType i, j, p; 1151 OrdinalType NRowA = m, NRowB = k; 1152 ScalarType temp; 1153 bool BadArgument = false; 1154 bool conjA = false, conjB = false; 1155 1156 // Change dimensions of matrix if either matrix is transposed 1157 if( !(ETranspChar[transa]=='N') ) { 1158 NRowA = k; 1159 } 1160 if( !(ETranspChar[transb]=='N') ) { 1161 NRowB = n; 1162 } 1163 1164 // Quick return if there is nothing to do! 1165 if( (m==izero) || (n==izero) || (((alpha==alpha_zero)||(k==izero)) && (beta==beta_one)) ){ return; } 1166 if( m < izero ) { 1167 std::cout << "BLAS::GEMM Error: M == " << m << std::endl; 1168 BadArgument = true; 1169 } 1170 if( n < izero ) { 1171 std::cout << "BLAS::GEMM Error: N == " << n << std::endl; 1172 BadArgument = true; 1173 } 1174 if( k < izero ) { 1175 std::cout << "BLAS::GEMM Error: K == " << k << std::endl; 1176 BadArgument = true; 1177 } 1178 if( lda < NRowA ) { 1179 std::cout << "BLAS::GEMM Error: LDA < "<<NRowA<<std::endl; 1180 BadArgument = true; 1181 } 1182 if( ldb < NRowB ) { 1183 std::cout << "BLAS::GEMM Error: LDB < "<<NRowB<<std::endl; 1184 BadArgument = true; 1185 } 1186 if( ldc < m ) { 1187 std::cout << "BLAS::GEMM Error: LDC < MAX(1,M)"<< std::endl; 1188 BadArgument = true; 1189 } 1190 1191 if(!BadArgument) { 1192 1193 // Determine if this is a conjugate tranpose 1194 conjA = (ETranspChar[transa] == 'C'); 1195 conjB = (ETranspChar[transb] == 'C'); 1196 1197 // Only need to scale the resulting matrix C. 1198 if( alpha == alpha_zero ) { 1199 if( beta == beta_zero ) { 1200 for (j=izero; j<n; j++) { 1201 for (i=izero; i<m; i++) { 1202 C[j*ldc + i] = C_zero; 1203 } 1204 } 1205 } else { 1206 for (j=izero; j<n; j++) { 1207 for (i=izero; i<m; i++) { 1208 C[j*ldc + i] *= beta; 1209 } 1210 } 1211 } 1212 return; 1213 } 1214 // 1215 // Now start the operations. 1216 // 1217 if ( ETranspChar[transb]=='N' ) { 1218 if ( ETranspChar[transa]=='N' ) { 1219 // Compute C = alpha*A*B + beta*C 1220 for (j=izero; j<n; j++) { 1221 if( beta == beta_zero ) { 1222 for (i=izero; i<m; i++){ 1223 C[j*ldc + i] = C_zero; 1224 } 1225 } else if( beta != beta_one ) { 1226 for (i=izero; i<m; i++){ 1227 C[j*ldc + i] *= beta; 1228 } 1229 } 1230 for (p=izero; p<k; p++){ 1231 if (B[j*ldb + p] != B_zero ){ 1232 temp = alpha*B[j*ldb + p]; 1233 for (i=izero; i<m; i++) { 1234 C[j*ldc + i] += temp*A[p*lda + i]; 1235 } 1236 } 1237 } 1238 } 1239 } else if ( conjA ) { 1240 // Compute C = alpha*conj(A')*B + beta*C 1241 for (j=izero; j<n; j++) { 1242 for (i=izero; i<m; i++) { 1243 temp = C_zero; 1244 for (p=izero; p<k; p++) { 1245 temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])*B[j*ldb + p]; 1246 } 1247 if (beta == beta_zero) { 1248 C[j*ldc + i] = alpha*temp; 1249 } else { 1250 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i]; 1251 } 1252 } 1253 } 1254 } else { 1255 // Compute C = alpha*A'*B + beta*C 1256 for (j=izero; j<n; j++) { 1257 for (i=izero; i<m; i++) { 1258 temp = C_zero; 1259 for (p=izero; p<k; p++) { 1260 temp += A[i*lda + p]*B[j*ldb + p]; 1261 } 1262 if (beta == beta_zero) { 1263 C[j*ldc + i] = alpha*temp; 1264 } else { 1265 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i]; 1266 } 1267 } 1268 } 1269 } 1270 } else if ( ETranspChar[transa]=='N' ) { 1271 if ( conjB ) { 1272 // Compute C = alpha*A*conj(B') + beta*C 1273 for (j=izero; j<n; j++) { 1274 if (beta == beta_zero) { 1275 for (i=izero; i<m; i++) { 1276 C[j*ldc + i] = C_zero; 1277 } 1278 } else if ( beta != beta_one ) { 1279 for (i=izero; i<m; i++) { 1280 C[j*ldc + i] *= beta; 1281 } 1282 } 1283 for (p=izero; p<k; p++) { 1284 if (B[p*ldb + j] != B_zero) { 1285 temp = alpha*ScalarTraits<B_type>::conjugate(B[p*ldb + j]); 1286 for (i=izero; i<m; i++) { 1287 C[j*ldc + i] += temp*A[p*lda + i]; 1288 } 1289 } 1290 } 1291 } 1292 } else { 1293 // Compute C = alpha*A*B' + beta*C 1294 for (j=izero; j<n; j++) { 1295 if (beta == beta_zero) { 1296 for (i=izero; i<m; i++) { 1297 C[j*ldc + i] = C_zero; 1298 } 1299 } else if ( beta != beta_one ) { 1300 for (i=izero; i<m; i++) { 1301 C[j*ldc + i] *= beta; 1302 } 1303 } 1304 for (p=izero; p<k; p++) { 1305 if (B[p*ldb + j] != B_zero) { 1306 temp = alpha*B[p*ldb + j]; 1307 for (i=izero; i<m; i++) { 1308 C[j*ldc + i] += temp*A[p*lda + i]; 1309 } 1310 } 1311 } 1312 } 1313 } 1314 } else if ( conjA ) { 1315 if ( conjB ) { 1316 // Compute C = alpha*conj(A')*conj(B') + beta*C 1317 for (j=izero; j<n; j++) { 1318 for (i=izero; i<m; i++) { 1319 temp = C_zero; 1320 for (p=izero; p<k; p++) { 1321 temp += ScalarTraits<A_type>::conjugate(A[i*lda + p]) 1322 * ScalarTraits<B_type>::conjugate(B[p*ldb + j]); 1323 } 1324 if (beta == beta_zero) { 1325 C[j*ldc + i] = alpha*temp; 1326 } else { 1327 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i]; 1328 } 1329 } 1330 } 1331 } else { 1332 // Compute C = alpha*conj(A')*B' + beta*C 1333 for (j=izero; j<n; j++) { 1334 for (i=izero; i<m; i++) { 1335 temp = C_zero; 1336 for (p=izero; p<k; p++) { 1337 temp += ScalarTraits<A_type>::conjugate(A[i*lda + p]) 1338 * B[p*ldb + j]; 1339 } 1340 if (beta == beta_zero) { 1341 C[j*ldc + i] = alpha*temp; 1342 } else { 1343 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i]; 1344 } 1345 } 1346 } 1347 } 1348 } else { 1349 if ( conjB ) { 1350 // Compute C = alpha*A'*conj(B') + beta*C 1351 for (j=izero; j<n; j++) { 1352 for (i=izero; i<m; i++) { 1353 temp = C_zero; 1354 for (p=izero; p<k; p++) { 1355 temp += A[i*lda + p] 1356 * ScalarTraits<B_type>::conjugate(B[p*ldb + j]); 1357 } 1358 if (beta == beta_zero) { 1359 C[j*ldc + i] = alpha*temp; 1360 } else { 1361 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i]; 1362 } 1363 } 1364 } 1365 } else { 1366 // Compute C = alpha*A'*B' + beta*C 1367 for (j=izero; j<n; j++) { 1368 for (i=izero; i<m; i++) { 1369 temp = C_zero; 1370 for (p=izero; p<k; p++) { 1371 temp += A[i*lda + p]*B[p*ldb + j]; 1372 } 1373 if (beta == beta_zero) { 1374 C[j*ldc + i] = alpha*temp; 1375 } else { 1376 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i]; 1377 } 1378 } 1379 } 1380 } // end if (ETranspChar[transa]=='N') ... 1381 } // end if (ETranspChar[transb]=='N') ... 1382 } // end if (!BadArgument) ... 1383 } // end of GEMM 1384 1385 1386 template<typename OrdinalType, typename ScalarType> 1387 void DefaultBLASImpl<OrdinalType, ScalarType>:: SWAP(const OrdinalType & n,ScalarType * const x,const OrdinalType & incx,ScalarType * const y,const OrdinalType & incy) const1388 SWAP (const OrdinalType& n, ScalarType* const x, const OrdinalType& incx, 1389 ScalarType* const y, const OrdinalType& incy) const 1390 { 1391 if (n <= 0) { 1392 return; 1393 } 1394 1395 if (incx == 1 && incy == 1) { 1396 for (int i = 0; i < n; ++i) { 1397 ScalarType tmp = x[i]; 1398 x[i] = y[i]; 1399 y[i] = tmp; 1400 } 1401 return; 1402 } 1403 1404 int ix = 1; 1405 int iy = 1; 1406 if (incx < 0) { 1407 ix = (1-n) * incx + 1; 1408 } 1409 if (incy < 0) { 1410 iy = (1-n) * incy + 1; 1411 } 1412 1413 for (int i = 1; i <= n; ++i) { 1414 ScalarType tmp = x[ix - 1]; 1415 x[ix - 1] = y[iy - 1]; 1416 y[iy - 1] = tmp; 1417 ix += incx; 1418 iy += incy; 1419 } 1420 } 1421 1422 1423 template<typename OrdinalType, typename ScalarType> 1424 template <typename alpha_type, typename A_type, typename B_type, typename beta_type> SYMM(ESide side,EUplo uplo,const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const A_type * A,const OrdinalType & lda,const B_type * B,const OrdinalType & ldb,const beta_type beta,ScalarType * C,const OrdinalType & ldc) const1425 void DefaultBLASImpl<OrdinalType, ScalarType>::SYMM(ESide side, EUplo uplo, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const 1426 { 1427 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 1428 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 1429 alpha_type alpha_zero = ScalarTraits<alpha_type>::zero(); 1430 beta_type beta_zero = ScalarTraits<beta_type>::zero(); 1431 ScalarType C_zero = ScalarTraits<ScalarType>::zero(); 1432 beta_type beta_one = ScalarTraits<beta_type>::one(); 1433 OrdinalType i, j, k, NRowA = m; 1434 ScalarType temp1, temp2; 1435 bool BadArgument = false; 1436 bool Upper = (EUploChar[uplo] == 'U'); 1437 if (ESideChar[side] == 'R') { NRowA = n; } 1438 1439 // Quick return. 1440 if ( (m==izero) || (n==izero) || ( (alpha==alpha_zero)&&(beta==beta_one) ) ) { return; } 1441 if( m < izero ) { 1442 std::cout << "BLAS::SYMM Error: M == "<< m << std::endl; 1443 BadArgument = true; } 1444 if( n < izero ) { 1445 std::cout << "BLAS::SYMM Error: N == "<< n << std::endl; 1446 BadArgument = true; } 1447 if( lda < NRowA ) { 1448 std::cout << "BLAS::SYMM Error: LDA < "<<NRowA<<std::endl; 1449 BadArgument = true; } 1450 if( ldb < m ) { 1451 std::cout << "BLAS::SYMM Error: LDB < MAX(1,M)"<<std::endl; 1452 BadArgument = true; } 1453 if( ldc < m ) { 1454 std::cout << "BLAS::SYMM Error: LDC < MAX(1,M)"<<std::endl; 1455 BadArgument = true; } 1456 1457 if(!BadArgument) { 1458 1459 // Only need to scale C and return. 1460 if (alpha == alpha_zero) { 1461 if (beta == beta_zero ) { 1462 for (j=izero; j<n; j++) { 1463 for (i=izero; i<m; i++) { 1464 C[j*ldc + i] = C_zero; 1465 } 1466 } 1467 } else { 1468 for (j=izero; j<n; j++) { 1469 for (i=izero; i<m; i++) { 1470 C[j*ldc + i] *= beta; 1471 } 1472 } 1473 } 1474 return; 1475 } 1476 1477 if ( ESideChar[side] == 'L') { 1478 // Compute C = alpha*A*B + beta*C 1479 1480 if (Upper) { 1481 // The symmetric part of A is stored in the upper triangular part of the matrix. 1482 for (j=izero; j<n; j++) { 1483 for (i=izero; i<m; i++) { 1484 temp1 = alpha*B[j*ldb + i]; 1485 temp2 = C_zero; 1486 for (k=izero; k<i; k++) { 1487 C[j*ldc + k] += temp1*A[i*lda + k]; 1488 temp2 += B[j*ldb + k]*A[i*lda + k]; 1489 } 1490 if (beta == beta_zero) { 1491 C[j*ldc + i] = temp1*A[i*lda + i] + alpha*temp2; 1492 } else { 1493 C[j*ldc + i] = beta*C[j*ldc + i] + temp1*A[i*lda + i] + alpha*temp2; 1494 } 1495 } 1496 } 1497 } else { 1498 // The symmetric part of A is stored in the lower triangular part of the matrix. 1499 for (j=izero; j<n; j++) { 1500 for (i=m-ione; i>-ione; i--) { 1501 temp1 = alpha*B[j*ldb + i]; 1502 temp2 = C_zero; 1503 for (k=i+ione; k<m; k++) { 1504 C[j*ldc + k] += temp1*A[i*lda + k]; 1505 temp2 += B[j*ldb + k]*A[i*lda + k]; 1506 } 1507 if (beta == beta_zero) { 1508 C[j*ldc + i] = temp1*A[i*lda + i] + alpha*temp2; 1509 } else { 1510 C[j*ldc + i] = beta*C[j*ldc + i] + temp1*A[i*lda + i] + alpha*temp2; 1511 } 1512 } 1513 } 1514 } 1515 } else { 1516 // Compute C = alpha*B*A + beta*C. 1517 for (j=izero; j<n; j++) { 1518 temp1 = alpha*A[j*lda + j]; 1519 if (beta == beta_zero) { 1520 for (i=izero; i<m; i++) { 1521 C[j*ldc + i] = temp1*B[j*ldb + i]; 1522 } 1523 } else { 1524 for (i=izero; i<m; i++) { 1525 C[j*ldc + i] = beta*C[j*ldc + i] + temp1*B[j*ldb + i]; 1526 } 1527 } 1528 for (k=izero; k<j; k++) { 1529 if (Upper) { 1530 temp1 = alpha*A[j*lda + k]; 1531 } else { 1532 temp1 = alpha*A[k*lda + j]; 1533 } 1534 for (i=izero; i<m; i++) { 1535 C[j*ldc + i] += temp1*B[k*ldb + i]; 1536 } 1537 } 1538 for (k=j+ione; k<n; k++) { 1539 if (Upper) { 1540 temp1 = alpha*A[k*lda + j]; 1541 } else { 1542 temp1 = alpha*A[j*lda + k]; 1543 } 1544 for (i=izero; i<m; i++) { 1545 C[j*ldc + i] += temp1*B[k*ldb + i]; 1546 } 1547 } 1548 } 1549 } // end if (ESideChar[side]=='L') ... 1550 } // end if(!BadArgument) ... 1551 } // end SYMM 1552 1553 template<typename OrdinalType, typename ScalarType> 1554 template <typename alpha_type, typename A_type, typename beta_type> SYRK(EUplo uplo,ETransp trans,const OrdinalType & n,const OrdinalType & k,const alpha_type alpha,const A_type * A,const OrdinalType & lda,const beta_type beta,ScalarType * C,const OrdinalType & ldc) const1555 void DefaultBLASImpl<OrdinalType, ScalarType>::SYRK(EUplo uplo, ETransp trans, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const 1556 { 1557 typedef TypeNameTraits<OrdinalType> OTNT; 1558 typedef TypeNameTraits<ScalarType> STNT; 1559 1560 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 1561 alpha_type alpha_zero = ScalarTraits<alpha_type>::zero(); 1562 beta_type beta_zero = ScalarTraits<beta_type>::zero(); 1563 beta_type beta_one = ScalarTraits<beta_type>::one(); 1564 A_type temp, A_zero = ScalarTraits<A_type>::zero(); 1565 ScalarType C_zero = ScalarTraits<ScalarType>::zero(); 1566 OrdinalType i, j, l, NRowA = n; 1567 bool BadArgument = false; 1568 bool Upper = (EUploChar[uplo] == 'U'); 1569 1570 TEUCHOS_TEST_FOR_EXCEPTION( 1571 Teuchos::ScalarTraits<ScalarType>::isComplex 1572 && (trans == CONJ_TRANS), 1573 std::logic_error, 1574 "Teuchos::BLAS<"<<OTNT::name()<<","<<STNT::name()<<">::SYRK()" 1575 " does not support CONJ_TRANS for complex data types." 1576 ); 1577 1578 // Change dimensions of A matrix is transposed 1579 if( !(ETranspChar[trans]=='N') ) { 1580 NRowA = k; 1581 } 1582 1583 // Quick return. 1584 if ( n==izero ) { return; } 1585 if ( ( (alpha==alpha_zero) || (k==izero) ) && (beta==beta_one) ) { return; } 1586 if( n < izero ) { 1587 std::cout << "BLAS::SYRK Error: N == "<< n <<std::endl; 1588 BadArgument = true; } 1589 if( k < izero ) { 1590 std::cout << "BLAS::SYRK Error: K == "<< k <<std::endl; 1591 BadArgument = true; } 1592 if( lda < NRowA ) { 1593 std::cout << "BLAS::SYRK Error: LDA < "<<NRowA<<std::endl; 1594 BadArgument = true; } 1595 if( ldc < n ) { 1596 std::cout << "BLAS::SYRK Error: LDC < MAX(1,N)"<<std::endl; 1597 BadArgument = true; } 1598 1599 if(!BadArgument) { 1600 1601 // Scale C when alpha is zero 1602 if (alpha == alpha_zero) { 1603 if (Upper) { 1604 if (beta==beta_zero) { 1605 for (j=izero; j<n; j++) { 1606 for (i=izero; i<=j; i++) { 1607 C[j*ldc + i] = C_zero; 1608 } 1609 } 1610 } 1611 else { 1612 for (j=izero; j<n; j++) { 1613 for (i=izero; i<=j; i++) { 1614 C[j*ldc + i] *= beta; 1615 } 1616 } 1617 } 1618 } 1619 else { 1620 if (beta==beta_zero) { 1621 for (j=izero; j<n; j++) { 1622 for (i=j; i<n; i++) { 1623 C[j*ldc + i] = C_zero; 1624 } 1625 } 1626 } 1627 else { 1628 for (j=izero; j<n; j++) { 1629 for (i=j; i<n; i++) { 1630 C[j*ldc + i] *= beta; 1631 } 1632 } 1633 } 1634 } 1635 return; 1636 } 1637 1638 // Now we can start the computation 1639 1640 if ( ETranspChar[trans]=='N' ) { 1641 1642 // Form C <- alpha*A*A' + beta*C 1643 if (Upper) { 1644 for (j=izero; j<n; j++) { 1645 if (beta==beta_zero) { 1646 for (i=izero; i<=j; i++) { 1647 C[j*ldc + i] = C_zero; 1648 } 1649 } 1650 else if (beta!=beta_one) { 1651 for (i=izero; i<=j; i++) { 1652 C[j*ldc + i] *= beta; 1653 } 1654 } 1655 for (l=izero; l<k; l++) { 1656 if (A[l*lda + j] != A_zero) { 1657 temp = alpha*A[l*lda + j]; 1658 for (i = izero; i <=j; i++) { 1659 C[j*ldc + i] += temp*A[l*lda + i]; 1660 } 1661 } 1662 } 1663 } 1664 } 1665 else { 1666 for (j=izero; j<n; j++) { 1667 if (beta==beta_zero) { 1668 for (i=j; i<n; i++) { 1669 C[j*ldc + i] = C_zero; 1670 } 1671 } 1672 else if (beta!=beta_one) { 1673 for (i=j; i<n; i++) { 1674 C[j*ldc + i] *= beta; 1675 } 1676 } 1677 for (l=izero; l<k; l++) { 1678 if (A[l*lda + j] != A_zero) { 1679 temp = alpha*A[l*lda + j]; 1680 for (i=j; i<n; i++) { 1681 C[j*ldc + i] += temp*A[l*lda + i]; 1682 } 1683 } 1684 } 1685 } 1686 } 1687 } 1688 else { 1689 1690 // Form C <- alpha*A'*A + beta*C 1691 if (Upper) { 1692 for (j=izero; j<n; j++) { 1693 for (i=izero; i<=j; i++) { 1694 temp = A_zero; 1695 for (l=izero; l<k; l++) { 1696 temp += A[i*lda + l]*A[j*lda + l]; 1697 } 1698 if (beta==beta_zero) { 1699 C[j*ldc + i] = alpha*temp; 1700 } 1701 else { 1702 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i]; 1703 } 1704 } 1705 } 1706 } 1707 else { 1708 for (j=izero; j<n; j++) { 1709 for (i=j; i<n; i++) { 1710 temp = A_zero; 1711 for (l=izero; l<k; ++l) { 1712 temp += A[i*lda + l]*A[j*lda + l]; 1713 } 1714 if (beta==beta_zero) { 1715 C[j*ldc + i] = alpha*temp; 1716 } 1717 else { 1718 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i]; 1719 } 1720 } 1721 } 1722 } 1723 } 1724 } /* if (!BadArgument) */ 1725 } /* END SYRK */ 1726 1727 template<typename OrdinalType, typename ScalarType> 1728 template <typename alpha_type, typename A_type> TRMM(ESide side,EUplo uplo,ETransp transa,EDiag diag,const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const A_type * A,const OrdinalType & lda,ScalarType * B,const OrdinalType & ldb) const1729 void DefaultBLASImpl<OrdinalType, ScalarType>::TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const 1730 { 1731 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 1732 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 1733 alpha_type alpha_zero = ScalarTraits<alpha_type>::zero(); 1734 A_type A_zero = ScalarTraits<A_type>::zero(); 1735 ScalarType B_zero = ScalarTraits<ScalarType>::zero(); 1736 ScalarType one = ScalarTraits<ScalarType>::one(); 1737 OrdinalType i, j, k, NRowA = m; 1738 ScalarType temp; 1739 bool BadArgument = false; 1740 bool LSide = (ESideChar[side] == 'L'); 1741 bool noUnit = (EDiagChar[diag] == 'N'); 1742 bool Upper = (EUploChar[uplo] == 'U'); 1743 bool noConj = (ETranspChar[transa] == 'T'); 1744 1745 if(!LSide) { NRowA = n; } 1746 1747 // Quick return. 1748 if (n==izero || m==izero) { return; } 1749 if( m < izero ) { 1750 std::cout << "BLAS::TRMM Error: M == "<< m <<std::endl; 1751 BadArgument = true; } 1752 if( n < izero ) { 1753 std::cout << "BLAS::TRMM Error: N == "<< n <<std::endl; 1754 BadArgument = true; } 1755 if( lda < NRowA ) { 1756 std::cout << "BLAS::TRMM Error: LDA < "<<NRowA<<std::endl; 1757 BadArgument = true; } 1758 if( ldb < m ) { 1759 std::cout << "BLAS::TRMM Error: LDB < MAX(1,M)"<<std::endl; 1760 BadArgument = true; } 1761 1762 if(!BadArgument) { 1763 1764 // B only needs to be zeroed out. 1765 if( alpha == alpha_zero ) { 1766 for( j=izero; j<n; j++ ) { 1767 for( i=izero; i<m; i++ ) { 1768 B[j*ldb + i] = B_zero; 1769 } 1770 } 1771 return; 1772 } 1773 1774 // Start the computations. 1775 if ( LSide ) { 1776 // A is on the left side of B. 1777 1778 if ( ETranspChar[transa]=='N' ) { 1779 // Compute B = alpha*A*B 1780 1781 if ( Upper ) { 1782 // A is upper triangular 1783 for( j=izero; j<n; j++ ) { 1784 for( k=izero; k<m; k++) { 1785 if ( B[j*ldb + k] != B_zero ) { 1786 temp = alpha*B[j*ldb + k]; 1787 for( i=izero; i<k; i++ ) { 1788 B[j*ldb + i] += temp*A[k*lda + i]; 1789 } 1790 if ( noUnit ) 1791 temp *=A[k*lda + k]; 1792 B[j*ldb + k] = temp; 1793 } 1794 } 1795 } 1796 } else { 1797 // A is lower triangular 1798 for( j=izero; j<n; j++ ) { 1799 for( k=m-ione; k>-ione; k-- ) { 1800 if( B[j*ldb + k] != B_zero ) { 1801 temp = alpha*B[j*ldb + k]; 1802 B[j*ldb + k] = temp; 1803 if ( noUnit ) 1804 B[j*ldb + k] *= A[k*lda + k]; 1805 for( i=k+ione; i<m; i++ ) { 1806 B[j*ldb + i] += temp*A[k*lda + i]; 1807 } 1808 } 1809 } 1810 } 1811 } 1812 } else { 1813 // Compute B = alpha*A'*B or B = alpha*conj(A')*B 1814 if( Upper ) { 1815 for( j=izero; j<n; j++ ) { 1816 for( i=m-ione; i>-ione; i-- ) { 1817 temp = B[j*ldb + i]; 1818 if ( noConj ) { 1819 if( noUnit ) 1820 temp *= A[i*lda + i]; 1821 for( k=izero; k<i; k++ ) { 1822 temp += A[i*lda + k]*B[j*ldb + k]; 1823 } 1824 } else { 1825 if( noUnit ) 1826 temp *= ScalarTraits<A_type>::conjugate(A[i*lda + i]); 1827 for( k=izero; k<i; k++ ) { 1828 temp += ScalarTraits<A_type>::conjugate(A[i*lda + k])*B[j*ldb + k]; 1829 } 1830 } 1831 B[j*ldb + i] = alpha*temp; 1832 } 1833 } 1834 } else { 1835 for( j=izero; j<n; j++ ) { 1836 for( i=izero; i<m; i++ ) { 1837 temp = B[j*ldb + i]; 1838 if ( noConj ) { 1839 if( noUnit ) 1840 temp *= A[i*lda + i]; 1841 for( k=i+ione; k<m; k++ ) { 1842 temp += A[i*lda + k]*B[j*ldb + k]; 1843 } 1844 } else { 1845 if( noUnit ) 1846 temp *= ScalarTraits<A_type>::conjugate(A[i*lda + i]); 1847 for( k=i+ione; k<m; k++ ) { 1848 temp += ScalarTraits<A_type>::conjugate(A[i*lda + k])*B[j*ldb + k]; 1849 } 1850 } 1851 B[j*ldb + i] = alpha*temp; 1852 } 1853 } 1854 } 1855 } 1856 } else { 1857 // A is on the right hand side of B. 1858 1859 if( ETranspChar[transa] == 'N' ) { 1860 // Compute B = alpha*B*A 1861 1862 if( Upper ) { 1863 // A is upper triangular. 1864 for( j=n-ione; j>-ione; j-- ) { 1865 temp = alpha; 1866 if( noUnit ) 1867 temp *= A[j*lda + j]; 1868 for( i=izero; i<m; i++ ) { 1869 B[j*ldb + i] *= temp; 1870 } 1871 for( k=izero; k<j; k++ ) { 1872 if( A[j*lda + k] != A_zero ) { 1873 temp = alpha*A[j*lda + k]; 1874 for( i=izero; i<m; i++ ) { 1875 B[j*ldb + i] += temp*B[k*ldb + i]; 1876 } 1877 } 1878 } 1879 } 1880 } else { 1881 // A is lower triangular. 1882 for( j=izero; j<n; j++ ) { 1883 temp = alpha; 1884 if( noUnit ) 1885 temp *= A[j*lda + j]; 1886 for( i=izero; i<m; i++ ) { 1887 B[j*ldb + i] *= temp; 1888 } 1889 for( k=j+ione; k<n; k++ ) { 1890 if( A[j*lda + k] != A_zero ) { 1891 temp = alpha*A[j*lda + k]; 1892 for( i=izero; i<m; i++ ) { 1893 B[j*ldb + i] += temp*B[k*ldb + i]; 1894 } 1895 } 1896 } 1897 } 1898 } 1899 } else { 1900 // Compute B = alpha*B*A' or B = alpha*B*conj(A') 1901 1902 if( Upper ) { 1903 for( k=izero; k<n; k++ ) { 1904 for( j=izero; j<k; j++ ) { 1905 if( A[k*lda + j] != A_zero ) { 1906 if ( noConj ) 1907 temp = alpha*A[k*lda + j]; 1908 else 1909 temp = alpha*ScalarTraits<A_type>::conjugate(A[k*lda + j]); 1910 for( i=izero; i<m; i++ ) { 1911 B[j*ldb + i] += temp*B[k*ldb + i]; 1912 } 1913 } 1914 } 1915 temp = alpha; 1916 if( noUnit ) { 1917 if ( noConj ) 1918 temp *= A[k*lda + k]; 1919 else 1920 temp *= ScalarTraits<A_type>::conjugate(A[k*lda + k]); 1921 } 1922 if( temp != one ) { 1923 for( i=izero; i<m; i++ ) { 1924 B[k*ldb + i] *= temp; 1925 } 1926 } 1927 } 1928 } else { 1929 for( k=n-ione; k>-ione; k-- ) { 1930 for( j=k+ione; j<n; j++ ) { 1931 if( A[k*lda + j] != A_zero ) { 1932 if ( noConj ) 1933 temp = alpha*A[k*lda + j]; 1934 else 1935 temp = alpha*ScalarTraits<A_type>::conjugate(A[k*lda + j]); 1936 for( i=izero; i<m; i++ ) { 1937 B[j*ldb + i] += temp*B[k*ldb + i]; 1938 } 1939 } 1940 } 1941 temp = alpha; 1942 if( noUnit ) { 1943 if ( noConj ) 1944 temp *= A[k*lda + k]; 1945 else 1946 temp *= ScalarTraits<A_type>::conjugate(A[k*lda + k]); 1947 } 1948 if( temp != one ) { 1949 for( i=izero; i<m; i++ ) { 1950 B[k*ldb + i] *= temp; 1951 } 1952 } 1953 } 1954 } 1955 } // end if( ETranspChar[transa] == 'N' ) ... 1956 } // end if ( LSide ) ... 1957 } // end if (!BadArgument) 1958 } // end TRMM 1959 1960 template<typename OrdinalType, typename ScalarType> 1961 template <typename alpha_type, typename A_type> TRSM(ESide side,EUplo uplo,ETransp transa,EDiag diag,const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const A_type * A,const OrdinalType & lda,ScalarType * B,const OrdinalType & ldb) const1962 void DefaultBLASImpl<OrdinalType, ScalarType>::TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const 1963 { 1964 OrdinalType izero = OrdinalTraits<OrdinalType>::zero(); 1965 OrdinalType ione = OrdinalTraits<OrdinalType>::one(); 1966 alpha_type alpha_zero = ScalarTraits<alpha_type>::zero(); 1967 A_type A_zero = ScalarTraits<A_type>::zero(); 1968 ScalarType B_zero = ScalarTraits<ScalarType>::zero(); 1969 alpha_type alpha_one = ScalarTraits<alpha_type>::one(); 1970 ScalarType B_one = ScalarTraits<ScalarType>::one(); 1971 ScalarType temp; 1972 OrdinalType NRowA = m; 1973 bool BadArgument = false; 1974 bool noUnit = (EDiagChar[diag]=='N'); 1975 bool noConj = (ETranspChar[transa] == 'T'); 1976 1977 if (!(ESideChar[side] == 'L')) { NRowA = n; } 1978 1979 // Quick return. 1980 if (n == izero || m == izero) { return; } 1981 if( m < izero ) { 1982 std::cout << "BLAS::TRSM Error: M == "<<m<<std::endl; 1983 BadArgument = true; } 1984 if( n < izero ) { 1985 std::cout << "BLAS::TRSM Error: N == "<<n<<std::endl; 1986 BadArgument = true; } 1987 if( lda < NRowA ) { 1988 std::cout << "BLAS::TRSM Error: LDA < "<<NRowA<<std::endl; 1989 BadArgument = true; } 1990 if( ldb < m ) { 1991 std::cout << "BLAS::TRSM Error: LDB < MAX(1,M)"<<std::endl; 1992 BadArgument = true; } 1993 1994 if(!BadArgument) 1995 { 1996 int i, j, k; 1997 // Set the solution to the zero vector. 1998 if(alpha == alpha_zero) { 1999 for(j = izero; j < n; j++) { 2000 for( i = izero; i < m; i++) { 2001 B[j*ldb+i] = B_zero; 2002 } 2003 } 2004 } 2005 else 2006 { // Start the operations. 2007 if(ESideChar[side] == 'L') { 2008 // 2009 // Perform computations for OP(A)*X = alpha*B 2010 // 2011 if(ETranspChar[transa] == 'N') { 2012 // 2013 // Compute B = alpha*inv( A )*B 2014 // 2015 if(EUploChar[uplo] == 'U') { 2016 // A is upper triangular. 2017 for(j = izero; j < n; j++) { 2018 // Perform alpha*B if alpha is not 1. 2019 if(alpha != alpha_one) { 2020 for( i = izero; i < m; i++) { 2021 B[j*ldb+i] *= alpha; 2022 } 2023 } 2024 // Perform a backsolve for column j of B. 2025 for(k = (m - ione); k > -ione; k--) { 2026 // If this entry is zero, we don't have to do anything. 2027 if (B[j*ldb + k] != B_zero) { 2028 if ( noUnit ) { 2029 B[j*ldb + k] /= A[k*lda + k]; 2030 } 2031 for(i = izero; i < k; i++) { 2032 B[j*ldb + i] -= B[j*ldb + k] * A[k*lda + i]; 2033 } 2034 } 2035 } 2036 } 2037 } 2038 else 2039 { // A is lower triangular. 2040 for(j = izero; j < n; j++) { 2041 // Perform alpha*B if alpha is not 1. 2042 if(alpha != alpha_one) { 2043 for( i = izero; i < m; i++) { 2044 B[j*ldb+i] *= alpha; 2045 } 2046 } 2047 // Perform a forward solve for column j of B. 2048 for(k = izero; k < m; k++) { 2049 // If this entry is zero, we don't have to do anything. 2050 if (B[j*ldb + k] != B_zero) { 2051 if ( noUnit ) { 2052 B[j*ldb + k] /= A[k*lda + k]; 2053 } 2054 for(i = k+ione; i < m; i++) { 2055 B[j*ldb + i] -= B[j*ldb + k] * A[k*lda + i]; 2056 } 2057 } 2058 } 2059 } 2060 } // end if (uplo == 'U') 2061 } // if (transa =='N') 2062 else { 2063 // 2064 // Compute B = alpha*inv( A' )*B 2065 // or B = alpha*inv( conj(A') )*B 2066 // 2067 if(EUploChar[uplo] == 'U') { 2068 // A is upper triangular. 2069 for(j = izero; j < n; j++) { 2070 for( i = izero; i < m; i++) { 2071 temp = alpha*B[j*ldb+i]; 2072 if ( noConj ) { 2073 for(k = izero; k < i; k++) { 2074 temp -= A[i*lda + k] * B[j*ldb + k]; 2075 } 2076 if ( noUnit ) { 2077 temp /= A[i*lda + i]; 2078 } 2079 } else { 2080 for(k = izero; k < i; k++) { 2081 temp -= ScalarTraits<A_type>::conjugate(A[i*lda + k]) 2082 * B[j*ldb + k]; 2083 } 2084 if ( noUnit ) { 2085 temp /= ScalarTraits<A_type>::conjugate(A[i*lda + i]); 2086 } 2087 } 2088 B[j*ldb + i] = temp; 2089 } 2090 } 2091 } 2092 else 2093 { // A is lower triangular. 2094 for(j = izero; j < n; j++) { 2095 for(i = (m - ione) ; i > -ione; i--) { 2096 temp = alpha*B[j*ldb+i]; 2097 if ( noConj ) { 2098 for(k = i+ione; k < m; k++) { 2099 temp -= A[i*lda + k] * B[j*ldb + k]; 2100 } 2101 if ( noUnit ) { 2102 temp /= A[i*lda + i]; 2103 } 2104 } else { 2105 for(k = i+ione; k < m; k++) { 2106 temp -= ScalarTraits<A_type>::conjugate(A[i*lda + k]) 2107 * B[j*ldb + k]; 2108 } 2109 if ( noUnit ) { 2110 temp /= ScalarTraits<A_type>::conjugate(A[i*lda + i]); 2111 } 2112 } 2113 B[j*ldb + i] = temp; 2114 } 2115 } 2116 } 2117 } 2118 } // if (side == 'L') 2119 else { 2120 // side == 'R' 2121 // 2122 // Perform computations for X*OP(A) = alpha*B 2123 // 2124 if (ETranspChar[transa] == 'N') { 2125 // 2126 // Compute B = alpha*B*inv( A ) 2127 // 2128 if(EUploChar[uplo] == 'U') { 2129 // A is upper triangular. 2130 // Perform a backsolve for column j of B. 2131 for(j = izero; j < n; j++) { 2132 // Perform alpha*B if alpha is not 1. 2133 if(alpha != alpha_one) { 2134 for( i = izero; i < m; i++) { 2135 B[j*ldb+i] *= alpha; 2136 } 2137 } 2138 for(k = izero; k < j; k++) { 2139 // If this entry is zero, we don't have to do anything. 2140 if (A[j*lda + k] != A_zero) { 2141 for(i = izero; i < m; i++) { 2142 B[j*ldb + i] -= A[j*lda + k] * B[k*ldb + i]; 2143 } 2144 } 2145 } 2146 if ( noUnit ) { 2147 temp = B_one/A[j*lda + j]; 2148 for(i = izero; i < m; i++) { 2149 B[j*ldb + i] *= temp; 2150 } 2151 } 2152 } 2153 } 2154 else 2155 { // A is lower triangular. 2156 for(j = (n - ione); j > -ione; j--) { 2157 // Perform alpha*B if alpha is not 1. 2158 if(alpha != alpha_one) { 2159 for( i = izero; i < m; i++) { 2160 B[j*ldb+i] *= alpha; 2161 } 2162 } 2163 // Perform a forward solve for column j of B. 2164 for(k = j+ione; k < n; k++) { 2165 // If this entry is zero, we don't have to do anything. 2166 if (A[j*lda + k] != A_zero) { 2167 for(i = izero; i < m; i++) { 2168 B[j*ldb + i] -= A[j*lda + k] * B[k*ldb + i]; 2169 } 2170 } 2171 } 2172 if ( noUnit ) { 2173 temp = B_one/A[j*lda + j]; 2174 for(i = izero; i < m; i++) { 2175 B[j*ldb + i] *= temp; 2176 } 2177 } 2178 } 2179 } // end if (uplo == 'U') 2180 } // if (transa =='N') 2181 else { 2182 // 2183 // Compute B = alpha*B*inv( A' ) 2184 // or B = alpha*B*inv( conj(A') ) 2185 // 2186 if(EUploChar[uplo] == 'U') { 2187 // A is upper triangular. 2188 for(k = (n - ione); k > -ione; k--) { 2189 if ( noUnit ) { 2190 if ( noConj ) 2191 temp = B_one/A[k*lda + k]; 2192 else 2193 temp = B_one/ScalarTraits<A_type>::conjugate(A[k*lda + k]); 2194 for(i = izero; i < m; i++) { 2195 B[k*ldb + i] *= temp; 2196 } 2197 } 2198 for(j = izero; j < k; j++) { 2199 if (A[k*lda + j] != A_zero) { 2200 if ( noConj ) 2201 temp = A[k*lda + j]; 2202 else 2203 temp = ScalarTraits<A_type>::conjugate(A[k*lda + j]); 2204 for(i = izero; i < m; i++) { 2205 B[j*ldb + i] -= temp*B[k*ldb + i]; 2206 } 2207 } 2208 } 2209 if (alpha != alpha_one) { 2210 for (i = izero; i < m; i++) { 2211 B[k*ldb + i] *= alpha; 2212 } 2213 } 2214 } 2215 } 2216 else 2217 { // A is lower triangular. 2218 for(k = izero; k < n; k++) { 2219 if ( noUnit ) { 2220 if ( noConj ) 2221 temp = B_one/A[k*lda + k]; 2222 else 2223 temp = B_one/ScalarTraits<A_type>::conjugate(A[k*lda + k]); 2224 for (i = izero; i < m; i++) { 2225 B[k*ldb + i] *= temp; 2226 } 2227 } 2228 for(j = k+ione; j < n; j++) { 2229 if(A[k*lda + j] != A_zero) { 2230 if ( noConj ) 2231 temp = A[k*lda + j]; 2232 else 2233 temp = ScalarTraits<A_type>::conjugate(A[k*lda + j]); 2234 for(i = izero; i < m; i++) { 2235 B[j*ldb + i] -= temp*B[k*ldb + i]; 2236 } 2237 } 2238 } 2239 if (alpha != alpha_one) { 2240 for (i = izero; i < m; i++) { 2241 B[k*ldb + i] *= alpha; 2242 } 2243 } 2244 } 2245 } 2246 } 2247 } 2248 } 2249 } 2250 } 2251 2252 // Explicit instantiation for template<int,float> 2253 2254 template <> 2255 class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, float> 2256 { 2257 public: BLAS(void)2258 inline BLAS(void) {} BLAS(const BLAS<int,float> &)2259 inline BLAS(const BLAS<int, float>& /*BLAS_source*/) {} ~BLAS(void)2260 inline virtual ~BLAS(void) {} 2261 void ROTG(float* da, float* db, float* c, float* s) const; 2262 void ROT(const int& n, float* dx, const int& incx, float* dy, const int& incy, float* c, float* s) const; 2263 float ASUM(const int& n, const float* x, const int& incx) const; 2264 void AXPY(const int& n, const float& alpha, const float* x, const int& incx, float* y, const int& incy) const; 2265 void COPY(const int& n, const float* x, const int& incx, float* y, const int& incy) const; 2266 float DOT(const int& n, const float* x, const int& incx, const float* y, const int& incy) const; 2267 float NRM2(const int& n, const float* x, const int& incx) const; 2268 void SCAL(const int& n, const float& alpha, float* x, const int& incx) const; 2269 int IAMAX(const int& n, const float* x, const int& incx) const; 2270 void GEMV(ETransp trans, const int& m, const int& n, const float& alpha, const float* A, const int& lda, const float* x, const int& incx, const float& beta, float* y, const int& incy) const; 2271 void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const float* A, const int& lda, float* x, const int& incx) const; 2272 void GER(const int& m, const int& n, const float& alpha, const float* x, const int& incx, const float* y, const int& incy, float* A, const int& lda) const; 2273 void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float* B, const int& ldb, const float& beta, float* C, const int& ldc) const; 2274 void SWAP(const int& n, float* const x, const int& incx, float* const y, const int& incy) const; 2275 void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const float& alpha, const float* A, const int& lda, const float* B, const int& ldb, const float& beta, float* C, const int& ldc) const; 2276 void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float& beta, float* C, const int& ldc) const; 2277 void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float& beta, float* C, const int& ldc) const; 2278 void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const float& alpha, const float* A, const int& lda, float* B, const int& ldb) const; 2279 void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const float& alpha, const float* A, const int& lda, float* B, const int& ldb) const; 2280 }; 2281 2282 // Explicit instantiation for template<int,double> 2283 2284 template<> 2285 class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, double> 2286 { 2287 public: BLAS(void)2288 inline BLAS(void) {} BLAS(const BLAS<int,double> &)2289 inline BLAS(const BLAS<int, double>& /*BLAS_source*/) {} ~BLAS(void)2290 inline virtual ~BLAS(void) {} 2291 void ROTG(double* da, double* db, double* c, double* s) const; 2292 void ROT(const int& n, double* dx, const int& incx, double* dy, const int& incy, double* c, double* s) const; 2293 double ASUM(const int& n, const double* x, const int& incx) const; 2294 void AXPY(const int& n, const double& alpha, const double* x, const int& incx, double* y, const int& incy) const; 2295 void COPY(const int& n, const double* x, const int& incx, double* y, const int& incy) const; 2296 double DOT(const int& n, const double* x, const int& incx, const double* y, const int& incy) const; 2297 double NRM2(const int& n, const double* x, const int& incx) const; 2298 void SCAL(const int& n, const double& alpha, double* x, const int& incx) const; 2299 int IAMAX(const int& n, const double* x, const int& incx) const; 2300 void GEMV(ETransp trans, const int& m, const int& n, const double& alpha, const double* A, const int& lda, const double* x, const int& incx, const double& beta, double* y, const int& incy) const; 2301 void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const double* A, const int& lda, double* x, const int& incx) const; 2302 void GER(const int& m, const int& n, const double& alpha, const double* x, const int& incx, const double* y, const int& incy, double* A, const int& lda) const; 2303 void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double* B, const int& ldb, const double& beta, double* C, const int& ldc) const; 2304 void SWAP(const int& n, double* const x, const int& incx, double* const y, const int& incy) const; 2305 void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const double& alpha, const double* A, const int& lda, const double* B, const int& ldb, const double& beta, double* C, const int& ldc) const; 2306 void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double& beta, double* C, const int& ldc) const; 2307 void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double& beta, double* C, const int& ldc) const; 2308 void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const double& alpha, const double* A, const int& lda, double* B, const int& ldb) const; 2309 void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const double& alpha, const double* A, const int& lda, double* B, const int& ldb) const; 2310 }; 2311 2312 // Explicit instantiation for template<int,complex<float> > 2313 2314 template<> 2315 class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, std::complex<float> > 2316 { 2317 public: BLAS(void)2318 inline BLAS(void) {} BLAS(const BLAS<int,std::complex<float>> &)2319 inline BLAS(const BLAS<int, std::complex<float> >& /*BLAS_source*/) {} ~BLAS(void)2320 inline virtual ~BLAS(void) {} 2321 void ROTG(std::complex<float>* da, std::complex<float>* db, float* c, std::complex<float>* s) const; 2322 void ROT(const int& n, std::complex<float>* dx, const int& incx, std::complex<float>* dy, const int& incy, float* c, std::complex<float>* s) const; 2323 float ASUM(const int& n, const std::complex<float>* x, const int& incx) const; 2324 void AXPY(const int& n, const std::complex<float> alpha, const std::complex<float>* x, const int& incx, std::complex<float>* y, const int& incy) const; 2325 void COPY(const int& n, const std::complex<float>* x, const int& incx, std::complex<float>* y, const int& incy) const; 2326 std::complex<float> DOT(const int& n, const std::complex<float>* x, const int& incx, const std::complex<float>* y, const int& incy) const; 2327 float NRM2(const int& n, const std::complex<float>* x, const int& incx) const; 2328 void SCAL(const int& n, const std::complex<float> alpha, std::complex<float>* x, const int& incx) const; 2329 int IAMAX(const int& n, const std::complex<float>* x, const int& incx) const; 2330 void GEMV(ETransp trans, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float>* x, const int& incx, const std::complex<float> beta, std::complex<float>* y, const int& incy) const; 2331 void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const std::complex<float>* A, const int& lda, std::complex<float>* x, const int& incx) const; 2332 void GER(const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* x, const int& incx, const std::complex<float>* y, const int& incy, std::complex<float>* A, const int& lda) const; 2333 void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float>* B, const int& ldb, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const; 2334 void SWAP(const int& n, std::complex<float>* const x, const int& incx, std::complex<float>* const y, const int& incy) const; 2335 void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> *B, const int& ldb, const std::complex<float> beta, std::complex<float> *C, const int& ldc) const; 2336 void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const; 2337 void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const; 2338 void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb) const; 2339 void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb) const; 2340 }; 2341 2342 // Explicit instantiation for template<int,complex<double> > 2343 2344 template<> 2345 class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, std::complex<double> > 2346 { 2347 public: BLAS(void)2348 inline BLAS(void) {} BLAS(const BLAS<int,std::complex<double>> &)2349 inline BLAS(const BLAS<int, std::complex<double> >& /*BLAS_source*/) {} ~BLAS(void)2350 inline virtual ~BLAS(void) {} 2351 void ROTG(std::complex<double>* da, std::complex<double>* db, double* c, std::complex<double>* s) const; 2352 void ROT(const int& n, std::complex<double>* dx, const int& incx, std::complex<double>* dy, const int& incy, double* c, std::complex<double>* s) const; 2353 double ASUM(const int& n, const std::complex<double>* x, const int& incx) const; 2354 void AXPY(const int& n, const std::complex<double> alpha, const std::complex<double>* x, const int& incx, std::complex<double>* y, const int& incy) const; 2355 void COPY(const int& n, const std::complex<double>* x, const int& incx, std::complex<double>* y, const int& incy) const; 2356 std::complex<double> DOT(const int& n, const std::complex<double>* x, const int& incx, const std::complex<double>* y, const int& incy) const; 2357 double NRM2(const int& n, const std::complex<double>* x, const int& incx) const; 2358 void SCAL(const int& n, const std::complex<double> alpha, std::complex<double>* x, const int& incx) const; 2359 int IAMAX(const int& n, const std::complex<double>* x, const int& incx) const; 2360 void GEMV(ETransp trans, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double>* x, const int& incx, const std::complex<double> beta, std::complex<double>* y, const int& incy) const; 2361 void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const std::complex<double>* A, const int& lda, std::complex<double>* x, const int& incx) const; 2362 void GER(const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* x, const int& incx, const std::complex<double>* y, const int& incy, std::complex<double>* A, const int& lda) const; 2363 void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double>* B, const int& ldb, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const; 2364 void SWAP(const int& n, std::complex<double>* const x, const int& incx, std::complex<double>* const y, const int& incy) const; 2365 void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> *B, const int& ldb, const std::complex<double> beta, std::complex<double> *C, const int& ldc) const; 2366 void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const; 2367 void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const; 2368 void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb) const; 2369 void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb) const; 2370 }; 2371 2372 } // namespace Teuchos 2373 2374 #endif // _TEUCHOS_BLAS_HPP_ 2375