1 // @HEADER
2 // ***********************************************************************
3 //
4 //                    Teuchos: Common Tools Package
5 //                 Copyright (2004) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 // Kris
43 // 06.16.03 -- Start over from scratch
44 // 06.16.03 -- Initial templatization (Tpetra_BLAS.cpp is no longer needed)
45 // 06.18.03 -- Changed xxxxx_() function calls to XXXXX_F77()
46 //          -- Added warning messages for generic calls
47 // 07.08.03 -- Move into Teuchos package/namespace
48 // 07.24.03 -- The first iteration of BLAS generics is nearing completion. Caveats:
49 //             * TRSM isn't finished yet; it works for one case at the moment (left side, upper tri., no transpose, no unit diag.)
50 //             * Many of the generic implementations are quite inefficient, ugly, or both. I wrote these to be easy to debug, not for efficiency or legibility. The next iteration will improve both of these aspects as much as possible.
51 //             * Very little verification of input parameters is done, save for the character-type arguments (TRANS, etc.) which is quite robust.
52 //             * All of the routines that make use of both an incx and incy parameter (which includes much of the L1 BLAS) are set up to work iff incx == incy && incx > 0. Allowing for differing/negative values of incx/incy should be relatively trivial.
53 //             * All of the L2/L3 routines assume that the entire matrix is being used (that is, if A is mxn, lda = m); they don't work on submatrices yet. This *should* be a reasonably trivial thing to fix, as well.
54 //          -- Removed warning messages for generic calls
55 // 08.08.03 -- TRSM now works for all cases where SIDE == L and DIAG == N. DIAG == U is implemented but does not work correctly; SIDE == R is not yet implemented.
56 // 08.14.03 -- TRSM now works for all cases and accepts (and uses) leading-dimension information.
57 // 09.26.03 -- character input replaced with enumerated input to cause compiling errors and not run-time errors ( suggested by RAB ).
58 
59 #ifndef _TEUCHOS_BLAS_HPP_
60 #define _TEUCHOS_BLAS_HPP_
61 
62 /*! \file Teuchos_BLAS.hpp
63     \brief Templated interface class to BLAS routines.
64 */
65 /** \example BLAS/cxx_main.cpp
66     This is an example of how to use the Teuchos::BLAS class.
67 */
68 
69 #include "Teuchos_ConfigDefs.hpp"
70 #include "Teuchos_ScalarTraits.hpp"
71 #include "Teuchos_OrdinalTraits.hpp"
72 #include "Teuchos_BLAS_types.hpp"
73 #include "Teuchos_Assert.hpp"
74 
75 /*! \class Teuchos::BLAS
76     \brief Templated BLAS wrapper.
77 
78     The Teuchos::BLAS class provides functionality similar to the BLAS
79     (Basic Linear Algebra Subprograms).  The BLAS provide portable,
80     high- performance implementations of kernels such as dense vector
81     sums, inner products, and norms (the BLAS 1 routines), dense
82     matrix-vector multiplication and triangular solve (the BLAS 2
83     routines), and dense matrix-matrix multiplication and triangular
84     solve with multiple right-hand sides (the BLAS 3 routines).
85 
86     The standard BLAS interface is Fortran-specific.  Unfortunately,
87     the interface between C++ and Fortran is not standard across all
88     computer platforms.  The Teuchos::BLAS class provides C++ bindings
89     for the BLAS kernels in order to insulate the rest of Petra from
90     the details of C++ to Fortran translation.
91 
92     In addition to giving access to the standard BLAS functionality,
93     Teuchos::BLAS also provides a generic fall-back implementation for
94     any ScalarType class that defines the +, - * and / operators.
95 
96     Teuchos::BLAS only operates within a single shared-memory space,
97     just like the BLAS.  It does not attempt to implement
98     distributed-memory parallel matrix operations.
99 
100     \note This class has specializations for ScalarType=float and
101       double, which use the BLAS library directly.  If you configure
102       Teuchos to enable complex arithmetic support, via the CMake
103       option -DTeuchos_ENABLE_COMPLEX:BOOL=ON, then this class will
104       also invoke the BLAS library directly for
105       ScalarType=std::complex<float> and std::complex<double>.
106 */
107 namespace Teuchos
108 {
109   extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char ESideChar[];
110   extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char ETranspChar[];
111   extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char EUploChar[];
112   extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char EDiagChar[];
113   extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char ETypeChar[];
114 
115   // Forward declaration
116   namespace details {
117     template<typename ScalarType,
118              bool isComplex = Teuchos::ScalarTraits<ScalarType>::isComplex>
119     class GivensRotator;
120   }
121 
122   //! Default implementation for BLAS routines
123   /*!
124    * This class provides the default implementation for the BLAS routines.  It
125    * is put in a separate class so that specializations of BLAS for other types
126    * still have this implementation available.
127    */
128   template<typename OrdinalType, typename ScalarType>
129   class DefaultBLASImpl
130   {
131 
132     typedef typename Teuchos::ScalarTraits<ScalarType>::magnitudeType MagnitudeType;
133 
134   public:
135     //! @name Constructor/Destructor.
136     //@{
137 
138     //! Default constructor.
DefaultBLASImpl(void)139     inline DefaultBLASImpl(void) {}
140 
141     //! Copy constructor.
DefaultBLASImpl(const DefaultBLASImpl<OrdinalType,ScalarType> &)142     inline DefaultBLASImpl(const DefaultBLASImpl<OrdinalType, ScalarType>& /*BLAS_source*/) {}
143 
144     //! Destructor.
~DefaultBLASImpl(void)145     inline virtual ~DefaultBLASImpl(void) {}
146     //@}
147 
148     //! @name Level 1 BLAS Routines.
149     //@{
150 
151     //! The type used for c in ROTG
152     /*! This is MagnitudeType if ScalarType is complex and ScalarType otherwise.
153      */
154     typedef typename details::GivensRotator<ScalarType>::c_type rotg_c_type;
155 
156     //! Computes a Givens plane rotation.
157     void ROTG(ScalarType* da, ScalarType* db, rotg_c_type* c, ScalarType* s) const;
158 
159     //! Applies a Givens plane rotation.
160     void ROT(const OrdinalType& n, ScalarType* dx, const OrdinalType& incx, ScalarType* dy, const OrdinalType& incy, MagnitudeType* c, ScalarType* s) const;
161 
162     //! Scale the vector \c x by the constant \c alpha.
163     void SCAL(const OrdinalType& n, const ScalarType& alpha, ScalarType* x, const OrdinalType& incx) const;
164 
165     //! Copy the vector \c x to the vector \c y.
166     void COPY(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const;
167 
168     //! Perform the operation: \c y \c <- \c y+alpha*x.
169     template <typename alpha_type, typename x_type>
170     void AXPY(const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const;
171 
172     //! Sum the absolute values of the entries of \c x.
173     typename ScalarTraits<ScalarType>::magnitudeType ASUM(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const;
174 
175     //! Form the dot product of the vectors \c x and \c y.
176     template <typename x_type, typename y_type>
177     ScalarType DOT(const OrdinalType& n, const x_type* x, const OrdinalType& incx, const y_type* y, const OrdinalType& incy) const;
178 
179     //! Compute the 2-norm of the vector \c x.
180     typename ScalarTraits<ScalarType>::magnitudeType NRM2(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const;
181 
182     //! Return the index of the element of \c x with the maximum magnitude.
183     OrdinalType IAMAX(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const;
184     //@}
185 
186     //! @name Level 2 BLAS Routines.
187     //@{
188 
189     //! Performs the matrix-vector operation:  \c y \c <- \c alpha*A*x+beta*y or \c y \c <- \c alpha*A'*x+beta*y where \c A is a general \c m by \c n matrix.
190     template <typename alpha_type, typename A_type, typename x_type, typename beta_type>
191     void GEMV(ETransp trans, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A,
192               const OrdinalType& lda, const x_type* x, const OrdinalType& incx, const beta_type beta, ScalarType* y, const OrdinalType& incy) const;
193 
194     //! Performs the matrix-vector operation:  \c x \c <- \c A*x or \c x \c <- \c A'*x where \c A is a unit/non-unit \c n by \c n upper/lower triangular matrix.
195     template <typename A_type>
196     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const OrdinalType& n, const A_type* A,
197               const OrdinalType& lda, ScalarType* x, const OrdinalType& incx) const;
198 
199     //! \brief Performs the rank 1 operation:  \c A \c <- \c alpha*x*y'+A.
200     /// \note  For complex arithmetic, this routine performs [Z/C]GERU.
201     template <typename alpha_type, typename x_type, typename y_type>
202     void GER(const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx,
203              const y_type* y, const OrdinalType& incy, ScalarType* A, const OrdinalType& lda) const;
204     //@}
205 
206     //! @name Level 3 BLAS Routines.
207     //@{
208 
209     /// \brief General matrix-matrix multiply.
210     ///
211     /// This computes C = alpha*op(A)*op(B) + beta*C.  op(X) here may
212     /// be either X, the transpose of X, or the conjugate transpose of
213     /// X.  op(A) has m rows and k columns, op(B) has k rows and n
214     /// columns, and C has m rows and n columns.
215     template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
216     void GEMM(ETransp transa, ETransp transb, const OrdinalType& m, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const;
217 
218     //! Swap the entries of x and y.
219     void
220     SWAP (const OrdinalType& n, ScalarType* const x, const OrdinalType& incx,
221           ScalarType* const y, const OrdinalType& incy) const;
222 
223     //! Performs the matrix-matrix operation: \c C \c <- \c alpha*A*B+beta*C or \c C \c <- \c alpha*B*A+beta*C where \c A is an \c m by \c m or \c n by \c n symmetric matrix and \c B is a general matrix.
224     template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
225     void SYMM(ESide side, EUplo uplo, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const;
226 
227     //! Performs the symmetric rank k operation: \c C <- \c alpha*A*A'+beta*C or \c C <- \c alpha*A'*A+beta*C, where \c alpha and \c beta are scalars, \c C is an \c n by \c n symmetric matrix and \c A is an \c n by \c k matrix in the first case or \c k by \c n matrix in the second case.
228     template <typename alpha_type, typename A_type, typename beta_type>
229     void SYRK(EUplo uplo, ETransp trans, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const;
230 
231     //! Performs the matrix-matrix operation: \c B \c <- \c alpha*op(A)*B or \c B \c <- \c alpha*B*op(A) where \c op(A) is an unit/non-unit, upper/lower triangular matrix and \c B is a general matrix.
232     template <typename alpha_type, typename A_type>
233     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n,
234                 const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const;
235 
236     //! Solves the matrix equations:  \c op(A)*X=alpha*B or \c X*op(A)=alpha*B where \c X and \c B are \c m by \c n matrices, \c A is a unit/non-unit, upper/lower triangular matrix and \c op(A) is \c A or \c A'.  The matrix \c X is overwritten on \c B.
237     template <typename alpha_type, typename A_type>
238     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n,
239                 const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const;
240     //@}
241   };
242 
243   template<typename OrdinalType, typename ScalarType>
244   class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS : public DefaultBLASImpl<OrdinalType,ScalarType>
245   {
246 
247     typedef typename Teuchos::ScalarTraits<ScalarType>::magnitudeType MagnitudeType;
248 
249   public:
250     //! @name Constructor/Destructor.
251     //@{
252 
253     //! Default constructor.
BLAS(void)254     inline BLAS(void) {}
255 
256     //! Copy constructor.
BLAS(const BLAS<OrdinalType,ScalarType> &)257     inline BLAS(const BLAS<OrdinalType, ScalarType>& /*BLAS_source*/) {}
258 
259     //! Destructor.
~BLAS(void)260     inline virtual ~BLAS(void) {}
261     //@}
262   };
263 
264 //------------------------------------------------------------------------------------------
265 //      LEVEL 1 BLAS ROUTINES
266 //------------------------------------------------------------------------------------------
267 
268   /// \namespace details
269   /// \brief Teuchos implementation details.
270   ///
271   /// \warning Teuchos users should not use anything in this
272   ///   namespace.  They should not even assume that the namespace
273   ///   will continue to exist between releases.  The namespace's name
274   ///   itself or anything it contains may change at any time.
275   namespace details {
276 
277     // Compute magnitude.
278     template<typename ScalarType, bool isComplex>
279     class MagValue {
280     public:
281       void
282       blas_dabs1(const ScalarType* a, typename ScalarTraits<ScalarType>::magnitudeType* ret) const;
283     };
284 
285     // Complex-arithmetic specialization.
286     template<typename ScalarType>
287     class MagValue<ScalarType, true> {
288     public:
289       void
290       blas_dabs1(const ScalarType* a, typename ScalarTraits<ScalarType>::magnitudeType* ret) const;
291     };
292 
293     // Real-arithmetic specialization.
294     template<typename ScalarType>
295     class MagValue<ScalarType, false> {
296     public:
297       void
298       blas_dabs1(const ScalarType* a, ScalarType* ret) const;
299     };
300 
301     template<typename ScalarType, bool isComplex>
302     class GivensRotator {};
303 
304     // Complex-arithmetic specialization.
305     template<typename ScalarType>
306     class GivensRotator<ScalarType, true> {
307     public:
308       typedef typename ScalarTraits<ScalarType>::magnitudeType c_type;
309       void
310       ROTG (ScalarType* ca,
311             ScalarType* cb,
312             typename ScalarTraits<ScalarType>::magnitudeType* c,
313             ScalarType* s) const;
314     };
315 
316     // Real-arithmetic specialization.
317     template<typename ScalarType>
318     class GivensRotator<ScalarType, false> {
319     public:
320       typedef ScalarType c_type;
321       void
322       ROTG (ScalarType* da,
323             ScalarType* db,
324             ScalarType* c,
325             ScalarType* s) const;
326 
327       /// Return ABS(x) if y > 0 or y is +0, else -ABS(x) (if y is -0 or < 0).
328       ///
329       /// Note that SIGN respects IEEE 754 floating-point signed zero.
330       /// This is a hopefully correct implementation of the Fortran
331       /// type-generic SIGN intrinsic.  ROTG for complex arithmetic
332       /// doesn't require this function.  C99 provides a copysign()
333       /// math library function, but we are not able to rely on the
334       /// existence of C99 functions here.
335       ///
336       /// We provide this method on purpose only for the
337       /// real-arithmetic specialization of GivensRotator.  Complex
338       /// numbers don't have a sign; they have an angle.
SIGN(const ScalarType & x,const ScalarType & y) const339       ScalarType SIGN (const ScalarType& x, const ScalarType& y) const {
340         typedef ScalarTraits<ScalarType> STS;
341 
342         if (y > STS::zero()) {
343           return STS::magnitude (x);
344         } else if (y < STS::zero()) {
345           return -STS::magnitude (x);
346         } else { // y == STS::zero()
347           // Suppose that ScalarType& implements signed zero, as IEEE
348           // 754 - compliant floating-point numbers should.  You can't
349           // use == to test for signed zero, since +0 == -0.  However,
350           // 1/0 = Inf > 0 and 1/-0 = -Inf < 0.  Let's hope ScalarType
351           // supports Inf... we don't need to test for Inf, just see
352           // if it's greater than or less than zero.
353           //
354           // NOTE: This ONLY works if ScalarType is real.  Complex
355           // infinity doesn't have a sign, so we can't compare it with
356           // zero.  That's OK, because finite complex numbers don't
357           // have a sign either; they have an angle.
358           ScalarType signedInfinity = STS::one() / y;
359           if (signedInfinity > STS::zero()) {
360             return STS::magnitude (x);
361           } else {
362             // Even if ScalarType doesn't implement signed zero,
363             // Fortran's SIGN intrinsic returns -ABS(X) if the second
364             // argument Y is zero.  We imitate this behavior here.
365             return -STS::magnitude (x);
366           }
367         }
368       }
369     };
370 
371     // Implementation of complex-arithmetic specialization.
372     template<typename ScalarType>
373     void
374     GivensRotator<ScalarType, true>::
ROTG(ScalarType * ca,ScalarType * cb,typename ScalarTraits<ScalarType>::magnitudeType * c,ScalarType * s) const375     ROTG (ScalarType* ca,
376           ScalarType* cb,
377           typename ScalarTraits<ScalarType>::magnitudeType* c,
378           ScalarType* s) const
379     {
380       typedef ScalarTraits<ScalarType> STS;
381       typedef typename STS::magnitudeType MagnitudeType;
382       typedef ScalarTraits<MagnitudeType> STM;
383 
384       // This is a straightforward translation into C++ of the
385       // reference BLAS' implementation of ZROTG.  You can get
386       // the Fortran 77 source code of ZROTG here:
387       //
388       // http://www.netlib.org/blas/zrotg.f
389       //
390       // I used the following rules to translate Fortran types and
391       // intrinsic functions into C++:
392       //
393       // DOUBLE PRECISION -> MagnitudeType
394       // DOUBLE COMPLEX -> ScalarType
395       // CDABS -> STS::magnitude
396       // DCMPLX -> ScalarType constructor (assuming that ScalarType
397       //   is std::complex<MagnitudeType>)
398       // DCONJG -> STS::conjugate
399       // DSQRT -> STM::squareroot
400       ScalarType alpha;
401       MagnitudeType norm, scale;
402 
403       if (STS::magnitude (*ca) == STM::zero()) {
404         *c = STM::zero();
405         *s = STS::one();
406         *ca = *cb;
407       } else {
408         scale = STS::magnitude (*ca) + STS::magnitude (*cb);
409         { // I introduced temporaries into the translated BLAS code in
410           // order to make the expression easier to read and also save a
411           // few floating-point operations.
412           const MagnitudeType ca_scaled =
413             STS::magnitude (*ca / ScalarType(scale, STM::zero()));
414           const MagnitudeType cb_scaled =
415             STS::magnitude (*cb / ScalarType(scale, STM::zero()));
416           norm = scale *
417             STM::squareroot (ca_scaled*ca_scaled + cb_scaled*cb_scaled);
418         }
419         alpha = *ca / STS::magnitude (*ca);
420         *c = STS::magnitude (*ca) / norm;
421         *s = alpha * STS::conjugate (*cb) / norm;
422         *ca = alpha * norm;
423       }
424     }
425 
426     // Implementation of real-arithmetic specialization.
427     template<typename ScalarType>
428     void
429     GivensRotator<ScalarType, false>::
ROTG(ScalarType * da,ScalarType * db,ScalarType * c,ScalarType * s) const430     ROTG (ScalarType* da,
431           ScalarType* db,
432           ScalarType* c,
433           ScalarType* s) const
434     {
435       typedef ScalarTraits<ScalarType> STS;
436 
437       // This is a straightforward translation into C++ of the
438       // reference BLAS' implementation of DROTG.  You can get
439       // the Fortran 77 source code of DROTG here:
440       //
441       // http://www.netlib.org/blas/drotg.f
442       //
443       // I used the following rules to translate Fortran types and
444       // intrinsic functions into C++:
445       //
446       // DOUBLE PRECISION -> ScalarType
447       // DABS -> STS::magnitude
448       // DSQRT -> STM::squareroot
449       // DSIGN -> SIGN (see below)
450       //
451       // DSIGN(x,y) (the old DOUBLE PRECISION type-specific form of
452       // the Fortran type-generic SIGN intrinsic) required special
453       // translation, which we did in a separate utility function in
454       // the specializaton of GivensRotator for real arithmetic.
455       // (ROTG for complex arithmetic doesn't require this function.)
456       // C99 provides a copysign() math library function, but we are
457       // not able to rely on the existence of C99 functions here.
458       ScalarType r, roe, scale, z;
459 
460       roe = *db;
461       if (STS::magnitude (*da) > STS::magnitude (*db)) {
462         roe = *da;
463       }
464       scale = STS::magnitude (*da) + STS::magnitude (*db);
465       if (scale == STS::zero()) {
466         *c = STS::one();
467         *s = STS::zero();
468         r = STS::zero();
469         z = STS::zero();
470       } else {
471         // I introduced temporaries into the translated BLAS code in
472         // order to make the expression easier to read and also save
473         // a few floating-point& operations.
474         const ScalarType da_scaled = *da / scale;
475         const ScalarType db_scaled = *db / scale;
476         r = scale * STS::squareroot (da_scaled*da_scaled + db_scaled*db_scaled);
477         r = SIGN (STS::one(), roe) * r;
478         *c = *da / r;
479         *s = *db / r;
480         z = STS::one();
481         if (STS::magnitude (*da) > STS::magnitude (*db)) {
482           z = *s;
483         }
484         if (STS::magnitude (*db) >= STS::magnitude (*da) && *c != STS::zero()) {
485           z = STS::one() / *c;
486         }
487       }
488 
489       *da = r;
490       *db = z;
491     }
492 
493     // Real-valued implementation of MagValue
494     template<typename ScalarType>
495     void
496     MagValue<ScalarType, false>::
blas_dabs1(const ScalarType * a,ScalarType * ret) const497     blas_dabs1(const ScalarType* a, ScalarType* ret) const
498     {
499       *ret = Teuchos::ScalarTraits<ScalarType>::magnitude( *a );
500     }
501 
502     // Complex-valued implementation of MagValue
503     template<typename ScalarType>
504     void
505     MagValue<ScalarType, true>::
blas_dabs1(const ScalarType * a,typename ScalarTraits<ScalarType>::magnitudeType * ret) const506     blas_dabs1(const ScalarType* a, typename ScalarTraits<ScalarType>::magnitudeType* ret) const
507     {
508       *ret = ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::magnitude(a->real());
509       *ret += ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::magnitude(a->imag());
510     }
511 
512   } // namespace details
513 
514   template<typename OrdinalType, typename ScalarType>
515   void
516   DefaultBLASImpl<OrdinalType, ScalarType>::
ROTG(ScalarType * da,ScalarType * db,rotg_c_type * c,ScalarType * s) const517   ROTG (ScalarType* da,
518         ScalarType* db,
519         rotg_c_type* c,
520         ScalarType* s) const
521   {
522     details::GivensRotator<ScalarType> rotator;
523     rotator.ROTG (da, db, c, s);
524   }
525 
526   template<typename OrdinalType, typename ScalarType>
ROT(const OrdinalType & n,ScalarType * dx,const OrdinalType & incx,ScalarType * dy,const OrdinalType & incy,MagnitudeType * c,ScalarType * s) const527   void DefaultBLASImpl<OrdinalType,ScalarType>::ROT(const OrdinalType& n, ScalarType* dx, const OrdinalType& incx, ScalarType* dy, const OrdinalType& incy, MagnitudeType* c, ScalarType* s) const
528   {
529     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
530     ScalarType sconj = Teuchos::ScalarTraits<ScalarType>::conjugate(*s);
531     if (n <= 0) return;
532     if (incx==1 && incy==1) {
533       for(OrdinalType i=0; i<n; ++i) {
534         ScalarType temp = *c*dx[i] + sconj*dy[i];
535         dy[i] = *c*dy[i] - sconj*dx[i];
536         dx[i] = temp;
537       }
538     }
539     else {
540       OrdinalType ix = 0, iy = 0;
541       if (incx < izero) ix = (-n+1)*incx;
542       if (incy < izero) iy = (-n+1)*incy;
543       for(OrdinalType i=0; i<n; ++i) {
544         ScalarType temp = *c*dx[ix] + sconj*dy[iy];
545         dy[iy] = *c*dy[iy] - sconj*dx[ix];
546         dx[ix] = temp;
547         ix += incx;
548         iy += incy;
549       }
550     }
551   }
552 
553   template<typename OrdinalType, typename ScalarType>
SCAL(const OrdinalType & n,const ScalarType & alpha,ScalarType * x,const OrdinalType & incx) const554   void DefaultBLASImpl<OrdinalType, ScalarType>::SCAL(const OrdinalType& n, const ScalarType& alpha, ScalarType* x, const OrdinalType& incx) const
555   {
556     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
557     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
558     OrdinalType i, ix = izero;
559 
560     if ( n < ione || incx < ione )
561       return;
562 
563     // Scale the vector.
564     for(i = izero; i < n; i++)
565     {
566       x[ix] *= alpha;
567       ix += incx;
568     }
569   } /* end SCAL */
570 
571   template<typename OrdinalType, typename ScalarType>
COPY(const OrdinalType & n,const ScalarType * x,const OrdinalType & incx,ScalarType * y,const OrdinalType & incy) const572   void DefaultBLASImpl<OrdinalType, ScalarType>::COPY(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const
573   {
574     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
575     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
576     OrdinalType i, ix = izero, iy = izero;
577     if ( n > izero ) {
578         // Set the initial indices (ix, iy).
579         if (incx < izero) { ix = (-n+ione)*incx; }
580         if (incy < izero) { iy = (-n+ione)*incy; }
581 
582         for(i = izero; i < n; i++)
583           {
584             y[iy] = x[ix];
585             ix += incx;
586             iy += incy;
587           }
588       }
589   } /* end COPY */
590 
591   template<typename OrdinalType, typename ScalarType>
592   template <typename alpha_type, typename x_type>
AXPY(const OrdinalType & n,const alpha_type alpha,const x_type * x,const OrdinalType & incx,ScalarType * y,const OrdinalType & incy) const593   void DefaultBLASImpl<OrdinalType, ScalarType>::AXPY(const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const
594   {
595     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
596     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
597     OrdinalType i, ix = izero, iy = izero;
598     if( n > izero && alpha != ScalarTraits<alpha_type>::zero())
599       {
600         // Set the initial indices (ix, iy).
601         if (incx < izero) { ix = (-n+ione)*incx; }
602         if (incy < izero) { iy = (-n+ione)*incy; }
603 
604         for(i = izero; i < n; i++)
605           {
606             y[iy] += alpha * x[ix];
607             ix += incx;
608             iy += incy;
609           }
610       }
611   } /* end AXPY */
612 
613   template<typename OrdinalType, typename ScalarType>
ASUM(const OrdinalType & n,const ScalarType * x,const OrdinalType & incx) const614   typename ScalarTraits<ScalarType>::magnitudeType DefaultBLASImpl<OrdinalType, ScalarType>::ASUM(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const
615   {
616     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
617     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
618     typename ScalarTraits<ScalarType>::magnitudeType temp, result =
619       ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
620     OrdinalType i, ix = izero;
621 
622     if ( n < ione || incx < ione )
623       return result;
624 
625     details::MagValue<ScalarType, ScalarTraits<ScalarType>::isComplex> mval;
626     for (i = izero; i < n; i++)
627       {
628         mval.blas_dabs1( &x[ix], &temp );
629         result += temp;
630         ix += incx;
631       }
632 
633     return result;
634   } /* end ASUM */
635 
636   template<typename OrdinalType, typename ScalarType>
637   template <typename x_type, typename y_type>
DOT(const OrdinalType & n,const x_type * x,const OrdinalType & incx,const y_type * y,const OrdinalType & incy) const638   ScalarType DefaultBLASImpl<OrdinalType, ScalarType>::DOT(const OrdinalType& n, const x_type* x, const OrdinalType& incx, const y_type* y, const OrdinalType& incy) const
639   {
640     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
641     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
642     ScalarType result = ScalarTraits<ScalarType>::zero();
643     OrdinalType i, ix = izero, iy = izero;
644     if( n > izero )
645       {
646         // Set the initial indices (ix,iy).
647         if (incx < izero) { ix = (-n+ione)*incx; }
648         if (incy < izero) { iy = (-n+ione)*incy; }
649 
650         for(i = izero; i < n; i++)
651           {
652             result += ScalarTraits<x_type>::conjugate(x[ix]) * y[iy];
653             ix += incx;
654             iy += incy;
655           }
656       }
657     return result;
658   } /* end DOT */
659 
660   template<typename OrdinalType, typename ScalarType>
NRM2(const OrdinalType & n,const ScalarType * x,const OrdinalType & incx) const661   typename ScalarTraits<ScalarType>::magnitudeType DefaultBLASImpl<OrdinalType, ScalarType>::NRM2(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const
662   {
663     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
664     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
665     typename ScalarTraits<ScalarType>::magnitudeType result =
666       ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
667     OrdinalType i, ix = izero;
668 
669     if ( n < ione || incx < ione )
670       return result;
671 
672     for(i = izero; i < n; i++)
673       {
674         result += ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::conjugate(x[ix]) * x[ix]);
675         ix += incx;
676       }
677     result = ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::squareroot(result);
678     return result;
679   } /* end NRM2 */
680 
681   template<typename OrdinalType, typename ScalarType>
682   OrdinalType DefaultBLASImpl<OrdinalType, ScalarType>::IAMAX(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const
683   {
684     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
685     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
686     OrdinalType result = izero, ix = izero, i;
687     typename ScalarTraits<ScalarType>::magnitudeType curval =
688       ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
689     typename ScalarTraits<ScalarType>::magnitudeType maxval =
690       ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
691 
692     if ( n < ione || incx < ione )
693       return result;
694 
695     details::MagValue<ScalarType, ScalarTraits<ScalarType>::isComplex> mval;
696 
697     mval.blas_dabs1( &x[ix], &maxval );
698     ix += incx;
699     for(i = ione; i < n; i++)
700       {
701         mval.blas_dabs1( &x[ix], &curval );
702         if(curval > maxval)
703           {
704             result = i;
705             maxval = curval;
706           }
707         ix += incx;
708       }
709 
710     return result + 1; // the BLAS I?AMAX functions return 1-indexed (Fortran-style) values
711   } /* end IAMAX */
712 
713 //------------------------------------------------------------------------------------------
714 //      LEVEL 2 BLAS ROUTINES
715 //------------------------------------------------------------------------------------------
716   template<typename OrdinalType, typename ScalarType>
717   template <typename alpha_type, typename A_type, typename x_type, typename beta_type>
GEMV(ETransp trans,const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const A_type * A,const OrdinalType & lda,const x_type * x,const OrdinalType & incx,const beta_type beta,ScalarType * y,const OrdinalType & incy) const718   void DefaultBLASImpl<OrdinalType, ScalarType>::GEMV(ETransp trans, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const x_type* x, const OrdinalType& incx, const beta_type beta, ScalarType* y, const OrdinalType& incy) const
719   {
720     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
721     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
722     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
723     beta_type beta_zero = ScalarTraits<beta_type>::zero();
724     x_type x_zero = ScalarTraits<x_type>::zero();
725     ScalarType y_zero = ScalarTraits<ScalarType>::zero();
726     beta_type beta_one = ScalarTraits<beta_type>::one();
727     bool noConj = true;
728     bool BadArgument = false;
729 
730     // Quick return if there is nothing to do!
731     if( m == izero || n == izero || ( alpha == alpha_zero && beta == beta_one ) ){ return; }
732 
733     // Otherwise, we need to check the argument list.
734     if( m < izero ) {
735         std::cout << "BLAS::GEMV Error: M == " << m << std::endl;
736         BadArgument = true;
737     }
738     if( n < izero ) {
739         std::cout << "BLAS::GEMV Error: N == " << n << std::endl;
740         BadArgument = true;
741     }
742     if( lda < m ) {
743         std::cout << "BLAS::GEMV Error: LDA < MAX(1,M)"<< std::endl;
744         BadArgument = true;
745     }
746     if( incx == izero ) {
747         std::cout << "BLAS::GEMV Error: INCX == 0"<< std::endl;
748         BadArgument = true;
749     }
750     if( incy == izero ) {
751         std::cout << "BLAS::GEMV Error: INCY == 0"<< std::endl;
752         BadArgument = true;
753     }
754 
755     if(!BadArgument) {
756       OrdinalType i, j, lenx, leny, ix, iy, jx, jy;
757       OrdinalType kx = izero, ky = izero;
758       ScalarType temp;
759 
760       // Determine the lengths of the vectors x and y.
761       if(ETranspChar[trans] == 'N') {
762         lenx = n;
763         leny = m;
764       } else {
765         lenx = m;
766         leny = n;
767       }
768 
769       // Determine if this is a conjugate tranpose
770       noConj = (ETranspChar[trans] == 'T');
771 
772       // Set the starting pointers for the vectors x and y if incx/y < 0.
773       if (incx < izero ) { kx =  (ione - lenx)*incx; }
774       if (incy < izero ) { ky =  (ione - leny)*incy; }
775 
776       // Form y = beta*y
777       ix = kx; iy = ky;
778       if(beta != beta_one) {
779         if (incy == ione) {
780           if (beta == beta_zero) {
781             for(i = izero; i < leny; i++) { y[i] = y_zero; }
782           } else {
783             for(i = izero; i < leny; i++) { y[i] *= beta; }
784           }
785         } else {
786           if (beta == beta_zero) {
787             for(i = izero; i < leny; i++) {
788               y[iy] = y_zero;
789               iy += incy;
790             }
791           } else {
792             for(i = izero; i < leny; i++) {
793               y[iy] *= beta;
794               iy += incy;
795             }
796           }
797         }
798       }
799 
800       // Return if we don't have to do anything more.
801       if(alpha == alpha_zero) { return; }
802 
803       if( ETranspChar[trans] == 'N' ) {
804         // Form y = alpha*A*y
805         jx = kx;
806         if (incy == ione) {
807           for(j = izero; j < n; j++) {
808             if (x[jx] != x_zero) {
809               temp = alpha*x[jx];
810               for(i = izero; i < m; i++) {
811                 y[i] += temp*A[j*lda + i];
812               }
813             }
814             jx += incx;
815           }
816         } else {
817           for(j = izero; j < n; j++) {
818             if (x[jx] != x_zero) {
819               temp = alpha*x[jx];
820               iy = ky;
821               for(i = izero; i < m; i++) {
822                 y[iy] += temp*A[j*lda + i];
823                 iy += incy;
824               }
825             }
826             jx += incx;
827           }
828         }
829       } else {
830         jy = ky;
831         if (incx == ione) {
832           for(j = izero; j < n; j++) {
833             temp = y_zero;
834             if ( noConj ) {
835               for(i = izero; i < m; i++) {
836                 temp += A[j*lda + i]*x[i];
837               }
838             } else {
839               for(i = izero; i < m; i++) {
840                 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i];
841               }
842             }
843             y[jy] += alpha*temp;
844             jy += incy;
845           }
846         } else {
847           for(j = izero; j < n; j++) {
848             temp = y_zero;
849             ix = kx;
850             if ( noConj ) {
851               for (i = izero; i < m; i++) {
852                 temp += A[j*lda + i]*x[ix];
853                 ix += incx;
854               }
855             } else {
856               for (i = izero; i < m; i++) {
857                 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix];
858                 ix += incx;
859               }
860             }
861             y[jy] += alpha*temp;
862             jy += incy;
863           }
864         }
865       }
866     } /* if (!BadArgument) */
867   } /* end GEMV */
868 
869  template<typename OrdinalType, typename ScalarType>
870  template <typename A_type>
TRMV(EUplo uplo,ETransp trans,EDiag diag,const OrdinalType & n,const A_type * A,const OrdinalType & lda,ScalarType * x,const OrdinalType & incx) const871  void DefaultBLASImpl<OrdinalType, ScalarType>::TRMV(EUplo uplo, ETransp trans, EDiag diag, const OrdinalType& n, const A_type* A, const OrdinalType& lda, ScalarType* x, const OrdinalType& incx) const
872   {
873     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
874     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
875     ScalarType zero = ScalarTraits<ScalarType>::zero();
876     bool BadArgument = false;
877     bool noConj = true;
878 
879     // Quick return if there is nothing to do!
880     if( n == izero ){ return; }
881 
882     // Otherwise, we need to check the argument list.
883     if( n < izero ) {
884       std::cout << "BLAS::TRMV Error: N == " << n << std::endl;
885       BadArgument = true;
886     }
887     if( lda < n ) {
888       std::cout << "BLAS::TRMV Error: LDA < MAX(1,N)"<< std::endl;
889       BadArgument = true;
890     }
891     if( incx == izero ) {
892       std::cout << "BLAS::TRMV Error: INCX == 0"<< std::endl;
893       BadArgument = true;
894     }
895 
896     if(!BadArgument) {
897       OrdinalType i, j, ix, jx, kx = izero;
898       ScalarType temp;
899       bool noUnit = (EDiagChar[diag] == 'N');
900 
901       // Determine if this is a conjugate tranpose
902       noConj = (ETranspChar[trans] == 'T');
903 
904       // Set the starting pointer for the vector x if incx < 0.
905       if (incx < izero) { kx = (-n+ione)*incx; }
906 
907       // Start the operations for a nontransposed triangular matrix
908       if (ETranspChar[trans] == 'N') {
909         /* Compute x = A*x */
910         if (EUploChar[uplo] == 'U') {
911           /* A is an upper triangular matrix */
912           if (incx == ione) {
913             for (j=izero; j<n; j++) {
914               if (x[j] != zero) {
915                 temp = x[j];
916                 for (i=izero; i<j; i++) {
917                   x[i] += temp*A[j*lda + i];
918                 }
919                 if ( noUnit )
920                   x[j] *= A[j*lda + j];
921               }
922             }
923           } else {
924             jx = kx;
925             for (j=izero; j<n; j++) {
926               if (x[jx] != zero) {
927                 temp = x[jx];
928                 ix = kx;
929                 for (i=izero; i<j; i++) {
930                   x[ix] += temp*A[j*lda + i];
931                   ix += incx;
932                 }
933                 if ( noUnit )
934                   x[jx] *= A[j*lda + j];
935               }
936               jx += incx;
937             }
938           } /* if (incx == ione) */
939         } else { /* A is a lower triangular matrix */
940           if (incx == ione) {
941             for (j=n-ione; j>-ione; j--) {
942               if (x[j] != zero) {
943                 temp = x[j];
944                 for (i=n-ione; i>j; i--) {
945                   x[i] += temp*A[j*lda + i];
946                 }
947                 if ( noUnit )
948                   x[j] *= A[j*lda + j];
949               }
950             }
951           } else {
952             kx += (n-ione)*incx;
953             jx = kx;
954             for (j=n-ione; j>-ione; j--) {
955               if (x[jx] != zero) {
956                 temp = x[jx];
957                 ix = kx;
958                 for (i=n-ione; i>j; i--) {
959                   x[ix] += temp*A[j*lda + i];
960                   ix -= incx;
961                 }
962                 if ( noUnit )
963                   x[jx] *= A[j*lda + j];
964               }
965               jx -= incx;
966             }
967           }
968         } /* if (EUploChar[uplo]=='U') */
969       } else { /* A is transposed/conjugated */
970         /* Compute x = A'*x */
971         if (EUploChar[uplo]=='U') {
972           /* A is an upper triangular matrix */
973           if (incx == ione) {
974             for (j=n-ione; j>-ione; j--) {
975               temp = x[j];
976               if ( noConj ) {
977                 if ( noUnit )
978                   temp *= A[j*lda + j];
979                 for (i=j-ione; i>-ione; i--) {
980                   temp += A[j*lda + i]*x[i];
981                 }
982               } else {
983                 if ( noUnit )
984                   temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
985                 for (i=j-ione; i>-ione; i--) {
986                   temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i];
987                 }
988               }
989               x[j] = temp;
990             }
991           } else {
992             jx = kx + (n-ione)*incx;
993             for (j=n-ione; j>-ione; j--) {
994               temp = x[jx];
995               ix = jx;
996               if ( noConj ) {
997                 if ( noUnit )
998                   temp *= A[j*lda + j];
999                 for (i=j-ione; i>-ione; i--) {
1000                   ix -= incx;
1001                   temp += A[j*lda + i]*x[ix];
1002                 }
1003               } else {
1004                 if ( noUnit )
1005                   temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
1006                 for (i=j-ione; i>-ione; i--) {
1007                   ix -= incx;
1008                   temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix];
1009                 }
1010               }
1011               x[jx] = temp;
1012               jx -= incx;
1013             }
1014           }
1015         } else {
1016           /* A is a lower triangular matrix */
1017           if (incx == ione) {
1018             for (j=izero; j<n; j++) {
1019               temp = x[j];
1020               if ( noConj ) {
1021                 if ( noUnit )
1022                   temp *= A[j*lda + j];
1023                 for (i=j+ione; i<n; i++) {
1024                   temp += A[j*lda + i]*x[i];
1025                 }
1026               } else {
1027                 if ( noUnit )
1028                   temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
1029                 for (i=j+ione; i<n; i++) {
1030                   temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i];
1031                 }
1032               }
1033               x[j] = temp;
1034             }
1035           } else {
1036             jx = kx;
1037             for (j=izero; j<n; j++) {
1038               temp = x[jx];
1039               ix = jx;
1040               if ( noConj ) {
1041                 if ( noUnit )
1042                   temp *= A[j*lda + j];
1043                 for (i=j+ione; i<n; i++) {
1044                   ix += incx;
1045                   temp += A[j*lda + i]*x[ix];
1046                 }
1047               } else {
1048                 if ( noUnit )
1049                   temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
1050                 for (i=j+ione; i<n; i++) {
1051                   ix += incx;
1052                   temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix];
1053                 }
1054               }
1055               x[jx] = temp;
1056               jx += incx;
1057             }
1058           }
1059         } /* if (EUploChar[uplo]=='U') */
1060       } /* if (ETranspChar[trans]=='N') */
1061     } /* if (!BadArgument) */
1062   } /* end TRMV */
1063 
1064   template<typename OrdinalType, typename ScalarType>
1065   template <typename alpha_type, typename x_type, typename y_type>
GER(const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const x_type * x,const OrdinalType & incx,const y_type * y,const OrdinalType & incy,ScalarType * A,const OrdinalType & lda) const1066   void DefaultBLASImpl<OrdinalType, ScalarType>::GER(const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, const y_type* y, const OrdinalType& incy, ScalarType* A, const OrdinalType& lda) const
1067   {
1068     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
1069     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
1070     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
1071     y_type y_zero = ScalarTraits<y_type>::zero();
1072     bool BadArgument = false;
1073 
1074     // Quick return if there is nothing to do!
1075     if( m == izero || n == izero || alpha == alpha_zero ){ return; }
1076 
1077     // Otherwise, we need to check the argument list.
1078     if( m < izero ) {
1079         std::cout << "BLAS::GER Error: M == " << m << std::endl;
1080         BadArgument = true;
1081     }
1082     if( n < izero ) {
1083         std::cout << "BLAS::GER Error: N == " << n << std::endl;
1084         BadArgument = true;
1085     }
1086     if( lda < m ) {
1087         std::cout << "BLAS::GER Error: LDA < MAX(1,M)"<< std::endl;
1088         BadArgument = true;
1089     }
1090     if( incx == 0 ) {
1091         std::cout << "BLAS::GER Error: INCX == 0"<< std::endl;
1092         BadArgument = true;
1093     }
1094     if( incy == 0 ) {
1095         std::cout << "BLAS::GER Error: INCY == 0"<< std::endl;
1096         BadArgument = true;
1097     }
1098 
1099     if(!BadArgument) {
1100       OrdinalType i, j, ix, jy = izero, kx = izero;
1101       ScalarType temp;
1102 
1103       // Set the starting pointers for the vectors x and y if incx/y < 0.
1104       if (incx < izero) { kx = (-m+ione)*incx; }
1105       if (incy < izero) { jy = (-n+ione)*incy; }
1106 
1107       // Start the operations for incx == 1
1108       if( incx == ione ) {
1109         for( j=izero; j<n; j++ ) {
1110           if ( y[jy] != y_zero ) {
1111             temp = alpha*y[jy];
1112             for ( i=izero; i<m; i++ ) {
1113               A[j*lda + i] += x[i]*temp;
1114             }
1115           }
1116           jy += incy;
1117         }
1118       }
1119       else { // Start the operations for incx != 1
1120         for( j=izero; j<n; j++ ) {
1121           if ( y[jy] != y_zero ) {
1122             temp = alpha*y[jy];
1123             ix = kx;
1124             for( i=izero; i<m; i++ ) {
1125               A[j*lda + i] += x[ix]*temp;
1126               ix += incx;
1127             }
1128           }
1129           jy += incy;
1130         }
1131       }
1132     } /* if(!BadArgument) */
1133   } /* end GER */
1134 
1135 //------------------------------------------------------------------------------------------
1136 //      LEVEL 3 BLAS ROUTINES
1137 //------------------------------------------------------------------------------------------
1138 
1139   template<typename OrdinalType, typename ScalarType>
1140   template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
GEMM(ETransp transa,ETransp transb,const OrdinalType & m,const OrdinalType & n,const OrdinalType & k,const alpha_type alpha,const A_type * A,const OrdinalType & lda,const B_type * B,const OrdinalType & ldb,const beta_type beta,ScalarType * C,const OrdinalType & ldc) const1141   void DefaultBLASImpl<OrdinalType, ScalarType>::GEMM(ETransp transa, ETransp transb, const OrdinalType& m, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const
1142   {
1143 
1144     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
1145     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
1146     beta_type beta_zero = ScalarTraits<beta_type>::zero();
1147     B_type B_zero = ScalarTraits<B_type>::zero();
1148     ScalarType C_zero = ScalarTraits<ScalarType>::zero();
1149     beta_type beta_one = ScalarTraits<beta_type>::one();
1150     OrdinalType i, j, p;
1151     OrdinalType NRowA = m, NRowB = k;
1152     ScalarType temp;
1153     bool BadArgument = false;
1154     bool conjA = false, conjB = false;
1155 
1156     // Change dimensions of matrix if either matrix is transposed
1157     if( !(ETranspChar[transa]=='N') ) {
1158       NRowA = k;
1159     }
1160     if( !(ETranspChar[transb]=='N') ) {
1161       NRowB = n;
1162     }
1163 
1164     // Quick return if there is nothing to do!
1165     if( (m==izero) || (n==izero) || (((alpha==alpha_zero)||(k==izero)) && (beta==beta_one)) ){ return; }
1166     if( m < izero ) {
1167       std::cout << "BLAS::GEMM Error: M == " << m << std::endl;
1168       BadArgument = true;
1169     }
1170     if( n < izero ) {
1171       std::cout << "BLAS::GEMM Error: N == " << n << std::endl;
1172       BadArgument = true;
1173     }
1174     if( k < izero ) {
1175       std::cout << "BLAS::GEMM Error: K == " << k << std::endl;
1176       BadArgument = true;
1177     }
1178     if( lda < NRowA ) {
1179       std::cout << "BLAS::GEMM Error: LDA < "<<NRowA<<std::endl;
1180       BadArgument = true;
1181     }
1182     if( ldb < NRowB ) {
1183       std::cout << "BLAS::GEMM Error: LDB < "<<NRowB<<std::endl;
1184       BadArgument = true;
1185     }
1186      if( ldc < m ) {
1187       std::cout << "BLAS::GEMM Error: LDC < MAX(1,M)"<< std::endl;
1188       BadArgument = true;
1189     }
1190 
1191     if(!BadArgument) {
1192 
1193       // Determine if this is a conjugate tranpose
1194       conjA = (ETranspChar[transa] == 'C');
1195       conjB = (ETranspChar[transb] == 'C');
1196 
1197       // Only need to scale the resulting matrix C.
1198       if( alpha == alpha_zero ) {
1199         if( beta == beta_zero ) {
1200           for (j=izero; j<n; j++) {
1201             for (i=izero; i<m; i++) {
1202               C[j*ldc + i] = C_zero;
1203             }
1204           }
1205         } else {
1206           for (j=izero; j<n; j++) {
1207             for (i=izero; i<m; i++) {
1208               C[j*ldc + i] *= beta;
1209             }
1210           }
1211         }
1212         return;
1213       }
1214       //
1215       // Now start the operations.
1216       //
1217       if ( ETranspChar[transb]=='N' ) {
1218         if ( ETranspChar[transa]=='N' ) {
1219           // Compute C = alpha*A*B + beta*C
1220           for (j=izero; j<n; j++) {
1221             if( beta == beta_zero ) {
1222               for (i=izero; i<m; i++){
1223                 C[j*ldc + i] = C_zero;
1224               }
1225             } else if( beta != beta_one ) {
1226               for (i=izero; i<m; i++){
1227                 C[j*ldc + i] *= beta;
1228               }
1229             }
1230             for (p=izero; p<k; p++){
1231               if (B[j*ldb + p] != B_zero ){
1232                 temp = alpha*B[j*ldb + p];
1233                 for (i=izero; i<m; i++) {
1234                   C[j*ldc + i] += temp*A[p*lda + i];
1235                 }
1236               }
1237             }
1238           }
1239         } else if ( conjA ) {
1240           // Compute C = alpha*conj(A')*B + beta*C
1241           for (j=izero; j<n; j++) {
1242             for (i=izero; i<m; i++) {
1243               temp = C_zero;
1244               for (p=izero; p<k; p++) {
1245                 temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])*B[j*ldb + p];
1246               }
1247               if (beta == beta_zero) {
1248                 C[j*ldc + i] = alpha*temp;
1249               } else {
1250                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
1251               }
1252             }
1253           }
1254         } else {
1255           // Compute C = alpha*A'*B + beta*C
1256           for (j=izero; j<n; j++) {
1257             for (i=izero; i<m; i++) {
1258               temp = C_zero;
1259               for (p=izero; p<k; p++) {
1260                 temp += A[i*lda + p]*B[j*ldb + p];
1261               }
1262               if (beta == beta_zero) {
1263                 C[j*ldc + i] = alpha*temp;
1264               } else {
1265                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
1266               }
1267             }
1268           }
1269         }
1270       } else if ( ETranspChar[transa]=='N' ) {
1271         if ( conjB ) {
1272           // Compute C = alpha*A*conj(B') + beta*C
1273           for (j=izero; j<n; j++) {
1274             if (beta == beta_zero) {
1275               for (i=izero; i<m; i++) {
1276                 C[j*ldc + i] = C_zero;
1277               }
1278             } else if ( beta != beta_one ) {
1279               for (i=izero; i<m; i++) {
1280                 C[j*ldc + i] *= beta;
1281               }
1282             }
1283             for (p=izero; p<k; p++) {
1284               if (B[p*ldb + j] != B_zero) {
1285                 temp = alpha*ScalarTraits<B_type>::conjugate(B[p*ldb + j]);
1286                 for (i=izero; i<m; i++) {
1287                   C[j*ldc + i] += temp*A[p*lda + i];
1288                 }
1289               }
1290             }
1291           }
1292         } else {
1293           // Compute C = alpha*A*B' + beta*C
1294           for (j=izero; j<n; j++) {
1295             if (beta == beta_zero) {
1296               for (i=izero; i<m; i++) {
1297                 C[j*ldc + i] = C_zero;
1298               }
1299             } else if ( beta != beta_one ) {
1300               for (i=izero; i<m; i++) {
1301                 C[j*ldc + i] *= beta;
1302               }
1303             }
1304             for (p=izero; p<k; p++) {
1305               if (B[p*ldb + j] != B_zero) {
1306                 temp = alpha*B[p*ldb + j];
1307                 for (i=izero; i<m; i++) {
1308                   C[j*ldc + i] += temp*A[p*lda + i];
1309                 }
1310               }
1311             }
1312           }
1313         }
1314       } else if ( conjA ) {
1315         if ( conjB ) {
1316           // Compute C = alpha*conj(A')*conj(B') + beta*C
1317           for (j=izero; j<n; j++) {
1318             for (i=izero; i<m; i++) {
1319               temp = C_zero;
1320               for (p=izero; p<k; p++) {
1321                 temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])
1322                       * ScalarTraits<B_type>::conjugate(B[p*ldb + j]);
1323               }
1324               if (beta == beta_zero) {
1325                 C[j*ldc + i] = alpha*temp;
1326               } else {
1327                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
1328               }
1329             }
1330           }
1331         } else {
1332           // Compute C = alpha*conj(A')*B' + beta*C
1333           for (j=izero; j<n; j++) {
1334             for (i=izero; i<m; i++) {
1335               temp = C_zero;
1336               for (p=izero; p<k; p++) {
1337                 temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])
1338                       * B[p*ldb + j];
1339               }
1340               if (beta == beta_zero) {
1341                 C[j*ldc + i] = alpha*temp;
1342               } else {
1343                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
1344               }
1345             }
1346           }
1347         }
1348      } else {
1349        if ( conjB ) {
1350          // Compute C = alpha*A'*conj(B') + beta*C
1351          for (j=izero; j<n; j++) {
1352             for (i=izero; i<m; i++) {
1353               temp = C_zero;
1354               for (p=izero; p<k; p++) {
1355                 temp += A[i*lda + p]
1356                       * ScalarTraits<B_type>::conjugate(B[p*ldb + j]);
1357               }
1358               if (beta == beta_zero) {
1359                 C[j*ldc + i] = alpha*temp;
1360               } else {
1361                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
1362               }
1363             }
1364           }
1365         } else {
1366           // Compute C = alpha*A'*B' + beta*C
1367           for (j=izero; j<n; j++) {
1368             for (i=izero; i<m; i++) {
1369               temp = C_zero;
1370               for (p=izero; p<k; p++) {
1371                 temp += A[i*lda + p]*B[p*ldb + j];
1372               }
1373               if (beta == beta_zero) {
1374                 C[j*ldc + i] = alpha*temp;
1375               } else {
1376                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
1377               }
1378             }
1379           }
1380         } // end if (ETranspChar[transa]=='N') ...
1381       } // end if (ETranspChar[transb]=='N') ...
1382     } // end if (!BadArgument) ...
1383   } // end of GEMM
1384 
1385 
1386   template<typename OrdinalType, typename ScalarType>
1387   void DefaultBLASImpl<OrdinalType, ScalarType>::
SWAP(const OrdinalType & n,ScalarType * const x,const OrdinalType & incx,ScalarType * const y,const OrdinalType & incy) const1388   SWAP (const OrdinalType& n, ScalarType* const x, const OrdinalType& incx,
1389         ScalarType* const y, const OrdinalType& incy) const
1390   {
1391     if (n <= 0) {
1392       return;
1393     }
1394 
1395     if (incx == 1 && incy == 1) {
1396       for (int i = 0; i < n; ++i) {
1397         ScalarType tmp = x[i];
1398         x[i] = y[i];
1399         y[i] = tmp;
1400       }
1401       return;
1402     }
1403 
1404     int ix = 1;
1405     int iy = 1;
1406     if (incx < 0) {
1407       ix = (1-n) * incx + 1;
1408     }
1409     if (incy < 0) {
1410       iy = (1-n) * incy + 1;
1411     }
1412 
1413     for (int i = 1; i <= n; ++i) {
1414       ScalarType tmp = x[ix - 1];
1415       x[ix - 1] = y[iy - 1];
1416       y[iy - 1] = tmp;
1417       ix += incx;
1418       iy += incy;
1419     }
1420   }
1421 
1422 
1423   template<typename OrdinalType, typename ScalarType>
1424   template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
SYMM(ESide side,EUplo uplo,const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const A_type * A,const OrdinalType & lda,const B_type * B,const OrdinalType & ldb,const beta_type beta,ScalarType * C,const OrdinalType & ldc) const1425   void DefaultBLASImpl<OrdinalType, ScalarType>::SYMM(ESide side, EUplo uplo, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const
1426   {
1427     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
1428     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
1429     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
1430     beta_type beta_zero = ScalarTraits<beta_type>::zero();
1431     ScalarType C_zero = ScalarTraits<ScalarType>::zero();
1432     beta_type beta_one = ScalarTraits<beta_type>::one();
1433     OrdinalType i, j, k, NRowA = m;
1434     ScalarType temp1, temp2;
1435     bool BadArgument = false;
1436     bool Upper = (EUploChar[uplo] == 'U');
1437     if (ESideChar[side] == 'R') { NRowA = n; }
1438 
1439     // Quick return.
1440     if ( (m==izero) || (n==izero) || ( (alpha==alpha_zero)&&(beta==beta_one) ) ) { return; }
1441     if( m < izero ) {
1442       std::cout << "BLAS::SYMM Error: M == "<< m << std::endl;
1443       BadArgument = true; }
1444     if( n < izero ) {
1445       std::cout << "BLAS::SYMM Error: N == "<< n << std::endl;
1446       BadArgument = true; }
1447     if( lda < NRowA ) {
1448       std::cout << "BLAS::SYMM Error: LDA < "<<NRowA<<std::endl;
1449       BadArgument = true; }
1450     if( ldb < m ) {
1451       std::cout << "BLAS::SYMM Error: LDB < MAX(1,M)"<<std::endl;
1452       BadArgument = true; }
1453     if( ldc < m ) {
1454       std::cout << "BLAS::SYMM Error: LDC < MAX(1,M)"<<std::endl;
1455       BadArgument = true; }
1456 
1457     if(!BadArgument) {
1458 
1459       // Only need to scale C and return.
1460       if (alpha == alpha_zero) {
1461         if (beta == beta_zero ) {
1462           for (j=izero; j<n; j++) {
1463             for (i=izero; i<m; i++) {
1464               C[j*ldc + i] = C_zero;
1465             }
1466           }
1467         } else {
1468           for (j=izero; j<n; j++) {
1469             for (i=izero; i<m; i++) {
1470               C[j*ldc + i] *= beta;
1471             }
1472           }
1473         }
1474         return;
1475       }
1476 
1477       if ( ESideChar[side] == 'L') {
1478         // Compute C = alpha*A*B + beta*C
1479 
1480         if (Upper) {
1481           // The symmetric part of A is stored in the upper triangular part of the matrix.
1482           for (j=izero; j<n; j++) {
1483             for (i=izero; i<m; i++) {
1484               temp1 = alpha*B[j*ldb + i];
1485               temp2 = C_zero;
1486               for (k=izero; k<i; k++) {
1487                 C[j*ldc + k] += temp1*A[i*lda + k];
1488                 temp2 += B[j*ldb + k]*A[i*lda + k];
1489               }
1490               if (beta == beta_zero) {
1491                 C[j*ldc + i] = temp1*A[i*lda + i] + alpha*temp2;
1492               } else {
1493                 C[j*ldc + i] = beta*C[j*ldc + i] + temp1*A[i*lda + i] + alpha*temp2;
1494               }
1495             }
1496           }
1497         } else {
1498           // The symmetric part of A is stored in the lower triangular part of the matrix.
1499           for (j=izero; j<n; j++) {
1500             for (i=m-ione; i>-ione; i--) {
1501               temp1 = alpha*B[j*ldb + i];
1502               temp2 = C_zero;
1503               for (k=i+ione; k<m; k++) {
1504                 C[j*ldc + k] += temp1*A[i*lda + k];
1505                 temp2 += B[j*ldb + k]*A[i*lda + k];
1506               }
1507               if (beta == beta_zero) {
1508                 C[j*ldc + i] = temp1*A[i*lda + i] + alpha*temp2;
1509               } else {
1510                 C[j*ldc + i] = beta*C[j*ldc + i] + temp1*A[i*lda + i] + alpha*temp2;
1511               }
1512             }
1513           }
1514         }
1515       } else {
1516         // Compute C = alpha*B*A + beta*C.
1517         for (j=izero; j<n; j++) {
1518           temp1 = alpha*A[j*lda + j];
1519           if (beta == beta_zero) {
1520             for (i=izero; i<m; i++) {
1521               C[j*ldc + i] = temp1*B[j*ldb + i];
1522             }
1523           } else {
1524             for (i=izero; i<m; i++) {
1525               C[j*ldc + i] = beta*C[j*ldc + i] + temp1*B[j*ldb + i];
1526             }
1527           }
1528           for (k=izero; k<j; k++) {
1529             if (Upper) {
1530               temp1 = alpha*A[j*lda + k];
1531             } else {
1532               temp1 = alpha*A[k*lda + j];
1533             }
1534             for (i=izero; i<m; i++) {
1535               C[j*ldc + i] += temp1*B[k*ldb + i];
1536             }
1537           }
1538           for (k=j+ione; k<n; k++) {
1539             if (Upper) {
1540               temp1 = alpha*A[k*lda + j];
1541             } else {
1542               temp1 = alpha*A[j*lda + k];
1543             }
1544             for (i=izero; i<m; i++) {
1545               C[j*ldc + i] += temp1*B[k*ldb + i];
1546             }
1547           }
1548         }
1549       } // end if (ESideChar[side]=='L') ...
1550     } // end if(!BadArgument) ...
1551 } // end SYMM
1552 
1553   template<typename OrdinalType, typename ScalarType>
1554   template <typename alpha_type, typename A_type, typename beta_type>
SYRK(EUplo uplo,ETransp trans,const OrdinalType & n,const OrdinalType & k,const alpha_type alpha,const A_type * A,const OrdinalType & lda,const beta_type beta,ScalarType * C,const OrdinalType & ldc) const1555   void DefaultBLASImpl<OrdinalType, ScalarType>::SYRK(EUplo uplo, ETransp trans, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const
1556   {
1557     typedef TypeNameTraits<OrdinalType> OTNT;
1558     typedef TypeNameTraits<ScalarType> STNT;
1559 
1560     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
1561     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
1562     beta_type beta_zero = ScalarTraits<beta_type>::zero();
1563     beta_type beta_one = ScalarTraits<beta_type>::one();
1564     A_type temp, A_zero = ScalarTraits<A_type>::zero();
1565     ScalarType C_zero = ScalarTraits<ScalarType>::zero();
1566     OrdinalType i, j, l, NRowA = n;
1567     bool BadArgument = false;
1568     bool Upper = (EUploChar[uplo] == 'U');
1569 
1570     TEUCHOS_TEST_FOR_EXCEPTION(
1571       Teuchos::ScalarTraits<ScalarType>::isComplex
1572       && (trans == CONJ_TRANS),
1573       std::logic_error,
1574             "Teuchos::BLAS<"<<OTNT::name()<<","<<STNT::name()<<">::SYRK()"
1575             " does not support CONJ_TRANS for complex data types."
1576       );
1577 
1578     // Change dimensions of A matrix is transposed
1579     if( !(ETranspChar[trans]=='N') ) {
1580       NRowA = k;
1581     }
1582 
1583     // Quick return.
1584     if ( n==izero ) { return; }
1585     if ( ( (alpha==alpha_zero) || (k==izero) )  && (beta==beta_one) ) { return; }
1586     if( n < izero ) {
1587       std::cout << "BLAS::SYRK Error: N == "<< n <<std::endl;
1588       BadArgument = true; }
1589     if( k < izero ) {
1590       std::cout << "BLAS::SYRK Error: K == "<< k <<std::endl;
1591       BadArgument = true; }
1592     if( lda < NRowA ) {
1593       std::cout << "BLAS::SYRK Error: LDA < "<<NRowA<<std::endl;
1594       BadArgument = true; }
1595     if( ldc < n ) {
1596       std::cout << "BLAS::SYRK Error: LDC < MAX(1,N)"<<std::endl;
1597       BadArgument = true; }
1598 
1599     if(!BadArgument) {
1600 
1601       // Scale C when alpha is zero
1602       if (alpha == alpha_zero) {
1603         if (Upper) {
1604           if (beta==beta_zero) {
1605             for (j=izero; j<n; j++) {
1606               for (i=izero; i<=j; i++) {
1607                 C[j*ldc + i] = C_zero;
1608               }
1609             }
1610           }
1611           else {
1612             for (j=izero; j<n; j++) {
1613               for (i=izero; i<=j; i++) {
1614                 C[j*ldc + i] *= beta;
1615               }
1616             }
1617           }
1618         }
1619         else {
1620           if (beta==beta_zero) {
1621             for (j=izero; j<n; j++) {
1622               for (i=j; i<n; i++) {
1623                 C[j*ldc + i] = C_zero;
1624               }
1625             }
1626           }
1627           else {
1628             for (j=izero; j<n; j++) {
1629               for (i=j; i<n; i++) {
1630                 C[j*ldc + i] *= beta;
1631               }
1632             }
1633           }
1634         }
1635         return;
1636       }
1637 
1638       // Now we can start the computation
1639 
1640       if ( ETranspChar[trans]=='N' ) {
1641 
1642         // Form C <- alpha*A*A' + beta*C
1643         if (Upper) {
1644           for (j=izero; j<n; j++) {
1645             if (beta==beta_zero) {
1646               for (i=izero; i<=j; i++) {
1647                 C[j*ldc + i] = C_zero;
1648               }
1649             }
1650             else if (beta!=beta_one) {
1651               for (i=izero; i<=j; i++) {
1652                 C[j*ldc + i] *= beta;
1653               }
1654             }
1655             for (l=izero; l<k; l++) {
1656               if (A[l*lda + j] != A_zero) {
1657                 temp = alpha*A[l*lda + j];
1658                 for (i = izero; i <=j; i++) {
1659                   C[j*ldc + i] += temp*A[l*lda + i];
1660                 }
1661               }
1662             }
1663           }
1664         }
1665         else {
1666           for (j=izero; j<n; j++) {
1667             if (beta==beta_zero) {
1668               for (i=j; i<n; i++) {
1669                 C[j*ldc + i] = C_zero;
1670               }
1671             }
1672             else if (beta!=beta_one) {
1673               for (i=j; i<n; i++) {
1674                 C[j*ldc + i] *= beta;
1675               }
1676             }
1677             for (l=izero; l<k; l++) {
1678               if (A[l*lda + j] != A_zero) {
1679                 temp = alpha*A[l*lda + j];
1680                 for (i=j; i<n; i++) {
1681                   C[j*ldc + i] += temp*A[l*lda + i];
1682                 }
1683               }
1684             }
1685           }
1686         }
1687       }
1688       else {
1689 
1690         // Form C <- alpha*A'*A + beta*C
1691         if (Upper) {
1692           for (j=izero; j<n; j++) {
1693             for (i=izero; i<=j; i++) {
1694               temp = A_zero;
1695               for (l=izero; l<k; l++) {
1696                 temp += A[i*lda + l]*A[j*lda + l];
1697               }
1698               if (beta==beta_zero) {
1699                 C[j*ldc + i] = alpha*temp;
1700               }
1701               else {
1702                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
1703               }
1704             }
1705           }
1706         }
1707         else {
1708           for (j=izero; j<n; j++) {
1709             for (i=j; i<n; i++) {
1710               temp = A_zero;
1711               for (l=izero; l<k; ++l) {
1712                 temp += A[i*lda + l]*A[j*lda + l];
1713               }
1714               if (beta==beta_zero) {
1715                 C[j*ldc + i] = alpha*temp;
1716               }
1717               else {
1718                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
1719               }
1720             }
1721           }
1722         }
1723       }
1724     } /* if (!BadArgument) */
1725   } /* END SYRK */
1726 
1727   template<typename OrdinalType, typename ScalarType>
1728   template <typename alpha_type, typename A_type>
TRMM(ESide side,EUplo uplo,ETransp transa,EDiag diag,const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const A_type * A,const OrdinalType & lda,ScalarType * B,const OrdinalType & ldb) const1729   void DefaultBLASImpl<OrdinalType, ScalarType>::TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const
1730   {
1731     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
1732     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
1733     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
1734     A_type A_zero = ScalarTraits<A_type>::zero();
1735     ScalarType B_zero = ScalarTraits<ScalarType>::zero();
1736     ScalarType one = ScalarTraits<ScalarType>::one();
1737     OrdinalType i, j, k, NRowA = m;
1738     ScalarType temp;
1739     bool BadArgument = false;
1740     bool LSide = (ESideChar[side] == 'L');
1741     bool noUnit = (EDiagChar[diag] == 'N');
1742     bool Upper = (EUploChar[uplo] == 'U');
1743     bool noConj = (ETranspChar[transa] == 'T');
1744 
1745     if(!LSide) { NRowA = n; }
1746 
1747     // Quick return.
1748     if (n==izero || m==izero) { return; }
1749     if( m < izero ) {
1750       std::cout << "BLAS::TRMM Error: M == "<< m <<std::endl;
1751       BadArgument = true; }
1752     if( n < izero ) {
1753       std::cout << "BLAS::TRMM Error: N == "<< n <<std::endl;
1754       BadArgument = true; }
1755     if( lda < NRowA ) {
1756       std::cout << "BLAS::TRMM Error: LDA < "<<NRowA<<std::endl;
1757       BadArgument = true; }
1758     if( ldb < m ) {
1759       std::cout << "BLAS::TRMM Error: LDB < MAX(1,M)"<<std::endl;
1760       BadArgument = true; }
1761 
1762     if(!BadArgument) {
1763 
1764       // B only needs to be zeroed out.
1765       if( alpha == alpha_zero ) {
1766         for( j=izero; j<n; j++ ) {
1767           for( i=izero; i<m; i++ ) {
1768             B[j*ldb + i] = B_zero;
1769           }
1770         }
1771         return;
1772       }
1773 
1774       //  Start the computations.
1775       if ( LSide ) {
1776         // A is on the left side of B.
1777 
1778         if ( ETranspChar[transa]=='N' ) {
1779           // Compute B = alpha*A*B
1780 
1781           if ( Upper ) {
1782             // A is upper triangular
1783             for( j=izero; j<n; j++ ) {
1784               for( k=izero; k<m; k++) {
1785                 if ( B[j*ldb + k] != B_zero ) {
1786                   temp = alpha*B[j*ldb + k];
1787                   for( i=izero; i<k; i++ ) {
1788                     B[j*ldb + i] += temp*A[k*lda + i];
1789                   }
1790                   if ( noUnit )
1791                     temp *=A[k*lda + k];
1792                   B[j*ldb + k] = temp;
1793                 }
1794               }
1795             }
1796           } else {
1797             // A is lower triangular
1798             for( j=izero; j<n; j++ ) {
1799               for( k=m-ione; k>-ione; k-- ) {
1800                 if( B[j*ldb + k] != B_zero ) {
1801                   temp = alpha*B[j*ldb + k];
1802                   B[j*ldb + k] = temp;
1803                   if ( noUnit )
1804                     B[j*ldb + k] *= A[k*lda + k];
1805                   for( i=k+ione; i<m; i++ ) {
1806                     B[j*ldb + i] += temp*A[k*lda + i];
1807                   }
1808                 }
1809               }
1810             }
1811           }
1812         } else {
1813           // Compute B = alpha*A'*B or B = alpha*conj(A')*B
1814           if( Upper ) {
1815             for( j=izero; j<n; j++ ) {
1816               for( i=m-ione; i>-ione; i-- ) {
1817                 temp = B[j*ldb + i];
1818                 if ( noConj ) {
1819                   if( noUnit )
1820                     temp *= A[i*lda + i];
1821                   for( k=izero; k<i; k++ ) {
1822                     temp += A[i*lda + k]*B[j*ldb + k];
1823                   }
1824                 } else {
1825                   if( noUnit )
1826                     temp *= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
1827                   for( k=izero; k<i; k++ ) {
1828                     temp += ScalarTraits<A_type>::conjugate(A[i*lda + k])*B[j*ldb + k];
1829                   }
1830                 }
1831                 B[j*ldb + i] = alpha*temp;
1832               }
1833             }
1834           } else {
1835             for( j=izero; j<n; j++ ) {
1836               for( i=izero; i<m; i++ ) {
1837                 temp = B[j*ldb + i];
1838                 if ( noConj ) {
1839                   if( noUnit )
1840                     temp *= A[i*lda + i];
1841                   for( k=i+ione; k<m; k++ ) {
1842                     temp += A[i*lda + k]*B[j*ldb + k];
1843                   }
1844                 } else {
1845                   if( noUnit )
1846                     temp *= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
1847                   for( k=i+ione; k<m; k++ ) {
1848                     temp += ScalarTraits<A_type>::conjugate(A[i*lda + k])*B[j*ldb + k];
1849                   }
1850                 }
1851                 B[j*ldb + i] = alpha*temp;
1852               }
1853             }
1854           }
1855         }
1856       } else {
1857         // A is on the right hand side of B.
1858 
1859         if( ETranspChar[transa] == 'N' ) {
1860           // Compute B = alpha*B*A
1861 
1862           if( Upper ) {
1863             // A is upper triangular.
1864             for( j=n-ione; j>-ione; j-- ) {
1865               temp = alpha;
1866               if( noUnit )
1867                 temp *= A[j*lda + j];
1868               for( i=izero; i<m; i++ ) {
1869                 B[j*ldb + i] *= temp;
1870               }
1871               for( k=izero; k<j; k++ ) {
1872                 if( A[j*lda + k] != A_zero ) {
1873                   temp = alpha*A[j*lda + k];
1874                   for( i=izero; i<m; i++ ) {
1875                     B[j*ldb + i] += temp*B[k*ldb + i];
1876                   }
1877                 }
1878               }
1879             }
1880           } else {
1881             // A is lower triangular.
1882             for( j=izero; j<n; j++ ) {
1883               temp = alpha;
1884               if( noUnit )
1885                 temp *= A[j*lda + j];
1886               for( i=izero; i<m; i++ ) {
1887                 B[j*ldb + i] *= temp;
1888               }
1889               for( k=j+ione; k<n; k++ ) {
1890                 if( A[j*lda + k] != A_zero ) {
1891                   temp = alpha*A[j*lda + k];
1892                   for( i=izero; i<m; i++ ) {
1893                     B[j*ldb + i] += temp*B[k*ldb + i];
1894                   }
1895                 }
1896               }
1897             }
1898           }
1899         } else {
1900           // Compute B = alpha*B*A' or B = alpha*B*conj(A')
1901 
1902           if( Upper ) {
1903             for( k=izero; k<n; k++ ) {
1904               for( j=izero; j<k; j++ ) {
1905                 if( A[k*lda + j] != A_zero ) {
1906                   if ( noConj )
1907                     temp = alpha*A[k*lda + j];
1908                   else
1909                     temp = alpha*ScalarTraits<A_type>::conjugate(A[k*lda + j]);
1910                   for( i=izero; i<m; i++ ) {
1911                     B[j*ldb + i] += temp*B[k*ldb + i];
1912                   }
1913                 }
1914               }
1915               temp = alpha;
1916               if( noUnit ) {
1917                 if ( noConj )
1918                   temp *= A[k*lda + k];
1919                 else
1920                   temp *= ScalarTraits<A_type>::conjugate(A[k*lda + k]);
1921               }
1922               if( temp != one ) {
1923                 for( i=izero; i<m; i++ ) {
1924                   B[k*ldb + i] *= temp;
1925                 }
1926               }
1927             }
1928           } else {
1929             for( k=n-ione; k>-ione; k-- ) {
1930               for( j=k+ione; j<n; j++ ) {
1931                 if( A[k*lda + j] != A_zero ) {
1932                   if ( noConj )
1933                     temp = alpha*A[k*lda + j];
1934                   else
1935                     temp = alpha*ScalarTraits<A_type>::conjugate(A[k*lda + j]);
1936                   for( i=izero; i<m; i++ ) {
1937                     B[j*ldb + i] += temp*B[k*ldb + i];
1938                   }
1939                 }
1940               }
1941               temp = alpha;
1942               if( noUnit ) {
1943                 if ( noConj )
1944                   temp *= A[k*lda + k];
1945                 else
1946                   temp *= ScalarTraits<A_type>::conjugate(A[k*lda + k]);
1947               }
1948               if( temp != one ) {
1949                 for( i=izero; i<m; i++ ) {
1950                   B[k*ldb + i] *= temp;
1951                 }
1952               }
1953             }
1954           }
1955         } // end if( ETranspChar[transa] == 'N' ) ...
1956       } // end if ( LSide ) ...
1957     } // end if (!BadArgument)
1958   } // end TRMM
1959 
1960   template<typename OrdinalType, typename ScalarType>
1961   template <typename alpha_type, typename A_type>
TRSM(ESide side,EUplo uplo,ETransp transa,EDiag diag,const OrdinalType & m,const OrdinalType & n,const alpha_type alpha,const A_type * A,const OrdinalType & lda,ScalarType * B,const OrdinalType & ldb) const1962   void DefaultBLASImpl<OrdinalType, ScalarType>::TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const
1963   {
1964     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
1965     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
1966     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
1967     A_type A_zero = ScalarTraits<A_type>::zero();
1968     ScalarType B_zero = ScalarTraits<ScalarType>::zero();
1969     alpha_type alpha_one = ScalarTraits<alpha_type>::one();
1970     ScalarType B_one = ScalarTraits<ScalarType>::one();
1971     ScalarType temp;
1972     OrdinalType NRowA = m;
1973     bool BadArgument = false;
1974     bool noUnit = (EDiagChar[diag]=='N');
1975     bool noConj = (ETranspChar[transa] == 'T');
1976 
1977     if (!(ESideChar[side] == 'L')) { NRowA = n; }
1978 
1979     // Quick return.
1980     if (n == izero || m == izero) { return; }
1981     if( m < izero ) {
1982       std::cout << "BLAS::TRSM Error: M == "<<m<<std::endl;
1983       BadArgument = true; }
1984     if( n < izero ) {
1985       std::cout << "BLAS::TRSM Error: N == "<<n<<std::endl;
1986       BadArgument = true; }
1987     if( lda < NRowA ) {
1988       std::cout << "BLAS::TRSM Error: LDA < "<<NRowA<<std::endl;
1989       BadArgument = true; }
1990     if( ldb < m ) {
1991       std::cout << "BLAS::TRSM Error: LDB < MAX(1,M)"<<std::endl;
1992       BadArgument = true; }
1993 
1994     if(!BadArgument)
1995       {
1996         int i, j, k;
1997         // Set the solution to the zero vector.
1998         if(alpha == alpha_zero) {
1999             for(j = izero; j < n; j++) {
2000                 for( i = izero; i < m; i++) {
2001                     B[j*ldb+i] = B_zero;
2002                 }
2003             }
2004         }
2005         else
2006         { // Start the operations.
2007             if(ESideChar[side] == 'L') {
2008                 //
2009                 // Perform computations for OP(A)*X = alpha*B
2010                 //
2011                 if(ETranspChar[transa] == 'N') {
2012                     //
2013                     //  Compute B = alpha*inv( A )*B
2014                     //
2015                     if(EUploChar[uplo] == 'U') {
2016                         // A is upper triangular.
2017                         for(j = izero; j < n; j++) {
2018                             // Perform alpha*B if alpha is not 1.
2019                           if(alpha != alpha_one) {
2020                                 for( i = izero; i < m; i++) {
2021                                     B[j*ldb+i] *= alpha;
2022                                 }
2023                             }
2024                             // Perform a backsolve for column j of B.
2025                             for(k = (m - ione); k > -ione; k--) {
2026                                 // If this entry is zero, we don't have to do anything.
2027                                 if (B[j*ldb + k] != B_zero) {
2028                                     if ( noUnit ) {
2029                                         B[j*ldb + k] /= A[k*lda + k];
2030                                     }
2031                                     for(i = izero; i < k; i++) {
2032                                         B[j*ldb + i] -= B[j*ldb + k] * A[k*lda + i];
2033                                     }
2034                                 }
2035                             }
2036                         }
2037                     }
2038                     else
2039                     { // A is lower triangular.
2040                         for(j = izero; j < n; j++) {
2041                             // Perform alpha*B if alpha is not 1.
2042                             if(alpha != alpha_one) {
2043                                 for( i = izero; i < m; i++) {
2044                                     B[j*ldb+i] *= alpha;
2045                                 }
2046                             }
2047                             // Perform a forward solve for column j of B.
2048                             for(k = izero; k < m; k++) {
2049                                 // If this entry is zero, we don't have to do anything.
2050                                 if (B[j*ldb + k] != B_zero) {
2051                                     if ( noUnit ) {
2052                                         B[j*ldb + k] /= A[k*lda + k];
2053                                     }
2054                                     for(i = k+ione; i < m; i++) {
2055                                         B[j*ldb + i] -= B[j*ldb + k] * A[k*lda + i];
2056                                     }
2057                                 }
2058                             }
2059                         }
2060                     } // end if (uplo == 'U')
2061                 }  // if (transa =='N')
2062                 else {
2063                     //
2064                     //  Compute B = alpha*inv( A' )*B
2065                     //  or      B = alpha*inv( conj(A') )*B
2066                     //
2067                     if(EUploChar[uplo] == 'U') {
2068                         // A is upper triangular.
2069                         for(j = izero; j < n; j++) {
2070                             for( i = izero; i < m; i++) {
2071                                 temp = alpha*B[j*ldb+i];
2072                                 if ( noConj ) {
2073                                   for(k = izero; k < i; k++) {
2074                                       temp -= A[i*lda + k] * B[j*ldb + k];
2075                                   }
2076                                   if ( noUnit ) {
2077                                       temp /= A[i*lda + i];
2078                                   }
2079                                 } else {
2080                                   for(k = izero; k < i; k++) {
2081                                       temp -= ScalarTraits<A_type>::conjugate(A[i*lda + k])
2082                                             * B[j*ldb + k];
2083                                   }
2084                                   if ( noUnit ) {
2085                                       temp /= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
2086                                   }
2087                                 }
2088                                 B[j*ldb + i] = temp;
2089                             }
2090                         }
2091                     }
2092                     else
2093                     { // A is lower triangular.
2094                         for(j = izero; j < n; j++) {
2095                             for(i = (m - ione) ; i > -ione; i--) {
2096                                 temp = alpha*B[j*ldb+i];
2097                                 if ( noConj ) {
2098                                   for(k = i+ione; k < m; k++) {
2099                                     temp -= A[i*lda + k] * B[j*ldb + k];
2100                                   }
2101                                   if ( noUnit ) {
2102                                     temp /= A[i*lda + i];
2103                                   }
2104                                 } else {
2105                                   for(k = i+ione; k < m; k++) {
2106                                     temp -= ScalarTraits<A_type>::conjugate(A[i*lda + k])
2107                                           * B[j*ldb + k];
2108                                   }
2109                                   if ( noUnit ) {
2110                                     temp /= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
2111                                   }
2112                                 }
2113                                 B[j*ldb + i] = temp;
2114                             }
2115                         }
2116                     }
2117                 }
2118             }  // if (side == 'L')
2119             else {
2120                // side == 'R'
2121                //
2122                // Perform computations for X*OP(A) = alpha*B
2123                //
2124               if (ETranspChar[transa] == 'N') {
2125                     //
2126                     //  Compute B = alpha*B*inv( A )
2127                     //
2128                     if(EUploChar[uplo] == 'U') {
2129                         // A is upper triangular.
2130                         // Perform a backsolve for column j of B.
2131                         for(j = izero; j < n; j++) {
2132                             // Perform alpha*B if alpha is not 1.
2133                             if(alpha != alpha_one) {
2134                                 for( i = izero; i < m; i++) {
2135                                     B[j*ldb+i] *= alpha;
2136                                 }
2137                             }
2138                             for(k = izero; k < j; k++) {
2139                                 // If this entry is zero, we don't have to do anything.
2140                                 if (A[j*lda + k] != A_zero) {
2141                                     for(i = izero; i < m; i++) {
2142                                         B[j*ldb + i] -= A[j*lda + k] * B[k*ldb + i];
2143                                     }
2144                                 }
2145                             }
2146                             if ( noUnit ) {
2147                                 temp = B_one/A[j*lda + j];
2148                                 for(i = izero; i < m; i++) {
2149                                     B[j*ldb + i] *= temp;
2150                                 }
2151                             }
2152                         }
2153                     }
2154                     else
2155                     { // A is lower triangular.
2156                         for(j = (n - ione); j > -ione; j--) {
2157                             // Perform alpha*B if alpha is not 1.
2158                             if(alpha != alpha_one) {
2159                                 for( i = izero; i < m; i++) {
2160                                     B[j*ldb+i] *= alpha;
2161                                 }
2162                             }
2163                             // Perform a forward solve for column j of B.
2164                             for(k = j+ione; k < n; k++) {
2165                                 // If this entry is zero, we don't have to do anything.
2166                                 if (A[j*lda + k] != A_zero) {
2167                                     for(i = izero; i < m; i++) {
2168                                         B[j*ldb + i] -= A[j*lda + k] * B[k*ldb + i];
2169                                     }
2170                                 }
2171                             }
2172                             if ( noUnit ) {
2173                                 temp = B_one/A[j*lda + j];
2174                                 for(i = izero; i < m; i++) {
2175                                     B[j*ldb + i] *= temp;
2176                                 }
2177                             }
2178                         }
2179                     } // end if (uplo == 'U')
2180                 }  // if (transa =='N')
2181                 else {
2182                     //
2183                     //  Compute B = alpha*B*inv( A' )
2184                     //  or      B = alpha*B*inv( conj(A') )
2185                     //
2186                     if(EUploChar[uplo] == 'U') {
2187                         // A is upper triangular.
2188                         for(k = (n - ione); k > -ione; k--) {
2189                             if ( noUnit ) {
2190                                 if ( noConj )
2191                                   temp = B_one/A[k*lda + k];
2192                                 else
2193                                   temp = B_one/ScalarTraits<A_type>::conjugate(A[k*lda + k]);
2194                                 for(i = izero; i < m; i++) {
2195                                     B[k*ldb + i] *= temp;
2196                                 }
2197                             }
2198                             for(j = izero; j < k; j++) {
2199                                 if (A[k*lda + j] != A_zero) {
2200                                     if ( noConj )
2201                                       temp = A[k*lda + j];
2202                                     else
2203                                       temp = ScalarTraits<A_type>::conjugate(A[k*lda + j]);
2204                                     for(i = izero; i < m; i++) {
2205                                         B[j*ldb + i] -= temp*B[k*ldb + i];
2206                                     }
2207                                 }
2208                             }
2209                             if (alpha != alpha_one) {
2210                                 for (i = izero; i < m; i++) {
2211                                     B[k*ldb + i] *= alpha;
2212                                 }
2213                             }
2214                         }
2215                     }
2216                     else
2217                     { // A is lower triangular.
2218                         for(k = izero; k < n; k++) {
2219                             if ( noUnit ) {
2220                                 if ( noConj )
2221                                   temp = B_one/A[k*lda + k];
2222                                 else
2223                                   temp = B_one/ScalarTraits<A_type>::conjugate(A[k*lda + k]);
2224                                 for (i = izero; i < m; i++) {
2225                                     B[k*ldb + i] *= temp;
2226                                 }
2227                             }
2228                             for(j = k+ione; j < n; j++) {
2229                                 if(A[k*lda + j] != A_zero) {
2230                                     if ( noConj )
2231                                       temp = A[k*lda + j];
2232                                     else
2233                                       temp = ScalarTraits<A_type>::conjugate(A[k*lda + j]);
2234                                     for(i = izero; i < m; i++) {
2235                                         B[j*ldb + i] -= temp*B[k*ldb + i];
2236                                     }
2237                                 }
2238                             }
2239                             if (alpha != alpha_one) {
2240                                 for (i = izero; i < m; i++) {
2241                                     B[k*ldb + i] *= alpha;
2242                                 }
2243                             }
2244                         }
2245                     }
2246                 }
2247             }
2248         }
2249     }
2250   }
2251 
2252   // Explicit instantiation for template<int,float>
2253 
2254   template <>
2255   class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, float>
2256   {
2257   public:
BLAS(void)2258     inline BLAS(void) {}
BLAS(const BLAS<int,float> &)2259     inline BLAS(const BLAS<int, float>& /*BLAS_source*/) {}
~BLAS(void)2260     inline virtual ~BLAS(void) {}
2261     void ROTG(float* da, float* db, float* c, float* s) const;
2262     void ROT(const int& n, float* dx, const int& incx, float* dy, const int& incy, float* c, float* s) const;
2263     float ASUM(const int& n, const float* x, const int& incx) const;
2264     void AXPY(const int& n, const float& alpha, const float* x, const int& incx, float* y, const int& incy) const;
2265     void COPY(const int& n, const float* x, const int& incx, float* y, const int& incy) const;
2266     float DOT(const int& n, const float* x, const int& incx, const float* y, const int& incy) const;
2267     float NRM2(const int& n, const float* x, const int& incx) const;
2268     void SCAL(const int& n, const float& alpha, float* x, const int& incx) const;
2269     int IAMAX(const int& n, const float* x, const int& incx) const;
2270     void GEMV(ETransp trans, const int& m, const int& n, const float& alpha, const float* A, const int& lda, const float* x, const int& incx, const float& beta, float* y, const int& incy) const;
2271     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const float* A, const int& lda, float* x, const int& incx) const;
2272     void GER(const int& m, const int& n, const float& alpha, const float* x, const int& incx, const float* y, const int& incy, float* A, const int& lda) const;
2273     void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float* B, const int& ldb, const float& beta, float* C, const int& ldc) const;
2274     void SWAP(const int& n, float* const x, const int& incx, float* const y, const int& incy) const;
2275     void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const float& alpha, const float* A, const int& lda, const float* B, const int& ldb, const float& beta, float* C, const int& ldc) const;
2276     void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float& beta, float* C, const int& ldc) const;
2277     void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float& beta, float* C, const int& ldc) const;
2278     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const float& alpha, const float* A, const int& lda, float* B, const int& ldb) const;
2279     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const float& alpha, const float* A, const int& lda, float* B, const int& ldb) const;
2280   };
2281 
2282   // Explicit instantiation for template<int,double>
2283 
2284   template<>
2285   class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, double>
2286   {
2287   public:
BLAS(void)2288     inline BLAS(void) {}
BLAS(const BLAS<int,double> &)2289     inline BLAS(const BLAS<int, double>& /*BLAS_source*/) {}
~BLAS(void)2290     inline virtual ~BLAS(void) {}
2291     void ROTG(double* da, double* db, double* c, double* s) const;
2292     void ROT(const int& n, double* dx, const int& incx, double* dy, const int& incy, double* c, double* s) const;
2293     double ASUM(const int& n, const double* x, const int& incx) const;
2294     void AXPY(const int& n, const double& alpha, const double* x, const int& incx, double* y, const int& incy) const;
2295     void COPY(const int& n, const double* x, const int& incx, double* y, const int& incy) const;
2296     double DOT(const int& n, const double* x, const int& incx, const double* y, const int& incy) const;
2297     double NRM2(const int& n, const double* x, const int& incx) const;
2298     void SCAL(const int& n, const double& alpha, double* x, const int& incx) const;
2299     int IAMAX(const int& n, const double* x, const int& incx) const;
2300     void GEMV(ETransp trans, const int& m, const int& n, const double& alpha, const double* A, const int& lda, const double* x, const int& incx, const double& beta, double* y, const int& incy) const;
2301     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const double* A, const int& lda, double* x, const int& incx) const;
2302     void GER(const int& m, const int& n, const double& alpha, const double* x, const int& incx, const double* y, const int& incy, double* A, const int& lda) const;
2303     void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double* B, const int& ldb, const double& beta, double* C, const int& ldc) const;
2304     void SWAP(const int& n, double* const x, const int& incx, double* const y, const int& incy) const;
2305     void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const double& alpha, const double* A, const int& lda, const double* B, const int& ldb, const double& beta, double* C, const int& ldc) const;
2306     void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double& beta, double* C, const int& ldc) const;
2307     void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double& beta, double* C, const int& ldc) const;
2308     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const double& alpha, const double* A, const int& lda, double* B, const int& ldb) const;
2309     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const double& alpha, const double* A, const int& lda, double* B, const int& ldb) const;
2310   };
2311 
2312   // Explicit instantiation for template<int,complex<float> >
2313 
2314   template<>
2315   class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, std::complex<float> >
2316   {
2317   public:
BLAS(void)2318     inline BLAS(void) {}
BLAS(const BLAS<int,std::complex<float>> &)2319     inline BLAS(const BLAS<int, std::complex<float> >& /*BLAS_source*/) {}
~BLAS(void)2320     inline virtual ~BLAS(void) {}
2321     void ROTG(std::complex<float>* da, std::complex<float>* db, float* c, std::complex<float>* s) const;
2322     void ROT(const int& n, std::complex<float>* dx, const int& incx, std::complex<float>* dy, const int& incy, float* c, std::complex<float>* s) const;
2323     float ASUM(const int& n, const std::complex<float>* x, const int& incx) const;
2324     void AXPY(const int& n, const std::complex<float> alpha, const std::complex<float>* x, const int& incx, std::complex<float>* y, const int& incy) const;
2325     void COPY(const int& n, const std::complex<float>* x, const int& incx, std::complex<float>* y, const int& incy) const;
2326     std::complex<float> DOT(const int& n, const std::complex<float>* x, const int& incx, const std::complex<float>* y, const int& incy) const;
2327     float NRM2(const int& n, const std::complex<float>* x, const int& incx) const;
2328     void SCAL(const int& n, const std::complex<float> alpha, std::complex<float>* x, const int& incx) const;
2329     int IAMAX(const int& n, const std::complex<float>* x, const int& incx) const;
2330     void GEMV(ETransp trans, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float>* x, const int& incx, const std::complex<float> beta, std::complex<float>* y, const int& incy) const;
2331     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const std::complex<float>* A, const int& lda, std::complex<float>* x, const int& incx) const;
2332     void GER(const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* x, const int& incx, const std::complex<float>* y, const int& incy, std::complex<float>* A, const int& lda) const;
2333     void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float>* B, const int& ldb, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const;
2334     void SWAP(const int& n, std::complex<float>* const x, const int& incx, std::complex<float>* const y, const int& incy) const;
2335     void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> *B, const int& ldb, const std::complex<float> beta, std::complex<float> *C, const int& ldc) const;
2336     void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const;
2337     void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const;
2338     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb) const;
2339     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb) const;
2340   };
2341 
2342   // Explicit instantiation for template<int,complex<double> >
2343 
2344   template<>
2345   class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, std::complex<double> >
2346   {
2347   public:
BLAS(void)2348     inline BLAS(void) {}
BLAS(const BLAS<int,std::complex<double>> &)2349     inline BLAS(const BLAS<int, std::complex<double> >& /*BLAS_source*/) {}
~BLAS(void)2350     inline virtual ~BLAS(void) {}
2351     void ROTG(std::complex<double>* da, std::complex<double>* db, double* c, std::complex<double>* s) const;
2352     void ROT(const int& n, std::complex<double>* dx, const int& incx, std::complex<double>* dy, const int& incy, double* c, std::complex<double>* s) const;
2353     double ASUM(const int& n, const std::complex<double>* x, const int& incx) const;
2354     void AXPY(const int& n, const std::complex<double> alpha, const std::complex<double>* x, const int& incx, std::complex<double>* y, const int& incy) const;
2355     void COPY(const int& n, const std::complex<double>* x, const int& incx, std::complex<double>* y, const int& incy) const;
2356     std::complex<double> DOT(const int& n, const std::complex<double>* x, const int& incx, const std::complex<double>* y, const int& incy) const;
2357     double NRM2(const int& n, const std::complex<double>* x, const int& incx) const;
2358     void SCAL(const int& n, const std::complex<double> alpha, std::complex<double>* x, const int& incx) const;
2359     int IAMAX(const int& n, const std::complex<double>* x, const int& incx) const;
2360     void GEMV(ETransp trans, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double>* x, const int& incx, const std::complex<double> beta, std::complex<double>* y, const int& incy) const;
2361     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const std::complex<double>* A, const int& lda, std::complex<double>* x, const int& incx) const;
2362     void GER(const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* x, const int& incx, const std::complex<double>* y, const int& incy, std::complex<double>* A, const int& lda) const;
2363     void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double>* B, const int& ldb, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const;
2364     void SWAP(const int& n, std::complex<double>* const x, const int& incx, std::complex<double>* const y, const int& incy) const;
2365     void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> *B, const int& ldb, const std::complex<double> beta, std::complex<double> *C, const int& ldc) const;
2366     void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const;
2367     void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const;
2368     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb) const;
2369     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb) const;
2370   };
2371 
2372 } // namespace Teuchos
2373 
2374 #endif // _TEUCHOS_BLAS_HPP_
2375