1 /*
2  * $Id: cblasy.c,v 1.1 2005-09-18 22:04:52 dhmunro Exp $
3  * CBLAS routines used by yorick
4  */
5 /* Copyright (c) 2005, The Regents of the University of California.
6  * All rights reserved.
7  * This file is part of yorick (http://yorick.sourceforge.net).
8  * Read the accompanying LICENSE file for details.
9  */
10 
11 /* Basic Linear algebra Subprograms Technical (BLAST) Forum Standard
12  * August 21, 2001  (http://www.netlib.org/blas/blast-forum/)
13  * Appendix B: Legacy BLAS
14  *
15  * These routines were converted from the original Fortran source
16  * from www.netlib.org, with hand unrolling of the inner loops for
17  * the level 2 and 3 routines.  The unrolling helps not-very-bright
18  * or unaggressive C compilers, but may hurt good compilers.  (The
19  * level 1 Fortran source was already unrolled.)
20  */
21 
22 #include "cblasy.h"
23 
24 #define ABS(x) ((x)>=0?(x):-(x))
25 
26 /* ------------------------------------------------------------ level 1 */
27 
28 double
cblas_ddot(INT_IN n,const double * x,INT_IN incx,const double * y,INT_IN incy)29 cblas_ddot(INT_IN n, const double *x, INT_IN incx,
30            const double *y, INT_IN incy)
31 {
32   /*c
33     c     forms the dot product of two vectors.
34     c     uses unrolled loops for increments equal to one.
35     c     jack dongarra, linpack, 3/11/78.
36     c*/
37   double dtemp = 0.0;
38   long i;
39   if (n<=0) return dtemp;
40   if (incx!=1 || incy!=1) {
41     /*        unequal increments or equal increments not equal to 1 */
42     long iy, nn=n;
43     if (incx<0) x -= (n-1)*incx;
44     if (incy<0) y -= (n-1)*incx;
45     for (i=iy=0 ; nn-- ; i+=incx,iy+=incy) dtemp += y[iy]*x[i];
46   } else {
47     /*        both increments equal to 1 */
48     /* unroll loop */
49     long m = n%5;
50     for (i=0 ; i<m ; i++) dtemp += y[i]*x[i];
51     for (    ; i<n ; i+=5)
52       dtemp += y[i]*x[i] + y[i+1]*x[i+1] + y[i+2]*x[i+2] +
53         y[i+3]*x[i+3] + y[i+4]*x[i+4];
54   }
55   return dtemp;
56 }
57 
58 double
cblas_dnrm2(INT_IN n,const double * x,INT_IN incx)59 cblas_dnrm2(INT_IN n, const double *x, INT_IN incx)
60 {
61   /*c
62     c     euclidean norm of the n-vector stored in x_1() with storage
63     c     increment incx .
64     c     if    n .le. 0 return with result = 0.
65     c     if n .ge. 1 then incx must be .ge. 1
66     c
67     c           c.l.lawson, 1978 jan 08
68     c     modified to correct failure to update ix, 1/25/92.
69     c     modified 3/93 to return if incx .le. 0.
70     c
71     c     four phase method     using two built-in constants that are
72     c     hopefully applicable to all machines.
73     c         cutlo = maximum of  sqrt(u/eps)  over all known machines.
74     c         cuthi = minimum of  sqrt(v)      over all known machines.
75     c     where
76     c         eps = smallest no. such that eps + 1. .gt. 1.
77     c         u   = smallest positive no.   (underflow limit)
78     c         v   = largest  no.            (overflow  limit)
79     c
80     c     brief outline of algorithm..
81     c
82     c     phase 1    scans zero components.
83     c     move to phase 2 when a component is nonzero and .le. cutlo
84     c     move to phase 3 when a component is .gt. cutlo
85     c     move to phase 4 when a component is .ge. cuthi/m
86     c     where m = n for x() real and m = 2*n for complex.
87     c
88     c     values for cutlo and cuthi..
89     c     from the environmental parameters listed in the imsl converter
90     c     document the limiting values are as follows..
91     c     cutlo, s.p.   u/eps = 2**(-102) for  honeywell.  close seconds are
92     c                   univac and dec at 2**(-103)
93     c                   thus cutlo = 2**(-51) = 4.44089e-16
94     c     cuthi, s.p.   v = 2**127 for univac, honeywell, and dec.
95     c                   thus cuthi = 2**(63.5) = 1.30438e19
96     c     cutlo, d.p.   u/eps = 2**(-67) for honeywell and dec.
97     c                   thus cutlo = 2**(-33.5) = 8.23181d-11
98     c     cuthi, d.p.   same as s.p.  cuthi = 1.30438d19
99     c     data cutlo, cuthi / 8.232d-11,  1.304d19 /
100     c     data cutlo, cuthi / 4.441e-16,  1.304e19 /  */
101 #undef cutlo
102 #define cutlo 8.232e-11
103 #undef cuthi
104 #define cuthi 1.304e19
105   double  hitest, sum, adx, xmax;
106   long nn = n;
107   extern double sqrt(double);
108 
109   if ( nn<=0 || incx<=0 ) return 0.0;
110   hitest = cuthi/nn;  /* nn will change as calculation proceeds */
111 
112   adx = ABS(x[0]);
113 
114   /* phase 1 */
115   for (x+=incx,nn-- ; adx==0.0 && nn ; x+=incx,nn--) adx = ABS(x[0]);
116   if (!nn) return adx;  /* note that adx can be any size here */
117 
118   /* phase 2, adx is first non-zero element */
119   if (adx <= cutlo) {
120     xmax = adx;
121     sum = 1.0;
122     for (;;) {
123       adx = ABS(x[0]);
124       if (adx > cutlo) break;
125       if (adx > xmax) {
126         /* renormalize to new largest element */
127         sum = 1.0 + sum * (xmax/adx)*(xmax/adx);
128         xmax = adx;
129       } else {
130         /* continue with current normalization */
131         sum += (adx/xmax)*(adx/xmax);
132       }
133       nn--;
134       if (!nn) return xmax*sqrt(sum);
135       x += incx;
136     }
137     /* here, sum>=1.0, but xmax<=cutlo, so xmax*xmax might underflow */
138     sum *= xmax;  /* remove phase 2 normalization */
139     sum *= xmax;  /* don't want the compiler to do sum*(xmax*xmax) */
140     nn--;
141     x += incx;
142 
143   } else {
144     sum = 0.0;
145   }
146 
147   /* phase 3, adx is first element>cutlo */
148   if (adx<hitest) {
149     sum += adx*adx;
150     for (;;) {
151       if (!nn) return sqrt(sum);
152       adx = ABS(x[0]);
153       nn--;
154       x += incx;
155       if (adx>=hitest) break;
156       sum += adx*adx;
157     }
158   }
159 
160   /* phase 4, adx is first element greater than hitest */
161   xmax = adx;
162   sum /= xmax;   /* again, try to avoid sum/(xmax*xmax) */
163   sum /= xmax;
164   sum += 1.0;
165   while (nn) {
166     adx = ABS(x[0]);
167     if (adx > xmax) {
168       /* renormalize to new largest element */
169       sum = 1.0 + sum * (xmax/adx)*(xmax/adx);
170       xmax = adx;
171     } else {
172       /* continue with current normalization */
173       sum += (adx/xmax)*(adx/xmax);
174     }
175     nn--;
176     x += incx;
177   }
178   return xmax*sqrt(sum);
179 }
180 
181 double
cblas_dasum(INT_IN n,const double * x,INT_IN incx)182 cblas_dasum(INT_IN n, const double *x, INT_IN incx)
183 {
184   /*c
185     c     takes the sum of the absolute values.
186     c     jack dongarra, linpack, 3/11/78.
187     c     modified 3/93 to return if incx .le. 0.
188     c*/
189   double  dtemp = 0.0;
190   long i, nn=n;
191   if ( nn<=0 || incx<=0 ) return dtemp;
192   if (incx!=1) {
193     /*        increment not equal to 1 */
194     for (i=0 ; nn-- ; i+=incx) dtemp += ABS(x[i]);
195   } else {
196     /*        increment equal to 1 */
197     /* unroll loop */
198     long m = n%6;
199     for (i=0 ; i<m ; i++) dtemp += ABS(x[i]);
200     for (    ; i<n ; i+=6)
201       dtemp += ABS(x[i]) + ABS(x[i+1]) + ABS(x[i+2]) +
202         ABS(x[i+3]) + ABS(x[i+4]) + ABS(x[i+5]);
203   }
204   return dtemp;
205 }
206 
207 CBLAS_INDEX
cblas_idamax(INT_IN n,const double * x,INT_IN incx)208 cblas_idamax(INT_IN n, const double *x, INT_IN incx)
209 {
210   /*c
211     c     finds the index of element having max. absolute value.
212     c     jack dongarra, linpack, 3/11/78.
213     c     modified 3/93 to return if incx .le. 0.
214     c*/
215   CBLAS_INDEX iamax = 0;
216   double dmax;
217   long i, nn=n;
218   if (n<=0 || incx<=0) return iamax;
219   if (incx!=1) {
220     /*        increment not equal to 1 */
221     long ix;
222     dmax = ABS(x[0]);
223     for (i=1,ix=incx ; --nn ; i++,ix+=incx)
224       if (ABS(x[ix])>dmax) {
225         dmax = ABS(x[ix]);
226         iamax = i;
227       }
228   } else {
229     /*        increment equal to 1 */
230     dmax = ABS(x[0]);
231     for (i=1 ; --nn ; i++)
232       if (ABS(x[i])>dmax) {
233         dmax = ABS(x[i]);
234         iamax = i;
235       }
236   }
237   return iamax;
238 }
239 
240 void
cblas_dswap(INT_IN n,double * x,INT_IN incx,double * y,INT_IN incy)241 cblas_dswap(INT_IN n, double *x, INT_IN incx, double *y, INT_IN incy)
242 {
243   /*c
244     c     interchanges two vectors.
245     c     uses unrolled loops for increments equal one.
246     c     jack dongarra, linpack, 3/11/78.
247     c*/
248   double  dtemp;
249   long i;
250   if (n<=0) return;
251   if (incx!=1 || incy!=1) {
252     /*        unequal increments or equal increments not equal to 1 */
253     long iy, nn=n;
254     if (incx<0) x -= (n-1)*incx;
255     if (incy<0) y -= (n-1)*incx;
256     for (i=iy=0 ; nn-- ; i+=incx,iy+=incy) {
257       dtemp = x[i]; x[i] = y[iy]; y[iy] = dtemp;
258     }
259   } else {
260     /*        both increments equal to 1 */
261     /* unroll loop */
262     long m = n%3;
263     for (i=0 ; i<m ; i++) { dtemp = x[i]; x[i] = y[i]; y[i] = dtemp; }
264     for (    ; i<n ; i+=3) {
265       dtemp = x[i  ]; x[i  ] = y[i  ]; y[i  ] = dtemp;
266       dtemp = x[i+1]; x[i+1] = y[i+1]; y[i+1] = dtemp;
267       dtemp = x[i+2]; x[i+2] = y[i+2]; y[i+2] = dtemp;
268     }
269   }
270 }
271 
272 void
cblas_dcopy(INT_IN n,const double * x,INT_IN incx,double * y,INT_IN incy)273 cblas_dcopy(INT_IN n, const double *x, INT_IN incx,
274             double *y, INT_IN incy)
275 {
276   /*c
277     c     copies a vector, x, to a vector, y.
278     c     uses unrolled loops for increments equal to one.
279     c     jack dongarra, linpack, 3/11/78.
280     c*/
281   long i;
282   if (n<=0) return;
283   if (incx!=1 || incy!=1) {
284   /*        unequal increments or equal increments not equal to 1 */
285     long iy, nn=n;
286     if (incx<0) x -= (n-1)*incx;
287     if (incy<0) y -= (n-1)*incx;
288     for (i=iy=0 ; nn-- ; i+=incx,iy+=incy) y[iy] = x[i];
289   } else {
290     /*        both increments equal to 1 */
291     /* unroll loop */
292     long m= n%7;
293     for (i=0 ; i<m ; i++) y[i] = x[i];
294     for (    ; i<n ; i+=7) {
295       y[i  ] = x[i];
296       y[i+1] = x[i+1];
297       y[i+2] = x[i+2];
298       y[i+3] = x[i+3];
299       y[i+4] = x[i+4];
300       y[i+5] = x[i+5];
301       y[i+6] = x[i+6];
302     }
303   }
304 }
305 
306 void
cblas_daxpy(INT_IN n,const double alpha,const double * x,INT_IN incx,double * y,INT_IN incy)307 cblas_daxpy(INT_IN n, const double alpha, const double *x,
308             INT_IN incx, double *y, INT_IN incy)
309 {
310   /*c
311     c     constant times a vector plus a vector.
312     c     uses unrolled loops for increments equal to one.
313     c     jack dongarra, linpack, 3/11/78.
314     c*/
315   long i;
316   if (n<=0) return;
317   if (alpha == 0.0) return;
318   if (incx!=1 || incy!=1) {
319     /*        unequal increments or equal increments not equal to 1 */
320     long iy, nn=n;
321     if (incx<0) x -= (n-1)*incx;
322     if (incy<0) y -= (n-1)*incy;
323     for (i=iy=0 ; nn-- ; i+=incx,iy+=incy) y[iy] += alpha*x[i];
324   } else {
325     /*        both increments equal to 1 */
326     /* unroll loop */
327     long m = n%4;
328     for (i=0 ; i<m ; i++) y[i] += alpha*x[i];
329     for (    ; i<n ; i+=4) {
330       y[i  ] += alpha*x[i];
331       y[i+1] += alpha*x[i+1];
332       y[i+2] += alpha*x[i+2];
333       y[i+3] += alpha*x[i+3];
334     }
335   }
336 }
337 
338 void
cblas_drot(INT_IN n,double * x,INT_IN incx,double * y,INT_IN incy,const double c,const double s)339 cblas_drot(INT_IN n, double *x, INT_IN incx,
340            double *y, INT_IN incy, const double c, const double  s)
341 {
342   /*c
343     c     applies a plane rotation.
344     c     jack dongarra, linpack, 3/11/78.
345     c*/
346   double  dtemp;
347 
348   if (n<=0) return;
349   if (incx!=1 || incy!=1) {
350     /*       unequal increments or equal increments not equal to 1 */
351     long ix, iy, nn=n;
352     if (incx<0) x -= (n-1)*incx;
353     if (incy<0) y -= (n-1)*incy;
354     for (ix=iy=0 ; nn-- ; ix+=incx,iy+=incy) {
355       dtemp = c*x[ix] + s*y[iy];
356       y[iy] = c*y[iy] - s*x[ix];
357       x[ix] = dtemp;
358     }
359   } else {
360     /*       both increments equal to 1 */
361     long i;
362     for (i=0 ; i<n ; i++) {
363       dtemp = c*x[i] + s*y[i];
364       y[i] = c*y[i] - s*x[i];
365       x[i] = dtemp;
366     }
367   }
368   return;
369 }
370 
371 void
cblas_dscal(INT_IN n,const double alpha,double * x,INT_IN incx)372 cblas_dscal(INT_IN n, const double alpha, double *x, INT_IN incx)
373 {
374   /*c
375     c     scales a vector by a constant.
376     c     uses unrolled loops for increment equal to one.
377     c     jack dongarra, linpack, 3/11/78.
378     c     modified 3/93 to return if incx .le. 0.
379     c*/
380   long i, nn=n;
381   if (n<=0) return;
382   if (incx!=1) {
383   /*        increment not equal to 1 */
384     if (incx<0) x -= (n-1)*incx;
385     for (i=0 ; nn-- ; i+=incx) x[i] *= alpha;
386   } else {
387     /*        increment equal to 1 */
388     /* unroll loop */
389     long m= n%5;
390     for (i=0 ; i<m ; i++) x[i] *= alpha;
391     for (    ; i<n ; i+=5) {
392       x[i  ] *= alpha;
393       x[i+1] *= alpha;
394       x[i+2] *= alpha;
395       x[i+3] *= alpha;
396       x[i+4] *= alpha;
397     }
398   }
399 }
400 
401 /* ------------------------------------------------------------ level 2 */
402 
403 void
cblas_dgemv(const enum CBLAS_ORDER order,const enum CBLAS_TRANSPOSE transa,INT_IN m,INT_IN n,const double alpha,const double * a,INT_IN lda,const double * x,INT_IN incx,const double beta,double * y,INT_IN incy)404 cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE transa,
405             INT_IN m, INT_IN n, const double alpha,
406             const double *a, INT_IN lda, const double *x, INT_IN incx,
407             const double beta, double *y, INT_IN incy)
408 {
409   /*
410    *  Purpose
411    *  =======
412    *
413    *  DGEMV  performs one of the matrix-vector operations
414    *
415    *     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
416    *
417    *  where alpha and beta are scalars, x and y are vectors and A is an
418    *  m by n matrix.
419    *
420    *  Parameters
421    *  ==========
422    *
423    *  ORDER   - enum CBLAS_ORDER (ignored)
424    *
425    *  TRANSA  - enum CBLAS_TRANSPOSE
426    *           On entry, TRANS specifies the operation to be performed as
427    *           follows:
428    *              TRANSA = CblasNoTrans   y := alpha*A*x + beta*y.
429    *              TRANSA = CblasTrans     y := alpha*A'*x + beta*y.
430    *              TRANSA = CblasConjTrans y := alpha*A'*x + beta*y.
431    *           Unchanged on exit.
432    *
433    *  M      - INTEGER.
434    *           On entry, M specifies the number of rows of the matrix A.
435    *           M must be at least zero.
436    *           Unchanged on exit.
437    *
438    *  N      - INTEGER.
439    *           On entry, N specifies the number of columns of the matrix A.
440    *           N must be at least zero.
441    *           Unchanged on exit.
442    *
443    *  ALPHA  - DOUBLE PRECISION.
444    *           On entry, ALPHA specifies the scalar alpha.
445    *           Unchanged on exit.
446    *
447    *  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
448    *           Before entry, the leading m by n part of the array A must
449    *           contain the matrix of coefficients.
450    *           Unchanged on exit.
451    *
452    *  LDA    - INTEGER.
453    *           On entry, LDA specifies the first dimension of A as declared
454    *           in the calling (sub) program. LDA must be at least
455    *           max( 1, m ).
456    *           Unchanged on exit.
457    *
458    *  X      - DOUBLE PRECISION array of DIMENSION at least
459    *           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
460    *           and at least
461    *           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
462    *           Before entry, the incremented array X must contain the
463    *           vector x.
464    *           Unchanged on exit.
465    *
466    *  INCX   - INTEGER.
467    *           On entry, INCX specifies the increment for the elements of
468    *           X. INCX must not be zero.
469    *           Unchanged on exit.
470    *
471    *  BETA   - DOUBLE PRECISION.
472    *           On entry, BETA specifies the scalar beta. When BETA is
473    *           supplied as zero then Y need not be set on input.
474    *           Unchanged on exit.
475    *
476    *  Y      - DOUBLE PRECISION array of DIMENSION at least
477    *           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
478    *           and at least
479    *           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
480    *           Before entry with BETA non-zero, the incremented array Y
481    *           must contain the vector y. On exit, Y is overwritten by the
482    *           updated vector y.
483    *
484    *  INCY   - INTEGER.
485    *           On entry, INCY specifies the increment for the elements of
486    *           Y. INCY must not be zero.
487    *           Unchanged on exit.
488    */
489   /*
490    *  Level 2 Blas routine.
491    *
492    *  -- Written on 22-October-1986.
493    *     Jack Dongarra, Argonne National Lab.
494    *     Jeremy Du Croz, Nag Central Office.
495    *     Sven Hammarling, Nag Central Office.
496    *     Richard Hanson, Sandia National Labs.
497    */
498   double temp;
499   long i, info, j, lenx, leny, m3, nn=n, mm=m;
500   int trans = transa;
501   /*
502    *     Test the input parameters.
503    */
504   info = 0;
505   if (transa!=CblasNoTrans && transa!=CblasTrans && transa!=CblasConjTrans) {
506     info = 1;
507   } else if ( m<0 ) {
508     info = 2;
509   } else if ( n<0 ) {
510     info = 3;
511   } else if ( !lda || lda<m ) {
512     info = 6;
513   } else if ( incx==0 ) {
514     info = 8;
515   } else if ( incy==0 ) {
516     info = 11;
517   }
518   if ( info!=0 ) {
519     xerbla( "ydgemv ", info );
520     return;
521   }
522   /*
523    *     Quick return if possible.
524    */
525   if ( ( m==0 )||( n==0 ) || ( ( alpha==0.0 )&&( beta==1.0 ) ) )
526     return;
527   /*
528    *     Set  LENX  and  LENY, the lengths of the vectors x and y, and set
529    *     up the start points in  X  and  Y.
530    */
531   if ( trans==CblasNoTrans ) {
532     lenx = nn;
533     leny = mm;
534   } else {
535     lenx = mm;
536     leny = nn;
537   }
538   if ( incx<0 ) x -= (lenx-1)*incx;
539   if ( incy<0 ) y -= (lenx-1)*incy;
540   m3 = mm&3;
541   /*
542    *     Start the operations. In this version the elements of A are
543    *     accessed sequentially with one pass through A.
544    *
545    *     First form  y := beta*y.
546    */
547   if ( beta!=1.0 ) {
548     if ( incy==1 ) {
549       if ( beta==0.0 ) {
550         for (i=0 ; i<leny ; i++) y[i] = 0.0;
551       } else {
552         for (i=0 ; i<leny ; i++) y[i] *= beta;
553       }
554     } else {
555       if ( beta==0.0 ) {
556         for (i=0 ; leny-- ; i+=incy) y[i] = 0.0;
557       } else {
558         for (i=0 ; leny-- ; i+=incy) y[i] *= beta;
559       }
560     }
561   }
562   if ( alpha==0.0 )
563     return;
564   if ( trans==CblasNoTrans ) {
565     /*       Form  y := alpha*A*x + y.   */
566     if ( incy==1 ) {
567       for (j=0 ; nn-- ; j+=incx,a+=lda) {
568         if ( x[j]!=0.0 ) {
569           temp = alpha*x[j];
570           /* unroll innermost loop */
571           for (i=0 ; i<m3 ; i++) y[i] += temp*a[i];
572           for (    ; i<mm  ; i+=4) {
573             y[i  ] += temp*a[i];
574             y[i+1] += temp*a[i+1];
575             y[i+2] += temp*a[i+2];
576             y[i+3] += temp*a[i+3];
577           }
578         }
579       }
580     } else {
581       long iy;
582       for (j=0 ; nn-- ; j+=incx,a+=lda) {
583         if ( x[j]!=0.0 ) {
584           temp = alpha*x[j];
585           /* unroll innermost loop */
586           for (i=iy=0 ; i<m3 ; i++,iy+=incy) y[iy] += temp*a[i];
587           for (       ; i<mm  ; i+=4,iy+=4*incy) {
588             y[iy       ] += temp*a[i];
589             y[iy+  incy] += temp*a[i+1];
590             y[iy+2*incy] += temp*a[i+2];
591             y[iy+3*incy] += temp*a[i+3];
592           }
593         }
594       }
595     }
596   } else {
597     /*        Form  y := alpha*A'*x + y.  */
598     if ( incx==1 ) {
599       for (j=0 ; nn-- ; j+=incy,a+=lda) {
600         temp = 0.0;
601         /* unroll innermost loop */
602         for (i=0 ; i<m3 ; i++) temp += a[i]*x[i];
603         for (    ; i<mm  ; i+=4)
604           temp += a[i]*x[i] + a[i+1]*x[i+1] + a[i+2]*x[i+2] + a[i+3]*x[i+3];
605         y[j] += alpha*temp;
606       }
607     } else {
608       long ix;
609       for (j=0 ; nn-- ; j+=incy,a+=lda) {
610         temp = 0.0;
611         /* unroll innermost loop */
612         for (i=ix=0 ; i<m3 ; i++,ix+=incx) temp += a[i]*x[ix];
613         for (       ; i<mm  ; i+=4,ix+=4*incx)
614           temp += a[i]*x[ix] + a[i+1]*x[ix+incx] +
615             a[i+2]*x[ix+2*incx] + a[i+3]*x[ix+3*incx];
616         y[j] += alpha*temp;
617       }
618     }
619   }
620 
621   /*     End of DGEMV   */
622 }
623 
624 void
cblas_dtrmv(const enum CBLAS_ORDER order,const enum CBLAS_UPLO uplo,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_DIAG diag,INT_IN n,const double * a,INT_IN lda,double * x,INT_IN incx)625 cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo,
626             const enum CBLAS_TRANSPOSE transa, const enum CBLAS_DIAG diag,
627             INT_IN n, const double *a, INT_IN lda,
628             double *x, INT_IN incx)
629 {
630   /*
631    *  Purpose
632    *  =======
633    *
634    *  DTRMV  performs one of the matrix-vector operations
635    *
636    *     x := A*x,   or   x := A'*x,
637    *
638    *  where x is an n element vector and  A is an n by n unit, or non-unit,
639    *  upper or lower triangular matrix.
640    *
641    *  Parameters
642    *  ==========
643    *
644    *  ORDER   - enum CBLAS_ORDER (ignored)
645    *
646    *  UPLO   - enum CBLAS_UPLO
647    *           On entry, UPLO specifies whether the matrix is an upper or
648    *           lower triangular matrix as follows:
649    *              UPLO = CblasUpper   A is an upper triangular matrix.
650    *              UPLO = CblasLower   A is a lower triangular matrix.
651    *
652    *  TRANSA  - enum CBLAS_TRANSPOSE
653    *           On entry, TRANS specifies the operation to be performed as
654    *           follows:
655    *              TRANS = CblasNoTrans   x := A*x.
656    *              TRANS = CblasTrans     x := A'*x.
657    *              TRANS = CblasConjTrans x := A'*x.
658    *
659    *  DIAG   - enum CBLAS_DIAG
660    *           On entry, DIAG specifies whether or not A is unit
661    *           triangular as follows:
662    *              DIAG = CblasUnit      A is assumed to be unit triangular.
663    *              DIAG = CblasNonUnit   A is not assumed to be unit
664    *                                    triangular.
665    *
666    *  N      - INTEGER.
667    *           On entry, N specifies the order of the matrix A.
668    *           N must be at least zero.
669    *
670    *  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
671    *           Before entry with  UPLO = 'U' or 'u', the leading n by n
672    *           upper triangular part of the array A must contain the upper
673    *           triangular matrix and the strictly lower triangular part of
674    *           A is not referenced.
675    *           Before entry with UPLO = 'L' or 'l', the leading n by n
676    *           lower triangular part of the array A must contain the lower
677    *           triangular matrix and the strictly upper triangular part of
678    *           A is not referenced.
679    *           Note that when  DIAG = 'U' or 'u', the diagonal elements of
680    *           A are not referenced either, but are assumed to be unity.
681    *           Unchanged on exit.
682    *
683    *  LDA    - INTEGER.
684    *           On entry, LDA specifies the first dimension of A as declared
685    *           in the calling (sub) program. LDA must be at least
686    *           max( 1, n ).
687    *           Unchanged on exit.
688    *
689    *  X      - DOUBLE PRECISION array of dimension at least
690    *           ( 1 + ( n - 1 )*abs( INCX ) ).
691    *           Before entry, the incremented array X must contain the n
692    *           element vector x. On exit, X is overwritten with the
693    *           tranformed vector x.
694    *
695    *  INCX   - INTEGER.
696    *           On entry, INCX specifies the increment for the elements of
697    *           X. INCX must not be zero.
698    *           Unchanged on exit.
699    */
700   /*
701    *  Level 2 Blas routine.
702    *
703    *  -- Written on 22-October-1986.
704    *     Jack Dongarra, Argonne National Lab.
705    *     Jeremy Du Croz, Nag Central Office.
706    *     Sven Hammarling, Nag Central Office.
707    *     Richard Hanson, Sandia National Labs.
708    */
709   double    temp;
710   long      i, info, ix, j, jx;
711   int       nounit, trans = transa;
712   /*
713    *     Test the input parameters.
714    */
715   info = 0;
716   if (uplo!=CblasUpper && uplo!=CblasLower ) {
717     info = 1;
718   } else if (transa!=CblasNoTrans && transa!=CblasTrans &&
719              transa!=CblasConjTrans) {
720     info = 2;
721   } else if (diag!=CblasNonUnit && diag!=CblasUnit) {
722     info = 3;
723   } else if ( n<0 ) {
724     info = 4;
725   } else if ( !lda || lda<n ) {
726     info = 6;
727   } else if ( incx==0 ) {
728     info = 8;
729   }
730   if ( info!=0 ) {
731     xerbla( "ydtrmv ", info );
732     return;
733   }
734   /*
735    *     Quick return if possible.
736    */
737   if ( n==0 ) return;
738 
739   nounit = (diag==CblasNonUnit);
740   /*
741    *     Set up the start point in X if the increment is negative.
742    */
743   if ( incx<=0 ) x -= (n-1)*incx;
744   /*
745    *     Start the operations. In this version the elements of A are
746    *     accessed sequentially with one pass through A.
747    */
748   if ( trans==CblasNoTrans ) {
749     /*      Form  x := A*x.  */
750     if ( uplo==CblasUpper ) {
751       if ( incx==1 ) {
752         for (j=0 ; j<n ; j++,a+=lda) {
753           if ( x[j]!=0.0 ) {
754             temp = x[j];
755             /* unroll inner loop */
756             for (i=0 ; i<(j&3) ; i++) x[i] += temp*a[i];
757             for (    ; i<j     ; i+=4) {
758               x[i  ] += temp*a[i];
759               x[i+1] += temp*a[i+1];
760               x[i+2] += temp*a[i+2];
761               x[i+3] += temp*a[i+3];
762             }
763             if ( nounit ) x[j] *= a[j];
764           }
765         }
766       } else {
767         for (j=jx=0 ; j<n ; j++,jx+=incx,a+=lda) {
768           if ( x[jx]!=0.0 ) {
769             temp = x[jx];
770             /* unroll inner loop */
771             for (i=ix=0 ; i<(j&3) ; i++,ix+=incx) x[ix] += temp*a[i];
772             for (       ; i<j ; i+=4,ix+=4*incx) {
773               x[ix       ] += temp*a[i];
774               x[ix+  incx] += temp*a[i+1];
775               x[ix+2*incx] += temp*a[i+2];
776               x[ix+3*incx] += temp*a[i+3];
777             }
778             if ( nounit ) x[jx] *= a[j];
779           }
780         }
781       }
782     } else {
783       long count;
784       if ( incx==1 ) {
785         for (j=n-1,a+=j*lda ; j>=0 ; j--,a-=lda) {
786           if ( x[j]!=0.0 ) {
787             temp = x[j];
788             /* unroll inner loop */
789             count = (n-1-j)&3;
790             for (i=n-1 ; count-- ; i--) x[i] += temp*a[i];
791             for (      ; i>j     ; i-=4) {
792               x[i  ] += temp*a[i];
793               x[i-1] += temp*a[i-1];
794               x[i-2] += temp*a[i-2];
795               x[i-3] += temp*a[i-3];
796             }
797             if ( nounit ) x[j] *= a[j];
798           }
799         }
800       } else {
801         for (j=n-1,jx=j*incx,a+=j*lda ; j>=0 ; j--,jx-=incx,a-=lda) {
802           if ( x[jx]!=0.0 ) {
803             temp = x[jx];
804             /* unroll inner loop */
805             count = (n-1-j)&3;
806             for (i=n-1,ix=i*incx ; count-- ; i--,ix-=incx) x[ix] += temp*a[i];
807             for (                ; i>j     ; i-=4,ix-=4*incx) {
808               x[ix       ] += temp*a[i];
809               x[ix-  incx] += temp*a[i-1];
810               x[ix-2*incx] += temp*a[i-2];
811               x[ix-3*incx] += temp*a[i-3];
812             }
813             if ( nounit ) x[jx] *= a[j];
814           }
815         }
816       }
817     }
818   } else {
819     /*        Form  x := A'*x.  */
820     long count;
821     if ( uplo==CblasUpper ) {
822       if ( incx==1 ) {
823         for (j=n-1,a+=j*lda ; j>=0 ; j--,a-=lda) {
824           temp = x[j];
825           if ( nounit ) temp*= a[j];
826           if (j) {
827             /* unroll inner loop */
828             count = (j-1)&3;
829             for (i=j-1 ; count-- ; i--) temp += a[i]*x[i];
830             for (      ; i>0 ; i-=4) temp += a[i]*x[i] +
831               a[i-1]*x[i-1] + a[i-2]*x[i-2] + a[i-3]*x[i-3];
832           }
833           x[j] = temp;
834         }
835       } else {
836         for (j=n-1,jx=j*incx,a+=j*lda ; j>=0 ; j--,jx-=incx,a-=lda) {
837           temp = x[jx];
838           if ( nounit ) temp *= a[j];
839           if (j) {
840             /* unroll inner loop */
841             count = (j-1)&3;
842             for (i=j-1,ix=i*incx ; count-- ; i--,ix-=incx) temp += a[i]*x[ix];
843             for (                ; i>0 ; i-=4,ix-=4*incx) temp += a[i]*x[ix] +
844               a[i-1]*x[i-incx] + a[i-2]*x[i-2*incx] + a[i-3]*x[i-3*incx];
845           }
846           x[jx] = temp;
847         }
848       }
849     } else {
850       if ( incx==1 ) {
851         for (j=0 ; j<n ; j++,a+=lda) {
852           temp = x[j];
853           if ( nounit ) temp *= a[j];
854           /* unroll inner loop */
855           count = (n-1-j)&3;
856           for (i=j+1 ; count-- ; i++) temp += a[i]*x[i];
857           for (      ; i<n     ; i+=4) temp += a[i]*x[i] +
858             a[i+1]*x[i+1] + a[i+2]*x[i+2] + a[i+3]*x[i+3];
859           x[j] = temp;
860         }
861       } else {
862         for (j=jx=0 ; j<n ; j++,jx+=incx,a+=lda) {
863           temp = x[jx];
864           if ( nounit ) temp *= a[j];
865           /* unroll inner loop */
866           count = (n-1-j)&3;
867           for (i=j+1,ix=i*incx ; count-- ; i++,ix+=incx) temp += a[i]*x[ix];
868           for (             ; i<n     ; i+=4,ix+=4*incx) temp += a[i]*x[ix] +
869             a[i+1]*x[i+incx] + a[i+2]*x[i+2*incx] + a[i+3]*x[i+3*incx];
870           x[jx] = temp;
871         }
872       }
873     }
874   }
875 
876   return;
877   /*     End of DTRMV  */
878 }
879 
880 void
cblas_dtrsv(const enum CBLAS_ORDER order,const enum CBLAS_UPLO uplo,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_DIAG diag,INT_IN n,const double * a,INT_IN lda,double * x,INT_IN incx)881 cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo,
882             const enum CBLAS_TRANSPOSE transa, const enum CBLAS_DIAG diag,
883             INT_IN n, const double *a, INT_IN lda,
884             double *x, INT_IN incx)
885 {
886   /*
887    *  Purpose
888    *  =======
889    *
890    *  DTRSV  solves one of the systems of equations
891    *
892    *     A*x = b,   or   A'*x = b,
893    *
894    *  where b and x are n element vectors and A is an n by n unit, or
895    *  non-unit, upper or lower triangular matrix.
896    *
897    *  No test for singularity or near-singularity is included in this
898    *  routine. Such tests must be performed before calling this routine.
899    *
900    *  Parameters
901    *  ==========
902    *
903    *  ORDER   - enum CBLAS_ORDER (ignored)
904    *
905    *  UPLO   - enum CBLAS_UPLO
906    *           On entry, UPLO specifies whether the matrix is an upper or
907    *           lower triangular matrix as follows:
908    *              UPLO = CblasUpper   A is an upper triangular matrix.
909    *              UPLO = CblasLower   A is a lower triangular matrix.
910    *
911    *  TRANSA  - enum CBLAS_TRANSPOSE
912    *           On entry, TRANS specifies the operation to be performed as
913    *           follows:
914    *              TRANS = CblasNoTrans   x := A*x.
915    *              TRANS = CblasTrans     x := A'*x.
916    *              TRANS = CblasConjTrans x := A'*x.
917    *
918    *  DIAG   - enum CBLAS_DIAG
919    *           On entry, DIAG specifies whether or not A is unit
920    *           triangular as follows:
921    *              DIAG = CblasUnit      A is assumed to be unit triangular.
922    *              DIAG = CblasNonUnit   A is not assumed to be unit
923    *                                    triangular.
924    *
925    *  N      - INTEGER.
926    *           On entry, N specifies the order of the matrix A.
927    *           N must be at least zero.
928    *           Unchanged on exit.
929    *
930    *  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
931    *           Before entry with  UPLO = 'U' or 'u', the leading n by n
932    *           upper triangular part of the array A must contain the upper
933    *           triangular matrix and the strictly lower triangular part of
934    *           A is not referenced.
935    *           Before entry with UPLO = 'L' or 'l', the leading n by n
936    *           lower triangular part of the array A must contain the lower
937    *           triangular matrix and the strictly upper triangular part of
938    *           A is not referenced.
939    *           Note that when  DIAG = 'U' or 'u', the diagonal elements of
940    *           A are not referenced either, but are assumed to be unity.
941    *           Unchanged on exit.
942    *
943    *  LDA    - INTEGER.
944    *           On entry, LDA specifies the first dimension of A as declared
945    *           in the calling (sub) program. LDA must be at least
946    *           max( 1, n ).
947    *           Unchanged on exit.
948    *
949    *  X      - DOUBLE PRECISION array of dimension at least
950    *           ( 1 + ( n - 1 )*abs( INCX ) ).
951    *           Before entry, the incremented array X must contain the n
952    *           element right-hand side vector b. On exit, X is overwritten
953    *           with the solution vector x.
954    *
955    *  INCX   - INTEGER.
956    *           On entry, INCX specifies the increment for the elements of
957    *           X. INCX must not be zero.
958    *           Unchanged on exit.
959    */
960   /*
961    *  Level 2 Blas routine.
962    *
963    *  -- Written on 22-October-1986.
964    *     Jack Dongarra, Argonne National Lab.
965    *     Jeremy Du Croz, Nag Central Office.
966    *     Sven Hammarling, Nag Central Office.
967    *     Richard Hanson, Sandia National Labs.
968    */
969   double    temp;
970   long      i, info, ix, j, jx;
971   int       nounit, trans = transa;
972   /*     Test the input parameters  */
973   info = 0;
974   if (uplo!=CblasUpper && uplo!=CblasLower ) {
975     info = 1;
976   } else if (transa!=CblasNoTrans && transa!=CblasTrans &&
977              transa!=CblasConjTrans) {
978     info = 2;
979   } else if (diag!=CblasNonUnit && diag!=CblasUnit) {
980     info = 3;
981   } else if ( n<0 ) {
982     info = 4;
983   } else if ( !lda || lda<n ) {
984     info = 6;
985   } else if ( incx==0 ) {
986     info = 8;
987   }
988   if ( info!=0 ) {
989     xerbla( "ydtrsv ", info );
990     return;
991   }
992   /*     Quick return if possible  */
993   if ( n==0 ) return;
994 
995   nounit = (diag==CblasNonUnit);
996   /*
997    *     Set up the start point in X if the increment is not unity.
998    */
999   if ( incx<=0 ) x-= (n-1)*incx;
1000   /*
1001    *     Start the operations. In this version the elements of A are
1002    *     accessed sequentially with one pass through A.
1003    */
1004   if ( trans==CblasNoTrans ) {
1005     /*        Form  x := inv( A )*x  */
1006     if ( uplo==CblasUpper ) {
1007       long count;
1008       if ( incx==1 ) {
1009         for (j=n-1,a+=j*lda ; j>=0 ; j--,a-=lda) {
1010           if ( x[j]!=0.0 ) {
1011             if ( nounit ) x[j] /= a[j];
1012             if (!j) break;
1013             temp = x[j];
1014             /* unroll inner loop */
1015             count = (j-1)&3;
1016             for (i=j-1 ; count-- ; i--) x[i] -= temp*a[i];
1017             for (      ; i>0    ; i-=4) {
1018               x[i  ] -= temp*a[i];
1019               x[i-1] -= temp*a[i-1];
1020               x[i-2] -= temp*a[i-2];
1021               x[i-3] -= temp*a[i-3];
1022             }
1023           }
1024         }
1025       } else {
1026         for (j=n-1,jx=j*incx,a+=j*lda ; j>=0 ; j--,jx-=incx,a-=lda) {
1027           if ( x[jx]!=0.0 ) {
1028             if ( nounit ) x[jx] /= a[j];
1029             if (!j) break;
1030             temp = x[jx];
1031             /* unroll inner loop */
1032             count = (j-1)&3;
1033             for (i=j-1,ix=i*incx ; count-- ; i--,ix-=incx) x[ix] -= temp*a[i];
1034             for (                ; i>0    ; i-=4,ix-=4*incx) {
1035               x[ix       ] -= temp*a[i];
1036               x[ix-  incx] -= temp*a[i-1];
1037               x[ix-2*incx] -= temp*a[i-2];
1038               x[ix-3*incx] -= temp*a[i-3];
1039             }
1040           }
1041         }
1042       }
1043     } else {
1044       long count;
1045       if ( incx==1 ) {
1046         for (j=0 ; j<n ; j++,a+=lda) {
1047           if ( x[j]!=0.0 ) {
1048             if ( nounit ) x[j] /= a[j];
1049             temp = x[j];
1050             /* unroll inner loop */
1051             count = (n-1-j)&3;
1052             for (i=j+1 ; count-- ; i++) x[i] -= temp*a[i];
1053             for (      ; i<n     ; i+=4) {
1054               x[i  ] -= temp*a[i];
1055               x[i+1] -= temp*a[i+1];
1056               x[i+2] -= temp*a[i+2];
1057               x[i+3] -= temp*a[i+3];
1058             }
1059           }
1060         }
1061       } else {
1062         for (j=jx=0 ; j<n ; j++,jx+=incx,a+=lda) {
1063           if ( x[jx]!=0.0 ) {
1064             if ( nounit ) x[jx] /= a[j];
1065             temp = x[jx];
1066             /* unroll inner loop */
1067             count = (n-1-j)&3;
1068             for (i=j+1,ix=i*incx ; count-- ; i++,ix+=incx) x[ix] -= temp*a[i];
1069             for (                ; i<n     ; i+=4,ix+=4*incx) {
1070               x[ix       ] -= temp*a[i];
1071               x[ix+  incx] -= temp*a[i+1];
1072               x[ix+2*incx] -= temp*a[i+2];
1073               x[ix+3*incx] -= temp*a[i+3];
1074             }
1075           }
1076         }
1077       }
1078     }
1079   } else {
1080     /*        Form  x := inv( A' )*x   */
1081     if ( uplo==CblasUpper ) {
1082       if ( incx==1 ) {
1083         for (j=0 ; j<n ; j++,a+=lda) {
1084           temp = x[j];
1085           /* unroll inner loop */
1086           for (i=0 ; i<(j&3) ; i++) temp -= a[i]*x[i];
1087           for (    ; i<j     ; i+=4) temp -= a[i]*x[i] +
1088             a[i+1]*x[i+1] + a[i+2]*x[i+2] + a[i+3]*x[i+3];
1089           if ( nounit ) temp /= a[j];
1090           x[j] = temp;
1091         }
1092       } else {
1093         for (j=jx=0 ; j<n ; j++,jx+=incx,a+=lda) {
1094           temp = x[jx];
1095           /* unroll inner loop */
1096           for (i=ix=0 ; i<(j&3) ; i++,ix+=incx) temp -= a[i]*x[ix];
1097           for (       ; i<j     ; i+=4,ix+=4*incx) temp -= a[i]*x[ix] +
1098             a[i+1]*x[ix+incx] + a[i+2]*x[ix+2*incx] + a[i+3]*x[ix+3*incx];
1099           if ( nounit ) temp /= a[j];
1100           x[jx] = temp;
1101         }
1102       }
1103     } else {
1104       long count;
1105       if ( incx==1 ) {
1106         for (j=n-1,a+=j*lda ; j>=0 ; j--,a-=lda) {
1107           temp = x[j];
1108           /* unroll inner loop */
1109           count = (n-1-j)&3;
1110           for (i=n-1 ; count-- ; i--) temp -= a[i]*x[i];
1111           for (      ; i>j     ; i-=4) temp -= a[i]*x[i] +
1112             a[i-1]*x[i-1] + a[i-2]*x[i-2] + a[i-3]*x[i-3];
1113           if ( nounit ) temp /= a[j];
1114           x[j] = temp;
1115         }
1116       } else {
1117         for (j=n-1,jx=j*incx,a+=j*lda ; j>=0 ; j--,jx-=incx,a-=lda) {
1118           temp = x[jx];
1119           /* unroll inner loop */
1120           count = (n-1-j)&3;
1121           for (i=n-1,ix=i*incx ; count-- ; i--,ix-=incx) temp -= a[i]*x[ix];
1122           for (                ; i>j  ; i-=4,ix-=4*incx) temp -= a[i]*x[ix] +
1123             a[i-1]*x[ix-incx] + a[i-2]*x[ix-2*incx] + a[i-3]*x[ix-3*incx];
1124           if ( nounit ) temp /= a[j];
1125           x[jx] = temp;
1126         }
1127       }
1128     }
1129   }
1130 
1131   /*     End of DTRSV   */
1132 }
1133 
1134 void
cblas_dger(const enum CBLAS_ORDER order,INT_IN m,INT_IN n,const double alpha,const double * x,INT_IN incx,const double * y,INT_IN incy,double * a,INT_IN lda)1135 cblas_dger(const enum CBLAS_ORDER order, INT_IN m, INT_IN n,
1136            const double alpha, const double *x, INT_IN incx,
1137            const double *y, INT_IN incy, double *a, INT_IN lda)
1138 {
1139   /*
1140    *  Purpose
1141    *  =======
1142    *
1143    *  DGER   performs the rank 1 operation
1144    *
1145    *     A := alpha*x*y' + A,
1146    *
1147    *  where alpha is a scalar, x is an m element vector, y is an n element
1148    *  vector and A is an m by n matrix.
1149    *
1150    *  Parameters
1151    *  ==========
1152    *
1153    *  ORDER   - enum CBLAS_ORDER (ignored)
1154    *
1155    *  M      - INTEGER.
1156    *           On entry, M specifies the number of rows of the matrix A.
1157    *           M must be at least zero.
1158    *           Unchanged on exit.
1159    *
1160    *  N      - INTEGER.
1161    *           On entry, N specifies the number of columns of the matrix A.
1162    *           N must be at least zero.
1163    *           Unchanged on exit.
1164    *
1165    *  ALPHA  - DOUBLE PRECISION.
1166    *           On entry, ALPHA specifies the scalar alpha.
1167    *           Unchanged on exit.
1168    *
1169    *  X      - DOUBLE PRECISION array of dimension at least
1170    *           ( 1 + ( m - 1 )*abs( INCX ) ).
1171    *           Before entry, the incremented array X must contain the m
1172    *           element vector x.
1173    *           Unchanged on exit.
1174    *
1175    *  INCX   - INTEGER.
1176    *           On entry, INCX specifies the increment for the elements of
1177    *           X. INCX must not be zero.
1178    *           Unchanged on exit.
1179    *
1180    *  Y      - DOUBLE PRECISION array of dimension at least
1181    *           ( 1 + ( n - 1 )*abs( INCY ) ).
1182    *           Before entry, the incremented array Y must contain the n
1183    *           element vector y.
1184    *           Unchanged on exit.
1185    *
1186    *  INCY   - INTEGER.
1187    *           On entry, INCY specifies the increment for the elements of
1188    *           Y. INCY must not be zero.
1189    *           Unchanged on exit.
1190    *
1191    *  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
1192    *           Before entry, the leading m by n part of the array A must
1193    *           contain the matrix of coefficients. On exit, A is
1194    *           overwritten by the updated matrix.
1195    *
1196    *  LDA    - INTEGER.
1197    *           On entry, LDA specifies the first dimension of A as declared
1198    *           in the calling (sub) program. LDA must be at least
1199    *           max( 1, m ).
1200    *           Unchanged on exit.
1201    */
1202   /*
1203    *  Level 2 Blas routine.
1204    *
1205    *  -- Written on 22-October-1986.
1206    *     Jack Dongarra, Argonne National Lab.
1207    *     Jeremy Du Croz, Nag Central Office.
1208    *     Sven Hammarling, Nag Central Office.
1209    *     Richard Hanson, Sandia National Labs.
1210    */
1211   double    temp;
1212   long      i, info, j, m3, mm = m, nn = n, inc_x = incx, inc_y = incy;
1213   /*     Test the input parameters.  */
1214   info = 0;
1215   if     ( m<0 ) {
1216     info = 1;
1217   } else if ( n<0 ) {
1218     info = 2;
1219   } else if ( incx==0 ) {
1220     info = 5;
1221   } else if ( incy==0 ) {
1222     info = 7;
1223   } else if ( !lda || lda<m ) {
1224     info = 9;
1225   }
1226   if ( info!=0 ) {
1227     xerbla( "ydger  ", info );
1228     return;
1229   }
1230   /*     Quick return if possible.   */
1231   if ( ( m==0 ) || ( n==0 ) || ( alpha==0.0 ) )
1232     return;
1233   /*
1234    *     Start the operations. In this version the elements of A are
1235    *     accessed sequentially with one pass through A.
1236    */
1237   m3 = mm&3;
1238   if ( inc_y<0 ) y -= (nn-1)*inc_y;
1239   if ( inc_x==1 ) {
1240     for (j=0 ; nn-- ; j+=inc_y,a+=lda)
1241       if ( y[j]!=0.0 ) {
1242         temp = alpha*y[j];
1243         /* unroll innermost loop */
1244         for (i=0 ; i<m3 ; i++) a[i] += x[i]*temp;
1245         for (    ; i<mm  ; i+=4) {
1246           a[i  ] += x[i]*temp;
1247           a[i+1] += x[i+1]*temp;
1248           a[i+2] += x[i+2]*temp;
1249           a[i+3] += x[i+3]*temp;
1250         }
1251       }
1252   } else {
1253     long ix;
1254     if ( inc_x<0 ) x -= (mm-1)*inc_x;
1255     for (j=0 ; nn-- ; j+=inc_y,a+=lda) {
1256       if ( y[j]!=0.0 ) {
1257         temp = alpha*y[j];
1258         /* unroll innermost loop */
1259         for (i=ix=0 ; i<m3 ; i++,ix+=inc_x) a[i] += x[ix]*temp;
1260         for (       ; i<mm  ; i+=4,ix+=4*inc_x) {
1261           a[i  ] += x[ix]*temp;
1262           a[i+1] += x[ix+inc_x]*temp;
1263           a[i+2] += x[ix+2*inc_x]*temp;
1264           a[i+3] += x[ix+3*inc_x]*temp;
1265         }
1266       }
1267     }
1268   }
1269 
1270   /*     End of DGER  */
1271 }
1272 
1273 /* ------------------------------------------------------------ level 3 */
1274 
1275 void
cblas_dgemm(const enum CBLAS_ORDER order,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_TRANSPOSE transb,INT_IN m,INT_IN n,INT_IN k,const double alpha,const double * a,INT_IN lda,const double * b,INT_IN ldb,const double beta,double * c,INT_IN ldc)1276 cblas_dgemm(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE transa,
1277             const enum CBLAS_TRANSPOSE transb, INT_IN m, INT_IN n,
1278             INT_IN k, const double alpha, const double *a,
1279             INT_IN lda, const double *b, INT_IN ldb,
1280             const double beta, double *c, INT_IN ldc)
1281 {
1282   /*
1283    *  Purpose
1284    *  =======
1285    *
1286    *  DGEMM  performs one of the matrix-matrix operations
1287    *
1288    *     C := alpha*op( A )*op( B ) + beta*C,
1289    *
1290    *  where  op( X ) is one of
1291    *
1292    *     op( X ) = X   or   op( X ) = X',
1293    *
1294    *  alpha and beta are scalars, and A, B and C are matrices, with op( A )
1295    *  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
1296    *
1297    *  Parameters
1298    *  ==========
1299    *
1300    *  ORDER   - enum CBLAS_ORDER (ignored)
1301    *
1302    *  TRANSA  - enum CBLAS_TRANSPOSE
1303    *           On entry, TRANSA specifies the form of op( A ) to be used in
1304    *           the matrix multiplication as follows:
1305    *              TRANSA = CblasNoTrans,   op( A ) = A.
1306    *              TRANSA = CblasTrans,     op( A ) = A'.
1307    *              TRANSA = CblasConjTrans, op( A ) = A'.
1308    *
1309    *  TRANSB  - enum CBLAS_TRANSPOSE
1310    *           On entry, TRANSA specifies the form of op( B ) to be used in
1311    *           the matrix multiplication as follows:
1312    *              TRANSB = CblasNoTrans,   op( B ) = B.
1313    *              TRANSB = CblasTrans,     op( B ) = B'.
1314    *              TRANSB = CblasConjTrans, op( B ) = B'.
1315    *
1316    *  M      - INTEGER.
1317    *           On entry,  M  specifies  the number  of rows  of the  matrix
1318    *           op( A )  and of the  matrix  C.  M  must  be at least  zero.
1319    *           Unchanged on exit.
1320    *
1321    *  N      - INTEGER.
1322    *           On entry,  N  specifies the number  of columns of the matrix
1323    *           op( B ) and the number of columns of the matrix C. N must be
1324    *           at least zero.
1325    *           Unchanged on exit.
1326    *
1327    *  K      - INTEGER.
1328    *           On entry,  K  specifies  the number of columns of the matrix
1329    *           op( A ) and the number of rows of the matrix op( B ). K must
1330    *           be at least  zero.
1331    *           Unchanged on exit.
1332    *
1333    *  ALPHA  - DOUBLE PRECISION.
1334    *           On entry, ALPHA specifies the scalar alpha.
1335    *           Unchanged on exit.
1336    *
1337    *  A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
1338    *           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
1339    *           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
1340    *           part of the array  A  must contain the matrix  A,  otherwise
1341    *           the leading  k by m  part of the array  A  must contain  the
1342    *           matrix A.
1343    *           Unchanged on exit.
1344    *
1345    *  LDA    - INTEGER.
1346    *           On entry, LDA specifies the first dimension of A as declared
1347    *           in the calling (sub) program. When  TRANSA = 'N' or 'n' then
1348    *           LDA must be at least  max( 1, m ), otherwise  LDA must be at
1349    *           least  max( 1, k ).
1350    *           Unchanged on exit.
1351    *
1352    *  B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is
1353    *           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
1354    *           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
1355    *           part of the array  B  must contain the matrix  B,  otherwise
1356    *           the leading  n by k  part of the array  B  must contain  the
1357    *           matrix B.
1358    *           Unchanged on exit.
1359    *
1360    *  LDB    - INTEGER.
1361    *           On entry, LDB specifies the first dimension of B as declared
1362    *           in the calling (sub) program. When  TRANSB = 'N' or 'n' then
1363    *           LDB must be at least  max( 1, k ), otherwise  LDB must be at
1364    *           least  max( 1, n ).
1365    *           Unchanged on exit.
1366    *
1367    *  BETA   - DOUBLE PRECISION.
1368    *           On entry,  BETA  specifies the scalar  beta.  When  BETA  is
1369    *           supplied as zero then C need not be set on input.
1370    *           Unchanged on exit.
1371    *
1372    *  C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
1373    *           Before entry, the leading  m by n  part of the array  C must
1374    *           contain the matrix  C,  except when  beta  is zero, in which
1375    *           case C need not be set on entry.
1376    *           On exit, the array  C  is overwritten by the  m by n  matrix
1377    *           ( alpha*op( A )*op( B ) + beta*C ).
1378    *
1379    *  LDC    - INTEGER.
1380    *           On entry, LDC specifies the first dimension of C as declared
1381    *           in  the  calling  (sub)  program.   LDC  must  be  at  least
1382    *           max( 1, m ).
1383    *           Unchanged on exit.
1384    */
1385   /*
1386    *  Level 3 Blas routine.
1387    *
1388    *  -- Written on 8-February-1989.
1389    *     Jack Dongarra, Argonne National Laboratory.
1390    *     Iain Duff, AERE Harwell.
1391    *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
1392    *     Sven Hammarling, Numerical Algorithms Group Ltd.
1393    */
1394   int    nota, notb;
1395   long   i, info, j, l, nrowa, nrowb, nn = n;
1396   double temp;
1397   /*
1398    *     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
1399    *     transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
1400    *     and  columns of  A  and the  number of  rows  of  B  respectively.
1401    */
1402   nota  = transa==CblasNoTrans;
1403   notb  = transb==CblasNoTrans;
1404   nrowa = nota? m : k;
1405   nrowb = notb? k : n;
1406   /*     Test the input parameters.  */
1407   info = 0;
1408   if ( !nota && transa!=CblasTrans && transa!=CblasConjTrans ) {
1409     info = 1;
1410   } else if ( !notb && transb!=CblasTrans && transb!=CblasConjTrans ) {
1411     info = 2;
1412   } else if ( m<0 ) {
1413     info = 3;
1414   } else if ( n<0 ) {
1415     info = 4;
1416   } else if ( k<0 ) {
1417     info = 5;
1418   } else if ( !lda || lda<nrowa ) {
1419     info = 8;
1420   } else if ( !ldb || ldb<nrowb ) {
1421     info = 10;
1422   } else if ( !ldc || ldc<m ) {
1423     info = 13;
1424   }
1425   if ( info!=0 ) {
1426     xerbla( "ydgemm ", info );
1427     return;
1428   }
1429   /*     Quick return if possible.  */
1430   if ( ( m==0 )||( n==0 ) ||
1431        ( ( ( alpha==0.0 ) || ( k==0 ) ) && ( beta==1.0 ) ) )
1432     return;
1433   /*     And if  alpha.eq.0.0.  */
1434   if ( alpha==0.0 ) {
1435     if ( beta==0.0 ) {
1436       for ( ; nn-- ; c+=ldc)
1437         for (i=0 ; i<m ; i++) c[i] = 0.0;
1438     } else {
1439       for ( ; nn-- ; c+=ldc)
1440         for (i=0 ; i<m ; i++) c[i] *= beta;
1441     }
1442     return;
1443   }
1444   /*     Start the operations.  */
1445   if ( notb ) {
1446     if ( nota ) {
1447       /*           Form  C := alpha*A*B + beta*C  */
1448       const double *al;
1449       long m3 = m&3;
1450       for ( ; nn-- ; b+=ldb,c+=ldc) {
1451         if ( beta==0.0 ) {
1452           for (i=0 ; i<m ; i++) c[i] = 0.0;
1453         } else if ( beta!=1.0 ) {
1454           for (i=0 ; i<m ; i++) c[i] *= beta;
1455         }
1456         for (l=0,al=a ; l<k ; l++,al+=lda) {
1457           if ( b[l]!=0.0 ) {
1458             temp = alpha*b[l];
1459             /* unroll innermost loop */
1460             for (i=0 ; i<m3 ; i++) c[i] += temp*al[i];
1461             for (    ; i<m  ; i+=4) {
1462               c[i] += temp*al[i];
1463               c[i+1] += temp*al[i+1];
1464               c[i+2] += temp*al[i+2];
1465               c[i+3] += temp*al[i+3];
1466             }
1467           }
1468         }
1469       }
1470     } else {
1471       /*           Form  C := alpha*A'*B + beta*C  */
1472       const double *ai;
1473       long k3 = k&3;
1474       for ( ; nn-- ; b+=ldb,c+=ldc) {
1475         for (i=0,ai=a ; i<m ; i++,ai+=lda) {
1476           temp = 0.0;
1477           /* unroll innermost loop */
1478           for (l=0 ; l<k3 ; l++) temp += ai[l]*b[l];
1479           for (    ; l<k  ; l+=4)
1480             temp += ai[l]*b[l] + ai[l+1]*b[l+1] +
1481               ai[l+2]*b[l+2] + ai[l+3]*b[l+3];
1482           if ( beta==0.0 ) c[i] = alpha*temp;
1483           else c[i] = alpha*temp + beta*c[i];
1484         }
1485       }
1486     }
1487   } else {
1488     if ( nota ) {
1489       /*           Form  C := alpha*A*B' + beta*C  */
1490       const double *al, *bl;
1491       long m3 = m&3;
1492       for (j=0 ; j<nn ; j++,c+=ldc) {
1493         if ( beta==0.0 ) for (i=0 ; i<m ; i++) c[i] = 0.0;
1494         else if ( beta!=1.0 ) for (i=0 ; i<m ; i++) c[i] *= beta;
1495         for (l=0,al=a,bl=b ; l<k ; l++,al+=lda,bl+=ldb) {
1496           if ( bl[j]!=0.0 ) {
1497             temp = alpha*bl[j];
1498             /* unroll innermost loop */
1499             for (i=0 ; i<m3 ; i++) c[i] += temp*al[i];
1500             for (    ; i<m  ; i+=4) {
1501               c[i] += temp*al[i];
1502               c[i+1] += temp*al[i+1];
1503               c[i+2] += temp*al[i+2];
1504               c[i+3] += temp*al[i+3];
1505             }
1506           }
1507         }
1508       }
1509     } else {
1510       /*          Form  C := alpha*A'*B' + beta*C  */
1511       const double *ai, *bl;
1512       long k3 = k&3;
1513       for (j=0 ; j<nn ; j++,c+=ldc) {
1514         for (i=0,ai=a ; i<m ; i++,ai+=lda) {
1515           temp = 0.0;
1516           /* unroll innermost loop */
1517           for (l=0,bl=b+j ; l<k3 ; l++,bl+=ldb) temp += ai[l]*bl[0];
1518           for (           ; l<k  ; l +=4,bl+=4*ldb)
1519             temp += ai[l]*bl[0] + ai[l+1]*bl[ldb] +
1520               ai[l+2]*bl[2*ldb] + ai[l+3]*bl[3*ldb];
1521           if ( beta==0.0 ) c[i] = alpha*temp;
1522           else c[i] = alpha*temp + beta*c[i];
1523         }
1524       }
1525     }
1526   }
1527 
1528   /*     End of DGEMM   */
1529 }
1530 
1531 void
cblas_dtrmm(const enum CBLAS_ORDER order,const enum CBLAS_SIDE side,const enum CBLAS_UPLO uplo,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_DIAG diag,INT_IN m,INT_IN n,const double alpha,const double * a,INT_IN lda,double * b,INT_IN ldb)1532 cblas_dtrmm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side,
1533             const enum CBLAS_UPLO uplo, const enum CBLAS_TRANSPOSE transa,
1534             const enum CBLAS_DIAG diag, INT_IN m, INT_IN n,
1535             const double alpha, const double *a, INT_IN lda,
1536             double *b, INT_IN ldb)
1537 {
1538   /*
1539    *  Purpose
1540    *  =======
1541    *
1542    *  DTRMM  performs one of the matrix-matrix operations
1543    *
1544    *     B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
1545    *
1546    *  where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
1547    *  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
1548    *
1549    *     op( A ) = A   or   op( A ) = A'.
1550    *
1551    *  Parameters
1552    *  ==========
1553    *
1554    *  ORDER   - enum CBLAS_ORDER (ignored)
1555    *
1556    *  SIDE   - enum CBLAS_SIDE
1557    *           On entry,  SIDE specifies whether  op( A ) multiplies B from
1558    *           the left or right as follows:
1559    *              SIDE = CblasLeft    B := alpha*op( A )*B.
1560    *              SIDE = CblasRight   B := alpha*B*op( A ).
1561    *
1562    *  UPLO   - enum CBLAS_UPLO
1563    *           On entry, UPLO specifies whether the matrix is an upper or
1564    *           lower triangular matrix as follows:
1565    *              UPLO = CblasUpper   A is an upper triangular matrix.
1566    *              UPLO = CblasLower   A is a lower triangular matrix.
1567    *
1568    *  TRANSA  - enum CBLAS_TRANSPOSE
1569    *           On entry, TRANSA specifies the form of op( A ) to be used in
1570    *           the matrix multiplication as follows:
1571    *              TRANSA = CblasNoTrans,   op( A ) = A.
1572    *              TRANSA = CblasTrans,     op( A ) = A'.
1573    *              TRANSA = CblasConjTrans, op( A ) = A'.
1574    *
1575    *  DIAG   - enum CBLAS_DIAG
1576    *           On entry, DIAG specifies whether or not A is unit
1577    *           triangular as follows:
1578    *              DIAG = CblasUnit      A is assumed to be unit triangular.
1579    *              DIAG = CblasNonUnit   A is not assumed to be unit
1580    *                                    triangular.
1581    *
1582    *  M      - INTEGER.
1583    *           On entry, M specifies the number of rows of B. M must be at
1584    *           least zero.
1585    *           Unchanged on exit.
1586    *
1587    *  N      - INTEGER.
1588    *           On entry, N specifies the number of columns of B.  N must be
1589    *           at least zero.
1590    *           Unchanged on exit.
1591    *
1592    *  ALPHA  - DOUBLE PRECISION.
1593    *           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
1594    *           zero then  A is not referenced and  B need not be set before
1595    *           entry.
1596    *           Unchanged on exit.
1597    *
1598    *  A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m
1599    *           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
1600    *           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
1601    *           upper triangular part of the array  A must contain the upper
1602    *           triangular matrix  and the strictly lower triangular part of
1603    *           A is not referenced.
1604    *           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
1605    *           lower triangular part of the array  A must contain the lower
1606    *           triangular matrix  and the strictly upper triangular part of
1607    *           A is not referenced.
1608    *           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
1609    *           A  are not referenced either,  but are assumed to be  unity.
1610    *           Unchanged on exit.
1611    *
1612    *  LDA    - INTEGER.
1613    *           On entry, LDA specifies the first dimension of A as declared
1614    *           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
1615    *           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
1616    *           then LDA must be at least max( 1, n ).
1617    *           Unchanged on exit.
1618    *
1619    *  B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ).
1620    *           Before entry,  the leading  m by n part of the array  B must
1621    *           contain the matrix  B,  and  on exit  is overwritten  by the
1622    *           transformed matrix.
1623    *
1624    *  LDB    - INTEGER.
1625    *           On entry, LDB specifies the first dimension of B as declared
1626    *           in  the  calling  (sub)  program.   LDB  must  be  at  least
1627    *           max( 1, m ).
1628    *           Unchanged on exit.
1629    */
1630   /*
1631    *  Level 3 Blas routine.
1632    *
1633    *  -- Written on 8-February-1989.
1634    *     Jack Dongarra, Argonne National Laboratory.
1635    *     Iain Duff, AERE Harwell.
1636    *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
1637    *     Sven Hammarling, Numerical Algorithms Group Ltd.
1638    */
1639   int     lside, nounit, upper;
1640   long    i, info, j, k, nrowa, nn = n;
1641   double  temp;
1642   /*     Test the input parameters  */
1643   lside  = side==CblasLeft;
1644   nrowa = lside? m : n;
1645   nounit = diag==CblasNonUnit;
1646   upper  = uplo==CblasUpper;
1647 
1648   info   = 0;
1649   if ( !lside && side!=CblasRight ) {
1650     info = 1;
1651   } else if ( !upper && uplo!=CblasLower ) {
1652     info = 2;
1653   } else if (transa!=CblasNoTrans && transa!=CblasTrans &&
1654              transa!=CblasConjTrans) {
1655     info = 3;
1656   } else if ( diag!=CblasNonUnit && diag!=CblasUnit ) {
1657     info = 4;
1658   } else if ( m<0 ) {
1659     info = 5;
1660   } else if ( n<0 ) {
1661     info = 6;
1662   } else if ( !lda || lda<nrowa ) {
1663     info = 9;
1664   } else if ( !ldb || ldb<m ) {
1665     info = 11;
1666   }
1667   if ( info!=0 ) {
1668     xerbla( "ydtrmm ", info );
1669     return;
1670   }
1671   /*     Quick return if possible.  */
1672   if ( n==0 )
1673     return;
1674   /*     And when  alpha.eq.0.0.  */
1675   if ( alpha==0.0 ) {
1676     for ( ; nn-- ; b+=ldb)
1677       for (i=0 ; i<m ; i++) b[i] = 0.0;
1678     return;
1679   }
1680   /*     Start the operations.  */
1681   if ( lside ) {
1682     if ( transa==CblasNoTrans ) {
1683       /*           Form  B := alpha*A*B.  */
1684       const double *ak;
1685       if ( upper ) {
1686         for ( ; nn-- ; b+=ldb) {
1687           for (k=0,ak=a ; k<m ; k++,ak+=lda) {
1688             if ( b[k]!=0.0 ) {
1689               temp = alpha*b[k];
1690               /* hand unroll inner loop */
1691               for (i=0 ; i<(k&3) ; i++) b[i] += temp*ak[i];
1692               for (    ; i<k     ; i+=4) {
1693                 b[i  ] += temp*ak[i];
1694                 b[i+1] += temp*ak[i+1];
1695                 b[i+2] += temp*ak[i+2];
1696                 b[i+3] += temp*ak[i+3];
1697               }
1698               if ( nounit ) temp *= ak[k];
1699               b[k] = temp;
1700             }
1701           }
1702         }
1703       } else {
1704         long count;
1705         a += (m-1)*lda;
1706         for ( ; nn-- ; b+=ldb) {
1707           for (k=m-1,ak=a ; k>=0 ; k--,ak-=lda) {
1708             if ( b[k]!=0.0 ) {
1709               temp = alpha*b[k];
1710               b[k] = temp;
1711               if ( nounit ) b[k] *= ak[k];
1712               /* hand unroll inner loop */
1713               count = (m-1-k)&3;
1714               for (i=k+1 ; count-- ; i++) b[i] += temp*ak[i];
1715               for (      ; i<m     ; i+=4) {
1716                 b[i  ] += temp*ak[i];
1717                 b[i+1] += temp*ak[i+1];
1718                 b[i+2] += temp*ak[i+2];
1719                 b[i+3] += temp*ak[i+3];
1720               }
1721             }
1722           }
1723         }
1724       }
1725     } else {
1726       /*           Form  B := alpha*B*A'.  */
1727       const double *ai;
1728       if ( upper ) {
1729         a += (m-1)*lda;
1730         for ( ; nn-- ; b+=ldb) {
1731           for (i=m-1,ai=a ; i>=0 ; i--,a-=lda) {
1732             temp = b[i];
1733             if ( nounit ) temp *= ai[i];
1734             /* hand unroll inner loop */
1735             for (k=0 ; k<(i&3) ; k++) temp += ai[k]*b[k];
1736             for (    ; k<i     ; k+=4)
1737               temp += ai[k]*b[k] + ai[k+1]*b[k+1] + ai[k+2]*b[k+2] +
1738                 ai[k+3]*b[k+3];
1739             b[i] = alpha*temp;
1740           }
1741         }
1742       } else {
1743         long count;
1744         for ( ; nn-- ; b+=ldb) {
1745           for (i=0,ai=a ; i<m ; i++,a+=lda) {
1746             temp= b[i];
1747             if ( nounit ) temp *= ai[i];
1748             /* hand unroll inner loop */
1749             count = (m-1-i)&3;
1750             for (k=i+1 ; count-- ; k++) temp += ai[k]*b[k];
1751             for (      ; k<m     ; k+=4)
1752               temp += ai[k]*b[k] + ai[k+1]*b[k+1] + ai[k+2]*b[k+2] +
1753                 ai[k+3]*b[k+3];
1754             b[i] = alpha*temp;
1755           }
1756         }
1757       }
1758     }
1759   } else {
1760     if ( transa==CblasNoTrans ) {
1761       /*           Form  B := alpha*B*A.  */
1762       double *bj, *bk;
1763       if ( upper ) {
1764         for (j=nn-1,a+=j*lda,bj=b+j*ldb ; j>=0 ; j--,a-=lda,bj-=ldb) {
1765           temp = alpha;
1766           if ( nounit ) temp *= a[j];
1767           for (i=0 ; i<m ; i++) bj[i] *= temp;
1768           for (k=0,bk=b ; k<j ; k++,bk+=ldb) {
1769             if ( a[k]!=0.0 ) {
1770               temp = alpha*a[k];
1771               /* unroll inner loop */
1772               for (i=0 ; i<(m&3) ; i++) bj[i] += temp*bk[i];
1773               for (    ; i<m     ; i+=4) {
1774                 bj[i  ] += temp*bk[i];
1775                 bj[i+1] += temp*bk[i+1];
1776                 bj[i+2] += temp*bk[i+2];
1777                 bj[i+3] += temp*bk[i+3];
1778               }
1779             }
1780           }
1781         }
1782       } else {
1783         for (j=0,bj=b ; j<nn ; j++,a+=lda,bj+=ldb) {
1784           temp = alpha;
1785           if ( nounit ) temp *= a[j];
1786           for (i=0 ; i<m ; i++) bj[i] *= temp;
1787           for (k=j+1,bk=b+k*ldb ; k<nn ; k++,bk+=ldb) {
1788             if ( a[k]!=0.0 ) {
1789               temp = alpha*a[k];
1790               /* unroll inner loop */
1791               for (i=0 ; i<(m&3) ; i++) bj[i] += temp*bk[i];
1792               for (    ; i<m     ; i+=4) {
1793                 bj[i  ] += temp*bk[i];
1794                 bj[i+1] += temp*bk[i+1];
1795                 bj[i+2] += temp*bk[i+2];
1796                 bj[i+3] += temp*bk[i+3];
1797               }
1798             }
1799           }
1800         }
1801       }
1802     } else {
1803       /*           Form  B := alpha*B*A'.  */
1804       double *bj, *bk;
1805       if ( upper ) {
1806         for (k=0,bk=b ; k<nn ; k++,a+=lda,bk+=ldb) {
1807           for (j=0,bj=b ; j<k ; j++,bj+=ldb) {
1808             if ( a[j]!=0.0 ) {
1809               temp = alpha*a[j];
1810               /* unroll inner loop */
1811               for (i=0 ; i<(m&3) ; i++) bj[i] += temp*bk[i];
1812               for (    ; i<m     ; i+=4) {
1813                 bj[i  ] += temp*bk[i];
1814                 bj[i+1] += temp*bk[i+1];
1815                 bj[i+2] += temp*bk[i+2];
1816                 bj[i+3] += temp*bk[i+3];
1817               }
1818             }
1819           }
1820           temp = alpha;
1821           if ( nounit ) temp *= a[k];
1822           if ( temp!=1.0 ) for (i=0 ; i<m ; i++) bk[i] *= temp;
1823         }
1824       } else {
1825         for (k=nn-1,a+=k*lda,bk=b+k*ldb ; k>=0 ; k--,a-=lda,bk-=ldb) {
1826           for (j=k+1,bj=b+j*ldb ; j<nn ; j++,bj+=ldb) {
1827             if ( a[j]!=0.0 ) {
1828               temp = alpha*a[j];
1829               /* unroll inner loop */
1830               for (i=0 ; i<(m&3) ; i++) bj[i] += temp*bk[i];
1831               for (    ; i<m     ; i+=4) {
1832                 bj[i  ] += temp*bk[i];
1833                 bj[i+1] += temp*bk[i+1];
1834                 bj[i+2] += temp*bk[i+2];
1835                 bj[i+3] += temp*bk[i+3];
1836               }
1837             }
1838           }
1839           temp = alpha;
1840           if ( nounit ) temp *= a[k];
1841           if ( temp!=1.0 ) for (i=0 ; i<m ; i++) bk[i] *= temp;
1842         }
1843       }
1844     }
1845   }
1846 
1847   return;
1848   /*     End of DTRMM   */
1849 }
1850 
1851 void
cblas_dtrsm(const enum CBLAS_ORDER order,const enum CBLAS_SIDE side,const enum CBLAS_UPLO uplo,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_DIAG diag,INT_IN m,INT_IN n,const double alpha,const double * a,INT_IN lda,double * b,INT_IN ldb)1852 cblas_dtrsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side,
1853             const enum CBLAS_UPLO uplo, const enum CBLAS_TRANSPOSE transa,
1854             const enum CBLAS_DIAG diag, INT_IN m, INT_IN n,
1855             const double alpha, const double *a, INT_IN lda,
1856             double *b, INT_IN ldb)
1857 {
1858   /*
1859    *  Purpose
1860    *  =======
1861    *
1862    *  DTRSM  solves one of the matrix equations
1863    *
1864    *     op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
1865    *
1866    *  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
1867    *  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
1868    *
1869    *     op( A ) = A   or   op( A ) = A'.
1870    *
1871    *  The matrix X is overwritten on B.
1872    *
1873    *  Parameters
1874    *  ==========
1875    *
1876    *  ORDER   - enum CBLAS_ORDER (ignored)
1877    *
1878    *  SIDE   - enum CBLAS_SIDE
1879    *           On entry,  SIDE specifies whether  op( A ) appears on the left
1880    *           or right of X as follows:
1881    *              SIDE = CblasLeft    op( A )*X = alpha*B.
1882    *              SIDE = CblasRight   X*op( A ) = alpha*B.
1883    *
1884    *  UPLO   - enum CBLAS_UPLO
1885    *           On entry, UPLO specifies whether the matrix A is an upper or
1886    *           lower triangular matrix as follows:
1887    *              UPLO = CblasUpper   A is an upper triangular matrix.
1888    *              UPLO = CblasLower   A is a lower triangular matrix.
1889    *
1890    *  TRANSA  - enum CBLAS_TRANSPOSE
1891    *           On entry, TRANSA specifies the form of op( A ) to be used in
1892    *           the matrix multiplication as follows:
1893    *              TRANSA = CblasNoTrans,   op( A ) = A.
1894    *              TRANSA = CblasTrans,     op( A ) = A'.
1895    *              TRANSA = CblasConjTrans, op( A ) = A'.
1896    *
1897    *  DIAG   - enum CBLAS_DIAG
1898    *           On entry, DIAG specifies whether or not A is unit
1899    *           triangular as follows:
1900    *              DIAG = CblasUnit      A is assumed to be unit triangular.
1901    *              DIAG = CblasNonUnit   A is not assumed to be unit
1902    *                                    triangular.
1903    *
1904    *  M      - INTEGER.
1905    *           On entry, M specifies the number of rows of B. M must be at
1906    *           least zero.
1907    *           Unchanged on exit.
1908    *
1909    *  N      - INTEGER.
1910    *           On entry, N specifies the number of columns of B.  N must be
1911    *           at least zero.
1912    *           Unchanged on exit.
1913    *
1914    *  ALPHA  - DOUBLE PRECISION.
1915    *           On entry,  ALPHA specifies the scalar  alpha. When  alpha is
1916    *           zero then  A is not referenced and  B need not be set before
1917    *           entry.
1918    *           Unchanged on exit.
1919    *
1920    *  A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m
1921    *           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'.
1922    *           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k
1923    *           upper triangular part of the array  A must contain the upper
1924    *           triangular matrix  and the strictly lower triangular part of
1925    *           A is not referenced.
1926    *           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k
1927    *           lower triangular part of the array  A must contain the lower
1928    *           triangular matrix  and the strictly upper triangular part of
1929    *           A is not referenced.
1930    *           Note that when  DIAG = 'U' or 'u',  the diagonal elements of
1931    *           A  are not referenced either,  but are assumed to be  unity.
1932    *           Unchanged on exit.
1933    *
1934    *  LDA    - INTEGER.
1935    *           On entry, LDA specifies the first dimension of A as declared
1936    *           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then
1937    *           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r'
1938    *           then LDA must be at least max( 1, n ).
1939    *           Unchanged on exit.
1940    *
1941    *  B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ).
1942    *           Before entry,  the leading  m by n part of the array  B must
1943    *           contain  the  right-hand  side  matrix  B,  and  on exit  is
1944    *           overwritten by the solution matrix  X.
1945    *
1946    *  LDB    - INTEGER.
1947    *           On entry, LDB specifies the first dimension of B as declared
1948    *           in  the  calling  (sub)  program.   LDB  must  be  at  least
1949    *           max( 1, m ).
1950    *           Unchanged on exit.
1951    */
1952   /*
1953    *  Level 3 Blas routine.
1954    */
1955   /**
1956    *  -- Written on 8-February-1989.
1957    *     Jack Dongarra, Argonne National Laboratory.
1958    *     Iain Duff, AERE Harwell.
1959    *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
1960    *     Sven Hammarling, Numerical Algorithms Group Ltd.
1961    */
1962   int            lside, nounit, upper;
1963   long            i, info, j, k, nrowa;
1964   double    temp;
1965   /*     Test the input parameters.  */
1966   lside  = side==CblasLeft;
1967   nrowa = lside? m : n;
1968   nounit = diag==CblasNonUnit;
1969   upper  = uplo==CblasUpper;
1970 
1971   info   = 0;
1972   if ( !lside && side!=CblasRight ) {
1973     info = 1;
1974   } else if ( !upper && uplo!=CblasLower ) {
1975     info = 2;
1976   } else if (transa!=CblasNoTrans && transa!=CblasTrans &&
1977              transa!=CblasConjTrans) {
1978     info = 3;
1979   } else if ( diag!=CblasNonUnit && diag!=CblasUnit ) {
1980     info = 4;
1981   } else if ( m<0 ) {
1982     info = 5;
1983   } else if ( n<0 ) {
1984     info = 6;
1985   } else if ( !lda || lda<nrowa ) {
1986     info = 9;
1987   } else if ( !ldb || ldb<m ) {
1988     info = 11;
1989   }
1990   if ( info!=0 ) {
1991     xerbla( "ydtrsm ", info );
1992     return;
1993   }
1994   /*     Quick return if possible.  */
1995   if ( n==0 ) return;
1996   /*     And when  alpha.eq.0.0.  */
1997   if ( alpha==0.0 ) {
1998     long nn = n;
1999     for ( ; nn-- ; b+=ldb) {
2000       for (i=0 ; i<m ; i++) b[i] = 0.0;
2001     }
2002     return;
2003   }
2004   /*     Start the operations.  */
2005   if ( lside ) {
2006     if ( transa==CblasNoTrans ) {
2007       /*           Form  B := alpha*inv( A )*B  */
2008       const double *ak;
2009       if ( upper ) {
2010         for (j=0 ; j<n ; j++,b+=ldb) {
2011           if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) b[i] *= alpha;
2012           for (k=m-1,ak=a+k*lda ; k>=0 ; k--,ak-=lda) {
2013             if ( b[k]!=0.0 ) {
2014               if ( nounit ) b[k] /= ak[k];
2015               /* unroll inner loop */
2016               for (i=0 ; i<(k&3) ; i++) b[i] -= b[k]*ak[i];
2017               for (    ; i<k     ; i+=4) {
2018                 b[i  ] -= b[k]*ak[i];
2019                 b[i+1] -= b[k]*ak[i+1];
2020                 b[i+2] -= b[k]*ak[i+2];
2021                 b[i+3] -= b[k]*ak[i+3];
2022               }
2023             }
2024           }
2025         }
2026       } else {
2027         long count;
2028         for (j=0 ; j<n ; j++,b+=ldb) {
2029           if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) b[i] *= alpha;
2030           for (k=0,ak=a ; k<m ; k++,ak+=lda) {
2031             if ( b[k]!=0.0 ) {
2032               if ( nounit ) b[k] /= ak[k];
2033               /* unroll inner loop */
2034               count = (m-1-k)&3;
2035               for (i=k+1 ; count-- ; i++) b[i] -= b[k]*ak[i];
2036               for (      ; i<m     ; i+=4) {
2037                 b[i  ] -= b[k]*ak[i];
2038                 b[i+1] -= b[k]*ak[i+1];
2039                 b[i+2] -= b[k]*ak[i+2];
2040                 b[i+3] -= b[k]*ak[i+3];
2041               }
2042             }
2043           }
2044         }
2045       }
2046     } else {
2047       /*           Form  B := alpha*inv( A' )*B  */
2048       const double *ai;
2049       if ( upper ) {
2050         for (j=0 ; j<n ; j++,b+=ldb) {
2051           for (i=0,ai=a ; i<m ; i++,ai+=lda) {
2052             temp = alpha*b[i];
2053             /* unroll inner loop */
2054             for (k=0 ; k<(i&3) ; k++) temp -= ai[k]*b[k];
2055             for (    ; k<i     ; k+=4) temp -= ai[k]*b[k] +
2056                ai[k+1]*b[k+1] + ai[k+2]*b[k+2] + ai[k+3]*b[k+3];
2057             if ( nounit ) temp /= ai[i];
2058             b[i] = temp;
2059           }
2060         }
2061       } else {
2062         long count;
2063         for (j=0 ; j<n ; j++,b+=ldb) {
2064           for (i=m-1,ai=a+i*lda ; i>=0 ; i--,ai-=lda) {
2065             temp = alpha*b[i];
2066             /* unroll inner loop */
2067             count = (m-1-i)&3;
2068             for (k=i+1 ; count-- ; k++) temp -= ai[k]*b[k];
2069             for (      ; k<m     ; k+=4) temp -= ai[k]*b[k] +
2070                ai[k+1]*b[k+1] + ai[k+2]*b[k+2] + ai[k+3]*b[k+3];
2071             if ( nounit ) temp/= ai[i];
2072             b[i] = temp;
2073           }
2074         }
2075       }
2076     }
2077   } else {
2078     if ( transa==CblasNoTrans ) {
2079       /*           Form  B := alpha*B*inv( A )  */
2080       double *bj, *bk;
2081       if ( upper ) {
2082         for (j=0,bj=b ; j<n ; j++,a+=lda,bj+=ldb) {
2083           if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) bj[i] *= alpha;
2084           for (k=0,bk=b ; k<j ; k++,bk+=ldb) {
2085             if ( a[k]!=0.0 ) {
2086               /* unroll inner loop */
2087               for (i=0 ; i<(m&3) ; i++) bj[i] -= a[k]*bk[i];
2088               for (    ; i<m     ; i+=4) {
2089                 bj[i  ] -= a[k]*bk[i];
2090                 bj[i+1] -= a[k]*bk[i+1];
2091                 bj[i+2] -= a[k]*bk[i+2];
2092                 bj[i+3] -= a[k]*bk[i+3];
2093               }
2094             }
2095           }
2096           if ( nounit ) {
2097             temp= 1.0/a[j];
2098             for (i=0 ; i<m ; i++) bj[i] *= temp;
2099           }
2100         }
2101       } else {
2102         for (j=n-1,a+=j*lda,bj=b+j*ldb ; j>=0 ; j--,a-=lda,bj-=ldb) {
2103           if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) bj[i] *= alpha;
2104           for (k=j+1,bk=b+k*ldb ; k<n ; k++,bk+=ldb) {
2105             if ( a[k]!=0.0 ) {
2106               /* unroll inner loop */
2107               for (i=0 ; i<(m&3) ; i++) bj[i] -= a[k]*bk[i];
2108               for (    ; i<m     ; i+=4) {
2109                 bj[i  ] -= a[k]*bk[i];
2110                 bj[i+1] -= a[k]*bk[i+1];
2111                 bj[i+2] -= a[k]*bk[i+2];
2112                 bj[i+3] -= a[k]*bk[i+3];
2113               }
2114             }
2115           }
2116           if ( nounit ) {
2117             temp = 1.0/a[j];
2118             for (i=0 ; i<m ; i++) bj[i] *= temp;
2119           }
2120         }
2121       }
2122     } else {
2123       /*           Form  B := alpha*B*inv( A' )  */
2124       double *bj, *bk;
2125       if ( upper ) {
2126         for (k=n-1,a+=k*lda,bk=b+k*ldb ; k>=0 ; k--,a-=lda,bk-=ldb) {
2127           if ( nounit ) {
2128             temp = 1.0/a[k];
2129             for (i=0 ; i<m ; i++) bk[i] *= temp;
2130           }
2131           for (j=0,bj=b ; j<k ; j++,bj+=ldb) {
2132             if ( a[j]!=0.0 ) {
2133               temp = a[j];
2134               /* unroll inner loop */
2135               for (i=0 ; i<(m&3) ; i++) bj[i] -= temp*bk[i];
2136               for (    ; i<m     ; i+=4) {
2137                 bj[i  ] -= temp*bk[i];
2138                 bj[i+1] -= temp*bk[i+1];
2139                 bj[i+2] -= temp*bk[i+2];
2140                 bj[i+3] -= temp*bk[i+3];
2141               }
2142             }
2143           }
2144           if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) bk[i] *= alpha;
2145         }
2146       } else {
2147         for (k=0,bk=b ; k<n ; k++,a+=lda,bk+=ldb) {
2148           if ( nounit ) {
2149             temp = 1.0/a[k];
2150             for (i=0 ; i<m ; i++) bk[i] *= temp;
2151           }
2152           for (j=k+1,bj=b+j*ldb ; j<n ; j++,bj+=ldb) {
2153             if ( a[j]!=0.0 ) {
2154               temp = a[j];
2155               /* unroll inner loop */
2156               for (i=0 ; i<(m&3) ; i++) bj[i] -= temp*bk[i];
2157               for (    ; i<m     ; i+=4) {
2158                 bj[i  ] -= temp*bk[i];
2159                 bj[i+1] -= temp*bk[i+1];
2160                 bj[i+2] -= temp*bk[i+2];
2161                 bj[i+3] -= temp*bk[i+3];
2162               }
2163             }
2164           }
2165           if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) bk[i] *= alpha;
2166         }
2167       }
2168     }
2169   }
2170 
2171   /*     End of DTRSM   */
2172 }
2173