1 /*
2 * $Id: cblasy.c,v 1.1 2005-09-18 22:04:52 dhmunro Exp $
3 * CBLAS routines used by yorick
4 */
5 /* Copyright (c) 2005, The Regents of the University of California.
6 * All rights reserved.
7 * This file is part of yorick (http://yorick.sourceforge.net).
8 * Read the accompanying LICENSE file for details.
9 */
10
11 /* Basic Linear algebra Subprograms Technical (BLAST) Forum Standard
12 * August 21, 2001 (http://www.netlib.org/blas/blast-forum/)
13 * Appendix B: Legacy BLAS
14 *
15 * These routines were converted from the original Fortran source
16 * from www.netlib.org, with hand unrolling of the inner loops for
17 * the level 2 and 3 routines. The unrolling helps not-very-bright
18 * or unaggressive C compilers, but may hurt good compilers. (The
19 * level 1 Fortran source was already unrolled.)
20 */
21
22 #include "cblasy.h"
23
24 #define ABS(x) ((x)>=0?(x):-(x))
25
26 /* ------------------------------------------------------------ level 1 */
27
28 double
cblas_ddot(INT_IN n,const double * x,INT_IN incx,const double * y,INT_IN incy)29 cblas_ddot(INT_IN n, const double *x, INT_IN incx,
30 const double *y, INT_IN incy)
31 {
32 /*c
33 c forms the dot product of two vectors.
34 c uses unrolled loops for increments equal to one.
35 c jack dongarra, linpack, 3/11/78.
36 c*/
37 double dtemp = 0.0;
38 long i;
39 if (n<=0) return dtemp;
40 if (incx!=1 || incy!=1) {
41 /* unequal increments or equal increments not equal to 1 */
42 long iy, nn=n;
43 if (incx<0) x -= (n-1)*incx;
44 if (incy<0) y -= (n-1)*incx;
45 for (i=iy=0 ; nn-- ; i+=incx,iy+=incy) dtemp += y[iy]*x[i];
46 } else {
47 /* both increments equal to 1 */
48 /* unroll loop */
49 long m = n%5;
50 for (i=0 ; i<m ; i++) dtemp += y[i]*x[i];
51 for ( ; i<n ; i+=5)
52 dtemp += y[i]*x[i] + y[i+1]*x[i+1] + y[i+2]*x[i+2] +
53 y[i+3]*x[i+3] + y[i+4]*x[i+4];
54 }
55 return dtemp;
56 }
57
58 double
cblas_dnrm2(INT_IN n,const double * x,INT_IN incx)59 cblas_dnrm2(INT_IN n, const double *x, INT_IN incx)
60 {
61 /*c
62 c euclidean norm of the n-vector stored in x_1() with storage
63 c increment incx .
64 c if n .le. 0 return with result = 0.
65 c if n .ge. 1 then incx must be .ge. 1
66 c
67 c c.l.lawson, 1978 jan 08
68 c modified to correct failure to update ix, 1/25/92.
69 c modified 3/93 to return if incx .le. 0.
70 c
71 c four phase method using two built-in constants that are
72 c hopefully applicable to all machines.
73 c cutlo = maximum of sqrt(u/eps) over all known machines.
74 c cuthi = minimum of sqrt(v) over all known machines.
75 c where
76 c eps = smallest no. such that eps + 1. .gt. 1.
77 c u = smallest positive no. (underflow limit)
78 c v = largest no. (overflow limit)
79 c
80 c brief outline of algorithm..
81 c
82 c phase 1 scans zero components.
83 c move to phase 2 when a component is nonzero and .le. cutlo
84 c move to phase 3 when a component is .gt. cutlo
85 c move to phase 4 when a component is .ge. cuthi/m
86 c where m = n for x() real and m = 2*n for complex.
87 c
88 c values for cutlo and cuthi..
89 c from the environmental parameters listed in the imsl converter
90 c document the limiting values are as follows..
91 c cutlo, s.p. u/eps = 2**(-102) for honeywell. close seconds are
92 c univac and dec at 2**(-103)
93 c thus cutlo = 2**(-51) = 4.44089e-16
94 c cuthi, s.p. v = 2**127 for univac, honeywell, and dec.
95 c thus cuthi = 2**(63.5) = 1.30438e19
96 c cutlo, d.p. u/eps = 2**(-67) for honeywell and dec.
97 c thus cutlo = 2**(-33.5) = 8.23181d-11
98 c cuthi, d.p. same as s.p. cuthi = 1.30438d19
99 c data cutlo, cuthi / 8.232d-11, 1.304d19 /
100 c data cutlo, cuthi / 4.441e-16, 1.304e19 / */
101 #undef cutlo
102 #define cutlo 8.232e-11
103 #undef cuthi
104 #define cuthi 1.304e19
105 double hitest, sum, adx, xmax;
106 long nn = n;
107 extern double sqrt(double);
108
109 if ( nn<=0 || incx<=0 ) return 0.0;
110 hitest = cuthi/nn; /* nn will change as calculation proceeds */
111
112 adx = ABS(x[0]);
113
114 /* phase 1 */
115 for (x+=incx,nn-- ; adx==0.0 && nn ; x+=incx,nn--) adx = ABS(x[0]);
116 if (!nn) return adx; /* note that adx can be any size here */
117
118 /* phase 2, adx is first non-zero element */
119 if (adx <= cutlo) {
120 xmax = adx;
121 sum = 1.0;
122 for (;;) {
123 adx = ABS(x[0]);
124 if (adx > cutlo) break;
125 if (adx > xmax) {
126 /* renormalize to new largest element */
127 sum = 1.0 + sum * (xmax/adx)*(xmax/adx);
128 xmax = adx;
129 } else {
130 /* continue with current normalization */
131 sum += (adx/xmax)*(adx/xmax);
132 }
133 nn--;
134 if (!nn) return xmax*sqrt(sum);
135 x += incx;
136 }
137 /* here, sum>=1.0, but xmax<=cutlo, so xmax*xmax might underflow */
138 sum *= xmax; /* remove phase 2 normalization */
139 sum *= xmax; /* don't want the compiler to do sum*(xmax*xmax) */
140 nn--;
141 x += incx;
142
143 } else {
144 sum = 0.0;
145 }
146
147 /* phase 3, adx is first element>cutlo */
148 if (adx<hitest) {
149 sum += adx*adx;
150 for (;;) {
151 if (!nn) return sqrt(sum);
152 adx = ABS(x[0]);
153 nn--;
154 x += incx;
155 if (adx>=hitest) break;
156 sum += adx*adx;
157 }
158 }
159
160 /* phase 4, adx is first element greater than hitest */
161 xmax = adx;
162 sum /= xmax; /* again, try to avoid sum/(xmax*xmax) */
163 sum /= xmax;
164 sum += 1.0;
165 while (nn) {
166 adx = ABS(x[0]);
167 if (adx > xmax) {
168 /* renormalize to new largest element */
169 sum = 1.0 + sum * (xmax/adx)*(xmax/adx);
170 xmax = adx;
171 } else {
172 /* continue with current normalization */
173 sum += (adx/xmax)*(adx/xmax);
174 }
175 nn--;
176 x += incx;
177 }
178 return xmax*sqrt(sum);
179 }
180
181 double
cblas_dasum(INT_IN n,const double * x,INT_IN incx)182 cblas_dasum(INT_IN n, const double *x, INT_IN incx)
183 {
184 /*c
185 c takes the sum of the absolute values.
186 c jack dongarra, linpack, 3/11/78.
187 c modified 3/93 to return if incx .le. 0.
188 c*/
189 double dtemp = 0.0;
190 long i, nn=n;
191 if ( nn<=0 || incx<=0 ) return dtemp;
192 if (incx!=1) {
193 /* increment not equal to 1 */
194 for (i=0 ; nn-- ; i+=incx) dtemp += ABS(x[i]);
195 } else {
196 /* increment equal to 1 */
197 /* unroll loop */
198 long m = n%6;
199 for (i=0 ; i<m ; i++) dtemp += ABS(x[i]);
200 for ( ; i<n ; i+=6)
201 dtemp += ABS(x[i]) + ABS(x[i+1]) + ABS(x[i+2]) +
202 ABS(x[i+3]) + ABS(x[i+4]) + ABS(x[i+5]);
203 }
204 return dtemp;
205 }
206
207 CBLAS_INDEX
cblas_idamax(INT_IN n,const double * x,INT_IN incx)208 cblas_idamax(INT_IN n, const double *x, INT_IN incx)
209 {
210 /*c
211 c finds the index of element having max. absolute value.
212 c jack dongarra, linpack, 3/11/78.
213 c modified 3/93 to return if incx .le. 0.
214 c*/
215 CBLAS_INDEX iamax = 0;
216 double dmax;
217 long i, nn=n;
218 if (n<=0 || incx<=0) return iamax;
219 if (incx!=1) {
220 /* increment not equal to 1 */
221 long ix;
222 dmax = ABS(x[0]);
223 for (i=1,ix=incx ; --nn ; i++,ix+=incx)
224 if (ABS(x[ix])>dmax) {
225 dmax = ABS(x[ix]);
226 iamax = i;
227 }
228 } else {
229 /* increment equal to 1 */
230 dmax = ABS(x[0]);
231 for (i=1 ; --nn ; i++)
232 if (ABS(x[i])>dmax) {
233 dmax = ABS(x[i]);
234 iamax = i;
235 }
236 }
237 return iamax;
238 }
239
240 void
cblas_dswap(INT_IN n,double * x,INT_IN incx,double * y,INT_IN incy)241 cblas_dswap(INT_IN n, double *x, INT_IN incx, double *y, INT_IN incy)
242 {
243 /*c
244 c interchanges two vectors.
245 c uses unrolled loops for increments equal one.
246 c jack dongarra, linpack, 3/11/78.
247 c*/
248 double dtemp;
249 long i;
250 if (n<=0) return;
251 if (incx!=1 || incy!=1) {
252 /* unequal increments or equal increments not equal to 1 */
253 long iy, nn=n;
254 if (incx<0) x -= (n-1)*incx;
255 if (incy<0) y -= (n-1)*incx;
256 for (i=iy=0 ; nn-- ; i+=incx,iy+=incy) {
257 dtemp = x[i]; x[i] = y[iy]; y[iy] = dtemp;
258 }
259 } else {
260 /* both increments equal to 1 */
261 /* unroll loop */
262 long m = n%3;
263 for (i=0 ; i<m ; i++) { dtemp = x[i]; x[i] = y[i]; y[i] = dtemp; }
264 for ( ; i<n ; i+=3) {
265 dtemp = x[i ]; x[i ] = y[i ]; y[i ] = dtemp;
266 dtemp = x[i+1]; x[i+1] = y[i+1]; y[i+1] = dtemp;
267 dtemp = x[i+2]; x[i+2] = y[i+2]; y[i+2] = dtemp;
268 }
269 }
270 }
271
272 void
cblas_dcopy(INT_IN n,const double * x,INT_IN incx,double * y,INT_IN incy)273 cblas_dcopy(INT_IN n, const double *x, INT_IN incx,
274 double *y, INT_IN incy)
275 {
276 /*c
277 c copies a vector, x, to a vector, y.
278 c uses unrolled loops for increments equal to one.
279 c jack dongarra, linpack, 3/11/78.
280 c*/
281 long i;
282 if (n<=0) return;
283 if (incx!=1 || incy!=1) {
284 /* unequal increments or equal increments not equal to 1 */
285 long iy, nn=n;
286 if (incx<0) x -= (n-1)*incx;
287 if (incy<0) y -= (n-1)*incx;
288 for (i=iy=0 ; nn-- ; i+=incx,iy+=incy) y[iy] = x[i];
289 } else {
290 /* both increments equal to 1 */
291 /* unroll loop */
292 long m= n%7;
293 for (i=0 ; i<m ; i++) y[i] = x[i];
294 for ( ; i<n ; i+=7) {
295 y[i ] = x[i];
296 y[i+1] = x[i+1];
297 y[i+2] = x[i+2];
298 y[i+3] = x[i+3];
299 y[i+4] = x[i+4];
300 y[i+5] = x[i+5];
301 y[i+6] = x[i+6];
302 }
303 }
304 }
305
306 void
cblas_daxpy(INT_IN n,const double alpha,const double * x,INT_IN incx,double * y,INT_IN incy)307 cblas_daxpy(INT_IN n, const double alpha, const double *x,
308 INT_IN incx, double *y, INT_IN incy)
309 {
310 /*c
311 c constant times a vector plus a vector.
312 c uses unrolled loops for increments equal to one.
313 c jack dongarra, linpack, 3/11/78.
314 c*/
315 long i;
316 if (n<=0) return;
317 if (alpha == 0.0) return;
318 if (incx!=1 || incy!=1) {
319 /* unequal increments or equal increments not equal to 1 */
320 long iy, nn=n;
321 if (incx<0) x -= (n-1)*incx;
322 if (incy<0) y -= (n-1)*incy;
323 for (i=iy=0 ; nn-- ; i+=incx,iy+=incy) y[iy] += alpha*x[i];
324 } else {
325 /* both increments equal to 1 */
326 /* unroll loop */
327 long m = n%4;
328 for (i=0 ; i<m ; i++) y[i] += alpha*x[i];
329 for ( ; i<n ; i+=4) {
330 y[i ] += alpha*x[i];
331 y[i+1] += alpha*x[i+1];
332 y[i+2] += alpha*x[i+2];
333 y[i+3] += alpha*x[i+3];
334 }
335 }
336 }
337
338 void
cblas_drot(INT_IN n,double * x,INT_IN incx,double * y,INT_IN incy,const double c,const double s)339 cblas_drot(INT_IN n, double *x, INT_IN incx,
340 double *y, INT_IN incy, const double c, const double s)
341 {
342 /*c
343 c applies a plane rotation.
344 c jack dongarra, linpack, 3/11/78.
345 c*/
346 double dtemp;
347
348 if (n<=0) return;
349 if (incx!=1 || incy!=1) {
350 /* unequal increments or equal increments not equal to 1 */
351 long ix, iy, nn=n;
352 if (incx<0) x -= (n-1)*incx;
353 if (incy<0) y -= (n-1)*incy;
354 for (ix=iy=0 ; nn-- ; ix+=incx,iy+=incy) {
355 dtemp = c*x[ix] + s*y[iy];
356 y[iy] = c*y[iy] - s*x[ix];
357 x[ix] = dtemp;
358 }
359 } else {
360 /* both increments equal to 1 */
361 long i;
362 for (i=0 ; i<n ; i++) {
363 dtemp = c*x[i] + s*y[i];
364 y[i] = c*y[i] - s*x[i];
365 x[i] = dtemp;
366 }
367 }
368 return;
369 }
370
371 void
cblas_dscal(INT_IN n,const double alpha,double * x,INT_IN incx)372 cblas_dscal(INT_IN n, const double alpha, double *x, INT_IN incx)
373 {
374 /*c
375 c scales a vector by a constant.
376 c uses unrolled loops for increment equal to one.
377 c jack dongarra, linpack, 3/11/78.
378 c modified 3/93 to return if incx .le. 0.
379 c*/
380 long i, nn=n;
381 if (n<=0) return;
382 if (incx!=1) {
383 /* increment not equal to 1 */
384 if (incx<0) x -= (n-1)*incx;
385 for (i=0 ; nn-- ; i+=incx) x[i] *= alpha;
386 } else {
387 /* increment equal to 1 */
388 /* unroll loop */
389 long m= n%5;
390 for (i=0 ; i<m ; i++) x[i] *= alpha;
391 for ( ; i<n ; i+=5) {
392 x[i ] *= alpha;
393 x[i+1] *= alpha;
394 x[i+2] *= alpha;
395 x[i+3] *= alpha;
396 x[i+4] *= alpha;
397 }
398 }
399 }
400
401 /* ------------------------------------------------------------ level 2 */
402
403 void
cblas_dgemv(const enum CBLAS_ORDER order,const enum CBLAS_TRANSPOSE transa,INT_IN m,INT_IN n,const double alpha,const double * a,INT_IN lda,const double * x,INT_IN incx,const double beta,double * y,INT_IN incy)404 cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE transa,
405 INT_IN m, INT_IN n, const double alpha,
406 const double *a, INT_IN lda, const double *x, INT_IN incx,
407 const double beta, double *y, INT_IN incy)
408 {
409 /*
410 * Purpose
411 * =======
412 *
413 * DGEMV performs one of the matrix-vector operations
414 *
415 * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y,
416 *
417 * where alpha and beta are scalars, x and y are vectors and A is an
418 * m by n matrix.
419 *
420 * Parameters
421 * ==========
422 *
423 * ORDER - enum CBLAS_ORDER (ignored)
424 *
425 * TRANSA - enum CBLAS_TRANSPOSE
426 * On entry, TRANS specifies the operation to be performed as
427 * follows:
428 * TRANSA = CblasNoTrans y := alpha*A*x + beta*y.
429 * TRANSA = CblasTrans y := alpha*A'*x + beta*y.
430 * TRANSA = CblasConjTrans y := alpha*A'*x + beta*y.
431 * Unchanged on exit.
432 *
433 * M - INTEGER.
434 * On entry, M specifies the number of rows of the matrix A.
435 * M must be at least zero.
436 * Unchanged on exit.
437 *
438 * N - INTEGER.
439 * On entry, N specifies the number of columns of the matrix A.
440 * N must be at least zero.
441 * Unchanged on exit.
442 *
443 * ALPHA - DOUBLE PRECISION.
444 * On entry, ALPHA specifies the scalar alpha.
445 * Unchanged on exit.
446 *
447 * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
448 * Before entry, the leading m by n part of the array A must
449 * contain the matrix of coefficients.
450 * Unchanged on exit.
451 *
452 * LDA - INTEGER.
453 * On entry, LDA specifies the first dimension of A as declared
454 * in the calling (sub) program. LDA must be at least
455 * max( 1, m ).
456 * Unchanged on exit.
457 *
458 * X - DOUBLE PRECISION array of DIMENSION at least
459 * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
460 * and at least
461 * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
462 * Before entry, the incremented array X must contain the
463 * vector x.
464 * Unchanged on exit.
465 *
466 * INCX - INTEGER.
467 * On entry, INCX specifies the increment for the elements of
468 * X. INCX must not be zero.
469 * Unchanged on exit.
470 *
471 * BETA - DOUBLE PRECISION.
472 * On entry, BETA specifies the scalar beta. When BETA is
473 * supplied as zero then Y need not be set on input.
474 * Unchanged on exit.
475 *
476 * Y - DOUBLE PRECISION array of DIMENSION at least
477 * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
478 * and at least
479 * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
480 * Before entry with BETA non-zero, the incremented array Y
481 * must contain the vector y. On exit, Y is overwritten by the
482 * updated vector y.
483 *
484 * INCY - INTEGER.
485 * On entry, INCY specifies the increment for the elements of
486 * Y. INCY must not be zero.
487 * Unchanged on exit.
488 */
489 /*
490 * Level 2 Blas routine.
491 *
492 * -- Written on 22-October-1986.
493 * Jack Dongarra, Argonne National Lab.
494 * Jeremy Du Croz, Nag Central Office.
495 * Sven Hammarling, Nag Central Office.
496 * Richard Hanson, Sandia National Labs.
497 */
498 double temp;
499 long i, info, j, lenx, leny, m3, nn=n, mm=m;
500 int trans = transa;
501 /*
502 * Test the input parameters.
503 */
504 info = 0;
505 if (transa!=CblasNoTrans && transa!=CblasTrans && transa!=CblasConjTrans) {
506 info = 1;
507 } else if ( m<0 ) {
508 info = 2;
509 } else if ( n<0 ) {
510 info = 3;
511 } else if ( !lda || lda<m ) {
512 info = 6;
513 } else if ( incx==0 ) {
514 info = 8;
515 } else if ( incy==0 ) {
516 info = 11;
517 }
518 if ( info!=0 ) {
519 xerbla( "ydgemv ", info );
520 return;
521 }
522 /*
523 * Quick return if possible.
524 */
525 if ( ( m==0 )||( n==0 ) || ( ( alpha==0.0 )&&( beta==1.0 ) ) )
526 return;
527 /*
528 * Set LENX and LENY, the lengths of the vectors x and y, and set
529 * up the start points in X and Y.
530 */
531 if ( trans==CblasNoTrans ) {
532 lenx = nn;
533 leny = mm;
534 } else {
535 lenx = mm;
536 leny = nn;
537 }
538 if ( incx<0 ) x -= (lenx-1)*incx;
539 if ( incy<0 ) y -= (lenx-1)*incy;
540 m3 = mm&3;
541 /*
542 * Start the operations. In this version the elements of A are
543 * accessed sequentially with one pass through A.
544 *
545 * First form y := beta*y.
546 */
547 if ( beta!=1.0 ) {
548 if ( incy==1 ) {
549 if ( beta==0.0 ) {
550 for (i=0 ; i<leny ; i++) y[i] = 0.0;
551 } else {
552 for (i=0 ; i<leny ; i++) y[i] *= beta;
553 }
554 } else {
555 if ( beta==0.0 ) {
556 for (i=0 ; leny-- ; i+=incy) y[i] = 0.0;
557 } else {
558 for (i=0 ; leny-- ; i+=incy) y[i] *= beta;
559 }
560 }
561 }
562 if ( alpha==0.0 )
563 return;
564 if ( trans==CblasNoTrans ) {
565 /* Form y := alpha*A*x + y. */
566 if ( incy==1 ) {
567 for (j=0 ; nn-- ; j+=incx,a+=lda) {
568 if ( x[j]!=0.0 ) {
569 temp = alpha*x[j];
570 /* unroll innermost loop */
571 for (i=0 ; i<m3 ; i++) y[i] += temp*a[i];
572 for ( ; i<mm ; i+=4) {
573 y[i ] += temp*a[i];
574 y[i+1] += temp*a[i+1];
575 y[i+2] += temp*a[i+2];
576 y[i+3] += temp*a[i+3];
577 }
578 }
579 }
580 } else {
581 long iy;
582 for (j=0 ; nn-- ; j+=incx,a+=lda) {
583 if ( x[j]!=0.0 ) {
584 temp = alpha*x[j];
585 /* unroll innermost loop */
586 for (i=iy=0 ; i<m3 ; i++,iy+=incy) y[iy] += temp*a[i];
587 for ( ; i<mm ; i+=4,iy+=4*incy) {
588 y[iy ] += temp*a[i];
589 y[iy+ incy] += temp*a[i+1];
590 y[iy+2*incy] += temp*a[i+2];
591 y[iy+3*incy] += temp*a[i+3];
592 }
593 }
594 }
595 }
596 } else {
597 /* Form y := alpha*A'*x + y. */
598 if ( incx==1 ) {
599 for (j=0 ; nn-- ; j+=incy,a+=lda) {
600 temp = 0.0;
601 /* unroll innermost loop */
602 for (i=0 ; i<m3 ; i++) temp += a[i]*x[i];
603 for ( ; i<mm ; i+=4)
604 temp += a[i]*x[i] + a[i+1]*x[i+1] + a[i+2]*x[i+2] + a[i+3]*x[i+3];
605 y[j] += alpha*temp;
606 }
607 } else {
608 long ix;
609 for (j=0 ; nn-- ; j+=incy,a+=lda) {
610 temp = 0.0;
611 /* unroll innermost loop */
612 for (i=ix=0 ; i<m3 ; i++,ix+=incx) temp += a[i]*x[ix];
613 for ( ; i<mm ; i+=4,ix+=4*incx)
614 temp += a[i]*x[ix] + a[i+1]*x[ix+incx] +
615 a[i+2]*x[ix+2*incx] + a[i+3]*x[ix+3*incx];
616 y[j] += alpha*temp;
617 }
618 }
619 }
620
621 /* End of DGEMV */
622 }
623
624 void
cblas_dtrmv(const enum CBLAS_ORDER order,const enum CBLAS_UPLO uplo,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_DIAG diag,INT_IN n,const double * a,INT_IN lda,double * x,INT_IN incx)625 cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo,
626 const enum CBLAS_TRANSPOSE transa, const enum CBLAS_DIAG diag,
627 INT_IN n, const double *a, INT_IN lda,
628 double *x, INT_IN incx)
629 {
630 /*
631 * Purpose
632 * =======
633 *
634 * DTRMV performs one of the matrix-vector operations
635 *
636 * x := A*x, or x := A'*x,
637 *
638 * where x is an n element vector and A is an n by n unit, or non-unit,
639 * upper or lower triangular matrix.
640 *
641 * Parameters
642 * ==========
643 *
644 * ORDER - enum CBLAS_ORDER (ignored)
645 *
646 * UPLO - enum CBLAS_UPLO
647 * On entry, UPLO specifies whether the matrix is an upper or
648 * lower triangular matrix as follows:
649 * UPLO = CblasUpper A is an upper triangular matrix.
650 * UPLO = CblasLower A is a lower triangular matrix.
651 *
652 * TRANSA - enum CBLAS_TRANSPOSE
653 * On entry, TRANS specifies the operation to be performed as
654 * follows:
655 * TRANS = CblasNoTrans x := A*x.
656 * TRANS = CblasTrans x := A'*x.
657 * TRANS = CblasConjTrans x := A'*x.
658 *
659 * DIAG - enum CBLAS_DIAG
660 * On entry, DIAG specifies whether or not A is unit
661 * triangular as follows:
662 * DIAG = CblasUnit A is assumed to be unit triangular.
663 * DIAG = CblasNonUnit A is not assumed to be unit
664 * triangular.
665 *
666 * N - INTEGER.
667 * On entry, N specifies the order of the matrix A.
668 * N must be at least zero.
669 *
670 * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
671 * Before entry with UPLO = 'U' or 'u', the leading n by n
672 * upper triangular part of the array A must contain the upper
673 * triangular matrix and the strictly lower triangular part of
674 * A is not referenced.
675 * Before entry with UPLO = 'L' or 'l', the leading n by n
676 * lower triangular part of the array A must contain the lower
677 * triangular matrix and the strictly upper triangular part of
678 * A is not referenced.
679 * Note that when DIAG = 'U' or 'u', the diagonal elements of
680 * A are not referenced either, but are assumed to be unity.
681 * Unchanged on exit.
682 *
683 * LDA - INTEGER.
684 * On entry, LDA specifies the first dimension of A as declared
685 * in the calling (sub) program. LDA must be at least
686 * max( 1, n ).
687 * Unchanged on exit.
688 *
689 * X - DOUBLE PRECISION array of dimension at least
690 * ( 1 + ( n - 1 )*abs( INCX ) ).
691 * Before entry, the incremented array X must contain the n
692 * element vector x. On exit, X is overwritten with the
693 * tranformed vector x.
694 *
695 * INCX - INTEGER.
696 * On entry, INCX specifies the increment for the elements of
697 * X. INCX must not be zero.
698 * Unchanged on exit.
699 */
700 /*
701 * Level 2 Blas routine.
702 *
703 * -- Written on 22-October-1986.
704 * Jack Dongarra, Argonne National Lab.
705 * Jeremy Du Croz, Nag Central Office.
706 * Sven Hammarling, Nag Central Office.
707 * Richard Hanson, Sandia National Labs.
708 */
709 double temp;
710 long i, info, ix, j, jx;
711 int nounit, trans = transa;
712 /*
713 * Test the input parameters.
714 */
715 info = 0;
716 if (uplo!=CblasUpper && uplo!=CblasLower ) {
717 info = 1;
718 } else if (transa!=CblasNoTrans && transa!=CblasTrans &&
719 transa!=CblasConjTrans) {
720 info = 2;
721 } else if (diag!=CblasNonUnit && diag!=CblasUnit) {
722 info = 3;
723 } else if ( n<0 ) {
724 info = 4;
725 } else if ( !lda || lda<n ) {
726 info = 6;
727 } else if ( incx==0 ) {
728 info = 8;
729 }
730 if ( info!=0 ) {
731 xerbla( "ydtrmv ", info );
732 return;
733 }
734 /*
735 * Quick return if possible.
736 */
737 if ( n==0 ) return;
738
739 nounit = (diag==CblasNonUnit);
740 /*
741 * Set up the start point in X if the increment is negative.
742 */
743 if ( incx<=0 ) x -= (n-1)*incx;
744 /*
745 * Start the operations. In this version the elements of A are
746 * accessed sequentially with one pass through A.
747 */
748 if ( trans==CblasNoTrans ) {
749 /* Form x := A*x. */
750 if ( uplo==CblasUpper ) {
751 if ( incx==1 ) {
752 for (j=0 ; j<n ; j++,a+=lda) {
753 if ( x[j]!=0.0 ) {
754 temp = x[j];
755 /* unroll inner loop */
756 for (i=0 ; i<(j&3) ; i++) x[i] += temp*a[i];
757 for ( ; i<j ; i+=4) {
758 x[i ] += temp*a[i];
759 x[i+1] += temp*a[i+1];
760 x[i+2] += temp*a[i+2];
761 x[i+3] += temp*a[i+3];
762 }
763 if ( nounit ) x[j] *= a[j];
764 }
765 }
766 } else {
767 for (j=jx=0 ; j<n ; j++,jx+=incx,a+=lda) {
768 if ( x[jx]!=0.0 ) {
769 temp = x[jx];
770 /* unroll inner loop */
771 for (i=ix=0 ; i<(j&3) ; i++,ix+=incx) x[ix] += temp*a[i];
772 for ( ; i<j ; i+=4,ix+=4*incx) {
773 x[ix ] += temp*a[i];
774 x[ix+ incx] += temp*a[i+1];
775 x[ix+2*incx] += temp*a[i+2];
776 x[ix+3*incx] += temp*a[i+3];
777 }
778 if ( nounit ) x[jx] *= a[j];
779 }
780 }
781 }
782 } else {
783 long count;
784 if ( incx==1 ) {
785 for (j=n-1,a+=j*lda ; j>=0 ; j--,a-=lda) {
786 if ( x[j]!=0.0 ) {
787 temp = x[j];
788 /* unroll inner loop */
789 count = (n-1-j)&3;
790 for (i=n-1 ; count-- ; i--) x[i] += temp*a[i];
791 for ( ; i>j ; i-=4) {
792 x[i ] += temp*a[i];
793 x[i-1] += temp*a[i-1];
794 x[i-2] += temp*a[i-2];
795 x[i-3] += temp*a[i-3];
796 }
797 if ( nounit ) x[j] *= a[j];
798 }
799 }
800 } else {
801 for (j=n-1,jx=j*incx,a+=j*lda ; j>=0 ; j--,jx-=incx,a-=lda) {
802 if ( x[jx]!=0.0 ) {
803 temp = x[jx];
804 /* unroll inner loop */
805 count = (n-1-j)&3;
806 for (i=n-1,ix=i*incx ; count-- ; i--,ix-=incx) x[ix] += temp*a[i];
807 for ( ; i>j ; i-=4,ix-=4*incx) {
808 x[ix ] += temp*a[i];
809 x[ix- incx] += temp*a[i-1];
810 x[ix-2*incx] += temp*a[i-2];
811 x[ix-3*incx] += temp*a[i-3];
812 }
813 if ( nounit ) x[jx] *= a[j];
814 }
815 }
816 }
817 }
818 } else {
819 /* Form x := A'*x. */
820 long count;
821 if ( uplo==CblasUpper ) {
822 if ( incx==1 ) {
823 for (j=n-1,a+=j*lda ; j>=0 ; j--,a-=lda) {
824 temp = x[j];
825 if ( nounit ) temp*= a[j];
826 if (j) {
827 /* unroll inner loop */
828 count = (j-1)&3;
829 for (i=j-1 ; count-- ; i--) temp += a[i]*x[i];
830 for ( ; i>0 ; i-=4) temp += a[i]*x[i] +
831 a[i-1]*x[i-1] + a[i-2]*x[i-2] + a[i-3]*x[i-3];
832 }
833 x[j] = temp;
834 }
835 } else {
836 for (j=n-1,jx=j*incx,a+=j*lda ; j>=0 ; j--,jx-=incx,a-=lda) {
837 temp = x[jx];
838 if ( nounit ) temp *= a[j];
839 if (j) {
840 /* unroll inner loop */
841 count = (j-1)&3;
842 for (i=j-1,ix=i*incx ; count-- ; i--,ix-=incx) temp += a[i]*x[ix];
843 for ( ; i>0 ; i-=4,ix-=4*incx) temp += a[i]*x[ix] +
844 a[i-1]*x[i-incx] + a[i-2]*x[i-2*incx] + a[i-3]*x[i-3*incx];
845 }
846 x[jx] = temp;
847 }
848 }
849 } else {
850 if ( incx==1 ) {
851 for (j=0 ; j<n ; j++,a+=lda) {
852 temp = x[j];
853 if ( nounit ) temp *= a[j];
854 /* unroll inner loop */
855 count = (n-1-j)&3;
856 for (i=j+1 ; count-- ; i++) temp += a[i]*x[i];
857 for ( ; i<n ; i+=4) temp += a[i]*x[i] +
858 a[i+1]*x[i+1] + a[i+2]*x[i+2] + a[i+3]*x[i+3];
859 x[j] = temp;
860 }
861 } else {
862 for (j=jx=0 ; j<n ; j++,jx+=incx,a+=lda) {
863 temp = x[jx];
864 if ( nounit ) temp *= a[j];
865 /* unroll inner loop */
866 count = (n-1-j)&3;
867 for (i=j+1,ix=i*incx ; count-- ; i++,ix+=incx) temp += a[i]*x[ix];
868 for ( ; i<n ; i+=4,ix+=4*incx) temp += a[i]*x[ix] +
869 a[i+1]*x[i+incx] + a[i+2]*x[i+2*incx] + a[i+3]*x[i+3*incx];
870 x[jx] = temp;
871 }
872 }
873 }
874 }
875
876 return;
877 /* End of DTRMV */
878 }
879
880 void
cblas_dtrsv(const enum CBLAS_ORDER order,const enum CBLAS_UPLO uplo,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_DIAG diag,INT_IN n,const double * a,INT_IN lda,double * x,INT_IN incx)881 cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO uplo,
882 const enum CBLAS_TRANSPOSE transa, const enum CBLAS_DIAG diag,
883 INT_IN n, const double *a, INT_IN lda,
884 double *x, INT_IN incx)
885 {
886 /*
887 * Purpose
888 * =======
889 *
890 * DTRSV solves one of the systems of equations
891 *
892 * A*x = b, or A'*x = b,
893 *
894 * where b and x are n element vectors and A is an n by n unit, or
895 * non-unit, upper or lower triangular matrix.
896 *
897 * No test for singularity or near-singularity is included in this
898 * routine. Such tests must be performed before calling this routine.
899 *
900 * Parameters
901 * ==========
902 *
903 * ORDER - enum CBLAS_ORDER (ignored)
904 *
905 * UPLO - enum CBLAS_UPLO
906 * On entry, UPLO specifies whether the matrix is an upper or
907 * lower triangular matrix as follows:
908 * UPLO = CblasUpper A is an upper triangular matrix.
909 * UPLO = CblasLower A is a lower triangular matrix.
910 *
911 * TRANSA - enum CBLAS_TRANSPOSE
912 * On entry, TRANS specifies the operation to be performed as
913 * follows:
914 * TRANS = CblasNoTrans x := A*x.
915 * TRANS = CblasTrans x := A'*x.
916 * TRANS = CblasConjTrans x := A'*x.
917 *
918 * DIAG - enum CBLAS_DIAG
919 * On entry, DIAG specifies whether or not A is unit
920 * triangular as follows:
921 * DIAG = CblasUnit A is assumed to be unit triangular.
922 * DIAG = CblasNonUnit A is not assumed to be unit
923 * triangular.
924 *
925 * N - INTEGER.
926 * On entry, N specifies the order of the matrix A.
927 * N must be at least zero.
928 * Unchanged on exit.
929 *
930 * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
931 * Before entry with UPLO = 'U' or 'u', the leading n by n
932 * upper triangular part of the array A must contain the upper
933 * triangular matrix and the strictly lower triangular part of
934 * A is not referenced.
935 * Before entry with UPLO = 'L' or 'l', the leading n by n
936 * lower triangular part of the array A must contain the lower
937 * triangular matrix and the strictly upper triangular part of
938 * A is not referenced.
939 * Note that when DIAG = 'U' or 'u', the diagonal elements of
940 * A are not referenced either, but are assumed to be unity.
941 * Unchanged on exit.
942 *
943 * LDA - INTEGER.
944 * On entry, LDA specifies the first dimension of A as declared
945 * in the calling (sub) program. LDA must be at least
946 * max( 1, n ).
947 * Unchanged on exit.
948 *
949 * X - DOUBLE PRECISION array of dimension at least
950 * ( 1 + ( n - 1 )*abs( INCX ) ).
951 * Before entry, the incremented array X must contain the n
952 * element right-hand side vector b. On exit, X is overwritten
953 * with the solution vector x.
954 *
955 * INCX - INTEGER.
956 * On entry, INCX specifies the increment for the elements of
957 * X. INCX must not be zero.
958 * Unchanged on exit.
959 */
960 /*
961 * Level 2 Blas routine.
962 *
963 * -- Written on 22-October-1986.
964 * Jack Dongarra, Argonne National Lab.
965 * Jeremy Du Croz, Nag Central Office.
966 * Sven Hammarling, Nag Central Office.
967 * Richard Hanson, Sandia National Labs.
968 */
969 double temp;
970 long i, info, ix, j, jx;
971 int nounit, trans = transa;
972 /* Test the input parameters */
973 info = 0;
974 if (uplo!=CblasUpper && uplo!=CblasLower ) {
975 info = 1;
976 } else if (transa!=CblasNoTrans && transa!=CblasTrans &&
977 transa!=CblasConjTrans) {
978 info = 2;
979 } else if (diag!=CblasNonUnit && diag!=CblasUnit) {
980 info = 3;
981 } else if ( n<0 ) {
982 info = 4;
983 } else if ( !lda || lda<n ) {
984 info = 6;
985 } else if ( incx==0 ) {
986 info = 8;
987 }
988 if ( info!=0 ) {
989 xerbla( "ydtrsv ", info );
990 return;
991 }
992 /* Quick return if possible */
993 if ( n==0 ) return;
994
995 nounit = (diag==CblasNonUnit);
996 /*
997 * Set up the start point in X if the increment is not unity.
998 */
999 if ( incx<=0 ) x-= (n-1)*incx;
1000 /*
1001 * Start the operations. In this version the elements of A are
1002 * accessed sequentially with one pass through A.
1003 */
1004 if ( trans==CblasNoTrans ) {
1005 /* Form x := inv( A )*x */
1006 if ( uplo==CblasUpper ) {
1007 long count;
1008 if ( incx==1 ) {
1009 for (j=n-1,a+=j*lda ; j>=0 ; j--,a-=lda) {
1010 if ( x[j]!=0.0 ) {
1011 if ( nounit ) x[j] /= a[j];
1012 if (!j) break;
1013 temp = x[j];
1014 /* unroll inner loop */
1015 count = (j-1)&3;
1016 for (i=j-1 ; count-- ; i--) x[i] -= temp*a[i];
1017 for ( ; i>0 ; i-=4) {
1018 x[i ] -= temp*a[i];
1019 x[i-1] -= temp*a[i-1];
1020 x[i-2] -= temp*a[i-2];
1021 x[i-3] -= temp*a[i-3];
1022 }
1023 }
1024 }
1025 } else {
1026 for (j=n-1,jx=j*incx,a+=j*lda ; j>=0 ; j--,jx-=incx,a-=lda) {
1027 if ( x[jx]!=0.0 ) {
1028 if ( nounit ) x[jx] /= a[j];
1029 if (!j) break;
1030 temp = x[jx];
1031 /* unroll inner loop */
1032 count = (j-1)&3;
1033 for (i=j-1,ix=i*incx ; count-- ; i--,ix-=incx) x[ix] -= temp*a[i];
1034 for ( ; i>0 ; i-=4,ix-=4*incx) {
1035 x[ix ] -= temp*a[i];
1036 x[ix- incx] -= temp*a[i-1];
1037 x[ix-2*incx] -= temp*a[i-2];
1038 x[ix-3*incx] -= temp*a[i-3];
1039 }
1040 }
1041 }
1042 }
1043 } else {
1044 long count;
1045 if ( incx==1 ) {
1046 for (j=0 ; j<n ; j++,a+=lda) {
1047 if ( x[j]!=0.0 ) {
1048 if ( nounit ) x[j] /= a[j];
1049 temp = x[j];
1050 /* unroll inner loop */
1051 count = (n-1-j)&3;
1052 for (i=j+1 ; count-- ; i++) x[i] -= temp*a[i];
1053 for ( ; i<n ; i+=4) {
1054 x[i ] -= temp*a[i];
1055 x[i+1] -= temp*a[i+1];
1056 x[i+2] -= temp*a[i+2];
1057 x[i+3] -= temp*a[i+3];
1058 }
1059 }
1060 }
1061 } else {
1062 for (j=jx=0 ; j<n ; j++,jx+=incx,a+=lda) {
1063 if ( x[jx]!=0.0 ) {
1064 if ( nounit ) x[jx] /= a[j];
1065 temp = x[jx];
1066 /* unroll inner loop */
1067 count = (n-1-j)&3;
1068 for (i=j+1,ix=i*incx ; count-- ; i++,ix+=incx) x[ix] -= temp*a[i];
1069 for ( ; i<n ; i+=4,ix+=4*incx) {
1070 x[ix ] -= temp*a[i];
1071 x[ix+ incx] -= temp*a[i+1];
1072 x[ix+2*incx] -= temp*a[i+2];
1073 x[ix+3*incx] -= temp*a[i+3];
1074 }
1075 }
1076 }
1077 }
1078 }
1079 } else {
1080 /* Form x := inv( A' )*x */
1081 if ( uplo==CblasUpper ) {
1082 if ( incx==1 ) {
1083 for (j=0 ; j<n ; j++,a+=lda) {
1084 temp = x[j];
1085 /* unroll inner loop */
1086 for (i=0 ; i<(j&3) ; i++) temp -= a[i]*x[i];
1087 for ( ; i<j ; i+=4) temp -= a[i]*x[i] +
1088 a[i+1]*x[i+1] + a[i+2]*x[i+2] + a[i+3]*x[i+3];
1089 if ( nounit ) temp /= a[j];
1090 x[j] = temp;
1091 }
1092 } else {
1093 for (j=jx=0 ; j<n ; j++,jx+=incx,a+=lda) {
1094 temp = x[jx];
1095 /* unroll inner loop */
1096 for (i=ix=0 ; i<(j&3) ; i++,ix+=incx) temp -= a[i]*x[ix];
1097 for ( ; i<j ; i+=4,ix+=4*incx) temp -= a[i]*x[ix] +
1098 a[i+1]*x[ix+incx] + a[i+2]*x[ix+2*incx] + a[i+3]*x[ix+3*incx];
1099 if ( nounit ) temp /= a[j];
1100 x[jx] = temp;
1101 }
1102 }
1103 } else {
1104 long count;
1105 if ( incx==1 ) {
1106 for (j=n-1,a+=j*lda ; j>=0 ; j--,a-=lda) {
1107 temp = x[j];
1108 /* unroll inner loop */
1109 count = (n-1-j)&3;
1110 for (i=n-1 ; count-- ; i--) temp -= a[i]*x[i];
1111 for ( ; i>j ; i-=4) temp -= a[i]*x[i] +
1112 a[i-1]*x[i-1] + a[i-2]*x[i-2] + a[i-3]*x[i-3];
1113 if ( nounit ) temp /= a[j];
1114 x[j] = temp;
1115 }
1116 } else {
1117 for (j=n-1,jx=j*incx,a+=j*lda ; j>=0 ; j--,jx-=incx,a-=lda) {
1118 temp = x[jx];
1119 /* unroll inner loop */
1120 count = (n-1-j)&3;
1121 for (i=n-1,ix=i*incx ; count-- ; i--,ix-=incx) temp -= a[i]*x[ix];
1122 for ( ; i>j ; i-=4,ix-=4*incx) temp -= a[i]*x[ix] +
1123 a[i-1]*x[ix-incx] + a[i-2]*x[ix-2*incx] + a[i-3]*x[ix-3*incx];
1124 if ( nounit ) temp /= a[j];
1125 x[jx] = temp;
1126 }
1127 }
1128 }
1129 }
1130
1131 /* End of DTRSV */
1132 }
1133
1134 void
cblas_dger(const enum CBLAS_ORDER order,INT_IN m,INT_IN n,const double alpha,const double * x,INT_IN incx,const double * y,INT_IN incy,double * a,INT_IN lda)1135 cblas_dger(const enum CBLAS_ORDER order, INT_IN m, INT_IN n,
1136 const double alpha, const double *x, INT_IN incx,
1137 const double *y, INT_IN incy, double *a, INT_IN lda)
1138 {
1139 /*
1140 * Purpose
1141 * =======
1142 *
1143 * DGER performs the rank 1 operation
1144 *
1145 * A := alpha*x*y' + A,
1146 *
1147 * where alpha is a scalar, x is an m element vector, y is an n element
1148 * vector and A is an m by n matrix.
1149 *
1150 * Parameters
1151 * ==========
1152 *
1153 * ORDER - enum CBLAS_ORDER (ignored)
1154 *
1155 * M - INTEGER.
1156 * On entry, M specifies the number of rows of the matrix A.
1157 * M must be at least zero.
1158 * Unchanged on exit.
1159 *
1160 * N - INTEGER.
1161 * On entry, N specifies the number of columns of the matrix A.
1162 * N must be at least zero.
1163 * Unchanged on exit.
1164 *
1165 * ALPHA - DOUBLE PRECISION.
1166 * On entry, ALPHA specifies the scalar alpha.
1167 * Unchanged on exit.
1168 *
1169 * X - DOUBLE PRECISION array of dimension at least
1170 * ( 1 + ( m - 1 )*abs( INCX ) ).
1171 * Before entry, the incremented array X must contain the m
1172 * element vector x.
1173 * Unchanged on exit.
1174 *
1175 * INCX - INTEGER.
1176 * On entry, INCX specifies the increment for the elements of
1177 * X. INCX must not be zero.
1178 * Unchanged on exit.
1179 *
1180 * Y - DOUBLE PRECISION array of dimension at least
1181 * ( 1 + ( n - 1 )*abs( INCY ) ).
1182 * Before entry, the incremented array Y must contain the n
1183 * element vector y.
1184 * Unchanged on exit.
1185 *
1186 * INCY - INTEGER.
1187 * On entry, INCY specifies the increment for the elements of
1188 * Y. INCY must not be zero.
1189 * Unchanged on exit.
1190 *
1191 * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
1192 * Before entry, the leading m by n part of the array A must
1193 * contain the matrix of coefficients. On exit, A is
1194 * overwritten by the updated matrix.
1195 *
1196 * LDA - INTEGER.
1197 * On entry, LDA specifies the first dimension of A as declared
1198 * in the calling (sub) program. LDA must be at least
1199 * max( 1, m ).
1200 * Unchanged on exit.
1201 */
1202 /*
1203 * Level 2 Blas routine.
1204 *
1205 * -- Written on 22-October-1986.
1206 * Jack Dongarra, Argonne National Lab.
1207 * Jeremy Du Croz, Nag Central Office.
1208 * Sven Hammarling, Nag Central Office.
1209 * Richard Hanson, Sandia National Labs.
1210 */
1211 double temp;
1212 long i, info, j, m3, mm = m, nn = n, inc_x = incx, inc_y = incy;
1213 /* Test the input parameters. */
1214 info = 0;
1215 if ( m<0 ) {
1216 info = 1;
1217 } else if ( n<0 ) {
1218 info = 2;
1219 } else if ( incx==0 ) {
1220 info = 5;
1221 } else if ( incy==0 ) {
1222 info = 7;
1223 } else if ( !lda || lda<m ) {
1224 info = 9;
1225 }
1226 if ( info!=0 ) {
1227 xerbla( "ydger ", info );
1228 return;
1229 }
1230 /* Quick return if possible. */
1231 if ( ( m==0 ) || ( n==0 ) || ( alpha==0.0 ) )
1232 return;
1233 /*
1234 * Start the operations. In this version the elements of A are
1235 * accessed sequentially with one pass through A.
1236 */
1237 m3 = mm&3;
1238 if ( inc_y<0 ) y -= (nn-1)*inc_y;
1239 if ( inc_x==1 ) {
1240 for (j=0 ; nn-- ; j+=inc_y,a+=lda)
1241 if ( y[j]!=0.0 ) {
1242 temp = alpha*y[j];
1243 /* unroll innermost loop */
1244 for (i=0 ; i<m3 ; i++) a[i] += x[i]*temp;
1245 for ( ; i<mm ; i+=4) {
1246 a[i ] += x[i]*temp;
1247 a[i+1] += x[i+1]*temp;
1248 a[i+2] += x[i+2]*temp;
1249 a[i+3] += x[i+3]*temp;
1250 }
1251 }
1252 } else {
1253 long ix;
1254 if ( inc_x<0 ) x -= (mm-1)*inc_x;
1255 for (j=0 ; nn-- ; j+=inc_y,a+=lda) {
1256 if ( y[j]!=0.0 ) {
1257 temp = alpha*y[j];
1258 /* unroll innermost loop */
1259 for (i=ix=0 ; i<m3 ; i++,ix+=inc_x) a[i] += x[ix]*temp;
1260 for ( ; i<mm ; i+=4,ix+=4*inc_x) {
1261 a[i ] += x[ix]*temp;
1262 a[i+1] += x[ix+inc_x]*temp;
1263 a[i+2] += x[ix+2*inc_x]*temp;
1264 a[i+3] += x[ix+3*inc_x]*temp;
1265 }
1266 }
1267 }
1268 }
1269
1270 /* End of DGER */
1271 }
1272
1273 /* ------------------------------------------------------------ level 3 */
1274
1275 void
cblas_dgemm(const enum CBLAS_ORDER order,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_TRANSPOSE transb,INT_IN m,INT_IN n,INT_IN k,const double alpha,const double * a,INT_IN lda,const double * b,INT_IN ldb,const double beta,double * c,INT_IN ldc)1276 cblas_dgemm(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE transa,
1277 const enum CBLAS_TRANSPOSE transb, INT_IN m, INT_IN n,
1278 INT_IN k, const double alpha, const double *a,
1279 INT_IN lda, const double *b, INT_IN ldb,
1280 const double beta, double *c, INT_IN ldc)
1281 {
1282 /*
1283 * Purpose
1284 * =======
1285 *
1286 * DGEMM performs one of the matrix-matrix operations
1287 *
1288 * C := alpha*op( A )*op( B ) + beta*C,
1289 *
1290 * where op( X ) is one of
1291 *
1292 * op( X ) = X or op( X ) = X',
1293 *
1294 * alpha and beta are scalars, and A, B and C are matrices, with op( A )
1295 * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
1296 *
1297 * Parameters
1298 * ==========
1299 *
1300 * ORDER - enum CBLAS_ORDER (ignored)
1301 *
1302 * TRANSA - enum CBLAS_TRANSPOSE
1303 * On entry, TRANSA specifies the form of op( A ) to be used in
1304 * the matrix multiplication as follows:
1305 * TRANSA = CblasNoTrans, op( A ) = A.
1306 * TRANSA = CblasTrans, op( A ) = A'.
1307 * TRANSA = CblasConjTrans, op( A ) = A'.
1308 *
1309 * TRANSB - enum CBLAS_TRANSPOSE
1310 * On entry, TRANSA specifies the form of op( B ) to be used in
1311 * the matrix multiplication as follows:
1312 * TRANSB = CblasNoTrans, op( B ) = B.
1313 * TRANSB = CblasTrans, op( B ) = B'.
1314 * TRANSB = CblasConjTrans, op( B ) = B'.
1315 *
1316 * M - INTEGER.
1317 * On entry, M specifies the number of rows of the matrix
1318 * op( A ) and of the matrix C. M must be at least zero.
1319 * Unchanged on exit.
1320 *
1321 * N - INTEGER.
1322 * On entry, N specifies the number of columns of the matrix
1323 * op( B ) and the number of columns of the matrix C. N must be
1324 * at least zero.
1325 * Unchanged on exit.
1326 *
1327 * K - INTEGER.
1328 * On entry, K specifies the number of columns of the matrix
1329 * op( A ) and the number of rows of the matrix op( B ). K must
1330 * be at least zero.
1331 * Unchanged on exit.
1332 *
1333 * ALPHA - DOUBLE PRECISION.
1334 * On entry, ALPHA specifies the scalar alpha.
1335 * Unchanged on exit.
1336 *
1337 * A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is
1338 * k when TRANSA = 'N' or 'n', and is m otherwise.
1339 * Before entry with TRANSA = 'N' or 'n', the leading m by k
1340 * part of the array A must contain the matrix A, otherwise
1341 * the leading k by m part of the array A must contain the
1342 * matrix A.
1343 * Unchanged on exit.
1344 *
1345 * LDA - INTEGER.
1346 * On entry, LDA specifies the first dimension of A as declared
1347 * in the calling (sub) program. When TRANSA = 'N' or 'n' then
1348 * LDA must be at least max( 1, m ), otherwise LDA must be at
1349 * least max( 1, k ).
1350 * Unchanged on exit.
1351 *
1352 * B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is
1353 * n when TRANSB = 'N' or 'n', and is k otherwise.
1354 * Before entry with TRANSB = 'N' or 'n', the leading k by n
1355 * part of the array B must contain the matrix B, otherwise
1356 * the leading n by k part of the array B must contain the
1357 * matrix B.
1358 * Unchanged on exit.
1359 *
1360 * LDB - INTEGER.
1361 * On entry, LDB specifies the first dimension of B as declared
1362 * in the calling (sub) program. When TRANSB = 'N' or 'n' then
1363 * LDB must be at least max( 1, k ), otherwise LDB must be at
1364 * least max( 1, n ).
1365 * Unchanged on exit.
1366 *
1367 * BETA - DOUBLE PRECISION.
1368 * On entry, BETA specifies the scalar beta. When BETA is
1369 * supplied as zero then C need not be set on input.
1370 * Unchanged on exit.
1371 *
1372 * C - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
1373 * Before entry, the leading m by n part of the array C must
1374 * contain the matrix C, except when beta is zero, in which
1375 * case C need not be set on entry.
1376 * On exit, the array C is overwritten by the m by n matrix
1377 * ( alpha*op( A )*op( B ) + beta*C ).
1378 *
1379 * LDC - INTEGER.
1380 * On entry, LDC specifies the first dimension of C as declared
1381 * in the calling (sub) program. LDC must be at least
1382 * max( 1, m ).
1383 * Unchanged on exit.
1384 */
1385 /*
1386 * Level 3 Blas routine.
1387 *
1388 * -- Written on 8-February-1989.
1389 * Jack Dongarra, Argonne National Laboratory.
1390 * Iain Duff, AERE Harwell.
1391 * Jeremy Du Croz, Numerical Algorithms Group Ltd.
1392 * Sven Hammarling, Numerical Algorithms Group Ltd.
1393 */
1394 int nota, notb;
1395 long i, info, j, l, nrowa, nrowb, nn = n;
1396 double temp;
1397 /*
1398 * Set NOTA and NOTB as true if A and B respectively are not
1399 * transposed and set NROWA, NCOLA and NROWB as the number of rows
1400 * and columns of A and the number of rows of B respectively.
1401 */
1402 nota = transa==CblasNoTrans;
1403 notb = transb==CblasNoTrans;
1404 nrowa = nota? m : k;
1405 nrowb = notb? k : n;
1406 /* Test the input parameters. */
1407 info = 0;
1408 if ( !nota && transa!=CblasTrans && transa!=CblasConjTrans ) {
1409 info = 1;
1410 } else if ( !notb && transb!=CblasTrans && transb!=CblasConjTrans ) {
1411 info = 2;
1412 } else if ( m<0 ) {
1413 info = 3;
1414 } else if ( n<0 ) {
1415 info = 4;
1416 } else if ( k<0 ) {
1417 info = 5;
1418 } else if ( !lda || lda<nrowa ) {
1419 info = 8;
1420 } else if ( !ldb || ldb<nrowb ) {
1421 info = 10;
1422 } else if ( !ldc || ldc<m ) {
1423 info = 13;
1424 }
1425 if ( info!=0 ) {
1426 xerbla( "ydgemm ", info );
1427 return;
1428 }
1429 /* Quick return if possible. */
1430 if ( ( m==0 )||( n==0 ) ||
1431 ( ( ( alpha==0.0 ) || ( k==0 ) ) && ( beta==1.0 ) ) )
1432 return;
1433 /* And if alpha.eq.0.0. */
1434 if ( alpha==0.0 ) {
1435 if ( beta==0.0 ) {
1436 for ( ; nn-- ; c+=ldc)
1437 for (i=0 ; i<m ; i++) c[i] = 0.0;
1438 } else {
1439 for ( ; nn-- ; c+=ldc)
1440 for (i=0 ; i<m ; i++) c[i] *= beta;
1441 }
1442 return;
1443 }
1444 /* Start the operations. */
1445 if ( notb ) {
1446 if ( nota ) {
1447 /* Form C := alpha*A*B + beta*C */
1448 const double *al;
1449 long m3 = m&3;
1450 for ( ; nn-- ; b+=ldb,c+=ldc) {
1451 if ( beta==0.0 ) {
1452 for (i=0 ; i<m ; i++) c[i] = 0.0;
1453 } else if ( beta!=1.0 ) {
1454 for (i=0 ; i<m ; i++) c[i] *= beta;
1455 }
1456 for (l=0,al=a ; l<k ; l++,al+=lda) {
1457 if ( b[l]!=0.0 ) {
1458 temp = alpha*b[l];
1459 /* unroll innermost loop */
1460 for (i=0 ; i<m3 ; i++) c[i] += temp*al[i];
1461 for ( ; i<m ; i+=4) {
1462 c[i] += temp*al[i];
1463 c[i+1] += temp*al[i+1];
1464 c[i+2] += temp*al[i+2];
1465 c[i+3] += temp*al[i+3];
1466 }
1467 }
1468 }
1469 }
1470 } else {
1471 /* Form C := alpha*A'*B + beta*C */
1472 const double *ai;
1473 long k3 = k&3;
1474 for ( ; nn-- ; b+=ldb,c+=ldc) {
1475 for (i=0,ai=a ; i<m ; i++,ai+=lda) {
1476 temp = 0.0;
1477 /* unroll innermost loop */
1478 for (l=0 ; l<k3 ; l++) temp += ai[l]*b[l];
1479 for ( ; l<k ; l+=4)
1480 temp += ai[l]*b[l] + ai[l+1]*b[l+1] +
1481 ai[l+2]*b[l+2] + ai[l+3]*b[l+3];
1482 if ( beta==0.0 ) c[i] = alpha*temp;
1483 else c[i] = alpha*temp + beta*c[i];
1484 }
1485 }
1486 }
1487 } else {
1488 if ( nota ) {
1489 /* Form C := alpha*A*B' + beta*C */
1490 const double *al, *bl;
1491 long m3 = m&3;
1492 for (j=0 ; j<nn ; j++,c+=ldc) {
1493 if ( beta==0.0 ) for (i=0 ; i<m ; i++) c[i] = 0.0;
1494 else if ( beta!=1.0 ) for (i=0 ; i<m ; i++) c[i] *= beta;
1495 for (l=0,al=a,bl=b ; l<k ; l++,al+=lda,bl+=ldb) {
1496 if ( bl[j]!=0.0 ) {
1497 temp = alpha*bl[j];
1498 /* unroll innermost loop */
1499 for (i=0 ; i<m3 ; i++) c[i] += temp*al[i];
1500 for ( ; i<m ; i+=4) {
1501 c[i] += temp*al[i];
1502 c[i+1] += temp*al[i+1];
1503 c[i+2] += temp*al[i+2];
1504 c[i+3] += temp*al[i+3];
1505 }
1506 }
1507 }
1508 }
1509 } else {
1510 /* Form C := alpha*A'*B' + beta*C */
1511 const double *ai, *bl;
1512 long k3 = k&3;
1513 for (j=0 ; j<nn ; j++,c+=ldc) {
1514 for (i=0,ai=a ; i<m ; i++,ai+=lda) {
1515 temp = 0.0;
1516 /* unroll innermost loop */
1517 for (l=0,bl=b+j ; l<k3 ; l++,bl+=ldb) temp += ai[l]*bl[0];
1518 for ( ; l<k ; l +=4,bl+=4*ldb)
1519 temp += ai[l]*bl[0] + ai[l+1]*bl[ldb] +
1520 ai[l+2]*bl[2*ldb] + ai[l+3]*bl[3*ldb];
1521 if ( beta==0.0 ) c[i] = alpha*temp;
1522 else c[i] = alpha*temp + beta*c[i];
1523 }
1524 }
1525 }
1526 }
1527
1528 /* End of DGEMM */
1529 }
1530
1531 void
cblas_dtrmm(const enum CBLAS_ORDER order,const enum CBLAS_SIDE side,const enum CBLAS_UPLO uplo,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_DIAG diag,INT_IN m,INT_IN n,const double alpha,const double * a,INT_IN lda,double * b,INT_IN ldb)1532 cblas_dtrmm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side,
1533 const enum CBLAS_UPLO uplo, const enum CBLAS_TRANSPOSE transa,
1534 const enum CBLAS_DIAG diag, INT_IN m, INT_IN n,
1535 const double alpha, const double *a, INT_IN lda,
1536 double *b, INT_IN ldb)
1537 {
1538 /*
1539 * Purpose
1540 * =======
1541 *
1542 * DTRMM performs one of the matrix-matrix operations
1543 *
1544 * B := alpha*op( A )*B, or B := alpha*B*op( A ),
1545 *
1546 * where alpha is a scalar, B is an m by n matrix, A is a unit, or
1547 * non-unit, upper or lower triangular matrix and op( A ) is one of
1548 *
1549 * op( A ) = A or op( A ) = A'.
1550 *
1551 * Parameters
1552 * ==========
1553 *
1554 * ORDER - enum CBLAS_ORDER (ignored)
1555 *
1556 * SIDE - enum CBLAS_SIDE
1557 * On entry, SIDE specifies whether op( A ) multiplies B from
1558 * the left or right as follows:
1559 * SIDE = CblasLeft B := alpha*op( A )*B.
1560 * SIDE = CblasRight B := alpha*B*op( A ).
1561 *
1562 * UPLO - enum CBLAS_UPLO
1563 * On entry, UPLO specifies whether the matrix is an upper or
1564 * lower triangular matrix as follows:
1565 * UPLO = CblasUpper A is an upper triangular matrix.
1566 * UPLO = CblasLower A is a lower triangular matrix.
1567 *
1568 * TRANSA - enum CBLAS_TRANSPOSE
1569 * On entry, TRANSA specifies the form of op( A ) to be used in
1570 * the matrix multiplication as follows:
1571 * TRANSA = CblasNoTrans, op( A ) = A.
1572 * TRANSA = CblasTrans, op( A ) = A'.
1573 * TRANSA = CblasConjTrans, op( A ) = A'.
1574 *
1575 * DIAG - enum CBLAS_DIAG
1576 * On entry, DIAG specifies whether or not A is unit
1577 * triangular as follows:
1578 * DIAG = CblasUnit A is assumed to be unit triangular.
1579 * DIAG = CblasNonUnit A is not assumed to be unit
1580 * triangular.
1581 *
1582 * M - INTEGER.
1583 * On entry, M specifies the number of rows of B. M must be at
1584 * least zero.
1585 * Unchanged on exit.
1586 *
1587 * N - INTEGER.
1588 * On entry, N specifies the number of columns of B. N must be
1589 * at least zero.
1590 * Unchanged on exit.
1591 *
1592 * ALPHA - DOUBLE PRECISION.
1593 * On entry, ALPHA specifies the scalar alpha. When alpha is
1594 * zero then A is not referenced and B need not be set before
1595 * entry.
1596 * Unchanged on exit.
1597 *
1598 * A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m
1599 * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'.
1600 * Before entry with UPLO = 'U' or 'u', the leading k by k
1601 * upper triangular part of the array A must contain the upper
1602 * triangular matrix and the strictly lower triangular part of
1603 * A is not referenced.
1604 * Before entry with UPLO = 'L' or 'l', the leading k by k
1605 * lower triangular part of the array A must contain the lower
1606 * triangular matrix and the strictly upper triangular part of
1607 * A is not referenced.
1608 * Note that when DIAG = 'U' or 'u', the diagonal elements of
1609 * A are not referenced either, but are assumed to be unity.
1610 * Unchanged on exit.
1611 *
1612 * LDA - INTEGER.
1613 * On entry, LDA specifies the first dimension of A as declared
1614 * in the calling (sub) program. When SIDE = 'L' or 'l' then
1615 * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r'
1616 * then LDA must be at least max( 1, n ).
1617 * Unchanged on exit.
1618 *
1619 * B - DOUBLE PRECISION array of DIMENSION ( LDB, n ).
1620 * Before entry, the leading m by n part of the array B must
1621 * contain the matrix B, and on exit is overwritten by the
1622 * transformed matrix.
1623 *
1624 * LDB - INTEGER.
1625 * On entry, LDB specifies the first dimension of B as declared
1626 * in the calling (sub) program. LDB must be at least
1627 * max( 1, m ).
1628 * Unchanged on exit.
1629 */
1630 /*
1631 * Level 3 Blas routine.
1632 *
1633 * -- Written on 8-February-1989.
1634 * Jack Dongarra, Argonne National Laboratory.
1635 * Iain Duff, AERE Harwell.
1636 * Jeremy Du Croz, Numerical Algorithms Group Ltd.
1637 * Sven Hammarling, Numerical Algorithms Group Ltd.
1638 */
1639 int lside, nounit, upper;
1640 long i, info, j, k, nrowa, nn = n;
1641 double temp;
1642 /* Test the input parameters */
1643 lside = side==CblasLeft;
1644 nrowa = lside? m : n;
1645 nounit = diag==CblasNonUnit;
1646 upper = uplo==CblasUpper;
1647
1648 info = 0;
1649 if ( !lside && side!=CblasRight ) {
1650 info = 1;
1651 } else if ( !upper && uplo!=CblasLower ) {
1652 info = 2;
1653 } else if (transa!=CblasNoTrans && transa!=CblasTrans &&
1654 transa!=CblasConjTrans) {
1655 info = 3;
1656 } else if ( diag!=CblasNonUnit && diag!=CblasUnit ) {
1657 info = 4;
1658 } else if ( m<0 ) {
1659 info = 5;
1660 } else if ( n<0 ) {
1661 info = 6;
1662 } else if ( !lda || lda<nrowa ) {
1663 info = 9;
1664 } else if ( !ldb || ldb<m ) {
1665 info = 11;
1666 }
1667 if ( info!=0 ) {
1668 xerbla( "ydtrmm ", info );
1669 return;
1670 }
1671 /* Quick return if possible. */
1672 if ( n==0 )
1673 return;
1674 /* And when alpha.eq.0.0. */
1675 if ( alpha==0.0 ) {
1676 for ( ; nn-- ; b+=ldb)
1677 for (i=0 ; i<m ; i++) b[i] = 0.0;
1678 return;
1679 }
1680 /* Start the operations. */
1681 if ( lside ) {
1682 if ( transa==CblasNoTrans ) {
1683 /* Form B := alpha*A*B. */
1684 const double *ak;
1685 if ( upper ) {
1686 for ( ; nn-- ; b+=ldb) {
1687 for (k=0,ak=a ; k<m ; k++,ak+=lda) {
1688 if ( b[k]!=0.0 ) {
1689 temp = alpha*b[k];
1690 /* hand unroll inner loop */
1691 for (i=0 ; i<(k&3) ; i++) b[i] += temp*ak[i];
1692 for ( ; i<k ; i+=4) {
1693 b[i ] += temp*ak[i];
1694 b[i+1] += temp*ak[i+1];
1695 b[i+2] += temp*ak[i+2];
1696 b[i+3] += temp*ak[i+3];
1697 }
1698 if ( nounit ) temp *= ak[k];
1699 b[k] = temp;
1700 }
1701 }
1702 }
1703 } else {
1704 long count;
1705 a += (m-1)*lda;
1706 for ( ; nn-- ; b+=ldb) {
1707 for (k=m-1,ak=a ; k>=0 ; k--,ak-=lda) {
1708 if ( b[k]!=0.0 ) {
1709 temp = alpha*b[k];
1710 b[k] = temp;
1711 if ( nounit ) b[k] *= ak[k];
1712 /* hand unroll inner loop */
1713 count = (m-1-k)&3;
1714 for (i=k+1 ; count-- ; i++) b[i] += temp*ak[i];
1715 for ( ; i<m ; i+=4) {
1716 b[i ] += temp*ak[i];
1717 b[i+1] += temp*ak[i+1];
1718 b[i+2] += temp*ak[i+2];
1719 b[i+3] += temp*ak[i+3];
1720 }
1721 }
1722 }
1723 }
1724 }
1725 } else {
1726 /* Form B := alpha*B*A'. */
1727 const double *ai;
1728 if ( upper ) {
1729 a += (m-1)*lda;
1730 for ( ; nn-- ; b+=ldb) {
1731 for (i=m-1,ai=a ; i>=0 ; i--,a-=lda) {
1732 temp = b[i];
1733 if ( nounit ) temp *= ai[i];
1734 /* hand unroll inner loop */
1735 for (k=0 ; k<(i&3) ; k++) temp += ai[k]*b[k];
1736 for ( ; k<i ; k+=4)
1737 temp += ai[k]*b[k] + ai[k+1]*b[k+1] + ai[k+2]*b[k+2] +
1738 ai[k+3]*b[k+3];
1739 b[i] = alpha*temp;
1740 }
1741 }
1742 } else {
1743 long count;
1744 for ( ; nn-- ; b+=ldb) {
1745 for (i=0,ai=a ; i<m ; i++,a+=lda) {
1746 temp= b[i];
1747 if ( nounit ) temp *= ai[i];
1748 /* hand unroll inner loop */
1749 count = (m-1-i)&3;
1750 for (k=i+1 ; count-- ; k++) temp += ai[k]*b[k];
1751 for ( ; k<m ; k+=4)
1752 temp += ai[k]*b[k] + ai[k+1]*b[k+1] + ai[k+2]*b[k+2] +
1753 ai[k+3]*b[k+3];
1754 b[i] = alpha*temp;
1755 }
1756 }
1757 }
1758 }
1759 } else {
1760 if ( transa==CblasNoTrans ) {
1761 /* Form B := alpha*B*A. */
1762 double *bj, *bk;
1763 if ( upper ) {
1764 for (j=nn-1,a+=j*lda,bj=b+j*ldb ; j>=0 ; j--,a-=lda,bj-=ldb) {
1765 temp = alpha;
1766 if ( nounit ) temp *= a[j];
1767 for (i=0 ; i<m ; i++) bj[i] *= temp;
1768 for (k=0,bk=b ; k<j ; k++,bk+=ldb) {
1769 if ( a[k]!=0.0 ) {
1770 temp = alpha*a[k];
1771 /* unroll inner loop */
1772 for (i=0 ; i<(m&3) ; i++) bj[i] += temp*bk[i];
1773 for ( ; i<m ; i+=4) {
1774 bj[i ] += temp*bk[i];
1775 bj[i+1] += temp*bk[i+1];
1776 bj[i+2] += temp*bk[i+2];
1777 bj[i+3] += temp*bk[i+3];
1778 }
1779 }
1780 }
1781 }
1782 } else {
1783 for (j=0,bj=b ; j<nn ; j++,a+=lda,bj+=ldb) {
1784 temp = alpha;
1785 if ( nounit ) temp *= a[j];
1786 for (i=0 ; i<m ; i++) bj[i] *= temp;
1787 for (k=j+1,bk=b+k*ldb ; k<nn ; k++,bk+=ldb) {
1788 if ( a[k]!=0.0 ) {
1789 temp = alpha*a[k];
1790 /* unroll inner loop */
1791 for (i=0 ; i<(m&3) ; i++) bj[i] += temp*bk[i];
1792 for ( ; i<m ; i+=4) {
1793 bj[i ] += temp*bk[i];
1794 bj[i+1] += temp*bk[i+1];
1795 bj[i+2] += temp*bk[i+2];
1796 bj[i+3] += temp*bk[i+3];
1797 }
1798 }
1799 }
1800 }
1801 }
1802 } else {
1803 /* Form B := alpha*B*A'. */
1804 double *bj, *bk;
1805 if ( upper ) {
1806 for (k=0,bk=b ; k<nn ; k++,a+=lda,bk+=ldb) {
1807 for (j=0,bj=b ; j<k ; j++,bj+=ldb) {
1808 if ( a[j]!=0.0 ) {
1809 temp = alpha*a[j];
1810 /* unroll inner loop */
1811 for (i=0 ; i<(m&3) ; i++) bj[i] += temp*bk[i];
1812 for ( ; i<m ; i+=4) {
1813 bj[i ] += temp*bk[i];
1814 bj[i+1] += temp*bk[i+1];
1815 bj[i+2] += temp*bk[i+2];
1816 bj[i+3] += temp*bk[i+3];
1817 }
1818 }
1819 }
1820 temp = alpha;
1821 if ( nounit ) temp *= a[k];
1822 if ( temp!=1.0 ) for (i=0 ; i<m ; i++) bk[i] *= temp;
1823 }
1824 } else {
1825 for (k=nn-1,a+=k*lda,bk=b+k*ldb ; k>=0 ; k--,a-=lda,bk-=ldb) {
1826 for (j=k+1,bj=b+j*ldb ; j<nn ; j++,bj+=ldb) {
1827 if ( a[j]!=0.0 ) {
1828 temp = alpha*a[j];
1829 /* unroll inner loop */
1830 for (i=0 ; i<(m&3) ; i++) bj[i] += temp*bk[i];
1831 for ( ; i<m ; i+=4) {
1832 bj[i ] += temp*bk[i];
1833 bj[i+1] += temp*bk[i+1];
1834 bj[i+2] += temp*bk[i+2];
1835 bj[i+3] += temp*bk[i+3];
1836 }
1837 }
1838 }
1839 temp = alpha;
1840 if ( nounit ) temp *= a[k];
1841 if ( temp!=1.0 ) for (i=0 ; i<m ; i++) bk[i] *= temp;
1842 }
1843 }
1844 }
1845 }
1846
1847 return;
1848 /* End of DTRMM */
1849 }
1850
1851 void
cblas_dtrsm(const enum CBLAS_ORDER order,const enum CBLAS_SIDE side,const enum CBLAS_UPLO uplo,const enum CBLAS_TRANSPOSE transa,const enum CBLAS_DIAG diag,INT_IN m,INT_IN n,const double alpha,const double * a,INT_IN lda,double * b,INT_IN ldb)1852 cblas_dtrsm(const enum CBLAS_ORDER order, const enum CBLAS_SIDE side,
1853 const enum CBLAS_UPLO uplo, const enum CBLAS_TRANSPOSE transa,
1854 const enum CBLAS_DIAG diag, INT_IN m, INT_IN n,
1855 const double alpha, const double *a, INT_IN lda,
1856 double *b, INT_IN ldb)
1857 {
1858 /*
1859 * Purpose
1860 * =======
1861 *
1862 * DTRSM solves one of the matrix equations
1863 *
1864 * op( A )*X = alpha*B, or X*op( A ) = alpha*B,
1865 *
1866 * where alpha is a scalar, X and B are m by n matrices, A is a unit, or
1867 * non-unit, upper or lower triangular matrix and op( A ) is one of
1868 *
1869 * op( A ) = A or op( A ) = A'.
1870 *
1871 * The matrix X is overwritten on B.
1872 *
1873 * Parameters
1874 * ==========
1875 *
1876 * ORDER - enum CBLAS_ORDER (ignored)
1877 *
1878 * SIDE - enum CBLAS_SIDE
1879 * On entry, SIDE specifies whether op( A ) appears on the left
1880 * or right of X as follows:
1881 * SIDE = CblasLeft op( A )*X = alpha*B.
1882 * SIDE = CblasRight X*op( A ) = alpha*B.
1883 *
1884 * UPLO - enum CBLAS_UPLO
1885 * On entry, UPLO specifies whether the matrix A is an upper or
1886 * lower triangular matrix as follows:
1887 * UPLO = CblasUpper A is an upper triangular matrix.
1888 * UPLO = CblasLower A is a lower triangular matrix.
1889 *
1890 * TRANSA - enum CBLAS_TRANSPOSE
1891 * On entry, TRANSA specifies the form of op( A ) to be used in
1892 * the matrix multiplication as follows:
1893 * TRANSA = CblasNoTrans, op( A ) = A.
1894 * TRANSA = CblasTrans, op( A ) = A'.
1895 * TRANSA = CblasConjTrans, op( A ) = A'.
1896 *
1897 * DIAG - enum CBLAS_DIAG
1898 * On entry, DIAG specifies whether or not A is unit
1899 * triangular as follows:
1900 * DIAG = CblasUnit A is assumed to be unit triangular.
1901 * DIAG = CblasNonUnit A is not assumed to be unit
1902 * triangular.
1903 *
1904 * M - INTEGER.
1905 * On entry, M specifies the number of rows of B. M must be at
1906 * least zero.
1907 * Unchanged on exit.
1908 *
1909 * N - INTEGER.
1910 * On entry, N specifies the number of columns of B. N must be
1911 * at least zero.
1912 * Unchanged on exit.
1913 *
1914 * ALPHA - DOUBLE PRECISION.
1915 * On entry, ALPHA specifies the scalar alpha. When alpha is
1916 * zero then A is not referenced and B need not be set before
1917 * entry.
1918 * Unchanged on exit.
1919 *
1920 * A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m
1921 * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'.
1922 * Before entry with UPLO = 'U' or 'u', the leading k by k
1923 * upper triangular part of the array A must contain the upper
1924 * triangular matrix and the strictly lower triangular part of
1925 * A is not referenced.
1926 * Before entry with UPLO = 'L' or 'l', the leading k by k
1927 * lower triangular part of the array A must contain the lower
1928 * triangular matrix and the strictly upper triangular part of
1929 * A is not referenced.
1930 * Note that when DIAG = 'U' or 'u', the diagonal elements of
1931 * A are not referenced either, but are assumed to be unity.
1932 * Unchanged on exit.
1933 *
1934 * LDA - INTEGER.
1935 * On entry, LDA specifies the first dimension of A as declared
1936 * in the calling (sub) program. When SIDE = 'L' or 'l' then
1937 * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r'
1938 * then LDA must be at least max( 1, n ).
1939 * Unchanged on exit.
1940 *
1941 * B - DOUBLE PRECISION array of DIMENSION ( LDB, n ).
1942 * Before entry, the leading m by n part of the array B must
1943 * contain the right-hand side matrix B, and on exit is
1944 * overwritten by the solution matrix X.
1945 *
1946 * LDB - INTEGER.
1947 * On entry, LDB specifies the first dimension of B as declared
1948 * in the calling (sub) program. LDB must be at least
1949 * max( 1, m ).
1950 * Unchanged on exit.
1951 */
1952 /*
1953 * Level 3 Blas routine.
1954 */
1955 /**
1956 * -- Written on 8-February-1989.
1957 * Jack Dongarra, Argonne National Laboratory.
1958 * Iain Duff, AERE Harwell.
1959 * Jeremy Du Croz, Numerical Algorithms Group Ltd.
1960 * Sven Hammarling, Numerical Algorithms Group Ltd.
1961 */
1962 int lside, nounit, upper;
1963 long i, info, j, k, nrowa;
1964 double temp;
1965 /* Test the input parameters. */
1966 lside = side==CblasLeft;
1967 nrowa = lside? m : n;
1968 nounit = diag==CblasNonUnit;
1969 upper = uplo==CblasUpper;
1970
1971 info = 0;
1972 if ( !lside && side!=CblasRight ) {
1973 info = 1;
1974 } else if ( !upper && uplo!=CblasLower ) {
1975 info = 2;
1976 } else if (transa!=CblasNoTrans && transa!=CblasTrans &&
1977 transa!=CblasConjTrans) {
1978 info = 3;
1979 } else if ( diag!=CblasNonUnit && diag!=CblasUnit ) {
1980 info = 4;
1981 } else if ( m<0 ) {
1982 info = 5;
1983 } else if ( n<0 ) {
1984 info = 6;
1985 } else if ( !lda || lda<nrowa ) {
1986 info = 9;
1987 } else if ( !ldb || ldb<m ) {
1988 info = 11;
1989 }
1990 if ( info!=0 ) {
1991 xerbla( "ydtrsm ", info );
1992 return;
1993 }
1994 /* Quick return if possible. */
1995 if ( n==0 ) return;
1996 /* And when alpha.eq.0.0. */
1997 if ( alpha==0.0 ) {
1998 long nn = n;
1999 for ( ; nn-- ; b+=ldb) {
2000 for (i=0 ; i<m ; i++) b[i] = 0.0;
2001 }
2002 return;
2003 }
2004 /* Start the operations. */
2005 if ( lside ) {
2006 if ( transa==CblasNoTrans ) {
2007 /* Form B := alpha*inv( A )*B */
2008 const double *ak;
2009 if ( upper ) {
2010 for (j=0 ; j<n ; j++,b+=ldb) {
2011 if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) b[i] *= alpha;
2012 for (k=m-1,ak=a+k*lda ; k>=0 ; k--,ak-=lda) {
2013 if ( b[k]!=0.0 ) {
2014 if ( nounit ) b[k] /= ak[k];
2015 /* unroll inner loop */
2016 for (i=0 ; i<(k&3) ; i++) b[i] -= b[k]*ak[i];
2017 for ( ; i<k ; i+=4) {
2018 b[i ] -= b[k]*ak[i];
2019 b[i+1] -= b[k]*ak[i+1];
2020 b[i+2] -= b[k]*ak[i+2];
2021 b[i+3] -= b[k]*ak[i+3];
2022 }
2023 }
2024 }
2025 }
2026 } else {
2027 long count;
2028 for (j=0 ; j<n ; j++,b+=ldb) {
2029 if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) b[i] *= alpha;
2030 for (k=0,ak=a ; k<m ; k++,ak+=lda) {
2031 if ( b[k]!=0.0 ) {
2032 if ( nounit ) b[k] /= ak[k];
2033 /* unroll inner loop */
2034 count = (m-1-k)&3;
2035 for (i=k+1 ; count-- ; i++) b[i] -= b[k]*ak[i];
2036 for ( ; i<m ; i+=4) {
2037 b[i ] -= b[k]*ak[i];
2038 b[i+1] -= b[k]*ak[i+1];
2039 b[i+2] -= b[k]*ak[i+2];
2040 b[i+3] -= b[k]*ak[i+3];
2041 }
2042 }
2043 }
2044 }
2045 }
2046 } else {
2047 /* Form B := alpha*inv( A' )*B */
2048 const double *ai;
2049 if ( upper ) {
2050 for (j=0 ; j<n ; j++,b+=ldb) {
2051 for (i=0,ai=a ; i<m ; i++,ai+=lda) {
2052 temp = alpha*b[i];
2053 /* unroll inner loop */
2054 for (k=0 ; k<(i&3) ; k++) temp -= ai[k]*b[k];
2055 for ( ; k<i ; k+=4) temp -= ai[k]*b[k] +
2056 ai[k+1]*b[k+1] + ai[k+2]*b[k+2] + ai[k+3]*b[k+3];
2057 if ( nounit ) temp /= ai[i];
2058 b[i] = temp;
2059 }
2060 }
2061 } else {
2062 long count;
2063 for (j=0 ; j<n ; j++,b+=ldb) {
2064 for (i=m-1,ai=a+i*lda ; i>=0 ; i--,ai-=lda) {
2065 temp = alpha*b[i];
2066 /* unroll inner loop */
2067 count = (m-1-i)&3;
2068 for (k=i+1 ; count-- ; k++) temp -= ai[k]*b[k];
2069 for ( ; k<m ; k+=4) temp -= ai[k]*b[k] +
2070 ai[k+1]*b[k+1] + ai[k+2]*b[k+2] + ai[k+3]*b[k+3];
2071 if ( nounit ) temp/= ai[i];
2072 b[i] = temp;
2073 }
2074 }
2075 }
2076 }
2077 } else {
2078 if ( transa==CblasNoTrans ) {
2079 /* Form B := alpha*B*inv( A ) */
2080 double *bj, *bk;
2081 if ( upper ) {
2082 for (j=0,bj=b ; j<n ; j++,a+=lda,bj+=ldb) {
2083 if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) bj[i] *= alpha;
2084 for (k=0,bk=b ; k<j ; k++,bk+=ldb) {
2085 if ( a[k]!=0.0 ) {
2086 /* unroll inner loop */
2087 for (i=0 ; i<(m&3) ; i++) bj[i] -= a[k]*bk[i];
2088 for ( ; i<m ; i+=4) {
2089 bj[i ] -= a[k]*bk[i];
2090 bj[i+1] -= a[k]*bk[i+1];
2091 bj[i+2] -= a[k]*bk[i+2];
2092 bj[i+3] -= a[k]*bk[i+3];
2093 }
2094 }
2095 }
2096 if ( nounit ) {
2097 temp= 1.0/a[j];
2098 for (i=0 ; i<m ; i++) bj[i] *= temp;
2099 }
2100 }
2101 } else {
2102 for (j=n-1,a+=j*lda,bj=b+j*ldb ; j>=0 ; j--,a-=lda,bj-=ldb) {
2103 if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) bj[i] *= alpha;
2104 for (k=j+1,bk=b+k*ldb ; k<n ; k++,bk+=ldb) {
2105 if ( a[k]!=0.0 ) {
2106 /* unroll inner loop */
2107 for (i=0 ; i<(m&3) ; i++) bj[i] -= a[k]*bk[i];
2108 for ( ; i<m ; i+=4) {
2109 bj[i ] -= a[k]*bk[i];
2110 bj[i+1] -= a[k]*bk[i+1];
2111 bj[i+2] -= a[k]*bk[i+2];
2112 bj[i+3] -= a[k]*bk[i+3];
2113 }
2114 }
2115 }
2116 if ( nounit ) {
2117 temp = 1.0/a[j];
2118 for (i=0 ; i<m ; i++) bj[i] *= temp;
2119 }
2120 }
2121 }
2122 } else {
2123 /* Form B := alpha*B*inv( A' ) */
2124 double *bj, *bk;
2125 if ( upper ) {
2126 for (k=n-1,a+=k*lda,bk=b+k*ldb ; k>=0 ; k--,a-=lda,bk-=ldb) {
2127 if ( nounit ) {
2128 temp = 1.0/a[k];
2129 for (i=0 ; i<m ; i++) bk[i] *= temp;
2130 }
2131 for (j=0,bj=b ; j<k ; j++,bj+=ldb) {
2132 if ( a[j]!=0.0 ) {
2133 temp = a[j];
2134 /* unroll inner loop */
2135 for (i=0 ; i<(m&3) ; i++) bj[i] -= temp*bk[i];
2136 for ( ; i<m ; i+=4) {
2137 bj[i ] -= temp*bk[i];
2138 bj[i+1] -= temp*bk[i+1];
2139 bj[i+2] -= temp*bk[i+2];
2140 bj[i+3] -= temp*bk[i+3];
2141 }
2142 }
2143 }
2144 if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) bk[i] *= alpha;
2145 }
2146 } else {
2147 for (k=0,bk=b ; k<n ; k++,a+=lda,bk+=ldb) {
2148 if ( nounit ) {
2149 temp = 1.0/a[k];
2150 for (i=0 ; i<m ; i++) bk[i] *= temp;
2151 }
2152 for (j=k+1,bj=b+j*ldb ; j<n ; j++,bj+=ldb) {
2153 if ( a[j]!=0.0 ) {
2154 temp = a[j];
2155 /* unroll inner loop */
2156 for (i=0 ; i<(m&3) ; i++) bj[i] -= temp*bk[i];
2157 for ( ; i<m ; i+=4) {
2158 bj[i ] -= temp*bk[i];
2159 bj[i+1] -= temp*bk[i+1];
2160 bj[i+2] -= temp*bk[i+2];
2161 bj[i+3] -= temp*bk[i+3];
2162 }
2163 }
2164 }
2165 if ( alpha!=1.0 ) for (i=0 ; i<m ; i++) bk[i] *= alpha;
2166 }
2167 }
2168 }
2169 }
2170
2171 /* End of DTRSM */
2172 }
2173