1 /*********************************************************************/
2 /*                                                                   */
3 /*             Optimized BLAS libraries                              */
4 /*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5 /*                                                                   */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of     */
16 /* profits, interruption of business, or related expenses which may  */
17 /* arise from use of Software or Documentation, including but not    */
18 /* limited to those resulting from defects in Software and/or        */
19 /* Documentation, or loss or inaccuracy of data of any kind.         */
20 /*********************************************************************/
21 
22 #include <stdio.h>
23 #include "common.h"
24 
25 #ifdef UNIT
26 #define TRTI2	TRTI2_LU
27 #define TRMM	TRMM_LNLU
28 #define TRSM	TRSM_RNLU
29 #else
30 #define TRTI2	TRTI2_LN
31 #define TRMM	TRMM_LNLN
32 #define TRSM	TRSM_RNLN
33 #endif
34 
CNAME(blas_arg_t * args,BLASLONG * range_m,BLASLONG * range_n,FLOAT * sa,FLOAT * sb,BLASLONG mypos)35 blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
36 
37   BLASLONG n, info;
38   BLASLONG bk, i, blocking, start_i;
39   int mode;
40   BLASLONG lda, range_N[2];
41   blas_arg_t newarg;
42   FLOAT *a;
43   FLOAT alpha[2] = { ONE, ZERO};
44   FLOAT beta [2] = {-ONE, ZERO};
45 
46 #ifndef COMPLEX
47 #ifdef XDOUBLE
48   mode  =  BLAS_XDOUBLE | BLAS_REAL;
49 #elif defined(DOUBLE)
50   mode  =  BLAS_DOUBLE  | BLAS_REAL;
51 #else
52   mode  =  BLAS_SINGLE  | BLAS_REAL;
53 #endif
54 #else
55 #ifdef XDOUBLE
56   mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
57 #elif defined(DOUBLE)
58   mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
59 #else
60   mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
61 #endif
62 #endif
63 
64   n = args -> n;
65   a  = (FLOAT *)args -> a;
66   lda = args -> lda;
67 
68   if (range_n)  n = range_n[1] - range_n[0];
69 
70   if (n <= DTB_ENTRIES) {
71     info = TRTI2(args, NULL, range_n, sa, sb, 0);
72     return info;
73   }
74 
75   blocking = GEMM_Q;
76   if (n < 4 * GEMM_Q) blocking = (n + 3) / 4;
77 
78   start_i = 0;
79   while (start_i < n) start_i += blocking;
80   start_i -= blocking;
81 
82   for (i = start_i; i >= 0; i -= blocking) {
83     bk = n - i;
84     if (bk > blocking) bk = blocking;
85 
86     range_N[0] = i;
87     range_N[1] = i + bk;
88 
89     newarg.lda = lda;
90     newarg.ldb = lda;
91     newarg.ldc = lda;
92     newarg.alpha = alpha;
93 
94     newarg.m = n - bk - i;
95     newarg.n = bk;
96     newarg.a = a + ( i       + i * lda) * COMPSIZE;
97     newarg.b = a + ((i + bk) + i * lda) * COMPSIZE;
98 
99     newarg.beta  = beta;
100     newarg.nthreads = args -> nthreads;
101 
102     gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads);
103 
104     newarg.m = bk;
105     newarg.n = bk;
106 
107     newarg.a = a + (i + i * lda) * COMPSIZE;
108 
109     CNAME  (&newarg, NULL, NULL, sa, sb, 0);
110 
111     newarg.m = n - bk - i;
112     newarg.n = i;
113     newarg.k = bk;
114 
115     newarg.a = a + (i + bk + i * lda) * COMPSIZE;
116     newarg.b = a + (i               ) * COMPSIZE;
117     newarg.c = a + (i + bk          ) * COMPSIZE;
118 
119     newarg.beta  = NULL;
120 
121     gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads);
122 
123     newarg.a = a + (i      + i * lda) * COMPSIZE;
124     newarg.b = a + (i               ) * COMPSIZE;
125 
126     newarg.m = bk;
127     newarg.n = i;
128 
129     gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads);
130   }
131 
132 
133   return 0;
134 }
135