1 /* ---------------------------------------------------------------------
2 *
3 *  -- PBLAS auxiliary routine (version 2.0) --
4 *     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 *     and University of California, Berkeley.
6 *     April 1, 1998
7 *
8 *  ---------------------------------------------------------------------
9 */
10 /*
11 *  Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
PB_Cptrsm(PBTYP_T * TYPE,int FBCAST,char * SIDE,char * UPLO,char * TRANS,char * DIAG,int M,int N,char * ALPHA,char * A,int IA,int JA,int * DESCA,char * BC,int LDBC,char * BR,int LDBR)20 void PB_Cptrsm( PBTYP_T * TYPE, int FBCAST, char * SIDE, char * UPLO,
21                 char * TRANS, char * DIAG, int M, int N, char * ALPHA,
22                 char * A, int IA, int JA, int * DESCA, char * BC,
23                 int LDBC, char * BR, int LDBR )
24 #else
25 void PB_Cptrsm( TYPE, FBCAST, SIDE, UPLO, TRANS, DIAG, M, N, ALPHA,
26                 A, IA, JA, DESCA, BC, LDBC, BR, LDBR )
27 /*
28 *  .. Scalar Arguments ..
29 */
30    char           * ALPHA, * DIAG, * SIDE, * TRANS, * UPLO;
31    int            FBCAST, IA, JA, LDBC, LDBR, M, N;
32    PBTYP_T        * TYPE;
33 /*
34 *  .. Array Arguments ..
35 */
36    int            * DESCA;
37    char           * A, * BC, * BR;
38 #endif
39 {
40 /*
41 *  Purpose
42 *  =======
43 *
44 *  PB_Cptrsm  solves one of the matrix equations
45 *
46 *     op( sub( A ) ) * X = B,   or    X * op( sub( A ) ) = alpha * B,
47 *
48 *  where
49 *
50 *     sub( A ) denotes   A(IA:IA+M-1,JA:JA+M-1)  if SIDE = 'L',
51 *                        A(IA:IA+N-1,JA:JA+N-1)  if SIDE = 'R'.
52 *
53 *  X and B are m by n submatrices, sub( A ) is a unit, or non-unit,
54 *  upper or lower triangular submatrix and op( Y ) is one of
55 *
56 *     op( Y ) = Y   or   op( Y ) = Y'   or   op( Y ) = conjg( Y' ).
57 *
58 *  The submatrix X is overwritten on B.
59 *
60 *  No test for  singularity  or  near-singularity  is included  in  this
61 *  routine. Such tests must be performed before calling this routine.
62 *
63 *  Notes
64 *  =====
65 *
66 *  A description  vector  is associated with each 2D block-cyclicly dis-
67 *  tributed matrix.  This  vector  stores  the  information  required to
68 *  establish the  mapping  between a  matrix entry and its corresponding
69 *  process and memory location.
70 *
71 *  In  the  following  comments,   the character _  should  be  read  as
72 *  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
73 *  block cyclicly distributed matrix.  Its description vector is DESC_A:
74 *
75 *  NOTATION         STORED IN       EXPLANATION
76 *  ---------------- --------------- ------------------------------------
77 *  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
78 *  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
79 *                                   the NPROW x NPCOL BLACS process grid
80 *                                   A  is  distributed over. The context
81 *                                   itself  is  global,  but  the handle
82 *                                   (the integer value) may vary.
83 *  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
84 *                                   ted matrix A, M_A >= 0.
85 *  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
86 *                                   buted matrix A, N_A >= 0.
87 *  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
88 *                                   block of the matrix A, IMB_A > 0.
89 *  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
90 *                                   left   block   of   the  matrix   A,
91 *                                   INB_A > 0.
92 *  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
93 *                                   bute the last  M_A-IMB_A  rows of A,
94 *                                   MB_A > 0.
95 *  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
96 *                                   bute the last  N_A-INB_A  columns of
97 *                                   A, NB_A > 0.
98 *  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
99 *                                   row of the matrix  A is distributed,
100 *                                   NPROW > RSRC_A >= 0.
101 *  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
102 *                                   first column of  A  is  distributed.
103 *                                   NPCOL > CSRC_A >= 0.
104 *  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
105 *                                   array  storing  the  local blocks of
106 *                                   the distributed matrix A,
107 *                                   IF( Lc( 1, N_A ) > 0 )
108 *                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
109 *                                   ELSE
110 *                                      LLD_A >= 1.
111 *
112 *  Let K be the number of  rows of a matrix A starting at the global in-
113 *  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
114 *  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
115 *  receive if these K rows were distributed over NPROW processes.  If  K
116 *  is the number of columns of a matrix  A  starting at the global index
117 *  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
118 *  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
119 *  these K columns were distributed over NPCOL processes.
120 *
121 *  The values of Lr() and Lc() may be determined via a call to the func-
122 *  tion PB_Cnumroc:
123 *  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
124 *  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
125 *
126 *  Arguments
127 *  =========
128 *
129 *  TYPE    (local input) pointer to a PBTYP_T structure
130 *          On entry,  TYPE  is a pointer to a structure of type PBTYP_T,
131 *          that contains type information (See pblas.h).
132 *
133 *  FBCAST  (global input) INTEGER
134 *          On entry, FBCAST specifies whether the transposed of the vec-
135 *          tor solution should be broadcast or not when there is a  pos-
136 *          sible ambiguity, i.e. when sub( A ) is just one  block.  When
137 *          FBCAST is zero, the solution vector is not broadcast, and the
138 *          the solution vector is broadcast otherwise.
139 *
140 *  SIDE    (global input) pointer to CHAR
141 *          On entry,  SIDE  specifies  whether op( sub( A ) ) appears on
142 *          the left or right of X as follows:
143 *
144 *             SIDE = 'L' or 'l'   op( sub( A ) ) * X = B,
145 *
146 *             SIDE = 'R' or 'r'   X * op( sub( A ) ) = B.
147 *
148 *  UPLO    (global input) pointer to CHAR
149 *          On entry,  UPLO  specifies whether the submatrix  sub( A ) is
150 *          an upper or lower triangular submatrix as follows:
151 *
152 *             UPLO = 'U' or 'u'   sub( A ) is an upper triangular
153 *                                 submatrix,
154 *
155 *             UPLO = 'L' or 'l'   sub( A ) is a  lower triangular
156 *                                 submatrix.
157 *
158 *  TRANS   (global input) pointer to CHAR
159 *          On entry,  TRANS  specifies the  operation to be performed as
160 *          follows:
161 *
162 *             TRANS = 'N' or 'n'   sub( A )  * X = B,
163 *
164 *             TRANS = 'T' or 't'   sub( A )' * X = B,
165 *
166 *             TRANS = 'C' or 'c'   conjg( sub( A )' ) * X = B.
167 *
168 *  DIAG    (global input) pointer to CHAR
169 *          On entry,  DIAG  specifies  whether or not  sub( A )  is unit
170 *          triangular as follows:
171 *
172 *             DIAG = 'U' or 'u'  sub( A )  is  assumed to be unit trian-
173 *                                gular,
174 *
175 *             DIAG = 'N' or 'n'  sub( A ) is not assumed to be unit tri-
176 *                                angular.
177 *
178 *  M       (global input) INTEGER
179 *          On entry, M  specifies the number of rows of the submatrix B.
180 *          M  must be at least zero.
181 *
182 *  N       (global input) INTEGER
183 *          On entry, N  specifies the number of columns of the submatrix
184 *          B. N  must be at least zero.
185 *
186 *  A       (local input) pointer to CHAR
187 *          On entry, A is an array of dimension (LLD_A, Ka), where Ka is
188 *          at  least  Lc( 0, JA+M-1 )  when  SIDE = 'L' or 'l' and is at
189 *          least  Lc( 0, JA+N-1 ) otherwise.  Before  entry, this  array
190 *          contains the local entries of the matrix A.
191 *          Before entry with  UPLO = 'U' or 'u', this array contains the
192 *          local entries corresponding to  the entries of the upper tri-
193 *          angular submatrix  sub( A ), and the local entries correspon-
194 *          ding to the entries of the strictly lower triangular part  of
195 *          the submatrix  sub( A )  are not referenced.
196 *          Before entry with  UPLO = 'L' or 'l', this array contains the
197 *          local entries corresponding to  the entries of the lower tri-
198 *          angular submatrix  sub( A ), and the local entries correspon-
199 *          ding to the entries of the strictly upper triangular part  of
200 *          the submatrix  sub( A )  are not referenced.
201 *          Note  that  when DIAG = 'U' or 'u', the local entries corres-
202 *          ponding to the  diagonal elements  of the submatrix  sub( A )
203 *          are not referenced either, but are assumed to be unity.
204 *
205 *  IA      (global input) INTEGER
206 *          On entry, IA  specifies A's global row index, which points to
207 *          the beginning of the submatrix sub( A ).
208 *
209 *  JA      (global input) INTEGER
210 *          On entry, JA  specifies A's global column index, which points
211 *          to the beginning of the submatrix sub( A ).
212 *
213 *  DESCA   (global and local input) INTEGER array
214 *          On entry, DESCA  is an integer array of dimension DLEN_. This
215 *          is the array descriptor for the matrix A.
216 *
217 *  BC      (local input/local output) pointer to CHAR
218 *          On entry, BC is  an array of dimension (LDBC,Kbc), where  Kbc
219 *          is at least N when  SIDE is 'L' or 'l' and at least M  other-
220 *          wise. Before entry, when SIDE is 'L' or  'l' and TRANS is 'N'
221 *          or  'n'  or  SIDE is 'R' or 'r' and TRANS  is not 'N' or 'n',
222 *          this  array contains the local entries of the right-hand-side
223 *          matrix B. Otherwise, the entries of BC  should  be  zero.  On
224 *          exit, this  array contains the partial  solution matrix X.
225 *
226 *  LDBC    (local input) INTEGER
227 *          On entry,  LDBC  specifies the leading dimension of the array
228 *          BC. LDBC must  be  at  least MAX( 1, Lr( IA, M ) ) when  SIDE
229 *          is 'L' or 'l' and at  least MAX( 1, Lr( IA, N ) ) otherwise.
230 *
231 *  BR      (local input/local output) pointer to CHAR
232 *          On entry, BR is  an array of dimension (LDBR,Kbr), where  Kbr
233 *          is at least Lc( JA, M ) when  SIDE is 'L' or 'l' and at least
234 *          Lc( JA, N ) otherwise. Before entry, when SIDE is 'L' or  'l'
235 *          and  TRANS  is 'N' or 'n' or SIDE is 'R' or 'r' and TRANS  is
236 *          not 'N' or 'n', the entries of BR should be zero.  Otherwise,
237 *          this  array contains the local entries of the right-hand-side
238 *          matrix B. On exit, this  array contains the partial  solution
239 *          matrix X.
240 *
241 *  LDBR    (local input) INTEGER
242 *          On entry,  LDBR  specifies the leading dimension of the array
243 *          BR. LDBR must be at least MAX( 1, N ) when SIDE is 'L' or 'l'
244 *          and at least MAX( 1, M ) otherwise.
245 *
246 *  -- Written on April 1, 1998 by
247 *     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
248 *
249 *  ---------------------------------------------------------------------
250 */
251 /*
252 *  .. Local Scalars ..
253 */
254    char           btop, * negone, * one, * talpha1, * talpha2, * zero;
255    int            Acol, Aii, Aimb1, Ainb1, Ais1Col, Ais1Row, AisColRep,
256                   AisRowRep, Ajj, Alcol, Ald, Alrow, Amb, Anpprev, Anb, Anp,
257                   Anq, Arow, Asrc, ChangeRoc=0, LNorRT, Na, Nb, bcst, ctxt,
258                   izero=0, k=0, kb, kbprev=0, kbsize, lside, mb1, mycol, myrow,
259                   n1, n1last, n1p, n1pprev=0, nb1, nlast, notran, npcol, nprow,
260                   rocprev, size, tmp1, tmp2;
261    MMADD_T        add, tadd;
262    TZPAD_T        pad;
263    GEMM_T         gemm;
264    TRSM_T         trsm;
265    GESD2D_T       send;
266    GERV2D_T       recv;
267    GEBS2D_T       bsend;
268    GEBR2D_T       brecv;
269 /*
270 *  .. Local Arrays ..
271 */
272    char           * Aprev = NULL, * Bd    = NULL, * Bdprev = NULL,
273                   * Bprev = NULL, * work  = NULL;
274 /* ..
275 *  .. Executable Statements ..
276 *
277 */
278    if( ( M <= 0 ) || ( N <= 0 ) ) return;
279 /*
280 *  Retrieve process grid information
281 */
282    Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
283 
284    lside  = ( Mupcase( SIDE [0] ) ==   CLEFT );
285    notran = ( Mupcase( TRANS[0] ) == CNOTRAN );
286    LNorRT = ( lside && notran ) || ( !( lside ) && !( notran ) );
287    if( LNorRT ) { Na = M; Nb = N; } else { Na = N; Nb = M; }
288 /*
289 *  Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ...
290 */
291    PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow,
292                 &Acol );
293 /*
294 *  Determine if sub( A ) spans more than one process row, and/or more than one
295 *  process column.
296 */
297    Amb     = DESCA[MB_]; Anb = DESCA[NB_]; Ald = DESCA[LLD_ ];
298    Aimb1   = PB_Cfirstnb( Na, IA, DESCA[IMB_], Amb );
299    Anp     = PB_Cnumroc( Na, 0, Aimb1, Amb, myrow, Arow, nprow );
300    Ais1Row = !( PB_Cspan( Na, 0, Aimb1, Amb, Arow, nprow ) );
301    Ainb1   = PB_Cfirstnb( Na, JA, DESCA[INB_], Anb );
302    Anq     = PB_Cnumroc( Na, 0, Ainb1, Anb, mycol, Acol, npcol );
303    Ais1Col = !( PB_Cspan( Na, 0, Ainb1, Anb, Acol, npcol ) );
304 /*
305 *  When sub( A ) spans only one process, solve the system locally and return.
306 */
307    if( Ais1Row && Ais1Col )
308    {
309       if( LNorRT )
310       {
311          if( Anq > 0 )
312          {
313             if( Anp > 0 )
314             {
315                TYPE->Ftrsm( C2F_CHAR( ( notran ? SIDE : ( lside ? RIGHT :
316                             LEFT ) ) ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
317                             C2F_CHAR( DIAG ), &M, &N, ALPHA, Mptr( A, Aii, Ajj,
318                             Ald, TYPE->size ), &Ald, BC, &LDBC );
319                TYPE->Fmmtadd( &M, &N, TYPE->one, BC, &LDBC, TYPE->zero, BR,
320                               &LDBR );
321             }
322             if( ( Arow >= 0 ) && FBCAST )
323             {
324                btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
325                if( myrow == Arow )
326                   TYPE->Cgebs2d( ctxt, COLUMN, &btop, N, M, BR, LDBR );
327                else
328                   TYPE->Cgebr2d( ctxt, COLUMN, &btop, N, M, BR, LDBR, Arow,
329                                  mycol );
330             }
331          }
332       }
333       else
334       {
335          if( Anp > 0 )
336          {
337             if( Anq > 0 )
338             {
339                TYPE->Ftrsm( C2F_CHAR( ( notran ? SIDE : ( lside ? RIGHT :
340                             LEFT ) ) ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
341                             C2F_CHAR( DIAG ), &M, &N, ALPHA, Mptr( A, Aii, Ajj,
342                             Ald, TYPE->size ), &Ald, BR, &LDBR );
343                TYPE->Fmmtadd( &M, &N, TYPE->one, BR, &LDBR, TYPE->zero, BC,
344                               &LDBC );
345             }
346             if( ( Acol >= 0 ) && FBCAST )
347             {
348                btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
349                if( mycol == Acol )
350                   TYPE->Cgebs2d( ctxt, ROW, &btop, N, M, BC, LDBC );
351                else
352                   TYPE->Cgebr2d( ctxt, ROW, &btop, N, M, BC, LDBC, myrow,
353                                  Acol );
354             }
355          }
356       }
357       return;
358    }
359 /*
360 *  Retrieve from TYPE structure useful BLAS and BLACS functions.
361 */
362    size   = TYPE->size;
363    negone = TYPE->negone;  one   = TYPE->one;     zero = TYPE->zero;
364    add    = TYPE->Fmmadd;  tadd  = TYPE->Fmmtadd; pad  = TYPE->Ftzpad;
365    gemm   = TYPE->Fgemm;   trsm  = TYPE->Ftrsm;
366    send   = TYPE->Cgesd2d; recv  = TYPE->Cgerv2d;
367    bsend  = TYPE->Cgebs2d; brecv = TYPE->Cgebr2d;
368 
369    if( ( Anp > 0 ) && ( Anq > 0 ) ) A = Mptr( A, Aii, Ajj, Ald, size );
370 
371    if( LNorRT )
372    {
373 /*
374 *  Left - No tran  or  Right - (co)Trans
375 */
376       if( ( Anq <= 0 ) || ( Ais1Row && ( ( Arow >= 0 ) && !( FBCAST ) &&
377                                         ( myrow != Arow ) ) ) ) return;
378       btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
379       bcst = ( ( !Ais1Row ) || ( Ais1Row && ( Arow >= 0 ) && FBCAST ) );
380       AisRowRep = ( ( Arow < 0 ) || ( nprow == 1 ) );
381 
382       if( Mupcase( UPLO[0] ) == CUPPER )
383       {
384 /*
385 *  Initiate lookahead
386 */
387          nlast   = ( npcol - 1 ) * Anb;
388          n1      = MAX( nlast, Anb );
389          nlast  += Ainb1;
390          n1last  = n1 - Anb + MAX( Ainb1, Anb );
391          work    = PB_Cmalloc( Nb * MIN( n1last, Anp ) * size );
392          tmp1    = Na-1;
393          Alrow   = PB_Cindxg2p( tmp1, Aimb1, Amb, Arow, Arow, nprow );
394          Alcol   = PB_Cindxg2p( tmp1, Ainb1, Anb, Acol, Acol, npcol );
395          rocprev = Alcol; Anpprev = Anp; Bprev = BC; Bdprev = BR;
396          Aprev   = A = Mptr( A, 0, Anq, Ald, size );
397          mb1     = PB_Clastnb( Na, 0, Aimb1, Amb );
398          nb1     = PB_Clastnb( Na, 0, Ainb1, Anb );
399          tmp1    = Na - ( kb = MIN( mb1, nb1 ) );
400          n1      = ( ( Ais1Col || ( Na - nb1 < nlast ) ) ? n1last : n1 );
401          tmp2    = n1 + nb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) );
402          Asrc    = Arow;
403          n1p     = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Aimb1, Amb, myrow, Asrc,
404                                nprow );
405          talpha1 = talpha2 = ( ( Ais1Col || ( mycol == Alcol ) ) ?
406                                ALPHA : one );
407          while( Na > 0 )
408          {
409             kbsize = kb * size;
410 
411             if( Ais1Col || ( mycol == Alcol ) )
412             { A -= Ald*kbsize; Anq -= kb; Bd = Mptr( BR, 0, Anq, LDBR, size ); }
413             if( ( Arow < 0 ) || ( myrow == Alrow ) ) { Anp -= kb; }
414 /*
415 *  Partial update of previous block
416 */
417             if( n1pprev > 0 )
418             {
419                if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) )
420                {
421                   tmp1 = ( Anpprev - n1pprev ) * size;
422                   gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &n1pprev, &Nb,
423                         &kbprev, negone, Aprev+tmp1, &Ald, Bdprev, &LDBR,
424                         talpha1, Bprev+tmp1, &LDBC );
425                }
426 /*
427 *  Send partial updated result to current column
428 */
429                if( !( Ais1Col ) && ChangeRoc )
430                {
431                   if( mycol == rocprev )
432                   {
433                      send( ctxt, n1pprev, Nb, Mptr( Bprev, Anpprev-n1pprev, 0,
434                            LDBC, size ), LDBC, myrow, Alcol );
435                   }
436                   else if( mycol == Alcol )
437                   {
438                      recv( ctxt, n1pprev, Nb, work, n1pprev, myrow, rocprev );
439                      add( &n1pprev, &Nb, one, work, &n1pprev, one, Mptr( Bprev,
440                           Anpprev-n1pprev, 0, LDBC, size ), &LDBC );
441                   }
442                }
443             }
444 /*
445 *  Solve current diagonal block
446 */
447             if( Ais1Col || ( mycol == Alcol ) )
448             {
449                if( AisRowRep || ( myrow == Alrow ) )
450                {
451                   trsm( C2F_CHAR( LEFT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
452                         C2F_CHAR( DIAG ), &kb, &Nb, talpha2,  Mptr( A, Anp, 0,
453                         Ald, size ), &Ald, Mptr( BC, Anp, 0, LDBC, size ),
454                         &LDBC );
455                   tadd( &kb, &Nb, one, Mptr( BC, Anp, 0, LDBC, size ), &LDBC,
456                         zero, Mptr( BR, 0, Anq, LDBR, size ), &LDBR );
457                }
458                if( bcst )
459                {
460                   if( myrow == Alrow )
461                      bsend( ctxt, COLUMN, &btop, Nb, kb, Mptr( BR, 0, Anq, LDBR,
462                             size ), LDBR );
463                   else
464                      brecv( ctxt, COLUMN, &btop, Nb, kb, Mptr( BR, 0, Anq, LDBR,
465                             size ), LDBR, Alrow, mycol );
466                }
467                talpha2 = one;
468             }
469             else
470             {
471                if( !( Ais1Col ) && ( AisRowRep || ( myrow == Alrow ) ) )
472                   pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kb, &Nb, &izero,
473                        zero, zero, Mptr( BC, Anp, 0, LDBC, size ), &LDBC );
474             }
475 /*
476 *  Finish previous update
477 */
478             if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) )
479             {
480                if( ( tmp1 = Anpprev - n1pprev ) > 0 )
481                   gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &tmp1, &Nb,
482                         &kbprev, negone, Aprev, &Ald, Bdprev, &LDBR, talpha1,
483                         Bprev, &LDBC );
484                talpha1 = one;
485             }
486 /*
487 *  Save info of current step and update info for the next step
488 */
489             if( Ais1Col   || ( mycol == Alcol ) ) { Bdprev   = Bd; Aprev = A; }
490             if( AisRowRep || ( myrow == Alrow ) ) { Anpprev -= kb; }
491 
492             n1pprev = n1p;
493             rocprev = Alcol;
494             kbprev  = kb;
495             k      += kb;
496             Na     -= kb;
497 
498             mb1    -= kb;
499             if( mb1 == 0 )
500             {
501                if( !( Ais1Row ) && ( Alrow >= 0 ) )
502                   Alrow = MModSub1( Alrow, nprow );
503                mb1 = ( Na > Aimb1 ? Amb : Aimb1 );
504             }
505 
506             nb1      -= kb;
507             ChangeRoc = ( nb1 == 0 );
508 
509             if( ChangeRoc )
510             {
511                if( !( Ais1Col ) && ( Alcol >= 0 ) )
512                   Alcol = MModSub1( Alcol, npcol );
513                nb1 = ( Na > Ainb1 ? Anb : Ainb1 );
514             }
515             tmp1 = Na - ( kb = MIN( mb1, nb1 ) );
516             n1   = ( ( Ais1Col || ( Na-nb1 < nlast ) ) ? n1last : n1 );
517             tmp2 = n1 + nb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) );
518             n1p  = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Aimb1, Amb, myrow, Asrc,
519                                nprow );
520          }
521       }
522       else
523       {
524 /*
525 *  Initiate lookahead
526 */
527          n1    = ( MAX( npcol, 2 ) - 1 ) * Anb;
528          work  = PB_Cmalloc( Nb*MIN( n1, Anp )*size );
529          Aprev = A; Bprev = BC, Bdprev = BR; Anpprev = Anp;
530          mb1   = Aimb1; nb1 = Ainb1; rocprev = Acol;
531          tmp1  = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + nb1 - kb;
532          Asrc  = Arow;
533          n1p   = PB_Cnumroc( MIN( tmp1, tmp2 ), kb, Aimb1, Amb, myrow, Asrc,
534                              nprow );
535          talpha1 = talpha2 = ( ( Ais1Col || ( mycol == Acol ) ) ?
536                                ALPHA : one );
537          while( kb > 0 )
538          {
539             kbsize = kb * size;
540 /*
541 *  Partial update of previous block
542 */
543             if( n1pprev > 0 )
544             {
545                if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) )
546                   gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &n1pprev, &Nb,
547                         &kbprev, negone, Aprev, &Ald, Bdprev, &LDBR, talpha1,
548                         Bprev, &LDBC );
549 /*
550 *  Send partial updated result to current column
551 */
552                if( !( Ais1Col ) && ChangeRoc )
553                {
554                   if( mycol == rocprev )
555                   {
556                      send( ctxt, n1pprev, Nb, Bprev, LDBC, myrow, Acol );
557                   }
558                   else if( mycol == Acol )
559                   {
560                      recv( ctxt, n1pprev, Nb, work, n1pprev, myrow, rocprev );
561                      add( &n1pprev, &Nb, one, work, &n1pprev, one, Bprev,
562                           &LDBC );
563                   }
564                }
565             }
566 /*
567 *  Solve current diagonal block
568 */
569             if( Ais1Col || ( mycol == Acol ) )
570             {
571                if( AisRowRep || ( myrow == Arow ) )
572                {
573                   trsm( C2F_CHAR( LEFT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
574                         C2F_CHAR( DIAG ), &kb, &Nb, talpha2, A, &Ald, BC,
575                         &LDBC );
576                   tadd( &kb, &Nb, one, BC, &LDBC, zero, BR, &LDBR );
577                }
578                if( bcst )
579                {
580                   if( myrow == Arow )
581                      bsend( ctxt, COLUMN, &btop, Nb, kb, BR, LDBR );
582                   else
583                      brecv( ctxt, COLUMN, &btop, Nb, kb, BR, LDBR, Arow,
584                             mycol );
585                }
586                talpha2 = one;
587             }
588             else
589             {
590                if( !( Ais1Col ) && ( AisRowRep || ( myrow == Arow ) ) )
591                   pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kb, &Nb, &izero,
592                        zero, zero, BC, &LDBC );
593             }
594 /*
595 *  Finish previous update
596 */
597             if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) )
598             {
599                if( ( tmp1 = Anpprev - n1pprev ) > 0 )
600                {
601                   tmp2 = n1pprev * size;
602                   gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &tmp1, &Nb,
603                         &kbprev, negone, Aprev+tmp2, &Ald, Bdprev, &LDBR,
604                         talpha1, Bprev+tmp2, &LDBC );
605                }
606                Aprev += Ald * kbprev * size; talpha1 = one;
607             }
608 /*
609 *  Save info of current step and update info for the next step
610 */
611             if( Ais1Col || ( mycol == Acol ) )
612             { A += Ald*kbsize; Bdprev = Bd = BR; BR += LDBR*kbsize; }
613             if( AisRowRep || ( myrow == Arow ) )
614             {
615                Bprev   = ( BC += kbsize );
616                A      += kbsize;
617                Aprev  += kbsize;
618                Anpprev = ( Anp -= kb );
619             }
620             n1pprev = n1p;
621             rocprev = Acol;
622             kbprev  = kb;
623             k      += kb;
624             Na     -= kb;
625 
626             mb1    -= kb;
627             if( mb1 == 0 )
628             {
629                if( !( Ais1Row ) && ( Arow >= 0 ) )
630                   Arow = MModAdd1( Arow, nprow );
631                mb1 = MIN( Amb, Na );
632             }
633 
634             nb1      -= kb;
635             ChangeRoc = ( nb1 == 0 );
636 
637             if( ChangeRoc )
638             {
639                if( !( Ais1Col ) && ( Acol >= 0 ) )
640                   Acol = MModAdd1( Acol, npcol );
641                nb1 = MIN( Anb, Na );
642             }
643             tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + nb1 - kb;
644             n1p  = PB_Cnumroc( MIN( tmp2, tmp1 ), k + kb, Aimb1, Amb, myrow,
645                                Asrc, nprow );
646          }
647       }
648    }
649    else
650    {
651 /*
652 *  Right - No tran  or  Left - (co)Trans
653 */
654       if( ( Anp <= 0 ) || ( Ais1Col && ( ( Acol >= 0 ) && !( FBCAST ) &&
655                                          ( mycol != Acol ) ) ) ) return;
656       btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
657       bcst = ( ( !Ais1Col ) || ( Ais1Col && ( Acol >= 0 ) && FBCAST ) );
658       AisColRep = ( ( Acol < 0 ) || ( npcol == 1 ) );
659 
660       if( Mupcase( UPLO[0] ) == CUPPER )
661       {
662 /*
663 *  Initiate lookahead
664 */
665          n1    = ( MAX( nprow, 2 ) - 1 ) * Amb;
666          work  = PB_Cmalloc( Nb*MIN( n1, Anq )*size );
667          Aprev = A; Bprev = BR, Bdprev = BC; Anpprev = Anq;
668          mb1   = Aimb1; nb1 = Ainb1; rocprev = Arow;
669          tmp1  = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + mb1 - kb;
670          Asrc  = Acol;
671          n1p   = PB_Cnumroc( MIN( tmp1, tmp2 ), kb, Ainb1, Anb, mycol, Asrc,
672                              npcol );
673          talpha1 = talpha2 = ( ( Ais1Row || ( myrow == Arow ) ) ?
674                                ALPHA : one );
675          while( kb > 0 )
676          {
677             kbsize = kb * size;
678 /*
679 *  Partial update of previous block
680 */
681             if( n1pprev > 0 )
682             {
683                if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) )
684                   gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &n1pprev,
685                         &kbprev, negone, Bdprev, &LDBC, Aprev, &Ald, talpha1,
686                         Bprev, &LDBR );
687 /*
688 *  Send partial updated result to current row
689 */
690                if( !( Ais1Row ) && ChangeRoc )
691                {
692                   if( myrow == rocprev )
693                   {
694                      send( ctxt, Nb, n1pprev, Bprev, LDBR, Arow, mycol );
695                   }
696                   else if( myrow == Arow )
697                   {
698                      recv( ctxt, Nb, n1pprev, work, Nb, rocprev, mycol );
699                      add( &Nb, &n1pprev, one, work, &Nb, one, Bprev, &LDBR );
700                   }
701                }
702             }
703 /*
704 *  Solve current diagonal block
705 */
706             if( Ais1Row || ( myrow == Arow ) )
707             {
708                if( AisColRep || ( mycol == Acol ) )
709                {
710                   trsm( C2F_CHAR( RIGHT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
711                         C2F_CHAR( DIAG ), &Nb, &kb, talpha2, A, &Ald, BR,
712                         &LDBR );
713                   tadd( &Nb, &kb, one, BR, &LDBR, zero, BC, &LDBC );
714                }
715                if( bcst )
716                {
717                   if( mycol == Acol )
718                      bsend( ctxt, ROW, &btop, kb, Nb, BC, LDBC );
719                   else
720                      brecv( ctxt, ROW, &btop, kb, Nb, BC, LDBC, myrow, Acol );
721                }
722                talpha2 = one;
723             }
724             else
725             {
726                if( !( Ais1Row ) && ( AisColRep || ( mycol == Acol ) ) )
727                   pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Nb, &kb, &izero,
728                        zero, zero, BR, &LDBR );
729             }
730 /*
731 *  Finish previous update
732 */
733             if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) )
734             {
735                if( ( tmp1 = Anpprev - n1pprev ) > 0  )
736                {
737                   tmp2 = n1pprev * size;
738                   gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &tmp1,
739                         &kbprev, negone, Bdprev, &LDBC, Aprev+Ald*tmp2, &Ald,
740                         talpha1, Bprev+LDBR*tmp2, &LDBR );
741                }
742                Aprev  += kbprev * size; talpha1 = one;
743             }
744 /*
745 *  Save info of current step and update info for the next step
746 */
747             if( Ais1Row || ( myrow == Arow ) )
748             { A += kbsize; Bdprev = Bd = BC; BC += kbsize; }
749             if( AisColRep || ( mycol == Acol ) )
750             {
751                Bprev   = ( BR += LDBR * kbsize );
752                A      += Ald * kbsize;
753                Anpprev = ( Anq -= kb );
754                Aprev  += Ald * kbsize;
755             }
756             n1pprev = n1p;
757             rocprev = Arow;
758             kbprev  = kb;
759             k      += kb;
760             Na     -= kb;
761 
762             nb1    -= kb;
763             if( nb1 == 0 )
764             {
765                if( !( Ais1Col ) && ( Acol >= 0 ) )
766                   Acol = MModAdd1( Acol, npcol );
767                nb1 = MIN( Anb, Na );
768             }
769 
770             mb1      -= kb;
771             ChangeRoc = ( mb1 == 0 );
772 
773             if( ChangeRoc )
774             {
775                if( !( Ais1Row ) && ( Arow >= 0 ) )
776                   Arow = MModAdd1( Arow, nprow );
777                mb1 = MIN( Amb, Na );
778             }
779             tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + mb1 - kb;
780             n1p  = PB_Cnumroc( MIN( tmp2, tmp1 ), k + kb, Ainb1, Anb, mycol,
781                                Asrc, npcol );
782          }
783       }
784       else
785       {
786 /*
787 *  Initiate lookahead
788 */
789          nlast   = ( nprow - 1 ) * Amb;
790          n1      = MAX( nlast, Amb );
791          nlast  += Aimb1;
792          n1last  = n1 - Amb + MAX( Aimb1, Amb );
793          work    = PB_Cmalloc( Nb * MIN( n1last, Anq ) * size );
794          tmp1    = Na-1;
795          Alrow   = PB_Cindxg2p( tmp1, Aimb1, Amb, Arow, Arow, nprow );
796          Alcol   = PB_Cindxg2p( tmp1, Ainb1, Anb, Acol, Acol, npcol );
797          rocprev = Alrow; Anpprev = Anq; Bprev = BR; Bdprev = BC;
798          Aprev   = A = Mptr( A, Anp, 0, Ald, size );
799          mb1     = PB_Clastnb( Na, 0, Aimb1, Amb );
800          nb1     = PB_Clastnb( Na, 0, Ainb1, Anb );
801          tmp1    = Na - ( kb = MIN( mb1, nb1 ) );
802          n1      = ( ( Ais1Row || ( Na-mb1 < nlast ) ) ? n1last : n1 );
803          tmp2    = n1 + mb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) );
804          Asrc    = Acol;
805          n1p     = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Ainb1, Anb, mycol, Asrc,
806                                npcol );
807          talpha1 = talpha2 = ( ( Ais1Row || ( myrow == Alrow ) ) ?
808                                ALPHA : one );
809          while( Na > 0 )
810          {
811             kbsize = kb * size;
812 
813             if( Ais1Row || ( myrow == Alrow ) )
814             { A -= kbsize; Anp -= kb; Bd = Mptr( BC, Anp, 0, LDBC, size ); }
815             if( ( Acol < 0 ) || ( mycol == Alcol ) ) { Anq -= kb; }
816 /*
817 *  Partial update of previous block
818 */
819             if( n1pprev > 0 )
820             {
821                if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) )
822                {
823                   tmp1 = ( Anpprev - n1pprev ) * size;
824                   TYPE->Fgemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ),
825                                &Nb, &n1pprev, &kbprev, negone, Bdprev,
826                                &LDBC, Aprev+Ald*tmp1, &Ald, talpha1,
827                                Bprev+LDBR*tmp1, &LDBR );
828                }
829 /*
830 *  Send partial updated result to current row
831 */
832                if( !( Ais1Row ) && ChangeRoc )
833                {
834                   if( myrow == rocprev )
835                   {
836                      send( ctxt, Nb, n1pprev, Mptr( Bprev, 0, Anpprev-n1pprev,
837                            LDBR, size ), LDBR, Alrow, mycol );
838                   }
839                   else if( myrow == Alrow )
840                   {
841                      recv( ctxt, Nb, n1pprev, work, Nb, rocprev, mycol );
842                      add( &Nb, &n1pprev, one, work, &Nb, one, Mptr( Bprev, 0,
843                           Anpprev-n1pprev, LDBR, size ), &LDBR );
844                   }
845                }
846             }
847 /*
848 *  Solve current diagonal block
849 */
850             if( Ais1Row || ( myrow == Alrow ) )
851             {
852                if( AisColRep || ( mycol == Alcol ) )
853                {
854                   trsm( C2F_CHAR( RIGHT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
855                         C2F_CHAR( DIAG ), &Nb, &kb, talpha2, Mptr( A, 0, Anq,
856                         Ald, size ), &Ald, Mptr( BR, 0, Anq, LDBR, size ),
857                         &LDBR );
858                   tadd( &Nb, &kb, one, Mptr( BR, 0, Anq, LDBR, size ), &LDBR,
859                         zero, Mptr( BC, Anp, 0, LDBC, size ), &LDBC );
860                }
861                if( bcst )
862                {
863                   if( mycol == Alcol )
864                      bsend( ctxt, ROW, &btop, kb, Nb, Mptr( BC, Anp, 0, LDBC,
865                             size ), LDBC );
866                   else
867                      brecv( ctxt, ROW, &btop, kb, Nb, Mptr( BC, Anp, 0, LDBC,
868                             size ), LDBC, myrow, Alcol );
869                }
870                talpha2 = one;
871             }
872             else
873             {
874                if( !( Ais1Row ) && ( AisColRep || ( mycol == Alcol ) ) )
875                   pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Nb, &kb, &izero,
876                        zero, zero, Mptr( BR, 0, Anq, LDBR, size ), &LDBR );
877             }
878 /*
879 *  Finish previous update
880 */
881             if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) )
882             {
883                if( ( tmp1 = Anpprev - n1pprev ) > 0 )
884                   gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &tmp1,
885                         &kbprev, negone, Bdprev, &LDBC, Aprev, &Ald, talpha1,
886                         Bprev, &LDBR );
887                talpha1 = one;
888             }
889 /*
890 *  Save info of current step and update info for the next step
891 */
892             if(  Ais1Row  || ( myrow == Alrow ) ) { Bdprev = Bd; Aprev = A; }
893             if( AisColRep || ( mycol == Alcol ) ) { Anpprev -= kb; }
894 
895             n1pprev = n1p;
896             rocprev = Alrow;
897             kbprev  = kb;
898             k      += kb;
899             Na     -= kb;
900 
901             nb1    -= kb;
902             if( nb1 == 0 )
903             {
904                if( !( Ais1Col ) && ( Alcol >= 0 ) )
905                   Alcol = MModSub1( Alcol, npcol );
906                nb1 = ( Na > Ainb1 ? Anb : Ainb1 );
907             }
908 
909             mb1      -= kb;
910             ChangeRoc = ( mb1 == 0 );
911 
912             if( ChangeRoc )
913             {
914                if( !( Ais1Row ) && ( Alrow >= 0 ) )
915                   Alrow = MModSub1( Alrow, nprow );
916                mb1 = ( Na > Aimb1 ? Amb : Aimb1 );
917             }
918             tmp1 = Na - ( kb = MIN( mb1, nb1 ) );
919             n1   = ( ( Ais1Row || ( Na-mb1 < nlast ) ) ? n1last : n1 );
920             tmp2 = n1 + mb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) );
921             n1p  = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Ainb1, Anb, mycol, Asrc,
922                                npcol );
923          }
924       }
925    }
926    if( work ) free( work );
927 /*
928 *  End of PB_Cptrsm
929 */
930 }
931