SRC/PTOOLS/PB_Cptrsm.c

/* ---------------------------------------------------------------------
*
*  -- PBLAS auxiliary routine (version 2.0) --
*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
*     and University of California, Berkeley.
*     April 1, 1998
*
*  ---------------------------------------------------------------------
*/
/*
*  Include files
*/
#include "../pblas.h"
#include "../PBpblas.h"
#include "../PBtools.h"
#include "../PBblacs.h"
#include "../PBblas.h"

#ifdef __STDC__
void PB_Cptrsm( PBTYP_T * TYPE, int FBCAST, char * SIDE, char * UPLO,
                char * TRANS, char * DIAG, int M, int N, char * ALPHA,
                char * A, int IA, int JA, int * DESCA, char * BC,
                int LDBC, char * BR, int LDBR )
#else
void PB_Cptrsm( TYPE, FBCAST, SIDE, UPLO, TRANS, DIAG, M, N, ALPHA,
                A, IA, JA, DESCA, BC, LDBC, BR, LDBR )
/*
*  .. Scalar Arguments ..
*/
   char           * ALPHA, * DIAG, * SIDE, * TRANS, * UPLO;
   int            FBCAST, IA, JA, LDBC, LDBR, M, N;
   PBTYP_T        * TYPE;
/*
*  .. Array Arguments ..
*/
   int            * DESCA;
   char           * A, * BC, * BR;
#endif
{
/*
*  Purpose
*  =======
*
*  PB_Cptrsm  solves one of the matrix equations
*
*     op( sub( A ) ) * X = B,   or    X * op( sub( A ) ) = alpha * B,
*
*  where
*
*     sub( A ) denotes   A(IA:IA+M-1,JA:JA+M-1)  if SIDE = 'L',
*                        A(IA:IA+N-1,JA:JA+N-1)  if SIDE = 'R'.
*
*  X and B are m by n submatrices, sub( A ) is a unit, or non-unit,
*  upper or lower triangular submatrix and op( Y ) is one of
*
*     op( Y ) = Y   or   op( Y ) = Y'   or   op( Y ) = conjg( Y' ).
*
*  The submatrix X is overwritten on B.
*
*  No test for  singularity  or  near-singularity  is included  in  this
*  routine. Such tests must be performed before calling this routine.
*
*  Notes
*  =====
*
*  A description  vector  is associated with each 2D block-cyclicly dis-
*  tributed matrix.  This  vector  stores  the  information  required to
*  establish the  mapping  between a  matrix entry and its corresponding
*  process and memory location.
*
*  In  the  following  comments,   the character _  should  be  read  as
*  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
*  block cyclicly distributed matrix.  Its description vector is DESC_A:
*
*  NOTATION         STORED IN       EXPLANATION
*  ---------------- --------------- ------------------------------------
*  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
*  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
*                                   the NPROW x NPCOL BLACS process grid
*                                   A  is  distributed over. The context
*                                   itself  is  global,  but  the handle
*                                   (the integer value) may vary.
*  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
*                                   ted matrix A, M_A >= 0.
*  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
*                                   buted matrix A, N_A >= 0.
*  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
*                                   block of the matrix A, IMB_A > 0.
*  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
*                                   left   block   of   the  matrix   A,
*                                   INB_A > 0.
*  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
*                                   bute the last  M_A-IMB_A  rows of A,
*                                   MB_A > 0.
*  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
*                                   bute the last  N_A-INB_A  columns of
*                                   A, NB_A > 0.
*  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
*                                   row of the matrix  A is distributed,
*                                   NPROW > RSRC_A >= 0.
*  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
*                                   first column of  A  is  distributed.
*                                   NPCOL > CSRC_A >= 0.
*  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
*                                   array  storing  the  local blocks of
*                                   the distributed matrix A,
*                                   IF( Lc( 1, N_A ) > 0 )
*                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
*                                   ELSE
*                                      LLD_A >= 1.
*
*  Let K be the number of  rows of a matrix A starting at the global in-
*  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
*  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
*  receive if these K rows were distributed over NPROW processes.  If  K
*  is the number of columns of a matrix  A  starting at the global index
*  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
*  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
*  these K columns were distributed over NPCOL processes.
*
*  The values of Lr() and Lc() may be determined via a call to the func-
*  tion PB_Cnumroc:
*  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
*  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
*
*  Arguments
*  =========
*
*  TYPE    (local input) pointer to a PBTYP_T structure
*          On entry,  TYPE  is a pointer to a structure of type PBTYP_T,
*          that contains type information (See pblas.h).
*
*  FBCAST  (global input) INTEGER
*          On entry, FBCAST specifies whether the transposed of the vec-
*          tor solution should be broadcast or not when there is a  pos-
*          sible ambiguity, i.e. when sub( A ) is just one  block.  When
*          FBCAST is zero, the solution vector is not broadcast, and the
*          the solution vector is broadcast otherwise.
*
*  SIDE    (global input) pointer to CHAR
*          On entry,  SIDE  specifies  whether op( sub( A ) ) appears on
*          the left or right of X as follows:
*
*             SIDE = 'L' or 'l'   op( sub( A ) ) * X = B,
*
*             SIDE = 'R' or 'r'   X * op( sub( A ) ) = B.
*
*  UPLO    (global input) pointer to CHAR
*          On entry,  UPLO  specifies whether the submatrix  sub( A ) is
*          an upper or lower triangular submatrix as follows:
*
*             UPLO = 'U' or 'u'   sub( A ) is an upper triangular
*                                 submatrix,
*
*             UPLO = 'L' or 'l'   sub( A ) is a  lower triangular
*                                 submatrix.
*
*  TRANS   (global input) pointer to CHAR
*          On entry,  TRANS  specifies the  operation to be performed as
*          follows:
*
*             TRANS = 'N' or 'n'   sub( A )  * X = B,
*
*             TRANS = 'T' or 't'   sub( A )' * X = B,
*
*             TRANS = 'C' or 'c'   conjg( sub( A )' ) * X = B.
*
*  DIAG    (global input) pointer to CHAR
*          On entry,  DIAG  specifies  whether or not  sub( A )  is unit
*          triangular as follows:
*
*             DIAG = 'U' or 'u'  sub( A )  is  assumed to be unit trian-
*                                gular,
*
*             DIAG = 'N' or 'n'  sub( A ) is not assumed to be unit tri-
*                                angular.
*
*  M       (global input) INTEGER
*          On entry, M  specifies the number of rows of the submatrix B.
*          M  must be at least zero.
*
*  N       (global input) INTEGER
*          On entry, N  specifies the number of columns of the submatrix
*          B. N  must be at least zero.
*
*  A       (local input) pointer to CHAR
*          On entry, A is an array of dimension (LLD_A, Ka), where Ka is
*          at  least  Lc( 0, JA+M-1 )  when  SIDE = 'L' or 'l' and is at
*          least  Lc( 0, JA+N-1 ) otherwise.  Before  entry, this  array
*          contains the local entries of the matrix A.
*          Before entry with  UPLO = 'U' or 'u', this array contains the
*          local entries corresponding to  the entries of the upper tri-
*          angular submatrix  sub( A ), and the local entries correspon-
*          ding to the entries of the strictly lower triangular part  of
*          the submatrix  sub( A )  are not referenced.
*          Before entry with  UPLO = 'L' or 'l', this array contains the
*          local entries corresponding to  the entries of the lower tri-
*          angular submatrix  sub( A ), and the local entries correspon-
*          ding to the entries of the strictly upper triangular part  of
*          the submatrix  sub( A )  are not referenced.
*          Note  that  when DIAG = 'U' or 'u', the local entries corres-
*          ponding to the  diagonal elements  of the submatrix  sub( A )
*          are not referenced either, but are assumed to be unity.
*
*  IA      (global input) INTEGER
*          On entry, IA  specifies A's global row index, which points to
*          the beginning of the submatrix sub( A ).
*
*  JA      (global input) INTEGER
*          On entry, JA  specifies A's global column index, which points
*          to the beginning of the submatrix sub( A ).
*
*  DESCA   (global and local input) INTEGER array
*          On entry, DESCA  is an integer array of dimension DLEN_. This
*          is the array descriptor for the matrix A.
*
*  BC      (local input/local output) pointer to CHAR
*          On entry, BC is  an array of dimension (LDBC,Kbc), where  Kbc
*          is at least N when  SIDE is 'L' or 'l' and at least M  other-
*          wise. Before entry, when SIDE is 'L' or  'l' and TRANS is 'N'
*          or  'n'  or  SIDE is 'R' or 'r' and TRANS  is not 'N' or 'n',
*          this  array contains the local entries of the right-hand-side
*          matrix B. Otherwise, the entries of BC  should  be  zero.  On
*          exit, this  array contains the partial  solution matrix X.
*
*  LDBC    (local input) INTEGER
*          On entry,  LDBC  specifies the leading dimension of the array
*          BC. LDBC must  be  at  least MAX( 1, Lr( IA, M ) ) when  SIDE
*          is 'L' or 'l' and at  least MAX( 1, Lr( IA, N ) ) otherwise.
*
*  BR      (local input/local output) pointer to CHAR
*          On entry, BR is  an array of dimension (LDBR,Kbr), where  Kbr
*          is at least Lc( JA, M ) when  SIDE is 'L' or 'l' and at least
*          Lc( JA, N ) otherwise. Before entry, when SIDE is 'L' or  'l'
*          and  TRANS  is 'N' or 'n' or SIDE is 'R' or 'r' and TRANS  is
*          not 'N' or 'n', the entries of BR should be zero.  Otherwise,
*          this  array contains the local entries of the right-hand-side
*          matrix B. On exit, this  array contains the partial  solution
*          matrix X.
*
*  LDBR    (local input) INTEGER
*          On entry,  LDBR  specifies the leading dimension of the array
*          BR. LDBR must be at least MAX( 1, N ) when SIDE is 'L' or 'l'
*          and at least MAX( 1, M ) otherwise.
*
*  -- Written on April 1, 1998 by
*     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
*
*  ---------------------------------------------------------------------
*/
/*
*  .. Local Scalars ..
*/
   char           btop, * negone, * one, * talpha1, * talpha2, * zero;
   int            Acol, Aii, Aimb1, Ainb1, Ais1Col, Ais1Row, AisColRep,
                  AisRowRep, Ajj, Alcol, Ald, Alrow, Amb, Anpprev, Anb, Anp,
                  Anq, Arow, Asrc, ChangeRoc=0, LNorRT, Na, Nb, bcst, ctxt,
                  izero=0, k=0, kb, kbprev=0, kbsize, lside, mb1, mycol, myrow,
                  n1, n1last, n1p, n1pprev=0, nb1, nlast, notran, npcol, nprow,
                  rocprev, size, tmp1, tmp2;
   MMADD_T        add, tadd;
   TZPAD_T        pad;
   GEMM_T         gemm;
   TRSM_T         trsm;
   GESD2D_T       send;
   GERV2D_T       recv;
   GEBS2D_T       bsend;
   GEBR2D_T       brecv;
/*
*  .. Local Arrays ..
*/
   char           * Aprev = NULL, * Bd    = NULL, * Bdprev = NULL,
                  * Bprev = NULL, * work  = NULL;
/* ..
*  .. Executable Statements ..
*
*/
   if( ( M <= 0 ) || ( N <= 0 ) ) return;
/*
*  Retrieve process grid information
*/
   Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );

   lside  = ( Mupcase( SIDE [0] ) ==   CLEFT );
   notran = ( Mupcase( TRANS[0] ) == CNOTRAN );
   LNorRT = ( lside && notran ) || ( !( lside ) && !( notran ) );
   if( LNorRT ) { Na = M; Nb = N; } else { Na = N; Nb = M; }
/*
*  Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ...
*/
   PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow,
                &Acol );
/*
*  Determine if sub( A ) spans more than one process row, and/or more than one
*  process column.
*/
   Amb     = DESCA[MB_]; Anb = DESCA[NB_]; Ald = DESCA[LLD_ ];
   Aimb1   = PB_Cfirstnb( Na, IA, DESCA[IMB_], Amb );
   Anp     = PB_Cnumroc( Na, 0, Aimb1, Amb, myrow, Arow, nprow );
   Ais1Row = !( PB_Cspan( Na, 0, Aimb1, Amb, Arow, nprow ) );
   Ainb1   = PB_Cfirstnb( Na, JA, DESCA[INB_], Anb );
   Anq     = PB_Cnumroc( Na, 0, Ainb1, Anb, mycol, Acol, npcol );
   Ais1Col = !( PB_Cspan( Na, 0, Ainb1, Anb, Acol, npcol ) );
/*
*  When sub( A ) spans only one process, solve the system locally and return.
*/
   if( Ais1Row && Ais1Col )
   {
      if( LNorRT )
      {
         if( Anq > 0 )
         {
            if( Anp > 0 )
            {
               TYPE->Ftrsm( C2F_CHAR( ( notran ? SIDE : ( lside ? RIGHT :
                            LEFT ) ) ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
                            C2F_CHAR( DIAG ), &M, &N, ALPHA, Mptr( A, Aii, Ajj,
                            Ald, TYPE->size ), &Ald, BC, &LDBC );
               TYPE->Fmmtadd( &M, &N, TYPE->one, BC, &LDBC, TYPE->zero, BR,
                              &LDBR );
            }
            if( ( Arow >= 0 ) && FBCAST )
            {
               btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
               if( myrow == Arow )
                  TYPE->Cgebs2d( ctxt, COLUMN, &btop, N, M, BR, LDBR );
               else
                  TYPE->Cgebr2d( ctxt, COLUMN, &btop, N, M, BR, LDBR, Arow,
                                 mycol );
            }
         }
      }
      else
      {
         if( Anp > 0 )
         {
            if( Anq > 0 )
            {
               TYPE->Ftrsm( C2F_CHAR( ( notran ? SIDE : ( lside ? RIGHT :
                            LEFT ) ) ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
                            C2F_CHAR( DIAG ), &M, &N, ALPHA, Mptr( A, Aii, Ajj,
                            Ald, TYPE->size ), &Ald, BR, &LDBR );
               TYPE->Fmmtadd( &M, &N, TYPE->one, BR, &LDBR, TYPE->zero, BC,
                              &LDBC );
            }
            if( ( Acol >= 0 ) && FBCAST )
            {
               btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
               if( mycol == Acol )
                  TYPE->Cgebs2d( ctxt, ROW, &btop, N, M, BC, LDBC );
               else
                  TYPE->Cgebr2d( ctxt, ROW, &btop, N, M, BC, LDBC, myrow,
                                 Acol );
            }
         }
      }
      return;
   }
/*
*  Retrieve from TYPE structure useful BLAS and BLACS functions.
*/
   size   = TYPE->size;
   negone = TYPE->negone;  one   = TYPE->one;     zero = TYPE->zero;
   add    = TYPE->Fmmadd;  tadd  = TYPE->Fmmtadd; pad  = TYPE->Ftzpad;
   gemm   = TYPE->Fgemm;   trsm  = TYPE->Ftrsm;
   send   = TYPE->Cgesd2d; recv  = TYPE->Cgerv2d;
   bsend  = TYPE->Cgebs2d; brecv = TYPE->Cgebr2d;

   if( ( Anp > 0 ) && ( Anq > 0 ) ) A = Mptr( A, Aii, Ajj, Ald, size );

   if( LNorRT )
   {
/*
*  Left - No tran  or  Right - (co)Trans
*/
      if( ( Anq <= 0 ) || ( Ais1Row && ( ( Arow >= 0 ) && !( FBCAST ) &&
                                        ( myrow != Arow ) ) ) ) return;
      btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
      bcst = ( ( !Ais1Row ) || ( Ais1Row && ( Arow >= 0 ) && FBCAST ) );
      AisRowRep = ( ( Arow < 0 ) || ( nprow == 1 ) );

      if( Mupcase( UPLO[0] ) == CUPPER )
      {
/*
*  Initiate lookahead
*/
         nlast   = ( npcol - 1 ) * Anb;
         n1      = MAX( nlast, Anb );
         nlast  += Ainb1;
         n1last  = n1 - Anb + MAX( Ainb1, Anb );
         work    = PB_Cmalloc( Nb * MIN( n1last, Anp ) * size );
         tmp1    = Na-1;
         Alrow   = PB_Cindxg2p( tmp1, Aimb1, Amb, Arow, Arow, nprow );
         Alcol   = PB_Cindxg2p( tmp1, Ainb1, Anb, Acol, Acol, npcol );
         rocprev = Alcol; Anpprev = Anp; Bprev = BC; Bdprev = BR;
         Aprev   = A = Mptr( A, 0, Anq, Ald, size );
         mb1     = PB_Clastnb( Na, 0, Aimb1, Amb );
         nb1     = PB_Clastnb( Na, 0, Ainb1, Anb );
         tmp1    = Na - ( kb = MIN( mb1, nb1 ) );
         n1      = ( ( Ais1Col || ( Na - nb1 < nlast ) ) ? n1last : n1 );
         tmp2    = n1 + nb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) );
         Asrc    = Arow;
         n1p     = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Aimb1, Amb, myrow, Asrc,
                               nprow );
         talpha1 = talpha2 = ( ( Ais1Col || ( mycol == Alcol ) ) ?
                               ALPHA : one );
         while( Na > 0 )
         {
            kbsize = kb * size;

            if( Ais1Col || ( mycol == Alcol ) )
            { A -= Ald*kbsize; Anq -= kb; Bd = Mptr( BR, 0, Anq, LDBR, size ); }
            if( ( Arow < 0 ) || ( myrow == Alrow ) ) { Anp -= kb; }
/*
*  Partial update of previous block
*/
            if( n1pprev > 0 )
            {
               if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) )
               {
                  tmp1 = ( Anpprev - n1pprev ) * size;
                  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &n1pprev, &Nb,
                        &kbprev, negone, Aprev+tmp1, &Ald, Bdprev, &LDBR,
                        talpha1, Bprev+tmp1, &LDBC );
               }
/*
*  Send partial updated result to current column
*/
               if( !( Ais1Col ) && ChangeRoc )
               {
                  if( mycol == rocprev )
                  {
                     send( ctxt, n1pprev, Nb, Mptr( Bprev, Anpprev-n1pprev, 0,
                           LDBC, size ), LDBC, myrow, Alcol );
                  }
                  else if( mycol == Alcol )
                  {
                     recv( ctxt, n1pprev, Nb, work, n1pprev, myrow, rocprev );
                     add( &n1pprev, &Nb, one, work, &n1pprev, one, Mptr( Bprev,
                          Anpprev-n1pprev, 0, LDBC, size ), &LDBC );
                  }
               }
            }
/*
*  Solve current diagonal block
*/
            if( Ais1Col || ( mycol == Alcol ) )
            {
               if( AisRowRep || ( myrow == Alrow ) )
               {
                  trsm( C2F_CHAR( LEFT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
                        C2F_CHAR( DIAG ), &kb, &Nb, talpha2,  Mptr( A, Anp, 0,
                        Ald, size ), &Ald, Mptr( BC, Anp, 0, LDBC, size ),
                        &LDBC );
                  tadd( &kb, &Nb, one, Mptr( BC, Anp, 0, LDBC, size ), &LDBC,
                        zero, Mptr( BR, 0, Anq, LDBR, size ), &LDBR );
               }
               if( bcst )
               {
                  if( myrow == Alrow )
                     bsend( ctxt, COLUMN, &btop, Nb, kb, Mptr( BR, 0, Anq, LDBR,
                            size ), LDBR );
                  else
                     brecv( ctxt, COLUMN, &btop, Nb, kb, Mptr( BR, 0, Anq, LDBR,
                            size ), LDBR, Alrow, mycol );
               }
               talpha2 = one;
            }
            else
            {
               if( !( Ais1Col ) && ( AisRowRep || ( myrow == Alrow ) ) )
                  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kb, &Nb, &izero,
                       zero, zero, Mptr( BC, Anp, 0, LDBC, size ), &LDBC );
            }
/*
*  Finish previous update
*/
            if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) )
            {
               if( ( tmp1 = Anpprev - n1pprev ) > 0 )
                  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &tmp1, &Nb,
                        &kbprev, negone, Aprev, &Ald, Bdprev, &LDBR, talpha1,
                        Bprev, &LDBC );
               talpha1 = one;
            }
/*
*  Save info of current step and update info for the next step
*/
            if( Ais1Col   || ( mycol == Alcol ) ) { Bdprev   = Bd; Aprev = A; }
            if( AisRowRep || ( myrow == Alrow ) ) { Anpprev -= kb; }

            n1pprev = n1p;
            rocprev = Alcol;
            kbprev  = kb;
            k      += kb;
            Na     -= kb;

            mb1    -= kb;
            if( mb1 == 0 )
            {
               if( !( Ais1Row ) && ( Alrow >= 0 ) )
                  Alrow = MModSub1( Alrow, nprow );
               mb1 = ( Na > Aimb1 ? Amb : Aimb1 );
            }

            nb1      -= kb;
            ChangeRoc = ( nb1 == 0 );

            if( ChangeRoc )
            {
               if( !( Ais1Col ) && ( Alcol >= 0 ) )
                  Alcol = MModSub1( Alcol, npcol );
               nb1 = ( Na > Ainb1 ? Anb : Ainb1 );
            }
            tmp1 = Na - ( kb = MIN( mb1, nb1 ) );
            n1   = ( ( Ais1Col || ( Na-nb1 < nlast ) ) ? n1last : n1 );
            tmp2 = n1 + nb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) );
            n1p  = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Aimb1, Amb, myrow, Asrc,
                               nprow );
         }
      }
      else
      {
/*
*  Initiate lookahead
*/
         n1    = ( MAX( npcol, 2 ) - 1 ) * Anb;
         work  = PB_Cmalloc( Nb*MIN( n1, Anp )*size );
         Aprev = A; Bprev = BC, Bdprev = BR; Anpprev = Anp;
         mb1   = Aimb1; nb1 = Ainb1; rocprev = Acol;
         tmp1  = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + nb1 - kb;
         Asrc  = Arow;
         n1p   = PB_Cnumroc( MIN( tmp1, tmp2 ), kb, Aimb1, Amb, myrow, Asrc,
                             nprow );
         talpha1 = talpha2 = ( ( Ais1Col || ( mycol == Acol ) ) ?
                               ALPHA : one );
         while( kb > 0 )
         {
            kbsize = kb * size;
/*
*  Partial update of previous block
*/
            if( n1pprev > 0 )
            {
               if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) )
                  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &n1pprev, &Nb,
                        &kbprev, negone, Aprev, &Ald, Bdprev, &LDBR, talpha1,
                        Bprev, &LDBC );
/*
*  Send partial updated result to current column
*/
               if( !( Ais1Col ) && ChangeRoc )
               {
                  if( mycol == rocprev )
                  {
                     send( ctxt, n1pprev, Nb, Bprev, LDBC, myrow, Acol );
                  }
                  else if( mycol == Acol )
                  {
                     recv( ctxt, n1pprev, Nb, work, n1pprev, myrow, rocprev );
                     add( &n1pprev, &Nb, one, work, &n1pprev, one, Bprev,
                          &LDBC );
                  }
               }
            }
/*
*  Solve current diagonal block
*/
            if( Ais1Col || ( mycol == Acol ) )
            {
               if( AisRowRep || ( myrow == Arow ) )
               {
                  trsm( C2F_CHAR( LEFT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
                        C2F_CHAR( DIAG ), &kb, &Nb, talpha2, A, &Ald, BC,
                        &LDBC );
                  tadd( &kb, &Nb, one, BC, &LDBC, zero, BR, &LDBR );
               }
               if( bcst )
               {
                  if( myrow == Arow )
                     bsend( ctxt, COLUMN, &btop, Nb, kb, BR, LDBR );
                  else
                     brecv( ctxt, COLUMN, &btop, Nb, kb, BR, LDBR, Arow,
                            mycol );
               }
               talpha2 = one;
            }
            else
            {
               if( !( Ais1Col ) && ( AisRowRep || ( myrow == Arow ) ) )
                  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kb, &Nb, &izero,
                       zero, zero, BC, &LDBC );
            }
/*
*  Finish previous update
*/
            if( ( Ais1Col || ( mycol == rocprev ) ) && ( kbprev > 0 ) )
            {
               if( ( tmp1 = Anpprev - n1pprev ) > 0 )
               {
                  tmp2 = n1pprev * size;
                  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &tmp1, &Nb,
                        &kbprev, negone, Aprev+tmp2, &Ald, Bdprev, &LDBR,
                        talpha1, Bprev+tmp2, &LDBC );
               }
               Aprev += Ald * kbprev * size; talpha1 = one;
            }
/*
*  Save info of current step and update info for the next step
*/
            if( Ais1Col || ( mycol == Acol ) )
            { A += Ald*kbsize; Bdprev = Bd = BR; BR += LDBR*kbsize; }
            if( AisRowRep || ( myrow == Arow ) )
            {
               Bprev   = ( BC += kbsize );
               A      += kbsize;
               Aprev  += kbsize;
               Anpprev = ( Anp -= kb );
            }
            n1pprev = n1p;
            rocprev = Acol;
            kbprev  = kb;
            k      += kb;
            Na     -= kb;

            mb1    -= kb;
            if( mb1 == 0 )
            {
               if( !( Ais1Row ) && ( Arow >= 0 ) )
                  Arow = MModAdd1( Arow, nprow );
               mb1 = MIN( Amb, Na );
            }

            nb1      -= kb;
            ChangeRoc = ( nb1 == 0 );

            if( ChangeRoc )
            {
               if( !( Ais1Col ) && ( Acol >= 0 ) )
                  Acol = MModAdd1( Acol, npcol );
               nb1 = MIN( Anb, Na );
            }
            tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + nb1 - kb;
            n1p  = PB_Cnumroc( MIN( tmp2, tmp1 ), k + kb, Aimb1, Amb, myrow,
                               Asrc, nprow );
         }
      }
   }
   else
   {
/*
*  Right - No tran  or  Left - (co)Trans
*/
      if( ( Anp <= 0 ) || ( Ais1Col && ( ( Acol >= 0 ) && !( FBCAST ) &&
                                         ( mycol != Acol ) ) ) ) return;
      btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
      bcst = ( ( !Ais1Col ) || ( Ais1Col && ( Acol >= 0 ) && FBCAST ) );
      AisColRep = ( ( Acol < 0 ) || ( npcol == 1 ) );

      if( Mupcase( UPLO[0] ) == CUPPER )
      {
/*
*  Initiate lookahead
*/
         n1    = ( MAX( nprow, 2 ) - 1 ) * Amb;
         work  = PB_Cmalloc( Nb*MIN( n1, Anq )*size );
         Aprev = A; Bprev = BR, Bdprev = BC; Anpprev = Anq;
         mb1   = Aimb1; nb1 = Ainb1; rocprev = Arow;
         tmp1  = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + mb1 - kb;
         Asrc  = Acol;
         n1p   = PB_Cnumroc( MIN( tmp1, tmp2 ), kb, Ainb1, Anb, mycol, Asrc,
                             npcol );
         talpha1 = talpha2 = ( ( Ais1Row || ( myrow == Arow ) ) ?
                               ALPHA : one );
         while( kb > 0 )
         {
            kbsize = kb * size;
/*
*  Partial update of previous block
*/
            if( n1pprev > 0 )
            {
               if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) )
                  gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &n1pprev,
                        &kbprev, negone, Bdprev, &LDBC, Aprev, &Ald, talpha1,
                        Bprev, &LDBR );
/*
*  Send partial updated result to current row
*/
               if( !( Ais1Row ) && ChangeRoc )
               {
                  if( myrow == rocprev )
                  {
                     send( ctxt, Nb, n1pprev, Bprev, LDBR, Arow, mycol );
                  }
                  else if( myrow == Arow )
                  {
                     recv( ctxt, Nb, n1pprev, work, Nb, rocprev, mycol );
                     add( &Nb, &n1pprev, one, work, &Nb, one, Bprev, &LDBR );
                  }
               }
            }
/*
*  Solve current diagonal block
*/
            if( Ais1Row || ( myrow == Arow ) )
            {
               if( AisColRep || ( mycol == Acol ) )
               {
                  trsm( C2F_CHAR( RIGHT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
                        C2F_CHAR( DIAG ), &Nb, &kb, talpha2, A, &Ald, BR,
                        &LDBR );
                  tadd( &Nb, &kb, one, BR, &LDBR, zero, BC, &LDBC );
               }
               if( bcst )
               {
                  if( mycol == Acol )
                     bsend( ctxt, ROW, &btop, kb, Nb, BC, LDBC );
                  else
                     brecv( ctxt, ROW, &btop, kb, Nb, BC, LDBC, myrow, Acol );
               }
               talpha2 = one;
            }
            else
            {
               if( !( Ais1Row ) && ( AisColRep || ( mycol == Acol ) ) )
                  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Nb, &kb, &izero,
                       zero, zero, BR, &LDBR );
            }
/*
*  Finish previous update
*/
            if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) )
            {
               if( ( tmp1 = Anpprev - n1pprev ) > 0  )
               {
                  tmp2 = n1pprev * size;
                  gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &tmp1,
                        &kbprev, negone, Bdprev, &LDBC, Aprev+Ald*tmp2, &Ald,
                        talpha1, Bprev+LDBR*tmp2, &LDBR );
               }
               Aprev  += kbprev * size; talpha1 = one;
            }
/*
*  Save info of current step and update info for the next step
*/
            if( Ais1Row || ( myrow == Arow ) )
            { A += kbsize; Bdprev = Bd = BC; BC += kbsize; }
            if( AisColRep || ( mycol == Acol ) )
            {
               Bprev   = ( BR += LDBR * kbsize );
               A      += Ald * kbsize;
               Anpprev = ( Anq -= kb );
               Aprev  += Ald * kbsize;
            }
            n1pprev = n1p;
            rocprev = Arow;
            kbprev  = kb;
            k      += kb;
            Na     -= kb;

            nb1    -= kb;
            if( nb1 == 0 )
            {
               if( !( Ais1Col ) && ( Acol >= 0 ) )
                  Acol = MModAdd1( Acol, npcol );
               nb1 = MIN( Anb, Na );
            }

            mb1      -= kb;
            ChangeRoc = ( mb1 == 0 );

            if( ChangeRoc )
            {
               if( !( Ais1Row ) && ( Arow >= 0 ) )
                  Arow = MModAdd1( Arow, nprow );
               mb1 = MIN( Amb, Na );
            }
            tmp1 = Na - ( kb = MIN( mb1, nb1 ) ); tmp2 = n1 + mb1 - kb;
            n1p  = PB_Cnumroc( MIN( tmp2, tmp1 ), k + kb, Ainb1, Anb, mycol,
                               Asrc, npcol );
         }
      }
      else
      {
/*
*  Initiate lookahead
*/
         nlast   = ( nprow - 1 ) * Amb;
         n1      = MAX( nlast, Amb );
         nlast  += Aimb1;
         n1last  = n1 - Amb + MAX( Aimb1, Amb );
         work    = PB_Cmalloc( Nb * MIN( n1last, Anq ) * size );
         tmp1    = Na-1;
         Alrow   = PB_Cindxg2p( tmp1, Aimb1, Amb, Arow, Arow, nprow );
         Alcol   = PB_Cindxg2p( tmp1, Ainb1, Anb, Acol, Acol, npcol );
         rocprev = Alrow; Anpprev = Anq; Bprev = BR; Bdprev = BC;
         Aprev   = A = Mptr( A, Anp, 0, Ald, size );
         mb1     = PB_Clastnb( Na, 0, Aimb1, Amb );
         nb1     = PB_Clastnb( Na, 0, Ainb1, Anb );
         tmp1    = Na - ( kb = MIN( mb1, nb1 ) );
         n1      = ( ( Ais1Row || ( Na-mb1 < nlast ) ) ? n1last : n1 );
         tmp2    = n1 + mb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) );
         Asrc    = Acol;
         n1p     = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Ainb1, Anb, mycol, Asrc,
                               npcol );
         talpha1 = talpha2 = ( ( Ais1Row || ( myrow == Alrow ) ) ?
                               ALPHA : one );
         while( Na > 0 )
         {
            kbsize = kb * size;

            if( Ais1Row || ( myrow == Alrow ) )
            { A -= kbsize; Anp -= kb; Bd = Mptr( BC, Anp, 0, LDBC, size ); }
            if( ( Acol < 0 ) || ( mycol == Alcol ) ) { Anq -= kb; }
/*
*  Partial update of previous block
*/
            if( n1pprev > 0 )
            {
               if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) )
               {
                  tmp1 = ( Anpprev - n1pprev ) * size;
                  TYPE->Fgemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ),
                               &Nb, &n1pprev, &kbprev, negone, Bdprev,
                               &LDBC, Aprev+Ald*tmp1, &Ald, talpha1,
                               Bprev+LDBR*tmp1, &LDBR );
               }
/*
*  Send partial updated result to current row
*/
               if( !( Ais1Row ) && ChangeRoc )
               {
                  if( myrow == rocprev )
                  {
                     send( ctxt, Nb, n1pprev, Mptr( Bprev, 0, Anpprev-n1pprev,
                           LDBR, size ), LDBR, Alrow, mycol );
                  }
                  else if( myrow == Alrow )
                  {
                     recv( ctxt, Nb, n1pprev, work, Nb, rocprev, mycol );
                     add( &Nb, &n1pprev, one, work, &Nb, one, Mptr( Bprev, 0,
                          Anpprev-n1pprev, LDBR, size ), &LDBR );
                  }
               }
            }
/*
*  Solve current diagonal block
*/
            if( Ais1Row || ( myrow == Alrow ) )
            {
               if( AisColRep || ( mycol == Alcol ) )
               {
                  trsm( C2F_CHAR( RIGHT ), C2F_CHAR( UPLO ), C2F_CHAR( NOTRAN ),
                        C2F_CHAR( DIAG ), &Nb, &kb, talpha2, Mptr( A, 0, Anq,
                        Ald, size ), &Ald, Mptr( BR, 0, Anq, LDBR, size ),
                        &LDBR );
                  tadd( &Nb, &kb, one, Mptr( BR, 0, Anq, LDBR, size ), &LDBR,
                        zero, Mptr( BC, Anp, 0, LDBC, size ), &LDBC );
               }
               if( bcst )
               {
                  if( mycol == Alcol )
                     bsend( ctxt, ROW, &btop, kb, Nb, Mptr( BC, Anp, 0, LDBC,
                            size ), LDBC );
                  else
                     brecv( ctxt, ROW, &btop, kb, Nb, Mptr( BC, Anp, 0, LDBC,
                            size ), LDBC, myrow, Alcol );
               }
               talpha2 = one;
            }
            else
            {
               if( !( Ais1Row ) && ( AisColRep || ( mycol == Alcol ) ) )
                  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Nb, &kb, &izero,
                       zero, zero, Mptr( BR, 0, Anq, LDBR, size ), &LDBR );
            }
/*
*  Finish previous update
*/
            if( ( Ais1Row || ( myrow == rocprev ) ) && ( kbprev > 0 ) )
            {
               if( ( tmp1 = Anpprev - n1pprev ) > 0 )
                  gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &Nb, &tmp1,
                        &kbprev, negone, Bdprev, &LDBC, Aprev, &Ald, talpha1,
                        Bprev, &LDBR );
               talpha1 = one;
            }
/*
*  Save info of current step and update info for the next step
*/
            if(  Ais1Row  || ( myrow == Alrow ) ) { Bdprev = Bd; Aprev = A; }
            if( AisColRep || ( mycol == Alcol ) ) { Anpprev -= kb; }

            n1pprev = n1p;
            rocprev = Alrow;
            kbprev  = kb;
            k      += kb;
            Na     -= kb;

            nb1    -= kb;
            if( nb1 == 0 )
            {
               if( !( Ais1Col ) && ( Alcol >= 0 ) )
                  Alcol = MModSub1( Alcol, npcol );
               nb1 = ( Na > Ainb1 ? Anb : Ainb1 );
            }

            mb1      -= kb;
            ChangeRoc = ( mb1 == 0 );

            if( ChangeRoc )
            {
               if( !( Ais1Row ) && ( Alrow >= 0 ) )
                  Alrow = MModSub1( Alrow, nprow );
               mb1 = ( Na > Aimb1 ? Amb : Aimb1 );
            }
            tmp1 = Na - ( kb = MIN( mb1, nb1 ) );
            n1   = ( ( Ais1Row || ( Na-mb1 < nlast ) ) ? n1last : n1 );
            tmp2 = n1 + mb1 - kb; tmp1 -= ( tmp2 = MIN( tmp1, tmp2 ) );
            n1p  = PB_Cnumroc( tmp2, MAX( 0, tmp1 ), Ainb1, Anb, mycol, Asrc,
                               npcol );
         }
      }
   }
   if( work ) free( work );
/*
*  End of PB_Cptrsm
*/
}