SRC/PTOOLS/PB_Cainfog2l.c

/* ---------------------------------------------------------------------
*
*  -- PBLAS auxiliary routine (version 2.0) --
*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
*     and University of California, Berkeley.
*     April 1, 1998
*
*  ---------------------------------------------------------------------
*/
/*
*  Include files
*/
#include "../pblas.h"
#include "../PBpblas.h"
#include "../PBtools.h"
#include "../PBblacs.h"
#include "../PBblas.h"

#ifdef __STDC__
void PB_Cainfog2l( int M, int N, int I, int J, int * DESC, int NPROW,
                   int NPCOL, int MYROW, int MYCOL, int * IMB1,
                   int * INB1, int * MP, int * NQ, int * II, int * JJ,
                   int * PROW, int * PCOL, int * RPROW, int * RPCOL )
#else
void PB_Cainfog2l( M, N, I, J, DESC, NPROW, NPCOL, MYROW, MYCOL, IMB1,
                   INB1, MP, NQ, II, JJ, PROW, PCOL, RPROW, RPCOL )
/*
*  .. Scalar Arguments ..
*/
   int            I, * II, * IMB1, * INB1, J, * JJ, M, * MP, MYCOL,
                  MYROW, N, NPCOL, NPROW, * NQ, * PCOL, * PROW, * RPCOL,
                  * RPROW;
/*
*  .. Array Arguments ..
*/
   int            * DESC;
#endif
{
/*
*  Purpose
*  =======
*
*  PB_Cainfog2l computes the  starting  local row and column indexes II,
*  JJ  corresponding to  the  submatrix  starting  globally at the entry
*  pointed by I,  J. This routine returns the coordinates in the grid of
*  the  process owning  the  matrix entry of global indexes I, J, namely
*  PROW  and  PCOL. In addition, this routine computes the quantities MP
*  and  NQ,  which are respectively the local number of rows and columns
*  owned by the process of coordinate  MYROW, MYCOL corresponding to the
*  global submatrix A(I:I+M-1,J:J+N-1).  Finally, the size  of the first
*  partial block and the relative process coordinates  are also returned
*  respectively in IMB, INB and RPROW, RPCOL.
*
*  Notes
*  =====
*
*  A description  vector  is associated with each 2D block-cyclicly dis-
*  tributed matrix.  This  vector  stores  the  information  required to
*  establish the  mapping  between a  matrix entry and its corresponding
*  process and memory location.
*
*  In  the  following  comments,   the character _  should  be  read  as
*  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
*  block cyclicly distributed matrix.  Its description vector is DESC_A:
*
*  NOTATION         STORED IN       EXPLANATION
*  ---------------- --------------- ------------------------------------
*  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
*  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
*                                   the NPROW x NPCOL BLACS process grid
*                                   A  is  distributed over. The context
*                                   itself  is  global,  but  the handle
*                                   (the integer value) may vary.
*  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
*                                   ted matrix A, M_A >= 0.
*  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
*                                   buted matrix A, N_A >= 0.
*  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
*                                   block of the matrix A, IMB_A > 0.
*  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
*                                   left   block   of   the  matrix   A,
*                                   INB_A > 0.
*  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
*                                   bute the last  M_A-IMB_A  rows of A,
*                                   MB_A > 0.
*  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
*                                   bute the last  N_A-INB_A  columns of
*                                   A, NB_A > 0.
*  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
*                                   row of the matrix  A is distributed,
*                                   NPROW > RSRC_A >= 0.
*  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
*                                   first column of  A  is  distributed.
*                                   NPCOL > CSRC_A >= 0.
*  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
*                                   array  storing  the  local blocks of
*                                   the distributed matrix A,
*                                   IF( Lc( 1, N_A ) > 0 )
*                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
*                                   ELSE
*                                      LLD_A >= 1.
*
*  Let K be the number of  rows of a matrix A starting at the global in-
*  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
*  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
*  receive if these K rows were distributed over NPROW processes.  If  K
*  is the number of columns of a matrix  A  starting at the global index
*  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
*  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
*  these K columns were distributed over NPCOL processes.
*
*  The values of Lr() and Lc() may be determined via a call to the func-
*  tion PB_Cnumroc:
*  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
*  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
*
*  Arguments
*  =========
*
*  M       (global input) INTEGER
*          On entry, M specifies the global number of rows of the subma-
*          trix. M must be at least zero.
*
*  N       (global input) INTEGER
*          On entry, N specifies  the  global  number  of columns of the
*          submatrix. N must be at least zero.
*
*  I       (global input) INTEGER
*          On entry, I  specifies  the  global starting row index of the
*          submatrix. I must at least zero.
*
*  J       (global input) INTEGER
*          On entry, J  specifies  the global starting column  index  of
*          the submatrix. J must at least zero.
*
*  DESC    (global and local input) INTEGER array
*          On entry,  DESC is an integer array of dimension DLEN_.  This
*          is the array descriptor of the underlying matrix.
*
*  NPROW   (global input) INTEGER
*          On entry,  NPROW   specifies the total number of process rows
*          over which the matrix is distributed.  NPROW must be at least
*          one.
*
*  NPCOL   (global input) INTEGER
*          On entry, NPCOL specifies the total number of process columns
*          over which the matrix is distributed.  NPCOL must be at least
*          one.
*
*  MYROW   (local input) INTEGER
*          On entry,  MYROW  specifies the row coordinate of the process
*          whose local index  II  is determined.  MYROW must be at least
*          zero and strictly less than NPROW.
*
*  MYCOL   (local input) INTEGER
*          On entry,  MYCOL  specifies the column coordinate of the pro-
*          cess whose local index  JJ  is determined.  MYCOL  must be at
*          least zero and strictly less than NPCOL.
*
*  IMB1    (global output) INTEGER
*          On exit, IMB1 specifies the number of rows of the upper  left
*          block of the submatrix. On exit,  IMB1 is less or equal  than
*          M and greater or equal than MIN( 1, M ).
*
*  INB1    (global output) INTEGER
*          On exit, INB1 specifies  the number  of  columns of the upper
*          left block of the submatrix. On exit,  INB1 is  less or equal
*          than N and greater or equal than MIN( 1, N ).
*
*  MP      (local output) INTEGER
*          On exit, MP specifies the local number of rows of the  subma-
*          trix, that the processes of row coordinate MYROW own.  MP  is
*          at least zero.
*
*  NQ      (local output) INTEGER
*          On exit, NQ specifies  the  local  number  of columns  of the
*          submatrix,  that  the processes  of column  coordinate  MYCOL
*          own. NQ is at least zero.
*
*  II      (local output) INTEGER
*          On exit, II  specifies the  local  starting  row index of the
*          submatrix. On exit, II is at least zero.
*
*  JJ      (local output) INTEGER
*          On exit, JJ  specifies the  local  starting  column index  of
*          the submatrix. On exit, II is at least zero.
*
*  PROW    (global output) INTEGER
*          On exit,  PROW  specifies the row coordinate of  the  process
*          that possesses the first row of the submatrix. On exit,  PROW
*          is -1 if DESC(RSRC_)  is -1 on input, and, at least zero  and
*          strictly less than NPROW otherwise.
*
*  PCOL    (global output) INTEGER
*          On exit, PCOL  specifies the column coordinate of the process
*          that possesses the first column of the  submatrix.  On  exit,
*          PCOL is -1 if DESC(CSRC_)  is -1 on input, and, at least zero
*          and strictly less than NPCOL otherwise.
*
*  RPROW   (global output) INTEGER
*          On exit, RPROW specifies  the  relative row coordinate of the
*          process that possesses the first row  I  of the submatrix. On
*          exit, RPROW is -1 if DESC(RSRC_) is  -1  on  input,  and,  at
*          least zero and strictly less than NPROW otherwise.
*
*  RPCOL   (global output) INTEGER
*          On exit, RPCOL specifies  the  relative column  coordinate of
*          the process that possesses the first column  J  of the subma-
*          trix. On exit, RPCOL is -1 if  DESC(CSRC_)  is  -1  on input,
*          and, at least zero and strictly less than NPCOL otherwise.
*
*  -- Written on April 1, 1998 by
*     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
*
*  ---------------------------------------------------------------------
*/
/*
*  .. Local Scalars ..
*/
   int            i1, ilocblk, j1, mb, mydist, nb, nblocks, csrc, rsrc;
/* ..
*  .. Executable Statements ..
*
*/
/*
*  Retrieve the row distribution parameters
*/
   mb   = DESC[ MB_   ];
   rsrc = DESC[ RSRC_ ];

   if( ( rsrc == -1 ) || ( NPROW == 1 ) )
   {
/*
*  The rows are not distributed, or there is just one process row in the grid.
*  Therefore, the local and global indexes are the same, as well as the local
*  and global number of rows. Finally, the relative row process coordinate is
*  zero, since every process owns all rows. Note that the size of the first
*  row block can be zero only if M is zero.
*/
      *II    = I;
      if( ( *IMB1 = DESC[IMB_] - I ) <= 0 )
         *IMB1 += ( ( -(*IMB1) ) / mb + 1 ) * mb;
      *IMB1  = MIN( *IMB1, M );
      *MP    = M;
      *PROW  = rsrc;
      *RPROW = 0;
   }
   else
   {
/*
*  Figure out PROW, II and IMB1 first.
*/
      *IMB1 = DESC[IMB_];
      if( I < *IMB1 )                  /* Is I in first block range ? */
      {
/*
*  If I is in the first block of rows, then PROW is simply rsrc, II is I in
*  this process and zero elsewhere, and the size of the first block is the
*  IMB complement.
*/
         *PROW  = rsrc;
         *II    = ( ( MYROW == *PROW ) ? I : 0 );
         *IMB1 -= I;
      }
      else
      {
/*
*  The discussion goes as follows: compute my distance from the source process
*  so that within this process coordinate system, the source row process is the
*  process such that mydist=0, or equivalently MYROW == rsrc.
*
*  Find out the global coordinate of the block of rows I belongs to (nblocks),
*  as well as the minimum local number of row blocks that every process has.
*
*  when mydist < nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks,
*  when mydist > nblocks - ilocblk * NPROW, I own ilocblk     full blocks,
*  when mydist = nblocks - ilocblk * NPROW, I own ilocblk     full blocks
*  but not I, or I own ilocblk + 1 blocks and the entry I refers to.
*/
         i1 = I - *IMB1;
         if( MYROW == rsrc )
         {
/*
*  I refers to an entry that is not in the first block, find out which process
*  has it.
*/
            nblocks = i1 / mb + 1;
            *PROW   = rsrc + nblocks;
            *PROW  -= ( *PROW / NPROW ) * NPROW;
/*
*  Since mydist = 0 and nblocks - ilocblk * NPROW >= 0, there are only three
*  possible cases:
*
*    1) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I don't own I, in
*       which case II = IMB + ( ilocblk - 1 ) * MB. Note that this case cannot
*       happen when ilocblk is zero, since nblocks is at least one.
*
*    2) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I own I, in which
*       case I and II can respectively be written as IMB + (nblocks-1)*MB + IL
*       and IMB+(ilocblk-1) * MB + IL. That is II = I + (ilocblk - nblocks)*MB.
*       Note that this case cannot happen when ilocblk is zero, since nblocks
*       is at least one.
*
*    3) mydist = 0 < nblocks - ilocblk * NPROW, the source process owns
*       ilocblk+1 full blocks, and therefore II = IMB + ilocblk * MB. Note
*       that when ilocblk is zero, II is just IMB.
*/
            if( nblocks < NPROW )
            {
               *II = *IMB1;
            }
            else
            {
               ilocblk = nblocks / NPROW;
               if( ilocblk * NPROW >= nblocks )
               {
                  *II = ( ( MYROW == *PROW ) ? I + ( ilocblk - nblocks ) * mb :
                          *IMB1 + ( ilocblk - 1 ) * mb );
               }
               else
               {
                  *II = *IMB1 + ilocblk * mb;
               }
            }
         }
         else
         {
/*
*  I is not in the first block, find out which process has it.
*/
            nblocks = i1 / mb + 1;
            *PROW   = rsrc + nblocks;
            *PROW  -= ( *PROW / NPROW ) * NPROW;
/*
*  Compute my distance from the source process so that within this process
*  coordinate system, the source process is the process such that mydist=0.
*/
            if( ( mydist = MYROW - rsrc ) < 0 ) mydist += NPROW;
/*
*  When mydist <  nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks of
*  size MB since I am not the source process, i.e. II = ( ilocblk + 1 ) * MB.
*  When mydist >= nblocks - ilocblk * NPROW and I don't own I, I own ilocblk
*  full blocks of size MB, i.e. II = ilocblk * MB, otherwise I own ilocblk
*  blocks and I, in which case I can be written as IMB + (nblocks-1)*MB + IL
*  and II = ilocblk*MB + IL = I - IMB + ( ilocblk - nblocks + 1 )*MB.
*/
            if( nblocks < NPROW )
            {
               mydist -= nblocks;
               *II     = ( ( mydist < 0 ) ? mb : ( ( MYROW == *PROW ) ?
                           i1 + ( 1 - nblocks ) * mb : 0 ) );
            }
            else
            {
               ilocblk = nblocks / NPROW;
               mydist -= nblocks - ilocblk * NPROW;
               *II     = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * mb :
                             ( ( MYROW == *PROW ) ?
                               ( ilocblk - nblocks + 1 ) * mb + i1 :
                               ilocblk * mb ) );
            }
         }
/*
*  Update the size of first block
*/
         *IMB1 = nblocks * mb - i1;
      }
/*
*  Now everything is just like M, I=0, IMB1, MB, PROW, NPROW. The discussion
*  goes as follows: compute my distance from the source process PROW so that
*  within this process coordinate system, the source process is the process
*  such that mydist = 0. Figure out MP.
*/
      if( M <= *IMB1 )
      {
/*
*  M <= IMB1: if I am the source process, i.e. I own I (mydist = 0), MP is M
*  and 0 otherwise.
*/
         *MP = ( ( MYROW == *PROW ) ? M : 0 );
      }
      else
      {
/*
*  Find out how many full blocks are globally (nblocks) and locally (ilocblk)
*  in those M entries
*/
         nblocks = ( M - *IMB1 ) / mb + 1;

         if( MYROW == *PROW )
         {
/*
*  Since mydist = 0 and nblocks - ilocblk * NPROW >= 0, there are only two
*  possible cases:
*
*    1) When mydist = nblocks - ilocblk * NPROW = 0, that is NPROW divides
*       the global number of full blocks, then the source process PROW owns
*       one more block than the other processes; and M can be rewritten as
*       M  = IMB1 + (nblocks-1) * NB + LNB with LNB >= 0 size of the last block.
*       Similarly, the local value MP corresponding to M can be written as
*       MP = IMB1 + (ilocblk-1) * MB + LMB = M + ( ilocblk-1 - (nblocks-1) )*MB.
*       Note that this case cannot happen when ilocblk is zero, since nblocks
*       is at least one.
*
*    2) mydist = 0 < nblocks - ilocblk * NPROW, the source process only owns
*       full blocks, and therefore MP = IMB1 + ilocblk * MB. Note that when
*       ilocblk is zero, MP is just IMB1.
*/
            if( nblocks < NPROW )
            {
               *MP = *IMB1;
            }
            else
            {
               ilocblk = nblocks / NPROW;
               *MP     = ( ( nblocks - ilocblk * NPROW ) ?
                           *IMB1 + ilocblk * mb :
                           M + ( ilocblk - nblocks ) * mb );
            }
         }
         else
         {
/*
*  Compute my distance from the source process so that within this process
*  coordinate system, the source process is the process such that mydist=0.
*/
            if( ( mydist = MYROW - *PROW ) < 0 ) mydist += NPROW;
/*
*  When mydist < nblocks - ilocblk * NPROW, I own ilocblk + 1 full blocks of
*  size MB since I am not the source process,
*
*  when mydist > nblocks - ilocblk * NPROW, I own ilocblk     full blocks of
*  size MB since I am not the source process,
*
*  when mydist = nblocks - ilocblk * NPROW,
*     either the last block is not full and I own it, in which case
*        M = IMB1 + (nblocks - 1)*MB + LMB with LNB the size of the last block
*        such that MB > LMB > 0; the local value MP corresponding to M is given
*        by MP = ilocblk * MB + LMB = M - IMB1 + ( ilocblk - nblocks + 1 ) * MB;
*     or the last block is full and I am the first process owning only ilocblk
*        full blocks of size MB, that is M = IMB + ( nblocks - 1 ) * MB and
*        MP = ilocblk * MB = M - IMB + ( ilocblk - nblocks + 1 ) * MB.
*/
            if( nblocks < NPROW )
            {
               mydist -= nblocks;
               *MP     = ( ( mydist < 0 ) ? mb : ( ( mydist > 0 ) ? 0 :
                           M - *IMB1 + mb * ( 1 - nblocks ) ) );
            }
            else
            {
               ilocblk = nblocks / NPROW;
               mydist -= nblocks - ilocblk * NPROW;
               *MP     = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * mb :
                           ( ( mydist > 0 ) ? ilocblk * mb :
                           M - *IMB1 + mb * ( ilocblk - nblocks + 1 ) ) );
            }
         }
      }
/*
*  Finally figure out IMB1 and RPROW. Note that IMB1 can be zero when M = 0.
*/
      *IMB1  = MIN( *IMB1, M );
      if( ( *RPROW = MYROW - *PROW ) < 0 ) *RPROW += NPROW;
   }
/*
*  Idem for the columns
*/
   nb   = DESC[ NB_   ];
   csrc = DESC[ CSRC_ ];

   if( ( csrc == -1 ) || ( NPCOL == 1 ) )
   {
      *JJ    = J;
      if( ( *INB1 = DESC[INB_] - J ) <= 0 )
         *INB1 += ( ( -(*INB1) ) / nb + 1 ) * nb;
      *INB1  = MIN( *INB1, N );
      *NQ    = N;
      *PCOL  = csrc;
      *RPCOL = 0;
   }
   else
   {
      *INB1 = DESC[INB_];
      if( J < *INB1 )
      {
         *PCOL  = csrc;
         *JJ    = ( ( MYCOL == *PCOL ) ? J : 0 );
         *INB1 -= J;
      }
      else
      {
         j1 = J - *INB1;
         if( MYCOL == csrc )
         {
            nblocks = j1 / nb + 1;
            *PCOL   = csrc + nblocks;
            *PCOL  -= ( *PCOL / NPCOL ) * NPCOL;

            if( nblocks < NPCOL )
            {
               *JJ = *INB1;
            }
            else
            {
               ilocblk = nblocks / NPCOL;
               if( ilocblk * NPCOL >= nblocks )
               {
                  *JJ = ( ( MYCOL == *PCOL ) ? J + ( ilocblk - nblocks ) * nb :
                          *INB1 + ( ilocblk - 1 ) * nb );
               }
               else
               {
                  *JJ = *INB1 + ilocblk * nb;
               }
            }
         }
         else
         {
            nblocks = j1 / nb + 1;
            *PCOL   = csrc + nblocks;
            *PCOL  -= ( *PCOL / NPCOL ) * NPCOL;

            if( ( mydist  = MYCOL - csrc ) < 0 ) mydist += NPCOL;

            if( nblocks < NPCOL )
            {
               mydist -= nblocks;
               *JJ     = ( ( mydist < 0 ) ? nb : ( ( MYCOL == *PCOL ) ?
                           j1 + ( 1 - nblocks ) * nb : 0 ) );
            }
            else
            {
               ilocblk = nblocks / NPCOL;
               mydist -= nblocks - ilocblk * NPCOL;
               *JJ     = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * nb :
                           ( ( MYCOL == *PCOL ) ?
                             ( ilocblk - nblocks + 1 ) * nb + j1 :
                             ilocblk * nb ) );
            }
         }
         *INB1 = nblocks * nb - j1;
      }

      if( N <= *INB1 )
      {
         *NQ = ( ( MYCOL == *PCOL ) ? N : 0 );
      }
      else
      {
         nblocks = ( N - *INB1 ) / nb + 1;

         if( MYCOL == *PCOL )
         {
            if( nblocks < NPCOL )
            {
               *NQ = *INB1;
            }
            else
            {
               ilocblk = nblocks / NPCOL;
               *NQ     = ( ( nblocks - ilocblk * NPCOL ) ?
                           *INB1 + ilocblk * nb :
                           N + ( ilocblk - nblocks ) * nb );
            }
         }
         else
         {
            if( ( mydist  = MYCOL - *PCOL ) < 0 ) mydist += NPCOL;

            if( nblocks < NPCOL )
            {
               mydist -= nblocks;
               *NQ     = ( ( mydist < 0 ) ? nb : ( ( mydist > 0 ) ? 0 :
                           N - *INB1 + nb * ( 1 - nblocks ) ) );
            }
            else
            {
               ilocblk = nblocks / NPCOL;
               mydist -= nblocks - ilocblk * NPCOL;
               *NQ     = ( ( mydist < 0 ) ? ( ilocblk + 1 ) * nb :
                           ( ( mydist > 0 ) ? ilocblk * nb :
                           N - *INB1 + nb * ( ilocblk - nblocks + 1 ) ) );
            }
         }
      }
      *INB1  = MIN( *INB1, N );
      if( ( *RPCOL = MYCOL - *PCOL ) < 0 ) *RPCOL += NPCOL;
   }
/*
*  End of PB_Cainfog2l
*/
}