1      SUBROUTINE PSPBTRF( UPLO, N, BW, A, JA, DESCA, AF, LAF, WORK,
2     $                    LWORK, INFO )
3*
4*  -- ScaLAPACK routine (version 2.0.2) --
5*     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6*     May 1 2012
7*
8*     .. Scalar Arguments ..
9      CHARACTER          UPLO
10      INTEGER            BW, INFO, JA, LAF, LWORK, N
11*     ..
12*     .. Array Arguments ..
13      INTEGER            DESCA( * )
14      REAL               A( * ), AF( * ), WORK( * )
15*     ..
16*
17*
18*  Purpose
19*  =======
20*
21*  PSPBTRF computes a Cholesky factorization
22*  of an N-by-N real banded
23*  symmetric positive definite distributed matrix
24*  with bandwidth BW: A(1:N, JA:JA+N-1).
25*  Reordering is used to increase parallelism in the factorization.
26*  This reordering results in factors that are DIFFERENT from those
27*  produced by equivalent sequential codes. These factors cannot
28*  be used directly by users; however, they can be used in
29*  subsequent calls to PSPBTRS to solve linear systems.
30*
31*  The factorization has the form
32*
33*          P A(1:N, JA:JA+N-1) P^T = U' U ,  if UPLO = 'U', or
34*
35*          P A(1:N, JA:JA+N-1) P^T = L L', if UPLO = 'L'
36*
37*  where U is a banded upper triangular matrix and L is banded
38*  lower triangular, and P is a permutation matrix.
39*
40*  =====================================================================
41*
42*  Arguments
43*  =========
44*
45*  UPLO    (global input) CHARACTER
46*          = 'U':  Upper triangle of A(1:N, JA:JA+N-1) is stored;
47*          = 'L':  Lower triangle of A(1:N, JA:JA+N-1) is stored.
48*
49*  N       (global input) INTEGER
50*          The number of rows and columns to be operated on, i.e. the
51*          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
52*
53*  BW      (global input) INTEGER
54*          Number of subdiagonals in L or U. 0 <= BW <= N-1
55*
56*  A       (local input/local output) REAL pointer into
57*          local memory to an array with first dimension
58*          LLD_A >=(bw+1) (stored in DESCA).
59*          On entry, this array contains the local pieces of the
60*          N-by-N symmetric banded distributed matrix
61*          A(1:N, JA:JA+N-1) to be factored.
62*          This local portion is stored in the packed banded format
63*            used in LAPACK. Please see the Notes below and the
64*            ScaLAPACK manual for more detail on the format of
65*            distributed matrices.
66*          On exit, this array contains information containing details
67*            of the factorization.
68*          Note that permutations are performed on the matrix, so that
69*            the factors returned are different from those returned
70*            by LAPACK.
71*
72*  JA      (global input) INTEGER
73*          The index in the global array A that points to the start of
74*          the matrix to be operated on (which may be either all of A
75*          or a submatrix of A).
76*
77*  DESCA   (global and local input) INTEGER array of dimension DLEN.
78*          if 1D type (DTYPE_A=501), DLEN >= 7;
79*          if 2D type (DTYPE_A=1), DLEN >= 9 .
80*          The array descriptor for the distributed matrix A.
81*          Contains information of mapping of A to memory. Please
82*          see NOTES below for full description and options.
83*
84*  AF      (local output) REAL array, dimension LAF.
85*          Auxiliary Fillin Space.
86*          Fillin is created during the factorization routine
87*          PSPBTRF and this is stored in AF. If a linear system
88*          is to be solved using PSPBTRS after the factorization
89*          routine, AF *must not be altered* after the factorization.
90*
91*  LAF     (local input) INTEGER
92*          Size of user-input Auxiliary Fillin space AF. Must be >=
93*          (NB+2*bw)*bw
94*          If LAF is not large enough, an error code will be returned
95*          and the minimum acceptable size will be returned in AF( 1 )
96*
97*  WORK    (local workspace/local output)
98*          REAL temporary workspace. This space may
99*          be overwritten in between calls to routines. WORK must be
100*          the size given in LWORK.
101*          On exit, WORK( 1 ) contains the minimal LWORK.
102*
103*  LWORK   (local input or global input) INTEGER
104*          Size of user-input workspace WORK.
105*          If LWORK is too small, the minimal acceptable size will be
106*          returned in WORK(1) and an error code is returned. LWORK>=
107*          bw*bw
108*
109*  INFO    (global output) INTEGER
110*          = 0:  successful exit
111*          < 0:  If the i-th argument is an array and the j-entry had
112*                an illegal value, then INFO = -(i*100+j), if the i-th
113*                argument is a scalar and had an illegal value, then
114*                INFO = -i.
115*          > 0:  If INFO = K<=NPROCS, the submatrix stored on processor
116*                INFO and factored locally was not
117*                positive definite,  and
118*                the factorization was not completed.
119*                If INFO = K>NPROCS, the submatrix stored on processor
120*                INFO-NPROCS representing interactions with other
121*                processors was not
122*                positive definite,
123*                and the factorization was not completed.
124*
125*  =====================================================================
126*
127*
128*  Restrictions
129*  ============
130*
131*  The following are restrictions on the input parameters. Some of these
132*    are temporary and will be removed in future releases, while others
133*    may reflect fundamental technical limitations.
134*
135*    Non-cyclic restriction: VERY IMPORTANT!
136*      P*NB>= mod(JA-1,NB)+N.
137*      The mapping for matrices must be blocked, reflecting the nature
138*      of the divide and conquer algorithm as a task-parallel algorithm.
139*      This formula in words is: no processor may have more than one
140*      chunk of the matrix.
141*
142*    Blocksize cannot be too small:
143*      If the matrix spans more than one processor, the following
144*      restriction on NB, the size of each block on each processor,
145*      must hold:
146*      NB >= 2*BW
147*      The bulk of parallel computation is done on the matrix of size
148*      O(NB) on each processor. If this is too small, divide and conquer
149*      is a poor choice of algorithm.
150*
151*    Submatrix reference:
152*      JA = IB
153*      Alignment restriction that prevents unnecessary communication.
154*
155*
156*  =====================================================================
157*
158*
159*  Notes
160*  =====
161*
162*  If the factorization routine and the solve routine are to be called
163*    separately (to solve various sets of righthand sides using the same
164*    coefficient matrix), the auxiliary space AF *must not be altered*
165*    between calls to the factorization routine and the solve routine.
166*
167*  The best algorithm for solving banded and tridiagonal linear systems
168*    depends on a variety of parameters, especially the bandwidth.
169*    Currently, only algorithms designed for the case N/P >> bw are
170*    implemented. These go by many names, including Divide and Conquer,
171*    Partitioning, domain decomposition-type, etc.
172*
173*  Algorithm description: Divide and Conquer
174*
175*    The Divide and Conqer algorithm assumes the matrix is narrowly
176*      banded compared with the number of equations. In this situation,
177*      it is best to distribute the input matrix A one-dimensionally,
178*      with columns atomic and rows divided amongst the processes.
179*      The basic algorithm divides the banded matrix up into
180*      P pieces with one stored on each processor,
181*      and then proceeds in 2 phases for the factorization or 3 for the
182*      solution of a linear system.
183*      1) Local Phase:
184*         The individual pieces are factored independently and in
185*         parallel. These factors are applied to the matrix creating
186*         fillin, which is stored in a non-inspectable way in auxiliary
187*         space AF. Mathematically, this is equivalent to reordering
188*         the matrix A as P A P^T and then factoring the principal
189*         leading submatrix of size equal to the sum of the sizes of
190*         the matrices factored on each processor. The factors of
191*         these submatrices overwrite the corresponding parts of A
192*         in memory.
193*      2) Reduced System Phase:
194*         A small (BW* (P-1)) system is formed representing
195*         interaction of the larger blocks, and is stored (as are its
196*         factors) in the space AF. A parallel Block Cyclic Reduction
197*         algorithm is used. For a linear system, a parallel front solve
198*         followed by an analagous backsolve, both using the structure
199*         of the factored matrix, are performed.
200*      3) Backsubsitution Phase:
201*         For a linear system, a local backsubstitution is performed on
202*         each processor in parallel.
203*
204*
205*  Descriptors
206*  ===========
207*
208*  Descriptors now have *types* and differ from ScaLAPACK 1.0.
209*
210*  Note: banded codes can use either the old two dimensional
211*    or new one-dimensional descriptors, though the processor grid in
212*    both cases *must be one-dimensional*. We describe both types below.
213*
214*  Each global data object is described by an associated description
215*  vector.  This vector stores the information required to establish
216*  the mapping between an object element and its corresponding process
217*  and memory location.
218*
219*  Let A be a generic term for any 2D block cyclicly distributed array.
220*  Such a global array has an associated description vector DESCA.
221*  In the following comments, the character _ should be read as
222*  "of the global array".
223*
224*  NOTATION        STORED IN      EXPLANATION
225*  --------------- -------------- --------------------------------------
226*  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
227*                                 DTYPE_A = 1.
228*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
229*                                 the BLACS process grid A is distribu-
230*                                 ted over. The context itself is glo-
231*                                 bal, but the handle (the integer
232*                                 value) may vary.
233*  M_A    (global) DESCA( M_ )    The number of rows in the global
234*                                 array A.
235*  N_A    (global) DESCA( N_ )    The number of columns in the global
236*                                 array A.
237*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
238*                                 the rows of the array.
239*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
240*                                 the columns of the array.
241*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
242*                                 row of the array A is distributed.
243*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
244*                                 first column of the array A is
245*                                 distributed.
246*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
247*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
248*
249*  Let K be the number of rows or columns of a distributed matrix,
250*  and assume that its process grid has dimension p x q.
251*  LOCr( K ) denotes the number of elements of K that a process
252*  would receive if K were distributed over the p processes of its
253*  process column.
254*  Similarly, LOCc( K ) denotes the number of elements of K that a
255*  process would receive if K were distributed over the q processes of
256*  its process row.
257*  The values of LOCr() and LOCc() may be determined via a call to the
258*  ScaLAPACK tool function, NUMROC:
259*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
260*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
261*  An upper bound for these quantities may be computed by:
262*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
263*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
264*
265*
266*  One-dimensional descriptors:
267*
268*  One-dimensional descriptors are a new addition to ScaLAPACK since
269*    version 1.0. They simplify and shorten the descriptor for 1D
270*    arrays.
271*
272*  Since ScaLAPACK supports two-dimensional arrays as the fundamental
273*    object, we allow 1D arrays to be distributed either over the
274*    first dimension of the array (as if the grid were P-by-1) or the
275*    2nd dimension (as if the grid were 1-by-P). This choice is
276*    indicated by the descriptor type (501 or 502)
277*    as described below.
278*
279*    IMPORTANT NOTE: the actual BLACS grid represented by the
280*    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
281*    irrespective of which one-dimensional descriptor type
282*    (501 or 502) is input.
283*    This routine will interpret the grid properly either way.
284*    ScaLAPACK routines *do not support intercontext operations* so that
285*    the grid passed to a single ScaLAPACK routine *must be the same*
286*    for all array descriptors passed to that routine.
287*
288*    NOTE: In all cases where 1D descriptors are used, 2D descriptors
289*    may also be used, since a one-dimensional array is a special case
290*    of a two-dimensional array with one dimension of size unity.
291*    The two-dimensional array used in this case *must* be of the
292*    proper orientation:
293*      If the appropriate one-dimensional descriptor is DTYPEA=501
294*      (1 by P type), then the two dimensional descriptor must
295*      have a CTXT value that refers to a 1 by P BLACS grid;
296*      If the appropriate one-dimensional descriptor is DTYPEA=502
297*      (P by 1 type), then the two dimensional descriptor must
298*      have a CTXT value that refers to a P by 1 BLACS grid.
299*
300*
301*  Summary of allowed descriptors, types, and BLACS grids:
302*  DTYPE           501         502         1         1
303*  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
304*  -----------------------------------------------------
305*  A               OK          NO          OK        NO
306*  B               NO          OK          NO        OK
307*
308*  Let A be a generic term for any 1D block cyclicly distributed array.
309*  Such a global array has an associated description vector DESCA.
310*  In the following comments, the character _ should be read as
311*  "of the global array".
312*
313*  NOTATION        STORED IN  EXPLANATION
314*  --------------- ---------- ------------------------------------------
315*  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
316*                                TYPE_A = 501: 1-by-P grid.
317*                                TYPE_A = 502: P-by-1 grid.
318*  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
319*                                the BLACS process grid A is distribu-
320*                                ted over. The context itself is glo-
321*                                bal, but the handle (the integer
322*                                value) may vary.
323*  N_A    (global) DESCA( 3 ) The size of the array dimension being
324*                                distributed.
325*  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
326*                                the distributed dimension of the array.
327*  SRC_A  (global) DESCA( 5 ) The process row or column over which the
328*                                first row or column of the array
329*                                is distributed.
330*  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
331*                                storing the local blocks of the distri-
332*                                buted array A. Minimum value of LLD_A
333*                                depends on TYPE_A.
334*                                TYPE_A = 501: LLD_A >=
335*                                   size of undistributed dimension, 1.
336*                                TYPE_A = 502: LLD_A >=NB_A, 1.
337*  Reserved        DESCA( 7 ) Reserved for future use.
338*
339*
340*
341*  =====================================================================
342*
343*  Code Developer: Andrew J. Cleary, University of Tennessee.
344*    Current address: Lawrence Livermore National Labs.
345*
346*  =====================================================================
347*
348*     .. Parameters ..
349      REAL               ONE
350      PARAMETER          ( ONE = 1.0E+0 )
351      REAL               ZERO
352      PARAMETER          ( ZERO = 0.0E+0 )
353      INTEGER            INT_ONE
354      PARAMETER          ( INT_ONE = 1 )
355      INTEGER            DESCMULT, BIGNUM
356      PARAMETER          ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT )
357      INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
358     $                   LLD_, MB_, M_, NB_, N_, RSRC_
359      PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
360     $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
361     $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
362*     ..
363*     .. Local Scalars ..
364      INTEGER            COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
365     $                   ICTXT_NEW, ICTXT_SAVE, IDUM1, IDUM3, JA_NEW,
366     $                   LAF_MIN, LEVEL_DIST, LLDA, MBW2, MYCOL, MYROW,
367     $                   MY_NUM_COLS, NB, NEXT_TRI_SIZE_M,
368     $                   NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE,
369     $                   ODD_SIZE, OFST, PART_OFFSET, PART_SIZE,
370     $                   PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE,
371     $                   STORE_N_A, WORK_SIZE_MIN
372*     ..
373*     .. Local Arrays ..
374      INTEGER            DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
375*     ..
376*     .. External Subroutines ..
377      EXTERNAL           BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT,
378     $                   GLOBCHK, IGAMX2D, IGEBR2D, IGEBS2D, PXERBLA,
379     $                   RESHAPE, SAXPY, SGEMM, SGERV2D, SGESD2D,
380     $                   SLAMOV, SLATCPY, SPBTRF, SPOTRF, SSYRK, STBTRS,
381     $                   STRMM, STRRV2D, STRSD2D, STRSM, STRTRS
382*     ..
383*     .. External Functions ..
384      LOGICAL            LSAME
385      INTEGER            NUMROC
386      EXTERNAL           LSAME, NUMROC
387*     ..
388*     .. Intrinsic Functions ..
389      INTRINSIC          ICHAR, MIN, MOD
390*     ..
391*     .. Executable Statements ..
392*
393*     Test the input parameters
394*
395      INFO = 0
396*
397*     Convert descriptor into standard form for easy access to
398*        parameters, check that grid is of right shape.
399*
400      DESCA_1XP( 1 ) = 501
401*
402      CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE )
403*
404      IF( RETURN_CODE.NE.0 ) THEN
405         INFO = -( 6*100+2 )
406      END IF
407*
408*     Get values out of descriptor for use in code.
409*
410      ICTXT = DESCA_1XP( 2 )
411      CSRC = DESCA_1XP( 5 )
412      NB = DESCA_1XP( 4 )
413      LLDA = DESCA_1XP( 6 )
414      STORE_N_A = DESCA_1XP( 3 )
415*
416*     Get grid parameters
417*
418*
419*     Pre-calculate bw^2
420*
421      MBW2 = BW*BW
422*
423      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
424      NP = NPROW*NPCOL
425*
426*
427*
428      IF( LSAME( UPLO, 'U' ) ) THEN
429         IDUM1 = ICHAR( 'U' )
430      ELSE IF( LSAME( UPLO, 'L' ) ) THEN
431         IDUM1 = ICHAR( 'L' )
432      ELSE
433         INFO = -1
434      END IF
435*
436      IF( LWORK.LT.-1 ) THEN
437         INFO = -10
438      ELSE IF( LWORK.EQ.-1 ) THEN
439         IDUM3 = -1
440      ELSE
441         IDUM3 = 1
442      END IF
443*
444      IF( N.LT.0 ) THEN
445         INFO = -2
446      END IF
447*
448      IF( N+JA-1.GT.STORE_N_A ) THEN
449         INFO = -( 6*100+6 )
450      END IF
451*
452      IF( ( BW.GT.N-1 ) .OR. ( BW.LT.0 ) ) THEN
453         INFO = -3
454      END IF
455*
456      IF( LLDA.LT.( BW+1 ) ) THEN
457         INFO = -( 6*100+6 )
458      END IF
459*
460      IF( NB.LE.0 ) THEN
461         INFO = -( 6*100+4 )
462      END IF
463*
464*     Argument checking that is specific to Divide & Conquer routine
465*
466      IF( NPROW.NE.1 ) THEN
467         INFO = -( 6*100+2 )
468      END IF
469*
470      IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN
471         INFO = -( 2 )
472         CALL PXERBLA( ICTXT, 'PSPBTRF, D&C alg.: only 1 block per proc'
473     $                 , -INFO )
474         RETURN
475      END IF
476*
477      IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*BW ) ) THEN
478         INFO = -( 6*100+4 )
479         CALL PXERBLA( ICTXT, 'PSPBTRF, D&C alg.: NB too small', -INFO )
480         RETURN
481      END IF
482*
483*
484*     Check auxiliary storage size
485*
486      LAF_MIN = ( NB+2*BW )*BW
487*
488      IF( LAF.LT.LAF_MIN ) THEN
489         INFO = -8
490*        put minimum value of laf into AF( 1 )
491         AF( 1 ) = LAF_MIN
492         CALL PXERBLA( ICTXT, 'PSPBTRF: auxiliary storage error ',
493     $                 -INFO )
494         RETURN
495      END IF
496*
497*     Check worksize
498*
499      WORK_SIZE_MIN = BW*BW
500*
501      WORK( 1 ) = WORK_SIZE_MIN
502*
503      IF( LWORK.LT.WORK_SIZE_MIN ) THEN
504         IF( LWORK.NE.-1 ) THEN
505            INFO = -10
506            CALL PXERBLA( ICTXT, 'PSPBTRF: worksize error ', -INFO )
507         END IF
508         RETURN
509      END IF
510*
511*     Pack params and positions into arrays for global consistency check
512*
513      PARAM_CHECK( 9, 1 ) = DESCA( 5 )
514      PARAM_CHECK( 8, 1 ) = DESCA( 4 )
515      PARAM_CHECK( 7, 1 ) = DESCA( 3 )
516      PARAM_CHECK( 6, 1 ) = DESCA( 1 )
517      PARAM_CHECK( 5, 1 ) = JA
518      PARAM_CHECK( 4, 1 ) = BW
519      PARAM_CHECK( 3, 1 ) = N
520      PARAM_CHECK( 2, 1 ) = IDUM3
521      PARAM_CHECK( 1, 1 ) = IDUM1
522*
523      PARAM_CHECK( 9, 2 ) = 605
524      PARAM_CHECK( 8, 2 ) = 604
525      PARAM_CHECK( 7, 2 ) = 603
526      PARAM_CHECK( 6, 2 ) = 601
527      PARAM_CHECK( 5, 2 ) = 5
528      PARAM_CHECK( 4, 2 ) = 3
529      PARAM_CHECK( 3, 2 ) = 2
530      PARAM_CHECK( 2, 2 ) = 10
531      PARAM_CHECK( 1, 2 ) = 1
532*
533*     Want to find errors with MIN( ), so if no error, set it to a big
534*     number. If there already is an error, multiply by the the
535*     descriptor multiplier.
536*
537      IF( INFO.GE.0 ) THEN
538         INFO = BIGNUM
539      ELSE IF( INFO.LT.-DESCMULT ) THEN
540         INFO = -INFO
541      ELSE
542         INFO = -INFO*DESCMULT
543      END IF
544*
545*     Check consistency across processors
546*
547      CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, PARAM_CHECK( 1, 3 ),
548     $              INFO )
549*
550*     Prepare output: set info = 0 if no error, and divide by DESCMULT
551*     if error is not in a descriptor entry.
552*
553      IF( INFO.EQ.BIGNUM ) THEN
554         INFO = 0
555      ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN
556         INFO = -INFO / DESCMULT
557      ELSE
558         INFO = -INFO
559      END IF
560*
561      IF( INFO.LT.0 ) THEN
562         CALL PXERBLA( ICTXT, 'PSPBTRF', -INFO )
563         RETURN
564      END IF
565*
566*     Quick return if possible
567*
568      IF( N.EQ.0 )
569     $   RETURN
570*
571*
572*     Adjust addressing into matrix space to properly get into
573*        the beginning part of the relevant data
574*
575      PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) )
576*
577      IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN
578         PART_OFFSET = PART_OFFSET + NB
579      END IF
580*
581      IF( MYCOL.LT.CSRC ) THEN
582         PART_OFFSET = PART_OFFSET - NB
583      END IF
584*
585*     Form a new BLACS grid (the "standard form" grid) with only procs
586*        holding part of the matrix, of size 1xNP where NP is adjusted,
587*        starting at csrc=0, with JA modified to reflect dropped procs.
588*
589*     First processor to hold part of the matrix:
590*
591      FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL )
592*
593*     Calculate new JA one while dropping off unused processors.
594*
595      JA_NEW = MOD( JA-1, NB ) + 1
596*
597*     Save and compute new value of NP
598*
599      NP_SAVE = NP
600      NP = ( JA_NEW+N-2 ) / NB + 1
601*
602*     Call utility routine that forms "standard-form" grid
603*
604      CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC,
605     $              INT_ONE, NP )
606*
607*     Use new context from standard grid as context.
608*
609      ICTXT_SAVE = ICTXT
610      ICTXT = ICTXT_NEW
611      DESCA_1XP( 2 ) = ICTXT_NEW
612*
613*     Get information about new grid.
614*
615      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
616*
617*     Drop out processors that do not have part of the matrix.
618*
619      IF( MYROW.LT.0 ) THEN
620         GO TO 120
621      END IF
622*
623*     ********************************
624*     Values reused throughout routine
625*
626*     User-input value of partition size
627*
628      PART_SIZE = NB
629*
630*     Number of columns in each processor
631*
632      MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL )
633*
634*     Offset in columns to beginning of main partition in each proc
635*
636      IF( MYCOL.EQ.0 ) THEN
637         PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE )
638         MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE )
639      END IF
640*
641*     Offset in elements
642*
643      OFST = PART_OFFSET*LLDA
644*
645*     Size of main (or odd) partition in each processor
646*
647      ODD_SIZE = MY_NUM_COLS
648      IF( MYCOL.LT.NP-1 ) THEN
649         ODD_SIZE = ODD_SIZE - BW
650      END IF
651*
652*
653*       Zero out space for fillin
654*
655      DO 10 I = 1, LAF_MIN
656         AF( I ) = ZERO
657   10 CONTINUE
658*
659*       Zero out space for work
660*
661      DO 20 I = 1, WORK_SIZE_MIN
662         WORK( I ) = ZERO
663   20 CONTINUE
664*
665*     Begin main code
666*
667      IF( LSAME( UPLO, 'L' ) ) THEN
668*
669********************************************************************
670*       PHASE 1: Local computation phase.
671********************************************************************
672*
673*
674*       Sizes of the extra triangles communicated bewtween processors
675*
676         IF( MYCOL.GT.0 ) THEN
677            PREV_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0,
678     $                        NPCOL ) )
679            PREV_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL-1, 0,
680     $                        NPCOL ) )
681         END IF
682*
683         IF( MYCOL.LT.NPCOL-1 ) THEN
684            NEXT_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL+1, 0,
685     $                        NPCOL ) )
686            NEXT_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0,
687     $                        NPCOL ) )
688         END IF
689*
690         IF( MYCOL.LT.NP-1 ) THEN
691*         Transfer last triangle D_i of local matrix to next processor
692*         which needs it to calculate fillin due to factorization of
693*         its main (odd) block A_i.
694*         Overlap the send with the factorization of A_i.
695*
696            CALL STRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M,
697     $                    NEXT_TRI_SIZE_N, A( OFST+ODD_SIZE*LLDA+( BW+
698     $                    1 ) ), LLDA-1, 0, MYCOL+1 )
699*
700         END IF
701*
702*
703*       Factor main partition A_i = L_i {L_i}^T in each processor
704*
705         CALL SPBTRF( UPLO, ODD_SIZE, BW, A( OFST+1 ), LLDA, INFO )
706*
707         IF( INFO.NE.0 ) THEN
708            INFO = MYCOL + 1
709            GO TO 30
710         END IF
711*
712*
713         IF( MYCOL.LT.NP-1 ) THEN
714*         Apply factorization to odd-even connection block B_i
715*
716*         transpose the connection block in preparation.
717*
718            CALL SLATCPY( 'U', BW, BW, A( ( OFST+( BW+1 )+( ODD_SIZE-
719     $                    BW )*LLDA ) ), LLDA-1,
720     $                    AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW )
721*
722*         Perform the triangular system solve {L_i}{{B'}_i}^T = {B_i}^T
723*
724            CALL STRTRS( 'L', 'N', 'N', BW, BW,
725     $                   A( OFST+1+( ODD_SIZE-BW )*LLDA ), LLDA-1,
726     $                   AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO )
727*
728*
729*         transpose resulting block to its location
730*           in main storage.
731*
732            CALL SLATCPY( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ),
733     $                    BW, A( ( OFST+( BW+1 )+( ODD_SIZE-BW )*
734     $                    LLDA ) ), LLDA-1 )
735*
736*
737*         Compute contribution to diagonal block(s) of reduced system.
738*          {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^T
739*
740*         The following method uses more flops than necessary but
741*           does not necessitate the writing of a new BLAS routine.
742*
743*
744            CALL SSYRK( UPLO, 'T', BW, BW, -ONE,
745     $                  AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE,
746     $                  A( OFST+1+ODD_SIZE*LLDA ), LLDA-1 )
747*
748         END IF
749*       End of "if ( MYCOL .lt. NP-1 )..." loop
750*
751*
752   30    CONTINUE
753*       If the processor could not locally factor, it jumps here.
754*
755         IF( MYCOL.NE.0 ) THEN
756*         Discard temporary matrix stored beginning in
757*           AF( (odd_size+2*bw)*bw+1 ) and use for
758*           off_diagonal block of reduced system.
759*
760*         Receive previously transmitted matrix section, which forms
761*         the right-hand-side for the triangular solve that calculates
762*         the "spike" fillin.
763*
764*
765            CALL STRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M,
766     $                    PREV_TRI_SIZE_N, AF( 1 ), ODD_SIZE, 0,
767     $                    MYCOL-1 )
768*
769            IF( INFO.EQ.0 ) THEN
770*
771*         Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ .
772*
773               CALL STBTRS( 'L', 'N', 'N', ODD_SIZE, BW, BW,
774     $                      A( OFST+1 ), LLDA, AF( 1 ), ODD_SIZE, INFO )
775*
776*
777*         Calculate the update block for previous proc, E_i = G_i{G_i}^T
778*
779               CALL SSYRK( 'L', 'T', BW, ODD_SIZE, -ONE, AF( 1 ),
780     $                     ODD_SIZE, ZERO, AF( 1+( ODD_SIZE+2*BW )*BW ),
781     $                     BW )
782*
783*
784*         Initiate send of E_i to previous processor to overlap
785*           with next computation.
786*
787               CALL SGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ),
788     $                       BW, 0, MYCOL-1 )
789*
790*
791               IF( MYCOL.LT.NP-1 ) THEN
792*
793*           Calculate off-diagonal block(s) of reduced system.
794*           Note: for ease of use in solution of reduced system, store
795*           L's off-diagonal block in transpose form.
796*           {F_i}^T =  {H_i}{{B'}_i}^T
797*
798*           Copy matrix H_i (the last bw cols of G_i) to AF storage
799*             as per requirements of BLAS routine STRMM.
800*             Since we have G_i^T stored, transpose
801*             H_i^T to H_i.
802*
803                  CALL SLATCPY( 'N', BW, BW, AF( ODD_SIZE-BW+1 ),
804     $                          ODD_SIZE, AF( ( ODD_SIZE )*BW+1 ), BW )
805*
806                  CALL STRMM( 'R', 'U', 'T', 'N', BW, BW, -ONE,
807     $                        A( ( OFST+( BW+1 )+( ODD_SIZE-BW )*
808     $                        LLDA ) ), LLDA-1, AF( ( ODD_SIZE )*BW+1 ),
809     $                        BW )
810*
811*
812               END IF
813*
814            END IF
815*       End of "if ( MYCOL .ne. 0 )..."
816*
817         END IF
818*       End of "if (info.eq.0) then"
819*
820*
821*       Check to make sure no processors have found errors
822*
823         CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1,
824     $                 0, 0 )
825*
826         IF( MYCOL.EQ.0 ) THEN
827            CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
828         ELSE
829            CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
830         END IF
831*
832         IF( INFO.NE.0 ) THEN
833            GO TO 110
834         END IF
835*       No errors found, continue
836*
837*
838********************************************************************
839*       PHASE 2: Formation and factorization of Reduced System.
840********************************************************************
841*
842*       Gather up local sections of reduced system
843*
844*
845*     The last processor does not participate in the factorization of
846*       the reduced system, having sent its E_i already.
847         IF( MYCOL.EQ.NPCOL-1 ) THEN
848            GO TO 60
849         END IF
850*
851*       Initiate send of off-diag block(s) to overlap with next part.
852*       Off-diagonal block needed on neighboring processor to start
853*       algorithm.
854*
855         IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN
856*
857            CALL SGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0,
858     $                    MYCOL-1 )
859*
860         END IF
861*
862*       Copy last diagonal block into AF storage for subsequent
863*         operations.
864*
865         CALL SLAMOV( 'N', BW, BW, A( OFST+ODD_SIZE*LLDA+1 ), LLDA-1,
866     $                AF( ODD_SIZE*BW+MBW2+1 ), BW )
867*
868*       Receive cont. to diagonal block that is stored on this proc.
869*
870         IF( MYCOL.LT.NPCOL-1 ) THEN
871*
872            CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW,
873     $                    0, MYCOL+1 )
874*
875*          Add contribution to diagonal block
876*
877            CALL SAXPY( MBW2, ONE, AF( ODD_SIZE*BW+2*MBW2+1 ), 1,
878     $                  AF( ODD_SIZE*BW+MBW2+1 ), 1 )
879*
880         END IF
881*
882*
883*       *************************************
884*       Modification Loop
885*
886*       The distance for sending and receiving for each level starts
887*         at 1 for the first level.
888         LEVEL_DIST = 1
889*
890*       Do until this proc is needed to modify other procs' equations
891*
892   40    CONTINUE
893         IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 )
894     $      GO TO 50
895*
896*         Receive and add contribution to diagonal block from the left
897*
898         IF( MYCOL-LEVEL_DIST.GE.0 ) THEN
899            CALL SGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
900     $                    MYCOL-LEVEL_DIST )
901*
902            CALL SAXPY( MBW2, ONE, WORK( 1 ), 1,
903     $                  AF( ODD_SIZE*BW+MBW2+1 ), 1 )
904*
905         END IF
906*
907*         Receive and add contribution to diagonal block from the right
908*
909         IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN
910            CALL SGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
911     $                    MYCOL+LEVEL_DIST )
912*
913            CALL SAXPY( MBW2, ONE, WORK( 1 ), 1,
914     $                  AF( ODD_SIZE*BW+MBW2+1 ), 1 )
915*
916         END IF
917*
918         LEVEL_DIST = LEVEL_DIST*2
919*
920         GO TO 40
921   50    CONTINUE
922*       [End of GOTO Loop]
923*
924*
925*       *********************************
926*       Calculate and use this proc's blocks to modify other procs'...
927*
928*       Factor diagonal block
929*
930         CALL SPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), BW, INFO )
931*
932         IF( INFO.NE.0 ) THEN
933            INFO = NPCOL + MYCOL
934         END IF
935*
936*       ****************************************************************
937*       Receive offdiagonal block from processor to right.
938*         If this is the first group of processors, the receive comes
939*         from a different processor than otherwise.
940*
941         IF( LEVEL_DIST.EQ.1 ) THEN
942            COMM_PROC = MYCOL + 1
943*
944*           Move block into place that it will be expected to be for
945*             calcs.
946*
947            CALL SLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW,
948     $                   AF( ODD_SIZE*BW+2*MBW2+1 ), BW )
949*
950         ELSE
951            COMM_PROC = MYCOL + LEVEL_DIST / 2
952         END IF
953*
954         IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
955*
956            CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0,
957     $                    COMM_PROC )
958*
959            IF( INFO.EQ.0 ) THEN
960*
961*
962*         Modify upper off_diagonal block with diagonal block
963*
964*
965               CALL STRSM( 'L', 'L', 'N', 'N', BW, BW, ONE,
966     $                     AF( ODD_SIZE*BW+MBW2+1 ), BW,
967     $                     AF( ODD_SIZE*BW+1 ), BW )
968*
969            END IF
970*         End of "if ( info.eq.0 ) then"
971*
972*         Calculate contribution from this block to next diagonal block
973*
974            CALL SSYRK( 'L', 'T', BW, BW, -ONE, AF( ( ODD_SIZE )*BW+1 ),
975     $                  BW, ZERO, WORK( 1 ), BW )
976*
977*         Send contribution to diagonal block's owning processor.
978*
979            CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
980     $                    MYCOL+LEVEL_DIST )
981*
982         END IF
983*       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
984*
985*
986*       ****************************************************************
987*       Receive off_diagonal block from left and use to finish with this
988*         processor.
989*
990         IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND.
991     $       ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN
992*
993            IF( LEVEL_DIST.GT.1 ) THEN
994*
995*           Receive offdiagonal block(s) from proc level_dist/2 to the
996*           left
997*
998               CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ),
999     $                       BW, 0, MYCOL-LEVEL_DIST / 2 )
1000*
1001            END IF
1002*
1003*
1004            IF( INFO.EQ.0 ) THEN
1005*
1006*         Use diagonal block(s) to modify this offdiagonal block
1007*
1008               CALL STRSM( 'R', 'L', 'T', 'N', BW, BW, ONE,
1009     $                     AF( ODD_SIZE*BW+MBW2+1 ), BW,
1010     $                     AF( ODD_SIZE*BW+2*MBW2+1 ), BW )
1011*
1012            END IF
1013*         End of "if( info.eq.0 ) then"
1014*
1015*         Use offdiag block(s) to calculate modification to diag block
1016*           of processor to the left
1017*
1018            CALL SSYRK( 'L', 'N', BW, BW, -ONE,
1019     $                  AF( ( ODD_SIZE+2*BW )*BW+1 ), BW, ZERO,
1020     $                  WORK( 1 ), BW )
1021*
1022*         Send contribution to diagonal block's owning processor.
1023*
1024            CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
1025     $                    MYCOL-LEVEL_DIST )
1026*
1027*         *******************************************************
1028*
1029            IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
1030*
1031*           Decide which processor offdiagonal block(s) goes to
1032*
1033               IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN
1034                  COMM_PROC = MYCOL + LEVEL_DIST
1035               ELSE
1036                  COMM_PROC = MYCOL - LEVEL_DIST
1037               END IF
1038*
1039*           Use offdiagonal blocks to calculate offdiag
1040*             block to send to neighboring processor. Depending
1041*             on circumstances, may need to transpose the matrix.
1042*
1043               CALL SGEMM( 'N', 'N', BW, BW, BW, -ONE,
1044     $                     AF( ODD_SIZE*BW+2*MBW2+1 ), BW,
1045     $                     AF( ODD_SIZE*BW+1 ), BW, ZERO, WORK( 1 ),
1046     $                     BW )
1047*
1048*           Send contribution to offdiagonal block's owning processor.
1049*
1050               CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
1051     $                       COMM_PROC )
1052*
1053            END IF
1054*
1055         END IF
1056*       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1057*
1058   60    CONTINUE
1059*
1060      ELSE
1061*
1062*       CASE UPLO = 'U'
1063*
1064********************************************************************
1065*       PHASE 1: Local computation phase.
1066********************************************************************
1067*
1068*
1069*       Sizes of the extra triangles communicated bewtween processors
1070*
1071         IF( MYCOL.GT.0 ) THEN
1072            PREV_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0,
1073     $                        NPCOL ) )
1074            PREV_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL-1, 0,
1075     $                        NPCOL ) )
1076         END IF
1077*
1078         IF( MYCOL.LT.NPCOL-1 ) THEN
1079            NEXT_TRI_SIZE_M = MIN( BW, NUMROC( N, PART_SIZE, MYCOL+1, 0,
1080     $                        NPCOL ) )
1081            NEXT_TRI_SIZE_N = MIN( BW, NUMROC( N, PART_SIZE, MYCOL, 0,
1082     $                        NPCOL ) )
1083         END IF
1084*
1085*
1086*
1087*       Factor main partition A_i^T = U_i {U_i}^T in each processor
1088*
1089         CALL SPBTRF( UPLO, ODD_SIZE, BW, A( OFST+1 ), LLDA, INFO )
1090*
1091         IF( INFO.NE.0 ) THEN
1092            INFO = MYCOL + 1
1093            GO TO 70
1094         END IF
1095*
1096*
1097         IF( MYCOL.LT.NP-1 ) THEN
1098*         Apply factorization to odd-even connection block B_i
1099*
1100*         Move the connection block in preparation.
1101*
1102            CALL SLAMOV( 'L', BW, BW, A( ( OFST+1+ODD_SIZE*LLDA ) ),
1103     $                   LLDA-1, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ), BW )
1104*
1105*
1106*         Perform the triangular solve {L_i}{{B'}_i}^T = {B_i}^T
1107*
1108            CALL STRTRS( 'U', 'T', 'N', BW, BW,
1109     $                   A( OFST+BW+1+( ODD_SIZE-BW )*LLDA ), LLDA-1,
1110     $                   AF( ODD_SIZE*BW+2*MBW2+1 ), BW, INFO )
1111*
1112*         Move the resulting block back to its location in main storage.
1113*
1114            CALL SLAMOV( 'L', BW, BW, AF( ODD_SIZE*BW+2*MBW2+1+BW-BW ),
1115     $                   BW, A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1 )
1116*
1117*
1118*         Compute contribution to diagonal block(s) of reduced system.
1119*          {C'}_i^T = {C_i}^T-{{B'}_i}^T{{B'}_i}
1120*
1121*         The following method uses more flops than necessary but
1122*           does not necessitate the writing of a new BLAS routine.
1123*
1124*
1125            CALL SSYRK( UPLO, 'T', BW, BW, -ONE,
1126     $                  AF( ODD_SIZE*BW+2*MBW2+1 ), BW, ONE,
1127     $                  A( OFST+BW+1+ODD_SIZE*LLDA ), LLDA-1 )
1128*
1129         END IF
1130*       End of "if ( MYCOL .lt. NP-1 )..." loop
1131*
1132*
1133   70    CONTINUE
1134*       If the processor could not locally factor, it jumps here.
1135*
1136         IF( MYCOL.NE.0 ) THEN
1137*         Discard temporary matrix stored beginning in
1138*           AF( (odd_size+2*bw)*bw+1 ) and use for
1139*           off_diagonal block of reduced system.
1140*
1141*         Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ .
1142*
1143*
1144*         Copy D block into AF storage for solve.
1145*
1146            CALL SLATCPY( 'L', PREV_TRI_SIZE_N, PREV_TRI_SIZE_M,
1147     $                    A( OFST+1 ), LLDA-1, AF( 1 ), ODD_SIZE )
1148*
1149            IF( INFO.EQ.0 ) THEN
1150*
1151               CALL STBTRS( 'U', 'T', 'N', ODD_SIZE, BW, BW,
1152     $                      A( OFST+1 ), LLDA, AF( 1 ), ODD_SIZE, INFO )
1153*
1154*
1155*         Calculate the update block for previous proc, E_i = G_i{G_i}^T
1156*
1157               CALL SSYRK( 'L', 'T', BW, ODD_SIZE, -ONE, AF( 1 ),
1158     $                     ODD_SIZE, ZERO, AF( 1+( ODD_SIZE+2*BW )*BW ),
1159     $                     BW )
1160*
1161*
1162*         Initiate send of E_i to previous processor to overlap
1163*           with next computation.
1164*
1165               CALL SGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ),
1166     $                       BW, 0, MYCOL-1 )
1167*
1168*
1169               IF( MYCOL.LT.NP-1 ) THEN
1170*
1171*           Calculate off-diagonal block(s) of reduced system.
1172*           Note: for ease of use in solution of reduced system, store
1173*           L's off-diagonal block in transpose form.
1174*           {F_i}^T =  {H_i}{{B'}_i}^T
1175*
1176*           Copy matrix H_i (the last bw cols of G_i) to AF storage
1177*             as per requirements of BLAS routine STRMM.
1178*             Since we have G_i^T stored, transpose
1179*             H_i^T to H_i.
1180*
1181                  CALL SLATCPY( 'N', BW, BW, AF( ODD_SIZE-BW+1 ),
1182     $                          ODD_SIZE, AF( ( ODD_SIZE )*BW+1 ), BW )
1183*
1184                  CALL STRMM( 'R', 'L', 'N', 'N', BW, BW, -ONE,
1185     $                        A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1,
1186     $                        AF( ( ODD_SIZE )*BW+1 ), BW )
1187*
1188               END IF
1189*
1190            END IF
1191*       End of "if ( MYCOL .ne. 0 )..."
1192*
1193         END IF
1194*       End of "if (info.eq.0) then"
1195*
1196*
1197*       Check to make sure no processors have found errors
1198*
1199         CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1,
1200     $                 0, 0 )
1201*
1202         IF( MYCOL.EQ.0 ) THEN
1203            CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
1204         ELSE
1205            CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
1206         END IF
1207*
1208         IF( INFO.NE.0 ) THEN
1209            GO TO 110
1210         END IF
1211*       No errors found, continue
1212*
1213*
1214********************************************************************
1215*       PHASE 2: Formation and factorization of Reduced System.
1216********************************************************************
1217*
1218*       Gather up local sections of reduced system
1219*
1220*
1221*     The last processor does not participate in the factorization of
1222*       the reduced system, having sent its E_i already.
1223         IF( MYCOL.EQ.NPCOL-1 ) THEN
1224            GO TO 100
1225         END IF
1226*
1227*       Initiate send of off-diag block(s) to overlap with next part.
1228*       Off-diagonal block needed on neighboring processor to start
1229*       algorithm.
1230*
1231         IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN
1232*
1233            CALL SGESD2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0,
1234     $                    MYCOL-1 )
1235*
1236         END IF
1237*
1238*       Transpose last diagonal block into AF storage for subsequent
1239*         operations.
1240*
1241         CALL SLATCPY( 'U', BW, BW, A( OFST+ODD_SIZE*LLDA+1+BW ),
1242     $                 LLDA-1, AF( ODD_SIZE*BW+MBW2+1 ), BW )
1243*
1244*       Receive cont. to diagonal block that is stored on this proc.
1245*
1246         IF( MYCOL.LT.NPCOL-1 ) THEN
1247*
1248            CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ), BW,
1249     $                    0, MYCOL+1 )
1250*
1251*          Add contribution to diagonal block
1252*
1253            CALL SAXPY( MBW2, ONE, AF( ODD_SIZE*BW+2*MBW2+1 ), 1,
1254     $                  AF( ODD_SIZE*BW+MBW2+1 ), 1 )
1255*
1256         END IF
1257*
1258*
1259*       *************************************
1260*       Modification Loop
1261*
1262*       The distance for sending and receiving for each level starts
1263*         at 1 for the first level.
1264         LEVEL_DIST = 1
1265*
1266*       Do until this proc is needed to modify other procs' equations
1267*
1268   80    CONTINUE
1269         IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 )
1270     $      GO TO 90
1271*
1272*         Receive and add contribution to diagonal block from the left
1273*
1274         IF( MYCOL-LEVEL_DIST.GE.0 ) THEN
1275            CALL SGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
1276     $                    MYCOL-LEVEL_DIST )
1277*
1278            CALL SAXPY( MBW2, ONE, WORK( 1 ), 1,
1279     $                  AF( ODD_SIZE*BW+MBW2+1 ), 1 )
1280*
1281         END IF
1282*
1283*         Receive and add contribution to diagonal block from the right
1284*
1285         IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN
1286            CALL SGERV2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
1287     $                    MYCOL+LEVEL_DIST )
1288*
1289            CALL SAXPY( MBW2, ONE, WORK( 1 ), 1,
1290     $                  AF( ODD_SIZE*BW+MBW2+1 ), 1 )
1291*
1292         END IF
1293*
1294         LEVEL_DIST = LEVEL_DIST*2
1295*
1296         GO TO 80
1297   90    CONTINUE
1298*       [End of GOTO Loop]
1299*
1300*
1301*       *********************************
1302*       Calculate and use this proc's blocks to modify other procs'...
1303*
1304*       Factor diagonal block
1305*
1306         CALL SPOTRF( 'L', BW, AF( ODD_SIZE*BW+MBW2+1 ), BW, INFO )
1307*
1308         IF( INFO.NE.0 ) THEN
1309            INFO = NPCOL + MYCOL
1310         END IF
1311*
1312*       ****************************************************************
1313*       Receive offdiagonal block from processor to right.
1314*         If this is the first group of processors, the receive comes
1315*         from a different processor than otherwise.
1316*
1317         IF( LEVEL_DIST.EQ.1 ) THEN
1318            COMM_PROC = MYCOL + 1
1319*
1320*           Move block into place that it will be expected to be for
1321*             calcs.
1322*
1323            CALL SLAMOV( 'N', BW, BW, AF( ODD_SIZE*BW+1 ), BW,
1324     $                   AF( ODD_SIZE*BW+2*MBW2+1 ), BW )
1325*
1326         ELSE
1327            COMM_PROC = MYCOL + LEVEL_DIST / 2
1328         END IF
1329*
1330         IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
1331*
1332            CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+1 ), BW, 0,
1333     $                    COMM_PROC )
1334*
1335            IF( INFO.EQ.0 ) THEN
1336*
1337*
1338*         Modify upper off_diagonal block with diagonal block
1339*
1340*
1341               CALL STRSM( 'L', 'L', 'N', 'N', BW, BW, ONE,
1342     $                     AF( ODD_SIZE*BW+MBW2+1 ), BW,
1343     $                     AF( ODD_SIZE*BW+1 ), BW )
1344*
1345            END IF
1346*         End of "if ( info.eq.0 ) then"
1347*
1348*         Calculate contribution from this block to next diagonal block
1349*
1350            CALL SSYRK( 'L', 'T', BW, BW, -ONE, AF( ( ODD_SIZE )*BW+1 ),
1351     $                  BW, ZERO, WORK( 1 ), BW )
1352*
1353*         Send contribution to diagonal block's owning processor.
1354*
1355            CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
1356     $                    MYCOL+LEVEL_DIST )
1357*
1358         END IF
1359*       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1360*
1361*
1362*       ****************************************************************
1363*       Receive off_diagonal block from left and use to finish with this
1364*         processor.
1365*
1366         IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND.
1367     $       ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN
1368*
1369            IF( LEVEL_DIST.GT.1 ) THEN
1370*
1371*           Receive offdiagonal block(s) from proc level_dist/2 to the
1372*           left
1373*
1374               CALL SGERV2D( ICTXT, BW, BW, AF( ODD_SIZE*BW+2*MBW2+1 ),
1375     $                       BW, 0, MYCOL-LEVEL_DIST / 2 )
1376*
1377            END IF
1378*
1379*
1380            IF( INFO.EQ.0 ) THEN
1381*
1382*         Use diagonal block(s) to modify this offdiagonal block
1383*
1384               CALL STRSM( 'R', 'L', 'T', 'N', BW, BW, ONE,
1385     $                     AF( ODD_SIZE*BW+MBW2+1 ), BW,
1386     $                     AF( ODD_SIZE*BW+2*MBW2+1 ), BW )
1387*
1388            END IF
1389*         End of "if( info.eq.0 ) then"
1390*
1391*         Use offdiag block(s) to calculate modification to diag block
1392*           of processor to the left
1393*
1394            CALL SSYRK( 'L', 'N', BW, BW, -ONE,
1395     $                  AF( ( ODD_SIZE+2*BW )*BW+1 ), BW, ZERO,
1396     $                  WORK( 1 ), BW )
1397*
1398*         Send contribution to diagonal block's owning processor.
1399*
1400            CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
1401     $                    MYCOL-LEVEL_DIST )
1402*
1403*         *******************************************************
1404*
1405            IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
1406*
1407*           Decide which processor offdiagonal block(s) goes to
1408*
1409               IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN
1410                  COMM_PROC = MYCOL + LEVEL_DIST
1411               ELSE
1412                  COMM_PROC = MYCOL - LEVEL_DIST
1413               END IF
1414*
1415*           Use offdiagonal blocks to calculate offdiag
1416*             block to send to neighboring processor. Depending
1417*             on circumstances, may need to transpose the matrix.
1418*
1419               CALL SGEMM( 'N', 'N', BW, BW, BW, -ONE,
1420     $                     AF( ODD_SIZE*BW+2*MBW2+1 ), BW,
1421     $                     AF( ODD_SIZE*BW+1 ), BW, ZERO, WORK( 1 ),
1422     $                     BW )
1423*
1424*           Send contribution to offdiagonal block's owning processor.
1425*
1426               CALL SGESD2D( ICTXT, BW, BW, WORK( 1 ), BW, 0,
1427     $                       COMM_PROC )
1428*
1429            END IF
1430*
1431         END IF
1432*       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1433*
1434  100    CONTINUE
1435*
1436      END IF
1437*
1438  110 CONTINUE
1439*
1440*
1441*     Free BLACS space used to hold standard-form grid.
1442*
1443      IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN
1444         CALL BLACS_GRIDEXIT( ICTXT_NEW )
1445      END IF
1446*
1447  120 CONTINUE
1448*
1449*     Restore saved input parameters
1450*
1451      ICTXT = ICTXT_SAVE
1452      NP = NP_SAVE
1453*
1454*     Output minimum worksize
1455*
1456      WORK( 1 ) = WORK_SIZE_MIN
1457*
1458*         Make INFO consistent across processors
1459*
1460      CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0,
1461     $              0 )
1462*
1463      IF( MYCOL.EQ.0 ) THEN
1464         CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
1465      ELSE
1466         CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
1467      END IF
1468*
1469*
1470      RETURN
1471*
1472*     End of PSPBTRF
1473*
1474      END
1475