1      SUBROUTINE PSPBTRSV( UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B,
2     $                     IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3*
4*  -- ScaLAPACK routine (version 2.0.2) --
5*     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6*     May 1 2012
7*
8*     .. Scalar Arguments ..
9      CHARACTER          TRANS, UPLO
10      INTEGER            BW, IB, INFO, JA, LAF, LWORK, N, NRHS
11*     ..
12*     .. Array Arguments ..
13      INTEGER            DESCA( * ), DESCB( * )
14      REAL               A( * ), AF( * ), B( * ), WORK( * )
15*     ..
16*
17*
18*  Purpose
19*  =======
20*
21*  PSPBTRSV solves a banded triangular system of linear equations
22*
23*                      A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24*                                 or
25*                      A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS)
26*
27*  where A(1:N, JA:JA+N-1) is a banded
28*  triangular matrix factor produced by the
29*  Cholesky factorization code PSPBTRF
30*  and is stored in A(1:N,JA:JA+N-1) and AF.
31*  The matrix stored in A(1:N, JA:JA+N-1) is either
32*  upper or lower triangular according to UPLO,
33*  and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T
34*  is dictated by the user by the parameter TRANS.
35*
36*  Routine PSPBTRF MUST be called first.
37*
38*  =====================================================================
39*
40*  Arguments
41*  =========
42*
43*  UPLO    (global input) CHARACTER
44*          = 'U':  Upper triangle of A(1:N, JA:JA+N-1) is stored;
45*          = 'L':  Lower triangle of A(1:N, JA:JA+N-1) is stored.
46*
47*  TRANS   (global input) CHARACTER
48*          = 'N':  Solve with A(1:N, JA:JA+N-1);
49*          = 'T' or 'C':  Solve with A(1:N, JA:JA+N-1)^T;
50*
51*  N       (global input) INTEGER
52*          The number of rows and columns to be operated on, i.e. the
53*          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
54*
55*  BW      (global input) INTEGER
56*          Number of subdiagonals in L or U. 0 <= BW <= N-1
57*
58*  NRHS    (global input) INTEGER
59*          The number of right hand sides, i.e., the number of columns
60*          of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
61*          NRHS >= 0.
62*
63*  A       (local input/local output) REAL pointer into
64*          local memory to an array with first dimension
65*          LLD_A >=(bw+1) (stored in DESCA).
66*          On entry, this array contains the local pieces of the
67*          N-by-N symmetric banded distributed Cholesky factor L or
68*          L^T A(1:N, JA:JA+N-1).
69*          This local portion is stored in the packed banded format
70*            used in LAPACK. Please see the Notes below and the
71*            ScaLAPACK manual for more detail on the format of
72*            distributed matrices.
73*
74*  JA      (global input) INTEGER
75*          The index in the global array A that points to the start of
76*          the matrix to be operated on (which may be either all of A
77*          or a submatrix of A).
78*
79*  DESCA   (global and local input) INTEGER array of dimension DLEN.
80*          if 1D type (DTYPE_A=501), DLEN >= 7;
81*          if 2D type (DTYPE_A=1), DLEN >= 9 .
82*          The array descriptor for the distributed matrix A.
83*          Contains information of mapping of A to memory. Please
84*          see NOTES below for full description and options.
85*
86*  B       (local input/local output) REAL pointer into
87*          local memory to an array of local lead dimension lld_b>=NB.
88*          On entry, this array contains the
89*          the local pieces of the right hand sides
90*          B(IB:IB+N-1, 1:NRHS).
91*          On exit, this contains the local piece of the solutions
92*          distributed matrix X.
93*
94*  IB      (global input) INTEGER
95*          The row index in the global array B that points to the first
96*          row of the matrix to be operated on (which may be either
97*          all of B or a submatrix of B).
98*
99*  DESCB   (global and local input) INTEGER array of dimension DLEN.
100*          if 1D type (DTYPE_B=502), DLEN >=7;
101*          if 2D type (DTYPE_B=1), DLEN >= 9.
102*          The array descriptor for the distributed matrix B.
103*          Contains information of mapping of B to memory. Please
104*          see NOTES below for full description and options.
105*
106*  AF      (local output) REAL array, dimension LAF.
107*          Auxiliary Fillin Space.
108*          Fillin is created during the factorization routine
109*          PSPBTRF and this is stored in AF. If a linear system
110*          is to be solved using PSPBTRS after the factorization
111*          routine, AF *must not be altered* after the factorization.
112*
113*  LAF     (local input) INTEGER
114*          Size of user-input Auxiliary Fillin space AF. Must be >=
115*          (NB+2*bw)*bw
116*          If LAF is not large enough, an error code will be returned
117*          and the minimum acceptable size will be returned in AF( 1 )
118*
119*  WORK    (local workspace/local output)
120*          REAL temporary workspace. This space may
121*          be overwritten in between calls to routines. WORK must be
122*          the size given in LWORK.
123*          On exit, WORK( 1 ) contains the minimal LWORK.
124*
125*  LWORK   (local input or global input) INTEGER
126*          Size of user-input workspace WORK.
127*          If LWORK is too small, the minimal acceptable size will be
128*          returned in WORK(1) and an error code is returned. LWORK>=
129*          (bw*NRHS)
130*
131*  INFO    (global output) INTEGER
132*          = 0:  successful exit
133*          < 0:  If the i-th argument is an array and the j-entry had
134*                an illegal value, then INFO = -(i*100+j), if the i-th
135*                argument is a scalar and had an illegal value, then
136*                INFO = -i.
137*
138*  =====================================================================
139*
140*
141*  Restrictions
142*  ============
143*
144*  The following are restrictions on the input parameters. Some of these
145*    are temporary and will be removed in future releases, while others
146*    may reflect fundamental technical limitations.
147*
148*    Non-cyclic restriction: VERY IMPORTANT!
149*      P*NB>= mod(JA-1,NB)+N.
150*      The mapping for matrices must be blocked, reflecting the nature
151*      of the divide and conquer algorithm as a task-parallel algorithm.
152*      This formula in words is: no processor may have more than one
153*      chunk of the matrix.
154*
155*    Blocksize cannot be too small:
156*      If the matrix spans more than one processor, the following
157*      restriction on NB, the size of each block on each processor,
158*      must hold:
159*      NB >= 2*BW
160*      The bulk of parallel computation is done on the matrix of size
161*      O(NB) on each processor. If this is too small, divide and conquer
162*      is a poor choice of algorithm.
163*
164*    Submatrix reference:
165*      JA = IB
166*      Alignment restriction that prevents unnecessary communication.
167*
168*
169*  =====================================================================
170*
171*
172*  Notes
173*  =====
174*
175*  If the factorization routine and the solve routine are to be called
176*    separately (to solve various sets of righthand sides using the same
177*    coefficient matrix), the auxiliary space AF *must not be altered*
178*    between calls to the factorization routine and the solve routine.
179*
180*  The best algorithm for solving banded and tridiagonal linear systems
181*    depends on a variety of parameters, especially the bandwidth.
182*    Currently, only algorithms designed for the case N/P >> bw are
183*    implemented. These go by many names, including Divide and Conquer,
184*    Partitioning, domain decomposition-type, etc.
185*
186*  Algorithm description: Divide and Conquer
187*
188*    The Divide and Conqer algorithm assumes the matrix is narrowly
189*      banded compared with the number of equations. In this situation,
190*      it is best to distribute the input matrix A one-dimensionally,
191*      with columns atomic and rows divided amongst the processes.
192*      The basic algorithm divides the banded matrix up into
193*      P pieces with one stored on each processor,
194*      and then proceeds in 2 phases for the factorization or 3 for the
195*      solution of a linear system.
196*      1) Local Phase:
197*         The individual pieces are factored independently and in
198*         parallel. These factors are applied to the matrix creating
199*         fillin, which is stored in a non-inspectable way in auxiliary
200*         space AF. Mathematically, this is equivalent to reordering
201*         the matrix A as P A P^T and then factoring the principal
202*         leading submatrix of size equal to the sum of the sizes of
203*         the matrices factored on each processor. The factors of
204*         these submatrices overwrite the corresponding parts of A
205*         in memory.
206*      2) Reduced System Phase:
207*         A small (BW* (P-1)) system is formed representing
208*         interaction of the larger blocks, and is stored (as are its
209*         factors) in the space AF. A parallel Block Cyclic Reduction
210*         algorithm is used. For a linear system, a parallel front solve
211*         followed by an analagous backsolve, both using the structure
212*         of the factored matrix, are performed.
213*      3) Backsubsitution Phase:
214*         For a linear system, a local backsubstitution is performed on
215*         each processor in parallel.
216*
217*
218*  Descriptors
219*  ===========
220*
221*  Descriptors now have *types* and differ from ScaLAPACK 1.0.
222*
223*  Note: banded codes can use either the old two dimensional
224*    or new one-dimensional descriptors, though the processor grid in
225*    both cases *must be one-dimensional*. We describe both types below.
226*
227*  Each global data object is described by an associated description
228*  vector.  This vector stores the information required to establish
229*  the mapping between an object element and its corresponding process
230*  and memory location.
231*
232*  Let A be a generic term for any 2D block cyclicly distributed array.
233*  Such a global array has an associated description vector DESCA.
234*  In the following comments, the character _ should be read as
235*  "of the global array".
236*
237*  NOTATION        STORED IN      EXPLANATION
238*  --------------- -------------- --------------------------------------
239*  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
240*                                 DTYPE_A = 1.
241*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
242*                                 the BLACS process grid A is distribu-
243*                                 ted over. The context itself is glo-
244*                                 bal, but the handle (the integer
245*                                 value) may vary.
246*  M_A    (global) DESCA( M_ )    The number of rows in the global
247*                                 array A.
248*  N_A    (global) DESCA( N_ )    The number of columns in the global
249*                                 array A.
250*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
251*                                 the rows of the array.
252*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
253*                                 the columns of the array.
254*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
255*                                 row of the array A is distributed.
256*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
257*                                 first column of the array A is
258*                                 distributed.
259*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
260*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
261*
262*  Let K be the number of rows or columns of a distributed matrix,
263*  and assume that its process grid has dimension p x q.
264*  LOCr( K ) denotes the number of elements of K that a process
265*  would receive if K were distributed over the p processes of its
266*  process column.
267*  Similarly, LOCc( K ) denotes the number of elements of K that a
268*  process would receive if K were distributed over the q processes of
269*  its process row.
270*  The values of LOCr() and LOCc() may be determined via a call to the
271*  ScaLAPACK tool function, NUMROC:
272*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
273*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
274*  An upper bound for these quantities may be computed by:
275*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
276*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
277*
278*
279*  One-dimensional descriptors:
280*
281*  One-dimensional descriptors are a new addition to ScaLAPACK since
282*    version 1.0. They simplify and shorten the descriptor for 1D
283*    arrays.
284*
285*  Since ScaLAPACK supports two-dimensional arrays as the fundamental
286*    object, we allow 1D arrays to be distributed either over the
287*    first dimension of the array (as if the grid were P-by-1) or the
288*    2nd dimension (as if the grid were 1-by-P). This choice is
289*    indicated by the descriptor type (501 or 502)
290*    as described below.
291*
292*    IMPORTANT NOTE: the actual BLACS grid represented by the
293*    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
294*    irrespective of which one-dimensional descriptor type
295*    (501 or 502) is input.
296*    This routine will interpret the grid properly either way.
297*    ScaLAPACK routines *do not support intercontext operations* so that
298*    the grid passed to a single ScaLAPACK routine *must be the same*
299*    for all array descriptors passed to that routine.
300*
301*    NOTE: In all cases where 1D descriptors are used, 2D descriptors
302*    may also be used, since a one-dimensional array is a special case
303*    of a two-dimensional array with one dimension of size unity.
304*    The two-dimensional array used in this case *must* be of the
305*    proper orientation:
306*      If the appropriate one-dimensional descriptor is DTYPEA=501
307*      (1 by P type), then the two dimensional descriptor must
308*      have a CTXT value that refers to a 1 by P BLACS grid;
309*      If the appropriate one-dimensional descriptor is DTYPEA=502
310*      (P by 1 type), then the two dimensional descriptor must
311*      have a CTXT value that refers to a P by 1 BLACS grid.
312*
313*
314*  Summary of allowed descriptors, types, and BLACS grids:
315*  DTYPE           501         502         1         1
316*  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
317*  -----------------------------------------------------
318*  A               OK          NO          OK        NO
319*  B               NO          OK          NO        OK
320*
321*  Note that a consequence of this chart is that it is not possible
322*    for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
323*    to opposite requirements for the orientation of the BLACS grid,
324*    and as noted before, the *same* BLACS context must be used in
325*    all descriptors in a single ScaLAPACK subroutine call.
326*
327*  Let A be a generic term for any 1D block cyclicly distributed array.
328*  Such a global array has an associated description vector DESCA.
329*  In the following comments, the character _ should be read as
330*  "of the global array".
331*
332*  NOTATION        STORED IN  EXPLANATION
333*  --------------- ---------- ------------------------------------------
334*  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
335*                                TYPE_A = 501: 1-by-P grid.
336*                                TYPE_A = 502: P-by-1 grid.
337*  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
338*                                the BLACS process grid A is distribu-
339*                                ted over. The context itself is glo-
340*                                bal, but the handle (the integer
341*                                value) may vary.
342*  N_A    (global) DESCA( 3 ) The size of the array dimension being
343*                                distributed.
344*  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
345*                                the distributed dimension of the array.
346*  SRC_A  (global) DESCA( 5 ) The process row or column over which the
347*                                first row or column of the array
348*                                is distributed.
349*  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
350*                                storing the local blocks of the distri-
351*                                buted array A. Minimum value of LLD_A
352*                                depends on TYPE_A.
353*                                TYPE_A = 501: LLD_A >=
354*                                   size of undistributed dimension, 1.
355*                                TYPE_A = 502: LLD_A >=NB_A, 1.
356*  Reserved        DESCA( 7 ) Reserved for future use.
357*
358*
359*
360*  =====================================================================
361*
362*  Code Developer: Andrew J. Cleary, University of Tennessee.
363*    Current address: Lawrence Livermore National Labs.
364*
365*  =====================================================================
366*
367*     .. Parameters ..
368      REAL               ONE
369      PARAMETER          ( ONE = 1.0E+0 )
370      REAL               ZERO
371      PARAMETER          ( ZERO = 0.0E+0 )
372      INTEGER            INT_ONE
373      PARAMETER          ( INT_ONE = 1 )
374      INTEGER            DESCMULT, BIGNUM
375      PARAMETER          ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT )
376      INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
377     $                   LLD_, MB_, M_, NB_, N_, RSRC_
378      PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
379     $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
380     $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
381*     ..
382*     .. Local Scalars ..
383      INTEGER            CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
384     $                   IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA,
385     $                   LLDB, MBW2, MYCOL, MYROW, MY_NUM_COLS, NB, NP,
386     $                   NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST,
387     $                   PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B,
388     $                   STORE_N_A, WORK_SIZE_MIN
389*     ..
390*     .. Local Arrays ..
391      INTEGER            DESCA_1XP( 7 ), DESCB_PX1( 7 ),
392     $                   PARAM_CHECK( 17, 3 )
393*     ..
394*     .. External Subroutines ..
395      EXTERNAL           BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT,
396     $                   GLOBCHK, PXERBLA, RESHAPE, SGEMM, SGERV2D,
397     $                   SGESD2D, SLAMOV, SMATADD, STBTRS, STRMM, STRTRS
398*     ..
399*     .. External Functions ..
400      LOGICAL            LSAME
401      INTEGER            NUMROC
402      EXTERNAL           LSAME, NUMROC
403*     ..
404*     .. Intrinsic Functions ..
405      INTRINSIC          ICHAR, MOD
406*     ..
407*     .. Executable Statements ..
408*
409*     Test the input parameters
410*
411      INFO = 0
412*
413*     Convert descriptor into standard form for easy access to
414*        parameters, check that grid is of right shape.
415*
416      DESCA_1XP( 1 ) = 501
417      DESCB_PX1( 1 ) = 502
418*
419      CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE )
420*
421      IF( RETURN_CODE.NE.0 ) THEN
422         INFO = -( 8*100+2 )
423      END IF
424*
425      CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE )
426*
427      IF( RETURN_CODE.NE.0 ) THEN
428         INFO = -( 11*100+2 )
429      END IF
430*
431*     Consistency checks for DESCA and DESCB.
432*
433*     Context must be the same
434      IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN
435         INFO = -( 11*100+2 )
436      END IF
437*
438*        These are alignment restrictions that may or may not be removed
439*        in future releases. -Andy Cleary, April 14, 1996.
440*
441*     Block sizes must be the same
442      IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN
443         INFO = -( 11*100+4 )
444      END IF
445*
446*     Source processor must be the same
447*
448      IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN
449         INFO = -( 11*100+5 )
450      END IF
451*
452*     Get values out of descriptor for use in code.
453*
454      ICTXT = DESCA_1XP( 2 )
455      CSRC = DESCA_1XP( 5 )
456      NB = DESCA_1XP( 4 )
457      LLDA = DESCA_1XP( 6 )
458      STORE_N_A = DESCA_1XP( 3 )
459      LLDB = DESCB_PX1( 6 )
460      STORE_M_B = DESCB_PX1( 3 )
461*
462*     Get grid parameters
463*
464*
465*     Pre-calculate bw^2
466*
467      MBW2 = BW*BW
468*
469      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
470      NP = NPROW*NPCOL
471*
472*
473*
474      IF( LSAME( UPLO, 'U' ) ) THEN
475         IDUM1 = ICHAR( 'U' )
476      ELSE IF( LSAME( UPLO, 'L' ) ) THEN
477         IDUM1 = ICHAR( 'L' )
478      ELSE
479         INFO = -1
480      END IF
481*
482      IF( LSAME( TRANS, 'N' ) ) THEN
483         IDUM2 = ICHAR( 'N' )
484      ELSE IF( LSAME( TRANS, 'T' ) ) THEN
485         IDUM2 = ICHAR( 'T' )
486      ELSE IF( LSAME( TRANS, 'C' ) ) THEN
487         IDUM2 = ICHAR( 'T' )
488      ELSE
489         INFO = -2
490      END IF
491*
492      IF( LWORK.LT.-1 ) THEN
493         INFO = -14
494      ELSE IF( LWORK.EQ.-1 ) THEN
495         IDUM3 = -1
496      ELSE
497         IDUM3 = 1
498      END IF
499*
500      IF( N.LT.0 ) THEN
501         INFO = -3
502      END IF
503*
504      IF( N+JA-1.GT.STORE_N_A ) THEN
505         INFO = -( 8*100+6 )
506      END IF
507*
508      IF( ( BW.GT.N-1 ) .OR. ( BW.LT.0 ) ) THEN
509         INFO = -4
510      END IF
511*
512      IF( LLDA.LT.( BW+1 ) ) THEN
513         INFO = -( 8*100+6 )
514      END IF
515*
516      IF( NB.LE.0 ) THEN
517         INFO = -( 8*100+4 )
518      END IF
519*
520      IF( N+IB-1.GT.STORE_M_B ) THEN
521         INFO = -( 11*100+3 )
522      END IF
523*
524      IF( LLDB.LT.NB ) THEN
525         INFO = -( 11*100+6 )
526      END IF
527*
528      IF( NRHS.LT.0 ) THEN
529         INFO = -5
530      END IF
531*
532*     Current alignment restriction
533*
534      IF( JA.NE.IB ) THEN
535         INFO = -7
536      END IF
537*
538*     Argument checking that is specific to Divide & Conquer routine
539*
540      IF( NPROW.NE.1 ) THEN
541         INFO = -( 8*100+2 )
542      END IF
543*
544      IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN
545         INFO = -( 3 )
546         CALL PXERBLA( ICTXT,
547     $                 'PSPBTRSV, D&C alg.: only 1 block per proc',
548     $                 -INFO )
549         RETURN
550      END IF
551*
552      IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*BW ) ) THEN
553         INFO = -( 8*100+4 )
554         CALL PXERBLA( ICTXT, 'PSPBTRSV, D&C alg.: NB too small',
555     $                 -INFO )
556         RETURN
557      END IF
558*
559*
560      WORK_SIZE_MIN = BW*NRHS
561*
562      WORK( 1 ) = WORK_SIZE_MIN
563*
564      IF( LWORK.LT.WORK_SIZE_MIN ) THEN
565         IF( LWORK.NE.-1 ) THEN
566            INFO = -14
567            CALL PXERBLA( ICTXT, 'PSPBTRSV: worksize error', -INFO )
568         END IF
569         RETURN
570      END IF
571*
572*     Pack params and positions into arrays for global consistency check
573*
574      PARAM_CHECK( 17, 1 ) = DESCB( 5 )
575      PARAM_CHECK( 16, 1 ) = DESCB( 4 )
576      PARAM_CHECK( 15, 1 ) = DESCB( 3 )
577      PARAM_CHECK( 14, 1 ) = DESCB( 2 )
578      PARAM_CHECK( 13, 1 ) = DESCB( 1 )
579      PARAM_CHECK( 12, 1 ) = IB
580      PARAM_CHECK( 11, 1 ) = DESCA( 5 )
581      PARAM_CHECK( 10, 1 ) = DESCA( 4 )
582      PARAM_CHECK( 9, 1 ) = DESCA( 3 )
583      PARAM_CHECK( 8, 1 ) = DESCA( 1 )
584      PARAM_CHECK( 7, 1 ) = JA
585      PARAM_CHECK( 6, 1 ) = NRHS
586      PARAM_CHECK( 5, 1 ) = BW
587      PARAM_CHECK( 4, 1 ) = N
588      PARAM_CHECK( 3, 1 ) = IDUM3
589      PARAM_CHECK( 2, 1 ) = IDUM2
590      PARAM_CHECK( 1, 1 ) = IDUM1
591*
592      PARAM_CHECK( 17, 2 ) = 1105
593      PARAM_CHECK( 16, 2 ) = 1104
594      PARAM_CHECK( 15, 2 ) = 1103
595      PARAM_CHECK( 14, 2 ) = 1102
596      PARAM_CHECK( 13, 2 ) = 1101
597      PARAM_CHECK( 12, 2 ) = 10
598      PARAM_CHECK( 11, 2 ) = 805
599      PARAM_CHECK( 10, 2 ) = 804
600      PARAM_CHECK( 9, 2 ) = 803
601      PARAM_CHECK( 8, 2 ) = 801
602      PARAM_CHECK( 7, 2 ) = 7
603      PARAM_CHECK( 6, 2 ) = 5
604      PARAM_CHECK( 5, 2 ) = 4
605      PARAM_CHECK( 4, 2 ) = 3
606      PARAM_CHECK( 3, 2 ) = 14
607      PARAM_CHECK( 2, 2 ) = 2
608      PARAM_CHECK( 1, 2 ) = 1
609*
610*     Want to find errors with MIN( ), so if no error, set it to a big
611*     number. If there already is an error, multiply by the the
612*     descriptor multiplier.
613*
614      IF( INFO.GE.0 ) THEN
615         INFO = BIGNUM
616      ELSE IF( INFO.LT.-DESCMULT ) THEN
617         INFO = -INFO
618      ELSE
619         INFO = -INFO*DESCMULT
620      END IF
621*
622*     Check consistency across processors
623*
624      CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, PARAM_CHECK( 1, 3 ),
625     $              INFO )
626*
627*     Prepare output: set info = 0 if no error, and divide by DESCMULT
628*     if error is not in a descriptor entry.
629*
630      IF( INFO.EQ.BIGNUM ) THEN
631         INFO = 0
632      ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN
633         INFO = -INFO / DESCMULT
634      ELSE
635         INFO = -INFO
636      END IF
637*
638      IF( INFO.LT.0 ) THEN
639         CALL PXERBLA( ICTXT, 'PSPBTRSV', -INFO )
640         RETURN
641      END IF
642*
643*     Quick return if possible
644*
645      IF( N.EQ.0 )
646     $   RETURN
647*
648      IF( NRHS.EQ.0 )
649     $   RETURN
650*
651*
652*     Adjust addressing into matrix space to properly get into
653*        the beginning part of the relevant data
654*
655      PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) )
656*
657      IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN
658         PART_OFFSET = PART_OFFSET + NB
659      END IF
660*
661      IF( MYCOL.LT.CSRC ) THEN
662         PART_OFFSET = PART_OFFSET - NB
663      END IF
664*
665*     Form a new BLACS grid (the "standard form" grid) with only procs
666*        holding part of the matrix, of size 1xNP where NP is adjusted,
667*        starting at csrc=0, with JA modified to reflect dropped procs.
668*
669*     First processor to hold part of the matrix:
670*
671      FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL )
672*
673*     Calculate new JA one while dropping off unused processors.
674*
675      JA_NEW = MOD( JA-1, NB ) + 1
676*
677*     Save and compute new value of NP
678*
679      NP_SAVE = NP
680      NP = ( JA_NEW+N-2 ) / NB + 1
681*
682*     Call utility routine that forms "standard-form" grid
683*
684      CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC,
685     $              INT_ONE, NP )
686*
687*     Use new context from standard grid as context.
688*
689      ICTXT_SAVE = ICTXT
690      ICTXT = ICTXT_NEW
691      DESCA_1XP( 2 ) = ICTXT_NEW
692      DESCB_PX1( 2 ) = ICTXT_NEW
693*
694*     Get information about new grid.
695*
696      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
697*
698*     Drop out processors that do not have part of the matrix.
699*
700      IF( MYROW.LT.0 ) THEN
701         GO TO 180
702      END IF
703*
704*     ********************************
705*     Values reused throughout routine
706*
707*     User-input value of partition size
708*
709      PART_SIZE = NB
710*
711*     Number of columns in each processor
712*
713      MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL )
714*
715*     Offset in columns to beginning of main partition in each proc
716*
717      IF( MYCOL.EQ.0 ) THEN
718         PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE )
719         MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE )
720      END IF
721*
722*     Offset in elements
723*
724      OFST = PART_OFFSET*LLDA
725*
726*     Size of main (or odd) partition in each processor
727*
728      ODD_SIZE = MY_NUM_COLS
729      IF( MYCOL.LT.NP-1 ) THEN
730         ODD_SIZE = ODD_SIZE - BW
731      END IF
732*
733*
734*
735*     Begin main code
736*
737      IF( LSAME( UPLO, 'L' ) ) THEN
738*
739         IF( LSAME( TRANS, 'N' ) ) THEN
740*
741*        Frontsolve
742*
743*
744******************************************
745*       Local computation phase
746******************************************
747*
748*       Use main partition in each processor to solve locally
749*
750            CALL STBTRS( UPLO, 'N', 'N', ODD_SIZE, BW, NRHS,
751     $                   A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB,
752     $                   INFO )
753*
754*
755            IF( MYCOL.LT.NP-1 ) THEN
756*         Use factorization of odd-even connection block to modify
757*           locally stored portion of right hand side(s)
758*
759*
760*           First copy and multiply it into temporary storage,
761*             then use it on RHS
762*
763               CALL SLAMOV( 'N', BW, NRHS,
764     $                      B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB,
765     $                      WORK( 1 ), BW )
766*
767               CALL STRMM( 'L', 'U', 'N', 'N', BW, NRHS, -ONE,
768     $                     A( ( OFST+( BW+1 )+( ODD_SIZE-BW )*LLDA ) ),
769     $                     LLDA-1, WORK( 1 ), BW )
770*
771               CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE,
772     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
773*
774            END IF
775*
776*
777            IF( MYCOL.NE.0 ) THEN
778*         Use the "spike" fillin to calculate contribution to previous
779*           processor's righthand-side.
780*
781               CALL SGEMM( 'T', 'N', BW, NRHS, ODD_SIZE, -ONE, AF( 1 ),
782     $                     ODD_SIZE, B( PART_OFFSET+1 ), LLDB, ZERO,
783     $                     WORK( 1+BW-BW ), BW )
784            END IF
785*
786*
787************************************************
788*       Formation and solution of reduced system
789************************************************
790*
791*
792*       Send modifications to prior processor's right hand sides
793*
794            IF( MYCOL.GT.0 ) THEN
795*
796               CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
797     $                       MYCOL-1 )
798*
799            END IF
800*
801*       Receive modifications to processor's right hand sides
802*
803            IF( MYCOL.LT.NPCOL-1 ) THEN
804*
805               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
806     $                       MYCOL+1 )
807*
808*         Combine contribution to locally stored right hand sides
809*
810               CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE,
811     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
812*
813            END IF
814*
815*
816*       The last processor does not participate in the solution of the
817*       reduced system, having sent its contribution already.
818            IF( MYCOL.EQ.NPCOL-1 ) THEN
819               GO TO 30
820            END IF
821*
822*
823*       *************************************
824*       Modification Loop
825*
826*       The distance for sending and receiving for each level starts
827*         at 1 for the first level.
828            LEVEL_DIST = 1
829*
830*       Do until this proc is needed to modify other procs' equations
831*
832   10       CONTINUE
833            IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 )
834     $         GO TO 20
835*
836*         Receive and add contribution to righthand sides from left
837*
838            IF( MYCOL-LEVEL_DIST.GE.0 ) THEN
839*
840               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
841     $                       MYCOL-LEVEL_DIST )
842*
843               CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE,
844     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
845*
846            END IF
847*
848*         Receive and add contribution to righthand sides from right
849*
850            IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN
851*
852               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
853     $                       MYCOL+LEVEL_DIST )
854*
855               CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE,
856     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
857*
858            END IF
859*
860            LEVEL_DIST = LEVEL_DIST*2
861*
862            GO TO 10
863   20       CONTINUE
864*       [End of GOTO Loop]
865*
866*
867*
868*       *********************************
869*       Calculate and use this proc's blocks to modify other procs
870*
871*       Solve with diagonal block
872*
873            CALL STRTRS( 'L', 'N', 'N', BW, NRHS,
874     $                   AF( ODD_SIZE*BW+MBW2+1 ), BW,
875     $                   B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO )
876*
877            IF( INFO.NE.0 ) THEN
878               GO TO 170
879            END IF
880*
881*
882*
883*       *********
884            IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
885*
886*         Calculate contribution from this block to next diagonal block
887*
888               CALL SGEMM( 'T', 'N', BW, NRHS, BW, -ONE,
889     $                     AF( ( ODD_SIZE )*BW+1 ), BW,
890     $                     B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO,
891     $                     WORK( 1 ), BW )
892*
893*         Send contribution to diagonal block's owning processor.
894*
895               CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
896     $                       MYCOL+LEVEL_DIST )
897*
898            END IF
899*       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
900*
901*       ************
902            IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND.
903     $          ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) )
904     $           THEN
905*
906*
907*         Use offdiagonal block to calculate modification to diag block
908*           of processor to the left
909*
910               CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE,
911     $                     AF( ODD_SIZE*BW+2*MBW2+1 ), BW,
912     $                     B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO,
913     $                     WORK( 1 ), BW )
914*
915*         Send contribution to diagonal block's owning processor.
916*
917               CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
918     $                       MYCOL-LEVEL_DIST )
919*
920            END IF
921*       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
922*
923   30       CONTINUE
924*
925         ELSE
926*
927******************** BACKSOLVE *************************************
928*
929********************************************************************
930*     .. Begin reduced system phase of algorithm ..
931********************************************************************
932*
933*
934*
935*       The last processor does not participate in the solution of the
936*       reduced system and just waits to receive its solution.
937            IF( MYCOL.EQ.NPCOL-1 ) THEN
938               GO TO 80
939            END IF
940*
941*       Determine number of steps in tree loop
942*
943            LEVEL_DIST = 1
944   40       CONTINUE
945            IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 )
946     $         GO TO 50
947*
948            LEVEL_DIST = LEVEL_DIST*2
949*
950            GO TO 40
951   50       CONTINUE
952*
953*
954            IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND.
955     $          ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) )
956     $           THEN
957*
958*         Receive solution from processor to left
959*
960               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
961     $                       MYCOL-LEVEL_DIST )
962*
963*
964*         Use offdiagonal block to calculate modification to RHS stored
965*           on this processor
966*
967               CALL SGEMM( 'T', 'N', BW, NRHS, BW, -ONE,
968     $                     AF( ODD_SIZE*BW+2*MBW2+1 ), BW, WORK( 1 ),
969     $                     BW, ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
970            END IF
971*       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
972*
973*
974            IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
975*
976*         Receive solution from processor to right
977*
978               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
979     $                       MYCOL+LEVEL_DIST )
980*
981*         Calculate contribution from this block to next diagonal block
982*
983               CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE,
984     $                     AF( ( ODD_SIZE )*BW+1 ), BW, WORK( 1 ), BW,
985     $                     ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
986*
987            END IF
988*       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
989*
990*
991*       Solve with diagonal block
992*
993            CALL STRTRS( 'L', 'T', 'N', BW, NRHS,
994     $                   AF( ODD_SIZE*BW+MBW2+1 ), BW,
995     $                   B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO )
996*
997            IF( INFO.NE.0 ) THEN
998               GO TO 170
999            END IF
1000*
1001*
1002*
1003***Modification Loop *******
1004*
1005   60       CONTINUE
1006            IF( LEVEL_DIST.EQ.1 )
1007     $         GO TO 70
1008*
1009            LEVEL_DIST = LEVEL_DIST / 2
1010*
1011*         Send solution to the right
1012*
1013            IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN
1014*
1015               CALL SGESD2D( ICTXT, BW, NRHS,
1016     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0,
1017     $                       MYCOL+LEVEL_DIST )
1018*
1019            END IF
1020*
1021*         Send solution to left
1022*
1023            IF( MYCOL-LEVEL_DIST.GE.0 ) THEN
1024*
1025               CALL SGESD2D( ICTXT, BW, NRHS,
1026     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0,
1027     $                       MYCOL-LEVEL_DIST )
1028*
1029            END IF
1030*
1031            GO TO 60
1032   70       CONTINUE
1033*       [End of GOTO Loop]
1034*
1035   80       CONTINUE
1036*          [Processor npcol - 1 jumped to here to await next stage]
1037*
1038*******************************
1039*       Reduced system has been solved, communicate solutions to nearest
1040*         neighbors in preparation for local computation phase.
1041*
1042*
1043*       Send elements of solution to next proc
1044*
1045            IF( MYCOL.LT.NPCOL-1 ) THEN
1046*
1047               CALL SGESD2D( ICTXT, BW, NRHS,
1048     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0,
1049     $                       MYCOL+1 )
1050*
1051            END IF
1052*
1053*       Receive modifications to processor's right hand sides
1054*
1055            IF( MYCOL.GT.0 ) THEN
1056*
1057               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1058     $                       MYCOL-1 )
1059*
1060            END IF
1061*
1062*
1063*
1064**********************************************
1065*       Local computation phase
1066**********************************************
1067*
1068            IF( MYCOL.NE.0 ) THEN
1069*         Use the "spike" fillin to calculate contribution from previous
1070*           processor's solution.
1071*
1072               CALL SGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -ONE, AF( 1 ),
1073     $                     ODD_SIZE, WORK( 1+BW-BW ), BW, ONE,
1074     $                     B( PART_OFFSET+1 ), LLDB )
1075*
1076            END IF
1077*
1078*
1079            IF( MYCOL.LT.NP-1 ) THEN
1080*         Use factorization of odd-even connection block to modify
1081*           locally stored portion of right hand side(s)
1082*
1083*
1084*         First copy and multiply it into temporary storage,
1085*           then use it on RHS
1086*
1087               CALL SLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1 ),
1088     $                      LLDB, WORK( 1+BW-BW ), BW )
1089*
1090               CALL STRMM( 'L', 'U', 'T', 'N', BW, NRHS, -ONE,
1091     $                     A( ( OFST+( BW+1 )+( ODD_SIZE-BW )*LLDA ) ),
1092     $                     LLDA-1, WORK( 1+BW-BW ), BW )
1093*
1094               CALL SMATADD( BW, NRHS, ONE, WORK( 1+BW-BW ), BW, ONE,
1095     $                       B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB )
1096*
1097            END IF
1098*
1099*       Use main partition in each processor to solve locally
1100*
1101            CALL STBTRS( UPLO, 'T', 'N', ODD_SIZE, BW, NRHS,
1102     $                   A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB,
1103     $                   INFO )
1104*
1105         END IF
1106*     End of "IF( LSAME( TRANS, 'N' ) )"...
1107*
1108*
1109      ELSE
1110***************************************************************
1111*     CASE UPLO = 'U'                                         *
1112***************************************************************
1113         IF( LSAME( TRANS, 'T' ) ) THEN
1114*
1115*        Frontsolve
1116*
1117*
1118******************************************
1119*       Local computation phase
1120******************************************
1121*
1122*       Use main partition in each processor to solve locally
1123*
1124            CALL STBTRS( UPLO, 'T', 'N', ODD_SIZE, BW, NRHS,
1125     $                   A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB,
1126     $                   INFO )
1127*
1128*
1129            IF( MYCOL.LT.NP-1 ) THEN
1130*         Use factorization of odd-even connection block to modify
1131*           locally stored portion of right hand side(s)
1132*
1133*
1134*           First copy and multiply it into temporary storage,
1135*             then use it on RHS
1136*
1137               CALL SLAMOV( 'N', BW, NRHS,
1138     $                      B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB,
1139     $                      WORK( 1 ), BW )
1140*
1141               CALL STRMM( 'L', 'L', 'T', 'N', BW, NRHS, -ONE,
1142     $                     A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1,
1143     $                     WORK( 1 ), BW )
1144*
1145               CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE,
1146     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
1147*
1148            END IF
1149*
1150*
1151            IF( MYCOL.NE.0 ) THEN
1152*         Use the "spike" fillin to calculate contribution to previous
1153*           processor's righthand-side.
1154*
1155               CALL SGEMM( 'T', 'N', BW, NRHS, ODD_SIZE, -ONE, AF( 1 ),
1156     $                     ODD_SIZE, B( PART_OFFSET+1 ), LLDB, ZERO,
1157     $                     WORK( 1+BW-BW ), BW )
1158            END IF
1159*
1160*
1161************************************************
1162*       Formation and solution of reduced system
1163************************************************
1164*
1165*
1166*       Send modifications to prior processor's right hand sides
1167*
1168            IF( MYCOL.GT.0 ) THEN
1169*
1170               CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1171     $                       MYCOL-1 )
1172*
1173            END IF
1174*
1175*       Receive modifications to processor's right hand sides
1176*
1177            IF( MYCOL.LT.NPCOL-1 ) THEN
1178*
1179               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1180     $                       MYCOL+1 )
1181*
1182*         Combine contribution to locally stored right hand sides
1183*
1184               CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE,
1185     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
1186*
1187            END IF
1188*
1189*
1190*       The last processor does not participate in the solution of the
1191*       reduced system, having sent its contribution already.
1192            IF( MYCOL.EQ.NPCOL-1 ) THEN
1193               GO TO 110
1194            END IF
1195*
1196*
1197*       *************************************
1198*       Modification Loop
1199*
1200*       The distance for sending and receiving for each level starts
1201*         at 1 for the first level.
1202            LEVEL_DIST = 1
1203*
1204*       Do until this proc is needed to modify other procs' equations
1205*
1206   90       CONTINUE
1207            IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 )
1208     $         GO TO 100
1209*
1210*         Receive and add contribution to righthand sides from left
1211*
1212            IF( MYCOL-LEVEL_DIST.GE.0 ) THEN
1213*
1214               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1215     $                       MYCOL-LEVEL_DIST )
1216*
1217               CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE,
1218     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
1219*
1220            END IF
1221*
1222*         Receive and add contribution to righthand sides from right
1223*
1224            IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN
1225*
1226               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1227     $                       MYCOL+LEVEL_DIST )
1228*
1229               CALL SMATADD( BW, NRHS, ONE, WORK( 1 ), BW, ONE,
1230     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
1231*
1232            END IF
1233*
1234            LEVEL_DIST = LEVEL_DIST*2
1235*
1236            GO TO 90
1237  100       CONTINUE
1238*       [End of GOTO Loop]
1239*
1240*
1241*
1242*       *********************************
1243*       Calculate and use this proc's blocks to modify other procs
1244*
1245*       Solve with diagonal block
1246*
1247            CALL STRTRS( 'L', 'N', 'N', BW, NRHS,
1248     $                   AF( ODD_SIZE*BW+MBW2+1 ), BW,
1249     $                   B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO )
1250*
1251            IF( INFO.NE.0 ) THEN
1252               GO TO 170
1253            END IF
1254*
1255*
1256*
1257*       *********
1258            IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
1259*
1260*         Calculate contribution from this block to next diagonal block
1261*
1262               CALL SGEMM( 'T', 'N', BW, NRHS, BW, -ONE,
1263     $                     AF( ( ODD_SIZE )*BW+1 ), BW,
1264     $                     B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO,
1265     $                     WORK( 1 ), BW )
1266*
1267*         Send contribution to diagonal block's owning processor.
1268*
1269               CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1270     $                       MYCOL+LEVEL_DIST )
1271*
1272            END IF
1273*       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1274*
1275*       ************
1276            IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND.
1277     $          ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) )
1278     $           THEN
1279*
1280*
1281*         Use offdiagonal block to calculate modification to diag block
1282*           of processor to the left
1283*
1284               CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE,
1285     $                     AF( ODD_SIZE*BW+2*MBW2+1 ), BW,
1286     $                     B( PART_OFFSET+ODD_SIZE+1 ), LLDB, ZERO,
1287     $                     WORK( 1 ), BW )
1288*
1289*         Send contribution to diagonal block's owning processor.
1290*
1291               CALL SGESD2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1292     $                       MYCOL-LEVEL_DIST )
1293*
1294            END IF
1295*       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1296*
1297  110       CONTINUE
1298*
1299         ELSE
1300*
1301******************** BACKSOLVE *************************************
1302*
1303********************************************************************
1304*     .. Begin reduced system phase of algorithm ..
1305********************************************************************
1306*
1307*
1308*
1309*       The last processor does not participate in the solution of the
1310*       reduced system and just waits to receive its solution.
1311            IF( MYCOL.EQ.NPCOL-1 ) THEN
1312               GO TO 160
1313            END IF
1314*
1315*       Determine number of steps in tree loop
1316*
1317            LEVEL_DIST = 1
1318  120       CONTINUE
1319            IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 )
1320     $         GO TO 130
1321*
1322            LEVEL_DIST = LEVEL_DIST*2
1323*
1324            GO TO 120
1325  130       CONTINUE
1326*
1327*
1328            IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND.
1329     $          ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) )
1330     $           THEN
1331*
1332*         Receive solution from processor to left
1333*
1334               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1335     $                       MYCOL-LEVEL_DIST )
1336*
1337*
1338*         Use offdiagonal block to calculate modification to RHS stored
1339*           on this processor
1340*
1341               CALL SGEMM( 'T', 'N', BW, NRHS, BW, -ONE,
1342     $                     AF( ODD_SIZE*BW+2*MBW2+1 ), BW, WORK( 1 ),
1343     $                     BW, ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
1344            END IF
1345*       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1346*
1347*
1348            IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
1349*
1350*         Receive solution from processor to right
1351*
1352               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1353     $                       MYCOL+LEVEL_DIST )
1354*
1355*         Calculate contribution from this block to next diagonal block
1356*
1357               CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE,
1358     $                     AF( ( ODD_SIZE )*BW+1 ), BW, WORK( 1 ), BW,
1359     $                     ONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
1360*
1361            END IF
1362*       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1363*
1364*
1365*       Solve with diagonal block
1366*
1367            CALL STRTRS( 'L', 'T', 'N', BW, NRHS,
1368     $                   AF( ODD_SIZE*BW+MBW2+1 ), BW,
1369     $                   B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO )
1370*
1371            IF( INFO.NE.0 ) THEN
1372               GO TO 170
1373            END IF
1374*
1375*
1376*
1377***Modification Loop *******
1378*
1379  140       CONTINUE
1380            IF( LEVEL_DIST.EQ.1 )
1381     $         GO TO 150
1382*
1383            LEVEL_DIST = LEVEL_DIST / 2
1384*
1385*         Send solution to the right
1386*
1387            IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN
1388*
1389               CALL SGESD2D( ICTXT, BW, NRHS,
1390     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0,
1391     $                       MYCOL+LEVEL_DIST )
1392*
1393            END IF
1394*
1395*         Send solution to left
1396*
1397            IF( MYCOL-LEVEL_DIST.GE.0 ) THEN
1398*
1399               CALL SGESD2D( ICTXT, BW, NRHS,
1400     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0,
1401     $                       MYCOL-LEVEL_DIST )
1402*
1403            END IF
1404*
1405            GO TO 140
1406  150       CONTINUE
1407*       [End of GOTO Loop]
1408*
1409  160       CONTINUE
1410*          [Processor npcol - 1 jumped to here to await next stage]
1411*
1412*******************************
1413*       Reduced system has been solved, communicate solutions to nearest
1414*         neighbors in preparation for local computation phase.
1415*
1416*
1417*       Send elements of solution to next proc
1418*
1419            IF( MYCOL.LT.NPCOL-1 ) THEN
1420*
1421               CALL SGESD2D( ICTXT, BW, NRHS,
1422     $                       B( PART_OFFSET+ODD_SIZE+1 ), LLDB, 0,
1423     $                       MYCOL+1 )
1424*
1425            END IF
1426*
1427*       Receive modifications to processor's right hand sides
1428*
1429            IF( MYCOL.GT.0 ) THEN
1430*
1431               CALL SGERV2D( ICTXT, BW, NRHS, WORK( 1 ), BW, 0,
1432     $                       MYCOL-1 )
1433*
1434            END IF
1435*
1436*
1437*
1438**********************************************
1439*       Local computation phase
1440**********************************************
1441*
1442            IF( MYCOL.NE.0 ) THEN
1443*         Use the "spike" fillin to calculate contribution from previous
1444*           processor's solution.
1445*
1446               CALL SGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -ONE, AF( 1 ),
1447     $                     ODD_SIZE, WORK( 1+BW-BW ), BW, ONE,
1448     $                     B( PART_OFFSET+1 ), LLDB )
1449*
1450            END IF
1451*
1452*
1453            IF( MYCOL.LT.NP-1 ) THEN
1454*         Use factorization of odd-even connection block to modify
1455*           locally stored portion of right hand side(s)
1456*
1457*
1458*         First copy and multiply it into temporary storage,
1459*           then use it on RHS
1460*
1461               CALL SLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1 ),
1462     $                      LLDB, WORK( 1+BW-BW ), BW )
1463*
1464               CALL STRMM( 'L', 'L', 'N', 'N', BW, NRHS, -ONE,
1465     $                     A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1,
1466     $                     WORK( 1+BW-BW ), BW )
1467*
1468               CALL SMATADD( BW, NRHS, ONE, WORK( 1+BW-BW ), BW, ONE,
1469     $                       B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB )
1470*
1471            END IF
1472*
1473*       Use main partition in each processor to solve locally
1474*
1475            CALL STBTRS( UPLO, 'N', 'N', ODD_SIZE, BW, NRHS,
1476     $                   A( OFST+1 ), LLDA, B( PART_OFFSET+1 ), LLDB,
1477     $                   INFO )
1478*
1479         END IF
1480*     End of "IF( LSAME( TRANS, 'N' ) )"...
1481*
1482*
1483      END IF
1484*     End of "IF( LSAME( UPLO, 'L' ) )"...
1485  170 CONTINUE
1486*
1487*
1488*     Free BLACS space used to hold standard-form grid.
1489*
1490      IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN
1491         CALL BLACS_GRIDEXIT( ICTXT_NEW )
1492      END IF
1493*
1494  180 CONTINUE
1495*
1496*     Restore saved input parameters
1497*
1498      ICTXT = ICTXT_SAVE
1499      NP = NP_SAVE
1500*
1501*     Output minimum worksize
1502*
1503      WORK( 1 ) = WORK_SIZE_MIN
1504*
1505*
1506      RETURN
1507*
1508*     End of PSPBTRSV
1509*
1510      END
1511