1      SUBROUTINE PSDTTRF( N, DL, D, DU, JA, DESCA, AF, LAF, WORK, LWORK,
2     $                    INFO )
3*
4*  -- ScaLAPACK routine (version 1.7) --
5*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
6*     and University of California, Berkeley.
7*     April 3, 2000
8*
9*     .. Scalar Arguments ..
10      INTEGER            INFO, JA, LAF, LWORK, N
11*     ..
12*     .. Array Arguments ..
13      INTEGER            DESCA( * )
14      REAL               AF( * ), D( * ), DL( * ), DU( * ), WORK( * )
15*     ..
16*
17*
18*  Purpose
19*  =======
20*
21*  PSDTTRF computes a LU factorization
22*  of an N-by-N real tridiagonal
23*  diagonally dominant-like distributed matrix
24*  A(1:N, JA:JA+N-1).
25*  Reordering is used to increase parallelism in the factorization.
26*  This reordering results in factors that are DIFFERENT from those
27*  produced by equivalent sequential codes. These factors cannot
28*  be used directly by users; however, they can be used in
29*  subsequent calls to PSDTTRS to solve linear systems.
30*
31*  The factorization has the form
32*
33*          P A(1:N, JA:JA+N-1) P^T = L U
34*
35*  where U is a tridiagonal upper triangular matrix and L is tridiagonal
36*  lower triangular, and P is a permutation matrix.
37*
38*  =====================================================================
39*
40*  Arguments
41*  =========
42*
43*
44*  N       (global input) INTEGER
45*          The number of rows and columns to be operated on, i.e. the
46*          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
47*
48*  DL      (local input/local output) REAL pointer to local
49*          part of global vector storing the lower diagonal of the
50*          matrix. Globally, DL(1) is not referenced, and DL must be
51*          aligned with D.
52*          Must be of size >= DESCA( NB_ ).
53*          On exit, this array contains information containing the
54*            factors of the matrix.
55*
56*  D       (local input/local output) REAL pointer to local
57*          part of global vector storing the main diagonal of the
58*          matrix.
59*          On exit, this array contains information containing the
60*            factors of the matrix.
61*          Must be of size >= DESCA( NB_ ).
62*
63*  DU       (local input/local output) REAL pointer to local
64*          part of global vector storing the upper diagonal of the
65*          matrix. Globally, DU(n) is not referenced, and DU must be
66*          aligned with D.
67*          On exit, this array contains information containing the
68*            factors of the matrix.
69*          Must be of size >= DESCA( NB_ ).
70*
71*  JA      (global input) INTEGER
72*          The index in the global array A that points to the start of
73*          the matrix to be operated on (which may be either all of A
74*          or a submatrix of A).
75*
76*  DESCA   (global and local input) INTEGER array of dimension DLEN.
77*          if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
78*          if 2D type (DTYPE_A=1), DLEN >= 9.
79*          The array descriptor for the distributed matrix A.
80*          Contains information of mapping of A to memory. Please
81*          see NOTES below for full description and options.
82*
83*  AF      (local output) REAL array, dimension LAF.
84*          Auxiliary Fillin Space.
85*          Fillin is created during the factorization routine
86*          PSDTTRF and this is stored in AF. If a linear system
87*          is to be solved using PSDTTRS after the factorization
88*          routine, AF *must not be altered* after the factorization.
89*
90*  LAF     (local input) INTEGER
91*          Size of user-input Auxiliary Fillin space AF. Must be >=
92*          2*(NB+2)
93*          If LAF is not large enough, an error code will be returned
94*          and the minimum acceptable size will be returned in AF( 1 )
95*
96*  WORK    (local workspace/local output)
97*          REAL temporary workspace. This space may
98*          be overwritten in between calls to routines. WORK must be
99*          the size given in LWORK.
100*          On exit, WORK( 1 ) contains the minimal LWORK.
101*
102*  LWORK   (local input or global input) INTEGER
103*          Size of user-input workspace WORK.
104*          If LWORK is too small, the minimal acceptable size will be
105*          returned in WORK(1) and an error code is returned. LWORK>=
106*          8*NPCOL
107*
108*  INFO    (local output) INTEGER
109*          = 0:  successful exit
110*          < 0:  If the i-th argument is an array and the j-entry had
111*                an illegal value, then INFO = -(i*100+j), if the i-th
112*                argument is a scalar and had an illegal value, then
113*                INFO = -i.
114*          > 0:  If INFO = K<=NPROCS, the submatrix stored on processor
115*                INFO and factored locally was not
116*                diagonally dominant-like,  and
117*                the factorization was not completed.
118*                If INFO = K>NPROCS, the submatrix stored on processor
119*                INFO-NPROCS representing interactions with other
120*                processors was not
121*                stably factorable wo/interchanges,
122*                and the factorization was not completed.
123*
124*  =====================================================================
125*
126*
127*  Restrictions
128*  ============
129*
130*  The following are restrictions on the input parameters. Some of these
131*    are temporary and will be removed in future releases, while others
132*    may reflect fundamental technical limitations.
133*
134*    Non-cyclic restriction: VERY IMPORTANT!
135*      P*NB>= mod(JA-1,NB)+N.
136*      The mapping for matrices must be blocked, reflecting the nature
137*      of the divide and conquer algorithm as a task-parallel algorithm.
138*      This formula in words is: no processor may have more than one
139*      chunk of the matrix.
140*
141*    Blocksize cannot be too small:
142*      If the matrix spans more than one processor, the following
143*      restriction on NB, the size of each block on each processor,
144*      must hold:
145*      NB >= 2
146*      The bulk of parallel computation is done on the matrix of size
147*      O(NB) on each processor. If this is too small, divide and conquer
148*      is a poor choice of algorithm.
149*
150*    Submatrix reference:
151*      JA = IB
152*      Alignment restriction that prevents unnecessary communication.
153*
154*
155*  =====================================================================
156*
157*
158*  Notes
159*  =====
160*
161*  If the factorization routine and the solve routine are to be called
162*    separately (to solve various sets of righthand sides using the same
163*    coefficient matrix), the auxiliary space AF *must not be altered*
164*    between calls to the factorization routine and the solve routine.
165*
166*  The best algorithm for solving banded and tridiagonal linear systems
167*    depends on a variety of parameters, especially the bandwidth.
168*    Currently, only algorithms designed for the case N/P >> bw are
169*    implemented. These go by many names, including Divide and Conquer,
170*    Partitioning, domain decomposition-type, etc.
171*    For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
172*    algorithms are the appropriate choice.
173*
174*  Algorithm description: Divide and Conquer
175*
176*    The Divide and Conqer algorithm assumes the matrix is narrowly
177*      banded compared with the number of equations. In this situation,
178*      it is best to distribute the input matrix A one-dimensionally,
179*      with columns atomic and rows divided amongst the processes.
180*      The basic algorithm divides the tridiagonal matrix up into
181*      P pieces with one stored on each processor,
182*      and then proceeds in 2 phases for the factorization or 3 for the
183*      solution of a linear system.
184*      1) Local Phase:
185*         The individual pieces are factored independently and in
186*         parallel. These factors are applied to the matrix creating
187*         fillin, which is stored in a non-inspectable way in auxiliary
188*         space AF. Mathematically, this is equivalent to reordering
189*         the matrix A as P A P^T and then factoring the principal
190*         leading submatrix of size equal to the sum of the sizes of
191*         the matrices factored on each processor. The factors of
192*         these submatrices overwrite the corresponding parts of A
193*         in memory.
194*      2) Reduced System Phase:
195*         A small ((P-1)) system is formed representing
196*         interaction of the larger blocks, and is stored (as are its
197*         factors) in the space AF. A parallel Block Cyclic Reduction
198*         algorithm is used. For a linear system, a parallel front solve
199*         followed by an analagous backsolve, both using the structure
200*         of the factored matrix, are performed.
201*      3) Backsubsitution Phase:
202*         For a linear system, a local backsubstitution is performed on
203*         each processor in parallel.
204*
205*
206*  Descriptors
207*  ===========
208*
209*  Descriptors now have *types* and differ from ScaLAPACK 1.0.
210*
211*  Note: tridiagonal codes can use either the old two dimensional
212*    or new one-dimensional descriptors, though the processor grid in
213*    both cases *must be one-dimensional*. We describe both types below.
214*
215*  Each global data object is described by an associated description
216*  vector.  This vector stores the information required to establish
217*  the mapping between an object element and its corresponding process
218*  and memory location.
219*
220*  Let A be a generic term for any 2D block cyclicly distributed array.
221*  Such a global array has an associated description vector DESCA.
222*  In the following comments, the character _ should be read as
223*  "of the global array".
224*
225*  NOTATION        STORED IN      EXPLANATION
226*  --------------- -------------- --------------------------------------
227*  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
228*                                 DTYPE_A = 1.
229*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
230*                                 the BLACS process grid A is distribu-
231*                                 ted over. The context itself is glo-
232*                                 bal, but the handle (the integer
233*                                 value) may vary.
234*  M_A    (global) DESCA( M_ )    The number of rows in the global
235*                                 array A.
236*  N_A    (global) DESCA( N_ )    The number of columns in the global
237*                                 array A.
238*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
239*                                 the rows of the array.
240*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
241*                                 the columns of the array.
242*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
243*                                 row of the array A is distributed.
244*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
245*                                 first column of the array A is
246*                                 distributed.
247*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
248*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
249*
250*  Let K be the number of rows or columns of a distributed matrix,
251*  and assume that its process grid has dimension p x q.
252*  LOCr( K ) denotes the number of elements of K that a process
253*  would receive if K were distributed over the p processes of its
254*  process column.
255*  Similarly, LOCc( K ) denotes the number of elements of K that a
256*  process would receive if K were distributed over the q processes of
257*  its process row.
258*  The values of LOCr() and LOCc() may be determined via a call to the
259*  ScaLAPACK tool function, NUMROC:
260*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
261*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
262*  An upper bound for these quantities may be computed by:
263*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
264*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
265*
266*
267*  One-dimensional descriptors:
268*
269*  One-dimensional descriptors are a new addition to ScaLAPACK since
270*    version 1.0. They simplify and shorten the descriptor for 1D
271*    arrays.
272*
273*  Since ScaLAPACK supports two-dimensional arrays as the fundamental
274*    object, we allow 1D arrays to be distributed either over the
275*    first dimension of the array (as if the grid were P-by-1) or the
276*    2nd dimension (as if the grid were 1-by-P). This choice is
277*    indicated by the descriptor type (501 or 502)
278*    as described below.
279*    However, for tridiagonal matrices, since the objects being
280*    distributed are the individual vectors storing the diagonals, we
281*    have adopted the convention that both the P-by-1 descriptor and
282*    the 1-by-P descriptor are allowed and are equivalent for
283*    tridiagonal matrices. Thus, for tridiagonal matrices,
284*    DTYPE_A = 501 or 502 can be used interchangeably
285*    without any other change.
286*  We require that the distributed vectors storing the diagonals of a
287*    tridiagonal matrix be aligned with each other. Because of this, a
288*    single descriptor, DESCA, serves to describe the distribution of
289*    of all diagonals simultaneously.
290*
291*    IMPORTANT NOTE: the actual BLACS grid represented by the
292*    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
293*    irrespective of which one-dimensional descriptor type
294*    (501 or 502) is input.
295*    This routine will interpret the grid properly either way.
296*    ScaLAPACK routines *do not support intercontext operations* so that
297*    the grid passed to a single ScaLAPACK routine *must be the same*
298*    for all array descriptors passed to that routine.
299*
300*    NOTE: In all cases where 1D descriptors are used, 2D descriptors
301*    may also be used, since a one-dimensional array is a special case
302*    of a two-dimensional array with one dimension of size unity.
303*    The two-dimensional array used in this case *must* be of the
304*    proper orientation:
305*      If the appropriate one-dimensional descriptor is DTYPEA=501
306*      (1 by P type), then the two dimensional descriptor must
307*      have a CTXT value that refers to a 1 by P BLACS grid;
308*      If the appropriate one-dimensional descriptor is DTYPEA=502
309*      (P by 1 type), then the two dimensional descriptor must
310*      have a CTXT value that refers to a P by 1 BLACS grid.
311*
312*
313*  Summary of allowed descriptors, types, and BLACS grids:
314*  DTYPE           501         502         1         1
315*  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
316*  -----------------------------------------------------
317*  A               OK          OK          OK        NO
318*  B               NO          OK          NO        OK
319*
320*  Let A be a generic term for any 1D block cyclicly distributed array.
321*  Such a global array has an associated description vector DESCA.
322*  In the following comments, the character _ should be read as
323*  "of the global array".
324*
325*  NOTATION        STORED IN  EXPLANATION
326*  --------------- ---------- ------------------------------------------
327*  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
328*                                TYPE_A = 501: 1-by-P grid.
329*                                TYPE_A = 502: P-by-1 grid.
330*  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
331*                                the BLACS process grid A is distribu-
332*                                ted over. The context itself is glo-
333*                                bal, but the handle (the integer
334*                                value) may vary.
335*  N_A    (global) DESCA( 3 ) The size of the array dimension being
336*                                distributed.
337*  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
338*                                the distributed dimension of the array.
339*  SRC_A  (global) DESCA( 5 ) The process row or column over which the
340*                                first row or column of the array
341*                                is distributed.
342*  Ignored         DESCA( 6 ) Ignored for tridiagonal matrices.
343*  Reserved        DESCA( 7 ) Reserved for future use.
344*
345*
346*
347*  =====================================================================
348*
349*  Code Developer: Andrew J. Cleary, University of Tennessee.
350*    Current address: Lawrence Livermore National Labs.
351*
352*  =====================================================================
353*
354*     .. Parameters ..
355      REAL               ONE
356      PARAMETER          ( ONE = 1.0E+0 )
357      REAL               ZERO
358      PARAMETER          ( ZERO = 0.0E+0 )
359      INTEGER            INT_ONE
360      PARAMETER          ( INT_ONE = 1 )
361      INTEGER            DESCMULT, BIGNUM
362      PARAMETER          ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT )
363      INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
364     $                   LLD_, MB_, M_, NB_, N_, RSRC_
365      PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
366     $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
367     $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
368*     ..
369*     .. Local Scalars ..
370      INTEGER            COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
371     $                   ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN,
372     $                   LEVEL_DIST, LLDA, MYCOL, MYROW, MY_NUM_COLS,
373     $                   NB, NP, NPCOL, NPROW, NP_SAVE, ODD_SIZE,
374     $                   PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_N_A,
375     $                   TEMP, WORK_SIZE_MIN, WORK_U
376*     ..
377*     .. Local Arrays ..
378      INTEGER            DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 )
379*     ..
380*     .. External Subroutines ..
381      EXTERNAL           BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT,
382     $                   GLOBCHK, IGAMX2D, IGEBR2D, IGEBS2D, PXERBLA,
383     $                   RESHAPE, SDTTRF, SDTTRSV, SGERV2D, SGESD2D,
384     $                   STRRV2D, STRSD2D
385*     ..
386*     .. External Functions ..
387      INTEGER            NUMROC
388      REAL               SDOT
389      EXTERNAL           NUMROC, SDOT
390*     ..
391*     .. Intrinsic Functions ..
392      INTRINSIC          MOD, REAL
393*     ..
394*     .. Executable Statements ..
395*
396*     Test the input parameters
397*
398      INFO = 0
399*
400*     Convert descriptor into standard form for easy access to
401*        parameters, check that grid is of right shape.
402*
403      DESCA_1XP( 1 ) = 501
404*
405      TEMP = DESCA( DTYPE_ )
406      IF( TEMP.EQ.502 ) THEN
407*        Temporarily set the descriptor type to 1xP type
408         DESCA( DTYPE_ ) = 501
409      END IF
410*
411      CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE )
412*
413      DESCA( DTYPE_ ) = TEMP
414*
415      IF( RETURN_CODE.NE.0 ) THEN
416         INFO = -( 6*100+2 )
417      END IF
418*
419*     Get values out of descriptor for use in code.
420*
421      ICTXT = DESCA_1XP( 2 )
422      CSRC = DESCA_1XP( 5 )
423      NB = DESCA_1XP( 4 )
424      LLDA = DESCA_1XP( 6 )
425      STORE_N_A = DESCA_1XP( 3 )
426*
427*     Get grid parameters
428*
429*
430      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
431      NP = NPROW*NPCOL
432*
433*
434*
435      IF( LWORK.LT.-1 ) THEN
436         INFO = -10
437      ELSE IF( LWORK.EQ.-1 ) THEN
438         IDUM3 = -1
439      ELSE
440         IDUM3 = 1
441      END IF
442*
443      IF( N.LT.0 ) THEN
444         INFO = -1
445      END IF
446*
447      IF( N+JA-1.GT.STORE_N_A ) THEN
448         INFO = -( 6*100+6 )
449      END IF
450*
451*     Argument checking that is specific to Divide & Conquer routine
452*
453      IF( NPROW.NE.1 ) THEN
454         INFO = -( 6*100+2 )
455      END IF
456*
457      IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN
458         INFO = -( 1 )
459         CALL PXERBLA( ICTXT, 'PSDTTRF, D&C alg.: only 1 block per proc'
460     $                 , -INFO )
461         RETURN
462      END IF
463*
464      IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*INT_ONE ) ) THEN
465         INFO = -( 6*100+4 )
466         CALL PXERBLA( ICTXT, 'PSDTTRF, D&C alg.: NB too small', -INFO )
467         RETURN
468      END IF
469*
470*
471*     Check auxiliary storage size
472*
473      LAF_MIN = ( 12*NPCOL+3*NB )
474*
475      IF( LAF.LT.LAF_MIN ) THEN
476         INFO = -8
477*        put minimum value of laf into AF( 1 )
478         AF( 1 ) = LAF_MIN
479         CALL PXERBLA( ICTXT, 'PSDTTRF: auxiliary storage error ',
480     $                 -INFO )
481         RETURN
482      END IF
483*
484*     Check worksize
485*
486      WORK_SIZE_MIN = 8*NPCOL
487*
488      WORK( 1 ) = WORK_SIZE_MIN
489*
490      IF( LWORK.LT.WORK_SIZE_MIN ) THEN
491         IF( LWORK.NE.-1 ) THEN
492            INFO = -10
493            CALL PXERBLA( ICTXT, 'PSDTTRF: worksize error ', -INFO )
494         END IF
495         RETURN
496      END IF
497*
498*     Pack params and positions into arrays for global consistency check
499*
500      PARAM_CHECK( 7, 1 ) = DESCA( 5 )
501      PARAM_CHECK( 6, 1 ) = DESCA( 4 )
502      PARAM_CHECK( 5, 1 ) = DESCA( 3 )
503      PARAM_CHECK( 4, 1 ) = DESCA( 1 )
504      PARAM_CHECK( 3, 1 ) = JA
505      PARAM_CHECK( 2, 1 ) = N
506      PARAM_CHECK( 1, 1 ) = IDUM3
507*
508      PARAM_CHECK( 7, 2 ) = 605
509      PARAM_CHECK( 6, 2 ) = 604
510      PARAM_CHECK( 5, 2 ) = 603
511      PARAM_CHECK( 4, 2 ) = 601
512      PARAM_CHECK( 3, 2 ) = 5
513      PARAM_CHECK( 2, 2 ) = 1
514      PARAM_CHECK( 1, 2 ) = 10
515*
516*     Want to find errors with MIN( ), so if no error, set it to a big
517*     number. If there already is an error, multiply by the the
518*     descriptor multiplier.
519*
520      IF( INFO.GE.0 ) THEN
521         INFO = BIGNUM
522      ELSE IF( INFO.LT.-DESCMULT ) THEN
523         INFO = -INFO
524      ELSE
525         INFO = -INFO*DESCMULT
526      END IF
527*
528*     Check consistency across processors
529*
530      CALL GLOBCHK( ICTXT, 7, PARAM_CHECK, 7, PARAM_CHECK( 1, 3 ),
531     $              INFO )
532*
533*     Prepare output: set info = 0 if no error, and divide by DESCMULT
534*     if error is not in a descriptor entry.
535*
536      IF( INFO.EQ.BIGNUM ) THEN
537         INFO = 0
538      ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN
539         INFO = -INFO / DESCMULT
540      ELSE
541         INFO = -INFO
542      END IF
543*
544      IF( INFO.LT.0 ) THEN
545         CALL PXERBLA( ICTXT, 'PSDTTRF', -INFO )
546         RETURN
547      END IF
548*
549*     Quick return if possible
550*
551      IF( N.EQ.0 )
552     $   RETURN
553*
554*
555*     Adjust addressing into matrix space to properly get into
556*        the beginning part of the relevant data
557*
558      PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) )
559*
560      IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN
561         PART_OFFSET = PART_OFFSET + NB
562      END IF
563*
564      IF( MYCOL.LT.CSRC ) THEN
565         PART_OFFSET = PART_OFFSET - NB
566      END IF
567*
568*     Form a new BLACS grid (the "standard form" grid) with only procs
569*        holding part of the matrix, of size 1xNP where NP is adjusted,
570*        starting at csrc=0, with JA modified to reflect dropped procs.
571*
572*     First processor to hold part of the matrix:
573*
574      FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL )
575*
576*     Calculate new JA one while dropping off unused processors.
577*
578      JA_NEW = MOD( JA-1, NB ) + 1
579*
580*     Save and compute new value of NP
581*
582      NP_SAVE = NP
583      NP = ( JA_NEW+N-2 ) / NB + 1
584*
585*     Call utility routine that forms "standard-form" grid
586*
587      CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC,
588     $              INT_ONE, NP )
589*
590*     Use new context from standard grid as context.
591*
592      ICTXT_SAVE = ICTXT
593      ICTXT = ICTXT_NEW
594      DESCA_1XP( 2 ) = ICTXT_NEW
595*
596*     Get information about new grid.
597*
598      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
599*
600*     Drop out processors that do not have part of the matrix.
601*
602      IF( MYROW.LT.0 ) THEN
603         GO TO 70
604      END IF
605*
606*     ********************************
607*     Values reused throughout routine
608*
609*     User-input value of partition size
610*
611      PART_SIZE = NB
612*
613*     Number of columns in each processor
614*
615      MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL )
616*
617*     Offset in columns to beginning of main partition in each proc
618*
619      IF( MYCOL.EQ.0 ) THEN
620         PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE )
621         MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE )
622      END IF
623*
624*     Size of main (or odd) partition in each processor
625*
626      ODD_SIZE = MY_NUM_COLS
627      IF( MYCOL.LT.NP-1 ) THEN
628         ODD_SIZE = ODD_SIZE - INT_ONE
629      END IF
630*
631*     Offset to workspace for Upper triangular factor
632*
633      WORK_U = INT_ONE*ODD_SIZE + 3
634*
635*
636*       Zero out space for fillin
637*
638      DO 10 I = 1, LAF_MIN
639         AF( I ) = ZERO
640   10 CONTINUE
641*
642*     Begin main code
643*
644*
645********************************************************************
646*       PHASE 1: Local computation phase.
647********************************************************************
648*
649*
650      IF( MYCOL.LT.NP-1 ) THEN
651*         Transfer last triangle D_i of local matrix to next processor
652*         which needs it to calculate fillin due to factorization of
653*         its main (odd) block A_i.
654*         Overlap the send with the factorization of A_i.
655*
656         CALL STRSD2D( ICTXT, 'U', 'N', 1, 1,
657     $                 DU( PART_OFFSET+ODD_SIZE+1 ), LLDA-1, 0,
658     $                 MYCOL+1 )
659*
660      END IF
661*
662*
663*       Factor main partition A_i = L_i {U_i} in each processor
664*
665      CALL SDTTRF( ODD_SIZE, DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ),
666     $             DU( PART_OFFSET+1 ), INFO )
667*
668      IF( INFO.NE.0 ) THEN
669         INFO = MYCOL + 1
670         GO TO 20
671      END IF
672*
673*
674      IF( MYCOL.LT.NP-1 ) THEN
675*
676*         Apply factorization to lower connection block BL_i
677*         Apply factorization to upper connection block BU_i
678*
679*
680*         Perform the triangular solve {U_i}^T{BL'}_i^T = {BL_i}^T
681*
682*
683         DL( PART_OFFSET+ODD_SIZE+1 ) = ( DL( PART_OFFSET+ODD_SIZE+1 ) )
684     $                                   / ( D( PART_OFFSET+ODD_SIZE ) )
685*
686*
687*         Compute contribution to diagonal block(s) of reduced system.
688*          {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i}
689*
690*
691         D( PART_OFFSET+ODD_SIZE+1 ) = D( PART_OFFSET+ODD_SIZE+1 ) -
692     $                                 DL( PART_OFFSET+ODD_SIZE+1 )*
693     $                                 DU( PART_OFFSET+ODD_SIZE )
694*
695      END IF
696*       End of "if ( MYCOL .lt. NP-1 )..." loop
697*
698*
699   20 CONTINUE
700*       If the processor could not locally factor, it jumps here.
701*
702      IF( MYCOL.NE.0 ) THEN
703*
704*         Move entry that causes spike to auxiliary storage
705*
706         AF( WORK_U+1 ) = ( DL( PART_OFFSET+1 ) )
707*
708         IF( INFO.EQ.0 ) THEN
709*
710*         Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ .
711*
712            CALL SDTTRSV( 'L', 'N', ODD_SIZE, INT_ONE,
713     $                    DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ),
714     $                    DU( PART_OFFSET+1 ), AF( WORK_U+1 ), ODD_SIZE,
715     $                    INFO )
716*
717*
718*         Calculate the "spike" fillin, ${U_i}^T {{GL}_i}^T = {DU_i}^T$
719*
720            CALL STRRV2D( ICTXT, 'U', 'N', 1, 1, AF( 1 ), ODD_SIZE, 0,
721     $                    MYCOL-1 )
722*
723            CALL SDTTRSV( 'U', 'T', ODD_SIZE, INT_ONE,
724     $                    DL( PART_OFFSET+2 ), D( PART_OFFSET+1 ),
725     $                    DU( PART_OFFSET+1 ), AF( 1 ), ODD_SIZE, INFO )
726*
727*
728*         Calculate the update block for previous proc, E_i = GL_i{GU_i}
729*
730            AF( ODD_SIZE+3 ) = -ONE*SDOT( ODD_SIZE, AF( 1 ), 1,
731     $                         AF( WORK_U+1 ), 1 )
732*
733*
734*         Initiate send of E_i to previous processor to overlap
735*           with next computation.
736*
737            CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+3 ),
738     $                    INT_ONE, 0, MYCOL-1 )
739*
740*
741            IF( MYCOL.LT.NP-1 ) THEN
742*
743*           Calculate off-diagonal block(s) of reduced system.
744*           Note: for ease of use in solution of reduced system, store
745*           L's off-diagonal block in transpose form.
746*
747               AF( ODD_SIZE+1 ) = -ONE*( DL( PART_OFFSET+ODD_SIZE+1 )*
748     $                            AF( WORK_U+ODD_SIZE ) )
749*
750*
751               AF( WORK_U+( ODD_SIZE )+1 ) = -ONE*
752     $            DU( PART_OFFSET+ODD_SIZE )*( AF( ODD_SIZE ) )
753*
754            END IF
755*
756         END IF
757*       End of "if ( MYCOL .ne. 0 )..."
758*
759      END IF
760*       End of "if (info.eq.0) then"
761*
762*
763*       Check to make sure no processors have found errors
764*
765      CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0,
766     $              0 )
767*
768      IF( MYCOL.EQ.0 ) THEN
769         CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
770      ELSE
771         CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
772      END IF
773*
774      IF( INFO.NE.0 ) THEN
775         GO TO 60
776      END IF
777*       No errors found, continue
778*
779*
780********************************************************************
781*       PHASE 2: Formation and factorization of Reduced System.
782********************************************************************
783*
784*       Gather up local sections of reduced system
785*
786*
787*     The last processor does not participate in the factorization of
788*       the reduced system, having sent its E_i already.
789      IF( MYCOL.EQ.NPCOL-1 ) THEN
790         GO TO 50
791      END IF
792*
793*       Initiate send of off-diag block(s) to overlap with next part.
794*       Off-diagonal block needed on neighboring processor to start
795*       algorithm.
796*
797      IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN
798*
799         CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ),
800     $                 INT_ONE, 0, MYCOL-1 )
801*
802         CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, AF( WORK_U+ODD_SIZE+1 ),
803     $                 INT_ONE, 0, MYCOL-1 )
804*
805      END IF
806*
807*       Copy last diagonal block into AF storage for subsequent
808*         operations.
809*
810      AF( ODD_SIZE+2 ) = REAL( D( PART_OFFSET+ODD_SIZE+1 ) )
811*
812*       Receive cont. to diagonal block that is stored on this proc.
813*
814      IF( MYCOL.LT.NPCOL-1 ) THEN
815*
816         CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ),
817     $                 INT_ONE, 0, MYCOL+1 )
818*
819*          Add contribution to diagonal block
820*
821         AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + AF( ODD_SIZE+3 )
822*
823      END IF
824*
825*
826*       *************************************
827*       Modification Loop
828*
829*       The distance for sending and receiving for each level starts
830*         at 1 for the first level.
831      LEVEL_DIST = 1
832*
833*       Do until this proc is needed to modify other procs' equations
834*
835   30 CONTINUE
836      IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 )
837     $   GO TO 40
838*
839*         Receive and add contribution to diagonal block from the left
840*
841      IF( MYCOL-LEVEL_DIST.GE.0 ) THEN
842         CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0,
843     $                 MYCOL-LEVEL_DIST )
844*
845         AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 )
846*
847      END IF
848*
849*         Receive and add contribution to diagonal block from the right
850*
851      IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN
852         CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0,
853     $                 MYCOL+LEVEL_DIST )
854*
855         AF( ODD_SIZE+2 ) = AF( ODD_SIZE+2 ) + WORK( 1 )
856*
857      END IF
858*
859      LEVEL_DIST = LEVEL_DIST*2
860*
861      GO TO 30
862   40 CONTINUE
863*       [End of GOTO Loop]
864*
865*
866*       *********************************
867*       Calculate and use this proc's blocks to modify other procs'...
868      IF( AF( ODD_SIZE+2 ).EQ.ZERO ) THEN
869         INFO = NPCOL + MYCOL
870      END IF
871*
872*       ****************************************************************
873*       Receive offdiagonal block from processor to right.
874*         If this is the first group of processors, the receive comes
875*         from a different processor than otherwise.
876*
877      IF( LEVEL_DIST.EQ.1 ) THEN
878         COMM_PROC = MYCOL + 1
879*
880*           Move block into place that it will be expected to be for
881*             calcs.
882*
883         AF( WORK_U+ODD_SIZE+3 ) = AF( ODD_SIZE+1 )
884*
885         AF( ODD_SIZE+3 ) = AF( WORK_U+ODD_SIZE+1 )
886*
887      ELSE
888         COMM_PROC = MYCOL + LEVEL_DIST / 2
889      END IF
890*
891      IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
892*
893         CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+1 ),
894     $                 INT_ONE, 0, COMM_PROC )
895*
896         CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( WORK_U+ODD_SIZE+1 ),
897     $                 INT_ONE, 0, COMM_PROC )
898*
899         IF( INFO.EQ.0 ) THEN
900*
901*
902*         Modify lower off_diagonal block with diagonal block
903*
904*
905            AF( ODD_SIZE+1 ) = AF( ODD_SIZE+1 ) / ( AF( ODD_SIZE+2 ) )
906*
907         END IF
908*         End of "if ( info.eq.0 ) then"
909*
910*         Calculate contribution from this block to next diagonal block
911*
912         WORK( 1 ) = -ONE*( AF( ODD_SIZE+1 ) )*
913     $               AF( WORK_U+( ODD_SIZE )+1 )
914*
915*         Send contribution to diagonal block's owning processor.
916*
917         CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0,
918     $                 MYCOL+LEVEL_DIST )
919*
920      END IF
921*       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
922*
923*
924*       ****************************************************************
925*       Receive off_diagonal block from left and use to finish with this
926*         processor.
927*
928      IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND.
929     $    ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN
930*
931         IF( LEVEL_DIST.GT.1 ) THEN
932*
933*           Receive offdiagonal block(s) from proc level_dist/2 to the
934*           left
935*
936            CALL SGERV2D( ICTXT, INT_ONE, INT_ONE,
937     $                    AF( WORK_U+ODD_SIZE+2+1 ), INT_ONE, 0,
938     $                    MYCOL-LEVEL_DIST / 2 )
939*
940*           Receive offdiagonal block(s) from proc level_dist/2 to the
941*           left
942*
943            CALL SGERV2D( ICTXT, INT_ONE, INT_ONE, AF( ODD_SIZE+2+1 ),
944     $                    INT_ONE, 0, MYCOL-LEVEL_DIST / 2 )
945*
946         END IF
947*
948*
949         IF( INFO.EQ.0 ) THEN
950*
951*         Use diagonal block(s) to modify this offdiagonal block
952*
953            AF( ODD_SIZE+3 ) = AF( ODD_SIZE+3 ) / ( AF( ODD_SIZE+2 ) )
954*
955*
956         END IF
957*         End of "if( info.eq.0 ) then"
958*
959*         Use offdiag block(s) to calculate modification to diag block
960*           of processor to the left
961*
962         WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*( AF( WORK_U+ODD_SIZE+3 ) )
963*
964*         Send contribution to diagonal block's owning processor.
965*
966         CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE, 0,
967     $                 MYCOL-LEVEL_DIST )
968*
969*         *******************************************************
970*
971         IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
972*
973*           Decide which processor offdiagonal block(s) goes to
974*
975            IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN
976               COMM_PROC = MYCOL + LEVEL_DIST
977            ELSE
978               COMM_PROC = MYCOL - LEVEL_DIST
979            END IF
980*
981*           Use offdiagonal blocks to calculate offdiag
982*             block to send to neighboring processor. Depending
983*             on circumstances, may need to transpose the matrix.
984*
985            WORK( 1 ) = -ONE*AF( WORK_U+ODD_SIZE+3 )*AF( ODD_SIZE+1 )
986*
987*           Send contribution to offdiagonal block's owning processor.
988*
989            CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE,
990     $                    0, COMM_PROC )
991*
992            WORK( 1 ) = -ONE*AF( ODD_SIZE+3 )*AF( WORK_U+ODD_SIZE+1 )
993*
994*           Send contribution to offdiagonal block's owning processor.
995*
996            CALL SGESD2D( ICTXT, INT_ONE, INT_ONE, WORK( 1 ), INT_ONE,
997     $                    0, COMM_PROC )
998*
999         END IF
1000*
1001      END IF
1002*       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1003*
1004   50 CONTINUE
1005*
1006*
1007   60 CONTINUE
1008*
1009*
1010*     Free BLACS space used to hold standard-form grid.
1011*
1012      IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN
1013         CALL BLACS_GRIDEXIT( ICTXT_NEW )
1014      END IF
1015*
1016   70 CONTINUE
1017*
1018*     Restore saved input parameters
1019*
1020      ICTXT = ICTXT_SAVE
1021      NP = NP_SAVE
1022*
1023*     Output minimum worksize
1024*
1025      WORK( 1 ) = WORK_SIZE_MIN
1026*
1027*         Make INFO consistent across processors
1028*
1029      CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0,
1030     $              0 )
1031*
1032      IF( MYCOL.EQ.0 ) THEN
1033         CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
1034      ELSE
1035         CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
1036      END IF
1037*
1038*
1039      RETURN
1040*
1041*     End of PSDTTRF
1042*
1043      END
1044