1      SUBROUTINE PSGEBAL( JOB, N, A, DESCA, ILO, IHI, SCALE, INFO )
2*
3*     Contribution from the Department of Computing Science and HPC2N,
4*     Umea University, Sweden
5*
6*  -- ScaLAPACK computational routine (version 2.0.1) --
7*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
8*     Univ. of Colorado Denver and University of California, Berkeley.
9*     January, 2012
10*
11      IMPLICIT NONE
12*
13*     .. Scalar Arguments ..
14      CHARACTER          JOB
15      INTEGER            IHI, ILO, INFO, N
16*     ..
17*     .. Array Arguments ..
18      INTEGER            DESCA( * )
19      REAL               A( * ), SCALE( * )
20*     ..
21*
22*  Purpose
23*  =======
24*
25*  PSGEBAL balances a general real matrix A.  This involves, first,
26*  permuting A by a similarity transformation to isolate eigenvalues
27*  in the first 1 to ILO-1 and last IHI+1 to N elements on the
28*  diagonal; and second, applying a diagonal similarity transformation
29*  to rows and columns ILO to IHI to make the rows and columns as
30*  close in norm as possible.  Both steps are optional.
31*
32*  Balancing may reduce the 1-norm of the matrix, and improve the
33*  accuracy of the computed eigenvalues and/or eigenvectors.
34*
35*  Notes
36*  =====
37*
38*  Each global data object is described by an associated description
39*  vector.  This vector stores the information required to establish
40*  the mapping between an object element and its corresponding process
41*  and memory location.
42*
43*  Let A be a generic term for any 2D block cyclicly distributed array.
44*  Such a global array has an associated description vector DESCA.
45*  In the following comments, the character _ should be read as
46*  "of the global array".
47*
48*  NOTATION        STORED IN      EXPLANATION
49*  --------------- -------------- --------------------------------------
50*  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
51*                                 DTYPE_A = 1.
52*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
53*                                 the BLACS process grid A is distribu-
54*                                 ted over. The context itself is glo-
55*                                 bal, but the handle (the integer
56*                                 value) may vary.
57*  M_A    (global) DESCA( M_ )    The number of rows in the global
58*                                 array A.
59*  N_A    (global) DESCA( N_ )    The number of columns in the global
60*                                 array A.
61*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
62*                                 the rows of the array.
63*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
64*                                 the columns of the array.
65*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
66*                                 row of the array A is distributed.
67*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
68*                                 first column of the array A is
69*                                 distributed.
70*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
71*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
72*
73*  Let K be the number of rows or columns of a distributed matrix,
74*  and assume that its process grid has dimension p x q.
75*  LOCr( K ) denotes the number of elements of K that a process
76*  would receive if K were distributed over the p processes of its
77*  process column.
78*  Similarly, LOCc( K ) denotes the number of elements of K that a
79*  process would receive if K were distributed over the q processes of
80*  its process row.
81*  The values of LOCr() and LOCc() may be determined via a call to the
82*  ScaLAPACK tool function, NUMROC:
83*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
84*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
85*  An upper bound for these quantities may be computed by:
86*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
87*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
88*
89*
90*  Arguments
91*  =========
92*
93*  JOB     (global input) CHARACTER*1
94*          Specifies the operations to be performed on A:
95*          = 'N':  none:  simply set ILO = 1, IHI = N, SCALE(I) = 1.0
96*                  for i = 1,...,N;
97*          = 'P':  permute only;
98*          = 'S':  scale only;
99*          = 'B':  both permute and scale.
100*
101*  N       (global input) INTEGER
102*          The order of the matrix A.  N >= 0.
103*
104*  A       (local input/output) REAL             array, dimension
105*          (DESCA(LLD_,LOCc(N))
106*          On entry, the input matrix A.
107*          On exit,  A is overwritten by the balanced matrix.
108*          If JOB = 'N', A is not referenced.
109*          See Further Details.
110*
111*  DESCA   (global and local input) INTEGER array of dimension DLEN_.
112*          The array descriptor for the distributed matrix A.
113*
114*  ILO     (global output) INTEGER
115*  IHI     (global output) INTEGER
116*          ILO and IHI are set to integers such that on exit
117*          A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N.
118*          If JOB = 'N' or 'S', ILO = 1 and IHI = N.
119*
120*  SCALE   (global output) REAL             array, dimension (N)
121*          Details of the permutations and scaling factors applied to
122*          A.  If P(j) is the index of the row and column interchanged
123*          with row and column j and D(j) is the scaling factor
124*          applied to row and column j, then
125*          SCALE(j) = P(j)    for j = 1,...,ILO-1
126*                   = D(j)    for j = ILO,...,IHI
127*                   = P(j)    for j = IHI+1,...,N.
128*          The order in which the interchanges are made is N to IHI+1,
129*          then 1 to ILO-1.
130*
131*  INFO    (global output) INTEGER
132*          = 0:  successful exit.
133*          < 0:  if INFO = -i, the i-th argument had an illegal value.
134*
135*  Further Details
136*  ===============
137*
138*  The permutations consist of row and column interchanges which put
139*  the matrix in the form
140*
141*             ( T1   X   Y  )
142*     P A P = (  0   B   Z  )
143*             (  0   0   T2 )
144*
145*  where T1 and T2 are upper triangular matrices whose eigenvalues lie
146*  along the diagonal.  The column indices ILO and IHI mark the starting
147*  and ending columns of the submatrix B. Balancing consists of applying
148*  a diagonal similarity transformation inv(D) * B * D to make the
149*  1-norms of each row of B and its corresponding column nearly equal.
150*  The output matrix is
151*
152*     ( T1     X*D          Y    )
153*     (  0  inv(D)*B*D  inv(D)*Z ).
154*     (  0      0           T2   )
155*
156*  Information about the permutations P and the diagonal matrix D is
157*  returned in the vector SCALE.
158*
159*  This subroutine is based on the EISPACK routine BALANC. In principle,
160*  the parallelism is extracted by using PBLAS and BLACS routines for
161*  the permutation and balancing.
162*
163*  Modified by Tzu-Yi Chen, Computer Science Division, University of
164*    California at Berkeley, USA
165*
166*  Parallel version by Robert Granat and Meiyue Shao, Department of
167*    Computing Science and HPC2N, Umea University, Sweden
168*
169*  =====================================================================
170*
171*     .. Parameters ..
172      INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
173     $                   LLD_, MB_, M_, NB_, N_, RSRC_
174      PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
175     $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
176     $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
177      REAL               ZERO, ONE
178      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
179      REAL               SCLFAC
180      PARAMETER          ( SCLFAC = 2.0E+0 )
181      REAL               FACTOR
182      PARAMETER          ( FACTOR = 0.95E+0 )
183*     ..
184*     .. Local Scalars ..
185      LOGICAL            NOCONV
186      INTEGER            I, ICA, IEXC, IRA, J, K, L, M, LLDA,
187     $                   ICTXT, NPROW, NPCOL, MYROW, MYCOL, II, JJ,
188     $                   ARSRC, ACSRC
189      REAL               C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
190     $                   SFMIN2, ELEM
191*     ..
192*     .. Local Arrays ..
193      REAL               CR( 2 )
194*     ..
195*     .. External Functions ..
196      LOGICAL            SISNAN, LSAME
197      INTEGER            IDAMAX
198      REAL               SLAMCH
199      EXTERNAL           SISNAN, LSAME, SLAMCH
200*     ..
201*     .. External Subroutines ..
202      EXTERNAL           PSSCAL, PSSWAP, PSAMAX, PXERBLA,
203     $                   BLACS_GRIDINFO, CHK1MAT, SGSUM2D,
204     $                   INFOG2L, PSELGET
205*     ..
206*     .. Intrinsic Functions ..
207      INTRINSIC          ABS, MAX, MIN
208*     ..
209*     .. Executable Statements ..
210      INFO = 0
211      ICTXT = DESCA( CTXT_ )
212      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
213*
214*     Test the input parameters.
215*
216      IF( .NOT.LSAME( JOB, 'N' ) .AND. .NOT.LSAME( JOB, 'P' ) .AND.
217     $    .NOT.LSAME( JOB, 'S' ) .AND. .NOT.LSAME( JOB, 'B' ) ) THEN
218         INFO = -1
219      ELSE IF( N.LT.0 ) THEN
220         INFO = -2
221      ELSE
222         CALL CHK1MAT( N, 2, N, 2, 1, 1, DESCA, 4, INFO )
223      END IF
224      IF( INFO.NE.0 ) THEN
225         CALL PXERBLA( ICTXT, 'PSGEBAL', -INFO )
226         RETURN
227      END IF
228*
229*     Extract local leading dimension of A.
230*
231      LLDA = DESCA( LLD_ )
232*
233      K = 1
234      L = N
235*
236      IF( N.EQ.0 )
237     $   GO TO 210
238*
239      IF( LSAME( JOB, 'N' ) ) THEN
240         DO 10 I = 1, N
241            SCALE( I ) = ONE
242   10    CONTINUE
243         GO TO 210
244      END IF
245*
246      IF( LSAME( JOB, 'S' ) )
247     $   GO TO 120
248*
249*     Permutation to isolate eigenvalues if possible.
250*
251      GO TO 50
252*
253*     Row and column exchange.
254*
255   20 CONTINUE
256      SCALE( M ) = J
257      IF( J.EQ.M )
258     $   GO TO 30
259*
260      CALL PSSWAP( L, A, 1, J, DESCA, 1, A, 1, M, DESCA, 1 )
261      CALL PSSWAP( N-K+1, A, J, K, DESCA, DESCA(M_), A, M, K, DESCA,
262     $             DESCA(M_) )
263*
264   30 CONTINUE
265      GO TO ( 40, 80 )IEXC
266*
267*     Search for rows isolating an eigenvalue and push them down.
268*
269   40 CONTINUE
270      IF( L.EQ.1 )
271     $   GO TO 210
272      L = L - 1
273*
274   50 CONTINUE
275      DO 70 J = L, 1, -1
276*
277         DO 60 I = 1, L
278            IF( I.EQ.J )
279     $         GO TO 60
280*
281*           All processors need the information to make correct decisions.
282*
283            CALL PSELGET( 'All', '1-Tree', ELEM, A, J, I, DESCA )
284            IF( ELEM.NE.ZERO )
285     $         GO TO 70
286   60    CONTINUE
287*
288         M = L
289         IEXC = 1
290         GO TO 20
291   70 CONTINUE
292*
293      GO TO 90
294*
295*     Search for columns isolating an eigenvalue and push them left.
296*
297   80 CONTINUE
298      K = K + 1
299*
300   90 CONTINUE
301      DO 110 J = K, L
302*
303         DO 100 I = K, L
304            IF( I.EQ.J )
305     $         GO TO 100
306*
307*           All processors need the information to make correct decisions.
308*
309            CALL PSELGET( 'All', '1-Tree', ELEM, A, I, J, DESCA )
310            IF( ELEM.NE.ZERO )
311     $         GO TO 110
312  100    CONTINUE
313*
314         M = K
315         IEXC = 2
316         GO TO 20
317  110 CONTINUE
318*
319  120 CONTINUE
320      DO 130 I = K, L
321         SCALE( I ) = ONE
322  130 CONTINUE
323*
324      IF( LSAME( JOB, 'P' ) )
325     $   GO TO 210
326*
327*     Balance the submatrix in rows K to L.
328*
329*     Iterative loop for norm reduction.
330*
331      SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' )
332      SFMAX1 = ONE / SFMIN1
333      SFMIN2 = SFMIN1*SCLFAC
334      SFMAX2 = ONE / SFMIN2
335  140 CONTINUE
336      NOCONV = .FALSE.
337*
338      DO 200 I = K, L
339         C = ZERO
340         R = ZERO
341*
342*        Compute local partial values of R and C in parallel and combine
343*        with a call to the BLACS global summation routine distributing
344*        information to all processors.
345*
346         DO 150 J = K, L
347            IF( J.EQ.I )
348     $         GO TO 150
349            CALL INFOG2L( J, I, DESCA, NPROW, NPCOL, MYROW,
350     $                    MYCOL, II, JJ, ARSRC, ACSRC )
351            IF( MYROW.EQ.ARSRC .AND. MYCOL.EQ.ACSRC ) THEN
352               C = C + ABS( A( II + (JJ-1)*LLDA ) )
353            END IF
354            CALL INFOG2L( I, J, DESCA, NPROW, NPCOL, MYROW,
355     $                    MYCOL, II, JJ, ARSRC, ACSRC )
356            IF( MYROW.EQ.ARSRC .AND. MYCOL.EQ.ACSRC ) THEN
357               R = R + ABS( A( II + (JJ-1)*LLDA ) )
358            END IF
359  150    CONTINUE
360         CR( 1 ) = C
361         CR( 2 ) = R
362         CALL SGSUM2D( ICTXT, 'All', '1-Tree', 2, 1, CR, 2, -1, -1 )
363         C = CR( 1 )
364         R = CR( 2 )
365*
366*        Find global maximum absolute values and indices in parallel.
367*
368         CALL PSAMAX( L, CA, ICA, A, 1, I, DESCA, 1 )
369         CALL PSAMAX( N-K+1, RA, IRA, A, I, K, DESCA, DESCA(M_) )
370*
371*        Guard against zero C or R due to underflow.
372*
373         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
374     $      GO TO 200
375         G = R / SCLFAC
376         F = ONE
377         S = C + R
378  160    CONTINUE
379         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
380     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
381         IF( SISNAN( C+F+CA+R+G+RA ) ) THEN
382*
383*           Exit if NaN to avoid infinite loop
384*
385            INFO = -3
386            CALL PXERBLA( ICTXT, 'PDGEBAL', -INFO )
387            RETURN
388         END IF
389         F = F*SCLFAC
390         C = C*SCLFAC
391         CA = CA*SCLFAC
392         R = R / SCLFAC
393         G = G / SCLFAC
394         RA = RA / SCLFAC
395         GO TO 160
396*
397  170    CONTINUE
398         G = C / SCLFAC
399  180    CONTINUE
400         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
401     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
402         F = F / SCLFAC
403         C = C / SCLFAC
404         G = G / SCLFAC
405         CA = CA / SCLFAC
406         R = R*SCLFAC
407         RA = RA*SCLFAC
408         GO TO 180
409*
410*        Now balance.
411*
412  190    CONTINUE
413         IF( ( C+R ).GE.FACTOR*S )
414     $      GO TO 200
415         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
416            IF( F*SCALE( I ).LE.SFMIN1 )
417     $         GO TO 200
418         END IF
419         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
420            IF( SCALE( I ).GE.SFMAX1 / F )
421     $         GO TO 200
422         END IF
423         G = ONE / F
424         SCALE( I ) = SCALE( I )*F
425         NOCONV = .TRUE.
426*
427         CALL PSSCAL( N-K+1, G, A, I, K, DESCA, DESCA(M_) )
428         CALL PSSCAL( L, F, A, 1, I, DESCA, 1 )
429*
430  200 CONTINUE
431*
432      IF( NOCONV )
433     $   GO TO 140
434*
435  210 CONTINUE
436      ILO = K
437      IHI = L
438*
439      RETURN
440*
441*     End of PSGEBAL
442*
443      END
444