1 /**********************************************************************
2   Cluster_DFT_ON2.c:
3 
4      Cluster_DFT_ON2.c is a subroutine to perform cluster calculations
5      with an O(N^2~) method.
6 
7   Log of Cluster_DFT_ON2.c:
8 
9      11/Sep./2009  Released by T.Ozaki
10 
11 ***********************************************************************/
12 
13 #include <complex.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <math.h>
18 #include <time.h>
19 #include <complex.h>
20 #include "openmx_common.h"
21 #include "lapack_prototypes.h"
22 #include "mpi.h"
23 #include <omp.h>
24 
25 #define  measure_time       0
26 
27 
28 typedef struct {
29    long int a;
30    double complex b;
31 } clists;
32 
33 
34 
35 static double Find_Trial_ChemP_by_Muller(
36         int SCF_iter, int num_loop,
37         double pTrial_ChemP[4], double pDiff_Num[5],
38         double DeltaMu[7], double DeltaN[3], double DeltaNp[3]);
39 
40 static void qsort_complex(long n, long *a, double complex *b);
41 static int clists_cmp(const clists *x, const clists *y);
42 int Lapack_LU_Zinverse(int , dcomplex *);
43 
44 static void Nested_Dissection( int free_flag, int Nmat, int *MP,
45                                int *Depth_Level, int *Number_Division);
46 
47 static double Cluster_collinear_ON2_Force(
48 				    int SCF_iter,
49 				    int SpinP_switch,
50 				    double ***C,
51 				    double **ko,
52 				    double *****nh, double ****CntOLP,
53 				    double *****CDM,
54 				    double *****EDM,
55 				    double Eele0[2], double Eele1[2]);
56 
57 static double Cluster_collinear_ON2_Iter(
58 				    int SCF_iter,
59 				    int SpinP_switch,
60 				    double ***C,
61 				    double **ko,
62 				    double *****nh, double ****CntOLP,
63 				    double *****CDM,
64 				    double *****EDM,
65 				    double Eele0[2], double Eele1[2]);
66 
67 
68 
69 
70 static void OND_Solver(
71   char *mode,
72   int myid,
73   int numprocs,
74   int myid2,
75   int numprocs2,
76   int Nloop1,
77   MPI_Comm MPI_Comm,
78   int spin,
79   int Epoint,
80   double Real_Ene,
81   int Nmat,
82   int *NZE,
83   int *PZE,
84   int nl,
85   int mpmax,
86   double Trial_ChemP,
87   double **Hall,
88   double *Sall,
89   double **DMall,
90   double **WRG,
91   int *conv_index_row,
92   int *conv_index_col,
93   int **conv_ind2,
94   int TNZE);
95 
96 static void Bisection_System(
97     int Latomnum,
98     double **Cell_Gxyz2,
99     int **OG2G,
100     int **G2OG,
101     int *NN_FNAN,
102     int **NN_natn,
103     int **NN_ncn,
104     int *OGmin,
105     int *OGmax,
106     int *num0,
107     int *numb,
108     int *num1);
109 
110 
111 
112 
113 int **Index_BC;
114 int **SindB,**EindB;
115 int **SindC,**EindC;
116 int *invp;
117 int ***Bnum[2],***Cnum,**Anum;
118 int ****Bi[2],****Ci,***Ai;
119 double complex ****Bx[2];
120 double complex ****Cx;
121 double complex ****IBx[2];
122 double complex ****ICx;
123 double complex ***Ax;
124 double complex ***IAx;
125 dcomplex *IA0,***IS;
126 double **IA;
127 dcomplex ***Lvec[2];
128 double complex **Vvec,**Vvec2;
129 double complex **Q0,**Q1;
130 double *VvecTmp,*ISTmp;
131 
132 
Cluster_DFT_ON2(char * mode,int SCF_iter,int SpinP_switch,double *** Cluster_ReCoes,double ** Cluster_ko,double ***** nh,double ***** ImNL,double **** CntOLP,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])133 double Cluster_DFT_ON2(char *mode,
134 		       int SCF_iter,
135 		       int SpinP_switch,
136 		       double ***Cluster_ReCoes,
137 		       double **Cluster_ko,
138 		       double *****nh,
139 		       double *****ImNL,
140 		       double ****CntOLP,
141 		       double *****CDM,
142 		       double *****EDM,
143 		       double Eele0[2], double Eele1[2])
144 {
145   static double time0;
146 
147   /****************************************************
148          collinear without spin-orbit coupling
149   ****************************************************/
150 
151   if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==0 ){
152 
153     if (strcasecmp(mode,"scf")==0){
154 
155       time0 = Cluster_collinear_ON2_Iter( SCF_iter,SpinP_switch,Cluster_ReCoes,Cluster_ko,
156                                           nh,CntOLP,CDM,EDM,Eele0,Eele1 );
157 
158     }
159 
160     else if (strcasecmp(mode,"force")==0){
161 
162       time0 = Cluster_collinear_ON2_Force( SCF_iter,SpinP_switch,Cluster_ReCoes,Cluster_ko,
163                                            nh,CntOLP,CDM,EDM,Eele0,Eele1 );
164 
165     }
166 
167   }
168 
169   /****************************************************
170            collinear with spin-orbit coupling
171   ****************************************************/
172 
173   else if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==1 ){
174     printf("Spin-orbit coupling is not supported for collinear DFT calculations.\n");
175     MPI_Finalize();
176     exit(1);
177   }
178 
179   /****************************************************
180    non-collinear with and without spin-orbit coupling
181   ****************************************************/
182 
183   else if (SpinP_switch==3){
184     /*
185     time0 = Cluster_non_collinear(mode,SCF_iter,SpinP_switch,nh,ImNL,CntOLP,CDM,EDM,Eele0,Eele1);
186     */
187   }
188 
189   return time0;
190 }
191 
192 
193 
194 
195 
196 
197 
Nested_Dissection(int free_flag,int Nmat,int * MP,int * Depth_Level,int * Number_Division)198 static void Nested_Dissection(int free_flag, int Nmat, int *MP, int *Depth_Level, int *Number_Division)
199 {
200   int i,j,k,l,n,nmin,nmax,ct_AN;
201   int FNAN_min,po,n0,n1,nb,k0,i0,GA,p,k1;
202   int Gc_AN,AN0,Mc_AN,Gh_AN,h_AN,OGh_AN,Rn;
203   int OG_min,OG_max,OG_max0,diff_n,nmin0,nmax0;
204   int Anum,wanA;
205   int abc_n0[4],abc_nb[4],abc_n1[4];
206   int abc_OG_max[4],abc_OG_min[4];
207   int abc_nmax[4],abc_nmin[4];
208   int *OGmin,*OGmax,**G2OG,**OG2G;
209   double **Cell_Gxyz2;
210   int avsize_cluster;
211   int m,mp,mpmax,nl,np;
212   int num0,numb,num1;
213   int **Atom_Index_BC;
214   int **Atom_SindB;
215   int **Atom_EindB;
216   int **Atom_SindC;
217   int **Atom_EindC;
218   int **NN_natn,**NN_ncn,*NN_FNAN,*MP2;
219 
220   /**********************************************
221    calulate Depth_Level and Number_Division
222    Depth_Level = nl
223    Number_Division = mpmax
224   **********************************************/
225 
226   avsize_cluster = 20;
227 
228   if (avsize_cluster<atomnum){
229     np = atomnum/avsize_cluster;
230     nl = log((double)np)/log(2.0);
231     if (nl<0) nl = 0;
232   }
233 
234   else {
235     nl = 0;
236   }
237 
238   mpmax = 1;
239   for (i=0; i<nl; i++) mpmax = 2*mpmax;
240 
241   /************************************************
242    if (free_flag==1)
243      free Index_BC, SindB, EindB, SindC, EindC
244    else
245      allocate them
246   ************************************************/
247 
248   if (free_flag==1){
249 
250     for (n=0; n<(nl+1); n++){
251       free(Index_BC[n]);
252     }
253     free(Index_BC);
254 
255     for (n=0; n<(nl+1); n++){
256       free(SindB[n]);
257     }
258     free(SindB);
259 
260     for (n=0; n<(nl+1); n++){
261       free(EindB[n]);
262     }
263     free(EindB);
264 
265     for (n=0; n<nl; n++){
266       free(SindC[n]);
267     }
268     free(SindC);
269 
270     for (n=0; n<nl; n++){
271       free(EindC[n]);
272     }
273     free(EindC);
274 
275   }
276 
277   else {
278 
279     Index_BC = (int**)malloc(sizeof(int*)*(nl+1));
280     for (n=0; n<(nl+1); n++){
281       Index_BC[n] = (int*)malloc(sizeof(int)*Nmat);
282       for (i=0; i<Nmat; i++){
283 	Index_BC[n][i] = i;
284       }
285     }
286 
287     SindB = (int**)malloc(sizeof(int*)*(nl+1));
288     mp = mpmax;
289     for (n=0; n<(nl+1); n++){
290       SindB[n] = (int*)malloc(sizeof(int)*mp);
291       for (m=0; m<mp; m++) SindB[n][m] = 0;
292       mp /= 2;
293     }
294 
295     EindB = (int**)malloc(sizeof(int*)*(nl+1));
296     mp = mpmax;
297     for (n=0; n<(nl+1); n++){
298       EindB[n] = (int*)malloc(sizeof(int)*mp);
299       for (m=0; m<mp; m++) EindB[n][m] = 0;
300       mp /= 2;
301     }
302 
303     SindC = (int**)malloc(sizeof(int*)*nl);
304     mp = mpmax/2;
305     for (n=0; n<nl; n++){
306       SindC[n] = (int*)malloc(sizeof(int)*mp);
307       for (m=0; m<mp; m++) SindC[n][m] = 0;
308       mp /= 2;
309     }
310 
311     EindC = (int**)malloc(sizeof(int*)*nl);
312     mp = mpmax/2;
313     for (n=0; n<nl; n++){
314       EindC[n] = (int*)malloc(sizeof(int)*mp);
315       for (m=0; m<mp; m++) EindC[n][m] = 0;
316       mp /= 2;
317     }
318   }
319 
320   if (free_flag==1) return ;
321 
322   /* allocation of arrays */
323 
324   Atom_Index_BC = (int**)malloc(sizeof(int*)*(nl+1));
325   for (n=0; n<(nl+1); n++){
326     Atom_Index_BC[n] = (int*)malloc(sizeof(int)*(atomnum+1));
327     for (i=0; i<(atomnum+1); i++){
328       Atom_Index_BC[n][i] = i;
329     }
330   }
331 
332   Atom_SindB = (int**)malloc(sizeof(int*)*(nl+1));
333   mp = mpmax;
334   for (n=0; n<(nl+1); n++){
335     Atom_SindB[n] = (int*)malloc(sizeof(int)*mp);
336     for (m=0; m<mp; m++) Atom_SindB[n][m] = 0;
337     mp /= 2;
338   }
339 
340   Atom_EindB = (int**)malloc(sizeof(int*)*(nl+1));
341   mp = mpmax;
342   for (n=0; n<(nl+1); n++){
343     Atom_EindB[n] = (int*)malloc(sizeof(int)*mp);
344     for (m=0; m<mp; m++) Atom_EindB[n][m] = 0;
345     mp /= 2;
346   }
347 
348   Atom_SindC = (int**)malloc(sizeof(int*)*nl);
349   mp = mpmax/2;
350   for (n=0; n<nl; n++){
351     Atom_SindC[n] = (int*)malloc(sizeof(int)*mp);
352     for (m=0; m<mp; m++) Atom_SindC[n][m] = 0;
353     mp /= 2;
354   }
355 
356   Atom_EindC = (int**)malloc(sizeof(int*)*nl);
357   mp = mpmax/2;
358   for (n=0; n<nl; n++){
359     Atom_EindC[n] = (int*)malloc(sizeof(int)*mp);
360     for (m=0; m<mp; m++) Atom_EindC[n][m] = 0;
361     mp /= 2;
362   }
363 
364   Cell_Gxyz2 = (double**)malloc(sizeof(double*)*4);
365   for (k=0; k<4; k++){
366     Cell_Gxyz2[k] = (double*)malloc(sizeof(double)*(atomnum+1));
367   }
368 
369   OG2G = (int**)malloc(sizeof(int*)*4);
370   for (k=0; k<4; k++){
371     OG2G[k] = (int*)malloc(sizeof(int)*(atomnum+1));
372   }
373 
374   G2OG = (int**)malloc(sizeof(int*)*4);
375   for (k=0; k<4; k++){
376     G2OG[k] = (int*)malloc(sizeof(int)*(atomnum+1));
377   }
378 
379   OGmin = (int*)malloc(sizeof(int)*(atomnum+1));
380   OGmax = (int*)malloc(sizeof(int)*(atomnum+1));
381 
382   NN_ncn = (int**)malloc(sizeof(int*)*(atomnum+1));
383   for (ct_AN=0; ct_AN<=atomnum; ct_AN++){
384     NN_ncn[ct_AN]  = (int*)malloc(sizeof(int)*((int)(Max_FSNAN*ScaleSize)+1));
385   }
386 
387   NN_natn = (int**)malloc(sizeof(int*)*(atomnum+1));
388   for (ct_AN=0; ct_AN<=atomnum; ct_AN++){
389     NN_natn[ct_AN] = (int*)malloc(sizeof(int)*((int)(Max_FSNAN*ScaleSize)+1));
390   }
391 
392   NN_FNAN = (int*)malloc(sizeof(int)*(atomnum+1));
393   MP2 = (int*)malloc(sizeof(int)*(atomnum+1));
394 
395   /**********************************************
396       perform a nested dissection by bisection
397   **********************************************/
398 
399   Atom_SindB[nl][0] = 1;
400   Atom_EindB[nl][0] = atomnum;
401 
402   mp = 1;
403 
404   /* n: level of hierarchy */
405 
406   for (n=(nl-1); 0<=n; n--){
407 
408     /* m: child number */
409 
410     for (m=0; m<mp; m++){
411 
412       /* construct the adjacency information */
413 
414       for (i=Atom_SindB[n+1][m]; i<=Atom_EindB[n+1][m]; i++){
415 
416         GA = Atom_Index_BC[n+1][i];
417         i0 = 0;
418 
419         for (j=0; j<=FNAN[GA]; j++){
420 
421           po = 0;
422           for (p=Atom_SindB[n+1][m]; p<=Atom_EindB[n+1][m]; p++){
423 
424             if (natn[GA][j]==Atom_Index_BC[n+1][p]){
425 
426               k1 = p - Atom_SindB[n+1][m] + 1;
427               po = 1;
428 
429               break;
430             }
431 	  } /* p */
432 
433           if (po==1){
434 
435             NN_natn[i-Atom_SindB[n+1][m]+1][i0] = k1;
436             NN_ncn[i-Atom_SindB[n+1][m]+1][i0] = ncn[GA][j];
437             i0++;
438 	  }
439 
440         } /* j */
441 
442         NN_FNAN[i-Atom_SindB[n+1][m]+1] = i0 - 1;
443 
444         for (k=1; k<=3; k++){
445           Cell_Gxyz2[k][i-Atom_SindB[n+1][m]+1] = Cell_Gxyz[GA][k];
446 	  OG2G[k][i-Atom_SindB[n+1][m]+1] = GA;
447 	  G2OG[k][i-Atom_SindB[n+1][m]+1] = i-Atom_SindB[n+1][m]+1;
448 	}
449 
450       } /* i */
451 
452       Latomnum = Atom_EindB[n+1][m] - Atom_SindB[n+1][m] + 1;
453 
454       if (0<Latomnum){
455         Bisection_System( Latomnum,Cell_Gxyz2,OG2G,G2OG,
456                           NN_FNAN,NN_natn,NN_ncn,
457                           OGmin,OGmax,&num0,&numb,&num1);
458       }
459       else {
460         num0 = 0;
461         numb = 0;
462         num1 = 0;
463       }
464 
465       /* construct Atom_Index_BC for the C part */
466 
467       Atom_EindC[n][m] = Atom_EindB[n+1][m];
468       Atom_SindC[n][m] = Atom_EindC[n][m] + 1 - numb;
469 
470       for (i=Atom_SindC[n][m]; i<=Atom_EindC[n][m]; i++){
471         Atom_Index_BC[n][i] = OG2G[0][num0+i-Atom_SindC[n][m]+1];
472       }
473 
474       /* construct Index_BC for the B parts */
475 
476       Atom_SindB[n][2*m] = Atom_SindB[n+1][m];
477       Atom_EindB[n][2*m] = Atom_SindB[n][2*m] + num0 - 1;
478 
479       Atom_SindB[n][2*m+1] = Atom_EindB[n][2*m] + 1;
480       Atom_EindB[n][2*m+1] = Atom_SindB[n][2*m+1] + num1 - 1;
481 
482       for (i=Atom_SindB[n][2*m]; i<=Atom_EindB[n][2*m]; i++){
483         Atom_Index_BC[n][i] = OG2G[0][i-Atom_SindB[n][2*m]+1];
484       }
485 
486       for (i=Atom_SindB[n][2*m+1]; i<=Atom_EindB[n][2*m+1]; i++){
487         Atom_Index_BC[n][i] = OG2G[0][num0+numb+i-Atom_SindB[n][2*m+1]+1];
488       }
489 
490     } /* m */
491 
492     mp *= 2;
493 
494   } /* n */
495 
496   /*************************************************************
497    reorder Atom_Index_BC so that the order of the global index
498    can be equivalent in all the Atom_Index_BCs
499   *************************************************************/
500 
501   mp = mpmax;
502 
503   for (n=0; n<nl; n++){
504 
505     for (m=0; m<mp; m++){
506       for (i=Atom_SindB[n][m]; i<=Atom_EindB[n][m]; i++){
507 	Atom_Index_BC[n+1][i] = Atom_Index_BC[n][i];
508       }
509     }
510 
511     for (m=0; m<(mp/2); m++){
512       for (i=Atom_SindC[n][m]; i<=Atom_EindC[n][m]; i++){
513 	Atom_Index_BC[n+1][i] = Atom_Index_BC[n][i];
514       }
515     }
516 
517     mp /= 2;
518   }
519 
520   /* printing Atom_Index_BC for debugging */
521 
522   /*
523   printf("\n\n");
524   printf("nl=%2d mpmax=%2d\n",nl,mpmax);
525 
526   mp = mpmax;
527   for (n=0; n<(nl+1); n++){
528 
529     for (m=0; m<mp; m++){
530       for (i=Atom_SindB[n][m]; i<=Atom_EindB[n][m]; i++){
531         printf("B n=%2d m=%2d i=%2d Atom_Index_BC=%2d\n",n,m,i,Atom_Index_BC[n][i]);
532       }
533     }
534 
535     mp /= 2;
536   }
537 
538   mp = mpmax/2;
539   for (n=0; n<nl; n++){
540 
541     for (m=0; m<mp; m++){
542       for (i=Atom_SindC[n][m]; i<=Atom_EindC[n][m]; i++){
543         printf("C n=%2d m=%2d i=%2d Atom_Index_BC=%2d\n",n,m,i,Atom_Index_BC[n][i]);
544       }
545     }
546 
547     mp /= 2;
548   }
549   */
550 
551   /*************************************************************
552              construct Index_BC using Atom_Index_BC
553   *************************************************************/
554 
555   /* set up Depth_Level and Number_Division */
556 
557   *Depth_Level = nl;
558   *Number_Division = mpmax;
559 
560   /* set up MP2 where MP2[i=ordered index for atom]
561      gives an ordered index for orbital. */
562 
563   Anum = 1;
564   for (i=1; i<=atomnum; i++){
565     Gc_AN = Atom_Index_BC[nl][i];
566     MP2[i] = Anum;
567     wanA = WhatSpecies[Gc_AN];
568     Anum += Spe_Total_CNO[wanA];
569   }
570 
571   /* Index_BC for B */
572 
573   mp = mpmax;
574   for (n=0; n<(nl+1); n++){
575     for (m=0; m<mp; m++){
576 
577       i = Atom_SindB[n][m];
578       SindB[n][m] = MP2[i]-1;
579 
580       i = Atom_EindB[n][m];
581       Gc_AN = Atom_Index_BC[n][i];
582       wanA = WhatSpecies[Gc_AN];
583       EindB[n][m] = MP2[i] + Spe_Total_CNO[wanA]-2;
584 
585       for (i=Atom_SindB[n][m]; i<=Atom_EindB[n][m]; i++){
586 
587         Gc_AN = Atom_Index_BC[n][i];
588         wanA = WhatSpecies[Gc_AN];
589         i0 = MP2[i];
590 
591         for (j=MP[Gc_AN]; j<(MP[Gc_AN]+Spe_Total_CNO[wanA]); j++){
592           Index_BC[n][i0-1] = j-1;
593           i0++;
594 	}
595       }
596     }
597 
598     mp /= 2;
599   }
600 
601   /* Index_BC for C */
602 
603   mp = mpmax/2;
604   for (n=0; n<nl; n++){
605     for (m=0; m<mp; m++){
606 
607       i = Atom_SindC[n][m];
608       SindC[n][m] = MP2[i]-1;
609 
610       i = Atom_EindC[n][m];
611       Gc_AN = Atom_Index_BC[n][i];
612       wanA = WhatSpecies[Gc_AN];
613       EindC[n][m] = MP2[i] + Spe_Total_CNO[wanA]-2;
614 
615       for (i=Atom_SindC[n][m]; i<=Atom_EindC[n][m]; i++){
616 
617         Gc_AN = Atom_Index_BC[n][i];
618         wanA = WhatSpecies[Gc_AN];
619         i0 = MP2[i];
620 
621         for (j=MP[Gc_AN]; j<(MP[Gc_AN]+Spe_Total_CNO[wanA]); j++){
622           Index_BC[n][i0-1] = j-1;
623           i0++;
624 	}
625       }
626     }
627 
628     mp /= 2;
629   }
630 
631 
632   /* printing Index_BC for debugging */
633 
634   /*
635   printf("\n\n");
636   printf("nl=%2d mpmax=%2d\n",nl,mpmax);
637 
638   mp = mpmax;
639   for (n=0; n<(nl+1); n++){
640 
641     for (m=0; m<mp; m++){
642       for (i=SindB[n][m]; i<=EindB[n][m]; i++){
643         printf("B n=%2d m=%2d i=%2d Index_BC=%2d\n",n,m,i,Index_BC[n][i]);
644       }
645     }
646 
647     mp /= 2;
648   }
649 
650   mp = mpmax/2;
651   for (n=0; n<nl; n++){
652 
653     for (m=0; m<mp; m++){
654       for (i=SindC[n][m]; i<=EindC[n][m]; i++){
655         printf("C n=%2d m=%2d i=%2d Index_BC=%2d\n",n,m,i,Index_BC[n][i]);
656       }
657     }
658 
659     mp /= 2;
660   }
661   */
662 
663 
664   /* printing structures */
665 
666   /*
667   {
668 
669     int n3;
670     char NameC[100][100];
671 
672     n3 = nl-6;
673 
674   sprintf(NameC[0],"He");
675   sprintf(NameC[1],"Li");
676   sprintf(NameC[2],"Be");
677   sprintf(NameC[3],"B");
678   sprintf(NameC[4],"C");
679   sprintf(NameC[5],"N");
680   sprintf(NameC[6],"F");
681   sprintf(NameC[7],"Ne");
682   sprintf(NameC[8],"Na");
683   sprintf(NameC[9],"Mg");
684   sprintf(NameC[10],"Al");
685   sprintf(NameC[11],"Si");
686   sprintf(NameC[12],"P");
687   sprintf(NameC[13],"S");
688   sprintf(NameC[14],"Cl");
689   sprintf(NameC[15],"Ar");
690   sprintf(NameC[16],"K");
691 
692   printf("%2d\n\n",atomnum);
693 
694   mp = mpmax;
695   for (n=0; n<(nl+1); n++){
696 
697     for (m=0; m<mp; m++){
698 
699       if (n==n3){
700 	for (i=SindB[n][m]; i<=EindB[n][m]; i++){
701 
702           j = Index_BC[n][i] + 1;
703           printf("H  %12.7f %12.7f %12.7f   0.0 0.0 0.0\n",
704                   Gxyz[j][1]*BohrR,Gxyz[j][2]*BohrR,Gxyz[j][3]*BohrR);
705 	}
706       }
707 
708     }
709 
710     mp /= 2;
711   }
712 
713   mp = mpmax/2;
714   for (n=0; n<nl; n++){
715 
716     for (m=0; m<mp; m++){
717 
718       if (n3<=n){
719 	for (i=SindC[n][m]; i<=EindC[n][m]; i++){
720 
721           j = Index_BC[n][i] + 1;
722 
723           printf("%s %12.7f %12.7f %12.7f   0.0 0.0 0.0\n",
724                   NameC[n],Gxyz[j][1]*BohrR,Gxyz[j][2]*BohrR,Gxyz[j][3]*BohrR);
725 
726 
727 
728 	}
729       }
730 
731     }
732 
733     mp /= 2;
734   }
735 
736   }
737   MPI_Finalize();
738   exit(0);
739   */
740 
741   /* freeing of arrays */
742 
743   free(MP2);
744   free(NN_FNAN);
745 
746   for (ct_AN=0; ct_AN<=atomnum; ct_AN++){
747     free(NN_natn[ct_AN]);
748   }
749   free(NN_natn);
750 
751   for (ct_AN=0; ct_AN<=atomnum; ct_AN++){
752     free(NN_ncn[ct_AN]);
753   }
754   free(NN_ncn);
755 
756   free(OGmax);
757   free(OGmin);
758 
759   for (k=0; k<4; k++){
760     free(G2OG[k]);
761   }
762   free(G2OG);
763 
764   for (k=0; k<4; k++){
765     free(OG2G[k]);
766   }
767   free(OG2G);
768 
769   for (k=0; k<4; k++){
770     free(Cell_Gxyz2[k]);
771   }
772   free(Cell_Gxyz2);
773 
774   for (n=0; n<nl; n++){
775     free(Atom_EindC[n]);
776   }
777   free(Atom_EindC);
778 
779   for (n=0; n<nl; n++){
780     free(Atom_SindC[n]);
781   }
782   free(Atom_SindC);
783 
784   for (n=0; n<(nl+1); n++){
785     free(Atom_EindB[n]);
786   }
787   free(Atom_EindB);
788 
789   for (n=0; n<(nl+1); n++){
790     free(Atom_SindB[n]);
791   }
792   free(Atom_SindB);
793 
794   for (n=0; n<(nl+1); n++){
795     free(Atom_Index_BC[n]);
796   }
797   free(Atom_Index_BC);
798 }
799 
800 
801 
802 
803 
804 
805 
Bisection_System(int Latomnum,double ** Cell_Gxyz2,int ** OG2G,int ** G2OG,int * NN_FNAN,int ** NN_natn,int ** NN_ncn,int * OGmin,int * OGmax,int * num0,int * numb,int * num1)806 static void Bisection_System(
807     int Latomnum,
808     double **Cell_Gxyz2,
809     int **OG2G,
810     int **G2OG,
811     int *NN_FNAN,
812     int **NN_natn,
813     int **NN_ncn,
814     int *OGmin,
815     int *OGmax,
816     int *num0,
817     int *numb,
818     int *num1)
819 {
820   int i,k,AN0,AN1,AN2,Gc_AN,h_AN,Gh_AN;
821   int nmin,nmax,l,n,Rn,OGh_AN;
822   int FNAN_min,OG_max,OG_min,po;
823   int OG_min0,OG_max0,diff_n,nmin0,nmax0;
824   int abc_n0[4],abc_nb[4],abc_n1[4];
825   int abc_OG_max[4],abc_OG_min[4];
826   int abc_nmax[4],abc_nmin[4],abc_mag[4];
827   int n0,n1,nb,n00,nb0,n10,k0,wanA;
828   int *longtail_flag;
829   double rcut,max_rcut,min_rcut;
830 
831   /***************************************************************
832        eliminate atoms which have longer cutoff of basis set,
833        where we let the eliminated atoms be in the C region.
834   ***************************************************************/
835 
836   longtail_flag = (int*)malloc(sizeof(int)*(Latomnum+1));
837   for (i=0; i<(Latomnum+1); i++) longtail_flag[i] = 0;
838 
839   max_rcut = -100000.0;
840   min_rcut =  100000.0;
841 
842   for (AN0=1; AN0<=Latomnum; AN0++){
843 
844     AN1 = OG2G[1][AN0];
845     wanA = WhatSpecies[AN1];
846     rcut = Spe_Atom_Cut1[wanA];
847 
848     if (max_rcut<rcut) max_rcut = rcut;
849     if (rcut<min_rcut) min_rcut = rcut;
850   }
851 
852   if (  1.5<=(max_rcut/min_rcut) ){
853 
854     for (AN0=1; AN0<=Latomnum; AN0++){
855 
856       AN1 = OG2G[1][AN0];
857       wanA = WhatSpecies[AN1];
858       rcut = Spe_Atom_Cut1[wanA];
859 
860       if ( 1.5<=(rcut/min_rcut) ){
861         longtail_flag[AN0] = 1;
862       }
863     }
864   }
865 
866   /***************************************************************
867                       bisection of the system
868   ***************************************************************/
869 
870   for (k=1; k<=3; k++){
871 
872     /*
873     for (i=1; i<=Latomnum; i++){
874       printf("A1 k=%2d i=%2d Cell_Gxyz2=%15.12f OG2G=%2d G2OG=%2d\n",
875               k,i,Cell_Gxyz2[k][i],OG2G[k][i],G2OG[k][i]);
876     }
877     */
878 
879     /* order atoms based on Cell_Gxyz2 */
880 
881     qsort_double3((long)Latomnum,Cell_Gxyz2[k],OG2G[k],G2OG[k]);
882 
883     /*
884     for (i=1; i<=Latomnum; i++){
885       printf("A2 k=%2d i=%2d Cell_Gxyz2=%15.12f OG2G=%2d G2OG=%2d\n",
886               k,i,Cell_Gxyz2[k][i],OG2G[k][i],G2OG[k][i]);
887     }
888     */
889 
890     for (i=1; i<=Latomnum; i++){
891       AN0 = G2OG[k][i];
892       G2OG[0][AN0] = i;
893     }
894 
895     /* find the reachable range of each atom */
896 
897     for (AN0=1; AN0<=Latomnum; AN0++){
898 
899       AN1 = G2OG[k][AN0];
900 
901       if (longtail_flag[AN1]==0){  /* eliminate atoms with "longtail_flag[AN1]==0" */
902 
903 	nmin = 1000000;
904 	nmax =-1000000;
905 
906 	for (h_AN=0; h_AN<=NN_FNAN[AN1]; h_AN++){
907 
908 	  AN2 = NN_natn[AN1][h_AN];
909 	  Rn = NN_ncn[AN1][h_AN];
910 	  l = atv_ijk[Rn][k];
911 
912 	  n = (G2OG[0][AN2]-1) + Latomnum*l;
913 
914 	  if (longtail_flag[AN2]==0){ /* eliminate atoms with "longtail_flag[AN2]==0" */
915 	    if (nmax<n) nmax = n;
916 	    if (n<nmin) nmin = n;
917 	  }
918 	}
919       }
920 
921       else {
922         nmin = AN0-1;
923         nmax = AN0-1;
924       }
925 
926       OGmin[AN0] = nmin;
927       OGmax[AN0] = nmax;
928 
929     } /* AN0 */
930 
931     /* find the starting atom which has the lowest NN_FNAN. */
932 
933     FNAN_min = 1000000;
934     for (AN0=1; AN0<=Latomnum; AN0++){
935 
936       AN1 = G2OG[k][AN0];
937 
938       if (longtail_flag[AN1]==0){  /* eliminate atoms with "longtail_flag[AN1]==0" */
939 	if (NN_FNAN[AN1]<FNAN_min){
940 
941 	  FNAN_min = NN_FNAN[AN1];
942 	  OG_min = AN0;
943 	}
944       }
945     }
946 
947     /**********************************************************
948           extend the size of the part n0 step by step
949     **********************************************************/
950 
951     po = 0;
952     OG_max = OG_min;
953     nmin = OGmin[OG_min];
954     nmax = OGmax[OG_min];
955     diff_n = 1000000;
956     OG_min0 = OG_min;
957     OG_max0 = OG_max;
958     nmin0 = nmin;
959     nmax0 = nmax;
960 
961     do {
962 
963       if (OGmin[OG_max]<nmin) nmin = OGmin[OG_max];
964       if (nmax<OGmax[OG_max]) nmax = OGmax[OG_max];
965 
966       if (OGmin[OG_min]<nmin) nmin = OGmin[OG_min];
967       if (nmax<OGmax[OG_min]) nmax = OGmax[OG_min];
968 
969       n0 = OG_max - OG_min + 1;
970       nb = (nmax - nmin + 1) - n0;
971       n1 = Latomnum - (n0 + nb);
972 
973       /**********************************************************
974            set up flag into OG2G[0][AN],
975            where AN is the local index when the routine was called.
976       **********************************************************/
977 
978       for (AN0=0; AN0<=Latomnum; AN0++) OG2G[0][AN0] = 0;
979 
980       for (AN0=OG_min; AN0<=OG_max; AN0++){
981 	AN1 = G2OG[k][AN0];
982 	OG2G[0][AN1] = 10;  /* flag in the n0 part */
983       }
984 
985       for (AN0=nmin; AN0<=nmax; AN0++){
986 
987 	if (0<=AN0){
988 	  AN1 = AN0%Latomnum + 1;
989 	}
990 	else {
991 
992 	  AN1 = AN0;
993 	  do {
994 	    AN1 += Latomnum;
995 	  } while (AN1<0);
996 	  AN1 = AN1 + 1;
997 	}
998 
999 	AN2 = G2OG[k][AN1];
1000 
1001 	if (OG2G[0][AN2]==0)  OG2G[0][AN2] = 20;  /* flag in the nb part */
1002       }
1003 
1004       for (AN0=1; AN0<=Latomnum; AN0++){
1005 	if (OG2G[0][AN0]==0) OG2G[0][AN0] = 30; /* flag in the n1 part */
1006       }
1007 
1008       /**********************************************************
1009              update n0, nb, n1 using OG2G[0] and longtail_flag
1010       **********************************************************/
1011 
1012       for (AN0=1; AN0<=Latomnum; AN0++){
1013 
1014 	if (longtail_flag[AN0]==1 && OG2G[0][AN0]==10){
1015 	  n0--;
1016 	  nb++;
1017 	}
1018 
1019 	if (longtail_flag[AN0]==1 && OG2G[0][AN0]==30){
1020 	  n1--;
1021 	  nb++;
1022 	}
1023       }
1024 
1025       /**********************************************************
1026               if (fabs(n1-n0)<=diff_n) then, update parameters
1027       **********************************************************/
1028 
1029       if (fabs(n1-n0)<=diff_n){
1030 
1031 	diff_n = fabs(n1-n0);
1032 	OG_min0 = OG_min;
1033 	OG_max0 = OG_max;
1034 	nmin0 = nmin;
1035 	nmax0 = nmax;
1036 	n00 = n0;
1037 	nb0 = nb;
1038 	n10 = n1;
1039       }
1040       else {
1041 	po = 1;
1042       }
1043 
1044       if ( Latomnum<=(nmax-nmin+1) ) {
1045 	po = 2; /* partitioning is impossible along the direction. */
1046       }
1047 
1048       /*
1049       printf("FFF po=%2d k=%2d OG_min=%2d OG_max=%2d nmin=%2d nmax=%2d n0=%2d nb=%2d n1=%2d\n",
1050               po,k,OG_min,OG_max,nmin,nmax,n0,nb,n1);
1051       */
1052 
1053 
1054       if (po==0){
1055         if (OG_max<Latomnum) OG_max++;
1056         else if (1<OG_min)   OG_min--;
1057       }
1058 
1059     } while (po==0);
1060 
1061     abc_OG_max[k] = OG_max0;
1062     abc_OG_min[k] = OG_min0;
1063     abc_nmax[k] = nmax0;
1064     abc_nmin[k] = nmin0;
1065     abc_n0[k] = n00;
1066     abc_nb[k] = nb0;
1067     abc_n1[k] = n10;
1068 
1069     /*
1070     printf("k=%2d n0=%2d nb=%2d n1=%2d\n",k,abc_n0[k],abc_nb[k],abc_n1[k]);
1071     */
1072 
1073   } /* k */
1074 
1075   /**********************************************************
1076      set up OG2G
1077   **********************************************************/
1078 
1079   abc_mag[1] = fabs(abc_n0[1]-abc_n1[1])+abc_nb[1];
1080   abc_mag[2] = fabs(abc_n0[2]-abc_n1[2])+abc_nb[2];
1081   abc_mag[3] = fabs(abc_n0[3]-abc_n1[3])+abc_nb[3];
1082 
1083   /*
1084   printf("abc_mag[1]=%2d\n",abc_mag[1]);
1085   printf("abc_mag[2]=%2d\n",abc_mag[2]);
1086   printf("abc_mag[3]=%2d\n",abc_mag[3]);
1087   */
1088 
1089   if (abc_mag[1]<=abc_mag[2])  k0 = 1;
1090   else                         k0 = 2;
1091   if (abc_mag[3]<abc_mag[k0])  k0 = 3;
1092 
1093   /*
1094   printf("k0=%2d abc_n0[k0]=%2d\n",k0,abc_n0[k0]);
1095   */
1096 
1097   /* The n0 part */
1098 
1099   n = 1;
1100   for (AN0=abc_OG_min[k0]; AN0<(abc_OG_min[k0]+abc_n0[k0]); AN0++){
1101 
1102     AN1 = G2OG[k0][AN0];
1103 
1104     if (longtail_flag[AN1]!=1){
1105       G2OG[0][n] = OG2G[k0][AN0];
1106       OG2G[k0][AN0] = -1;
1107       n++;
1108     }
1109   }
1110 
1111   /* The nb part from the n0 part */
1112 
1113   for (AN0=abc_OG_min[k0]; AN0<(abc_OG_min[k0]+abc_n0[k0]); AN0++){
1114 
1115     AN1 = G2OG[k0][AN0];
1116 
1117     if (longtail_flag[AN1]==1){
1118       G2OG[0][n] = OG2G[k0][AN0];
1119       OG2G[k0][AN0] = -1;
1120       n++;
1121     }
1122   }
1123 
1124   /* The nb part from the original nb part */
1125 
1126   for (AN0=abc_nmin[k0]; AN0<=abc_nmax[k0]; AN0++){
1127 
1128     if (0<=AN0){
1129       AN1 = AN0%Latomnum + 1;
1130     }
1131     else {
1132 
1133       AN1 = AN0;
1134       do {
1135         AN1 += Latomnum;
1136       } while (AN1<0);
1137       AN1 = AN1 + 1;
1138     }
1139 
1140     if (OG2G[k0][AN1]!=-1) {
1141       G2OG[0][n] = OG2G[k0][AN1];
1142       OG2G[k0][AN1] = -1;
1143       n++;
1144     }
1145   }
1146 
1147   /* The nb part from the n1 part */
1148 
1149   for (AN0=1; AN0<=Latomnum; AN0++){
1150 
1151     AN1 = G2OG[k0][AN0];
1152 
1153     if (OG2G[k0][AN0]!=-1 && longtail_flag[AN1]==1) {
1154       G2OG[0][n] = OG2G[k0][AN0];
1155       OG2G[k0][AN0] = -1;
1156       n++;
1157     }
1158   }
1159 
1160   /* The n1 part */
1161 
1162   for (AN0=1; AN0<=Latomnum; AN0++){
1163 
1164     AN1 = G2OG[k0][AN0];
1165 
1166     if (OG2G[k0][AN0]!=-1 && longtail_flag[AN1]!=1) {
1167       G2OG[0][n] = OG2G[k0][AN0];
1168       OG2G[k0][AN0] = -1;
1169       n++;
1170     }
1171   }
1172 
1173   if ( (n-1)!=Latomnum ){
1174     printf("Could not bisect\n");
1175 
1176     MPI_Finalize();
1177     exit(1);
1178   }
1179 
1180   /* set n0, nb, n1, and OG2G[0] */
1181 
1182   n0 = abc_n0[k0];
1183   nb = abc_nb[k0];
1184   n1 = abc_n1[k0];
1185 
1186   for (n=1; n<=Latomnum; n++){
1187     OG2G[0][n] = G2OG[0][n];
1188   }
1189 
1190   /* print OG2G for debugging */
1191 
1192   /*
1193   for (n=1; n<=n0; n++){
1194     printf("n0: n=%2d OG2G=%2d\n",n,OG2G[0][n]);
1195   }
1196 
1197   for (n=n0+1; n<=n0+nb; n++){
1198     printf("nb: n=%2d OG2G=%2d\n",n,OG2G[0][n]);
1199   }
1200 
1201   for (n=n0+nb+1; n<=n0+nb+n1; n++){
1202     printf("n1: n=%2d OG2G=%2d\n",n,OG2G[0][n]);
1203   }
1204   */
1205 
1206   *num0 = abc_n0[k0];
1207   *numb = abc_nb[k0];
1208   *num1 = abc_n1[k0];
1209 
1210   free(longtail_flag);
1211 }
1212 
1213 
1214 
1215 
1216 
1217 
OND_Solver(char * mode,int myid,int numprocs,int myid2,int numprocs2,int Nloop1,MPI_Comm MPI_Comm,int spin,int Epoint,double Real_Ene,int Nmat,int * NZE,int * PZE,int nl,int mpmax,double Trial_ChemP,double ** Hall,double * Sall,double ** DMall,double ** WRG,int * conv_index_row,int * conv_index_col,int ** conv_ind2,int TNZE)1218 static void OND_Solver(
1219   char *mode,
1220   int myid,
1221   int numprocs,
1222   int myid2,
1223   int numprocs2,
1224   int Nloop1,
1225   MPI_Comm MPI_Comm,
1226   int spin,
1227   int Epoint,
1228   double Real_Ene,
1229   int Nmat,
1230   int *NZE,
1231   int *PZE,
1232   int nl,
1233   int mpmax,
1234   double Trial_ChemP,
1235   double **Hall,
1236   double *Sall,
1237   double **DMall,
1238   double **WRG,
1239   int *conv_index_row,
1240   int *conv_index_col,
1241   int **conv_ind2,
1242   int TNZE)
1243 {
1244   int po,TNZE2,TNZE3,col,row;
1245   int i,j,k,j1,k1,l,p,q;
1246   double complex *Tx;
1247   int *Ti,*Tp,*Tp0;
1248   double tol,sum;
1249   double stime,etime;
1250   double stime1,etime1;
1251   double stime2,etime2;
1252   double complex alpha,weight;
1253   double complex ctmp,ctmp3;
1254   double av_num;
1255   int io,jo,jo_min,n;
1256   int m0,m1,m2,m3,m4,m5;
1257   int n0,n1,n2,mp0,nsr;
1258   int nb0,nb1,nc,k2,kk;
1259   int mp,mm,numa;
1260   int m,i1,i2,j2,i3,j3,nsa,nsc,nsc2,nsb;
1261   int max_nsc,max_nsb,max_nsa;
1262   dcomplex dcsum0,dcsum1,dcsum2,dcsum3,dcsum4;
1263   double complex csum2,csum3,csum4;
1264   double time1a,time1a2,time1a1,time1a0,time1b,time1c,time1d;
1265   double time1,time2,time3,time4,time5,time6,time41,time42;
1266   double time31,time32,time33,time34;
1267   double time51,time52,time53,time54,time55;
1268   double numop1,numop2,numop0;
1269   dcomplex al,be;
1270   /* for OpenMP */
1271   int OMPID,Nthrds,Nprocs;
1272   /* for MPI */
1273   int ID,*is1,*ie1,*Row2ID,tag=999;
1274   int size0,size1,k0;
1275   MPI_Status *stat_send;
1276   MPI_Request *request_send;
1277   MPI_Request *request_recv;
1278 
1279   stat_send = (MPI_Status *)malloc(sizeof(MPI_Status)*numprocs2);
1280   request_send = (MPI_Request *)malloc(sizeof(MPI_Request)*numprocs2);
1281   request_recv = (MPI_Request *)malloc(sizeof(MPI_Request)*numprocs2);
1282 
1283   al.r = 1.0;
1284   al.i = 0.0;
1285   be.r = 0.0;
1286   be.i = 0.0;
1287 
1288   numop1 = 0.0;
1289   numop2 = 0.0;
1290 
1291   time1 = 0.0;
1292   time1a= 0.0;
1293   time1a0= 0.0;
1294   time1a1= 0.0;
1295   time1a2= 0.0;
1296   time1b= 0.0;
1297   time1c= 0.0;
1298   time1d= 0.0;
1299   time2 = 0.0;
1300   time3 = 0.0;
1301   time31= 0.0;
1302   time32= 0.0;
1303   time33= 0.0;
1304   time34= 0.0;
1305   time4 = 0.0;
1306   time41= 0.0;
1307   time42= 0.0;
1308   time5 = 0.0;
1309   time51= 0.0;
1310   time52= 0.0;
1311   time53= 0.0;
1312   time54= 0.0;
1313   time55= 0.0;
1314   time6 = 0.0;
1315 
1316   dtime(&stime);
1317   dtime(&stime1);
1318 
1319   /* set alpha and weight */
1320 
1321   if (strcasecmp(mode,"contour")==0){
1322 
1323     if (ON2_method[Epoint]==1){ /* poles */
1324       alpha  = Trial_ChemP + I*(ON2_zp[Epoint].i/Beta);
1325       weight = -2.0*ON2_Rp[Epoint].r/Beta;
1326     }
1327     else if (ON2_method[Epoint]==2){ /* zeroth moment */
1328       alpha  = ON2_zp[Epoint].r + I*ON2_zp[Epoint].i;
1329       weight = I*ON2_Rp[Epoint].i;
1330     }
1331   }
1332 
1333   else if (strcasecmp(mode,"retarded")==0){
1334     alpha  = Real_Ene + 0.01/eV2Hartree*I;
1335     weight = I;
1336   }
1337 
1338   else if (strcasecmp(mode,"force")==0){
1339 
1340     if (ON2_method_f[Epoint]==1){ /* poles */
1341       alpha  = Trial_ChemP + I*(ON2_zp_f[Epoint].i/Beta);
1342       weight = -2.0*ON2_Rp_f[Epoint].r/Beta*alpha;
1343     }
1344     else if (ON2_method_f[Epoint]==2){ /* zeroth and 1st order moments */
1345       alpha  = ON2_zp_f[Epoint].r + I*ON2_zp_f[Epoint].i;
1346       weight = ON2_Rp_f[Epoint].r + I*ON2_Rp_f[Epoint].i;
1347     }
1348   }
1349 
1350   /***********************************************************
1351       construct the A-matrix of which form is defined in
1352       CSparse as "compressed-column form".
1353   ***********************************************************/
1354 
1355   Tp  = (int*)malloc(sizeof(int)*(Nmat+1));
1356   Tp0 = (int*)malloc(sizeof(int)*(Nmat+1));
1357 
1358   if ( 1<measure_time ){
1359     printf("myid=%2d Epoint =%2d %10.5f %10.5f\n",myid,Epoint,Trial_ChemP,cimag(alpha) );fflush(stdout);
1360   }
1361 
1362   dtime(&stime2);
1363 
1364   for (i=0; i<TNZE; i++){
1365     row = conv_index_row[i];                /* row index    */
1366     col = conv_index_col[i];                /* column index */
1367     conv_ind2[row][col] = 1;
1368   }
1369 
1370   dtime(&etime2);
1371   time1a0 = etime2 - stime2;
1372 
1373   dtime(&stime2);
1374 
1375   for (j=0; j<Nmat; j++) Tp0[j] = 0;
1376 
1377   for (i=0; i<TNZE; i++){
1378 
1379     row = conv_index_row[i];                /* row index    */
1380     col = conv_index_col[i];                /* column index */
1381 
1382     if (conv_ind2[row][col]==1){
1383       conv_ind2[row][col] = -(Tp0[col]+1);
1384       Tp0[col]++;
1385     }
1386   }
1387 
1388   Tp[0] = 0;
1389   for (j=1; j<=Nmat; j++){    /* j: col */
1390     Tp[j] = Tp[j-1] + Tp0[j-1];
1391   }
1392   TNZE2 = Tp[Nmat];
1393 
1394   for (i=0; i<TNZE; i++){
1395 
1396     row = conv_index_row[i];                /* row index    */
1397     col = conv_index_col[i];                /* column index */
1398 
1399     if (conv_ind2[row][col]<0){
1400       conv_ind2[row][col] = -conv_ind2[row][col] - 1 + Tp[col];
1401     }
1402   }
1403 
1404   Tx = (double complex*)malloc(sizeof(double complex)*TNZE2);
1405   Ti = (int*)malloc(sizeof(int)*TNZE2);
1406   for (i=0; i<TNZE2; i++) Tx[i] = 0.0;
1407 
1408   dtime(&etime2);
1409   time1a1 = etime2 - stime2;
1410 
1411   dtime(&stime2);
1412 
1413   for (i=0; i<TNZE; i++){
1414 
1415     row = conv_index_row[i];                /* row index    */
1416     col = conv_index_col[i];                /* column index */
1417 
1418     k = conv_ind2[row][col];
1419 
1420     Ti[k] = row;                                 /* row index    */
1421     Tx[k] += (alpha*Sall[i] - Hall[spin][i]);    /* value        */
1422   }
1423 
1424   dtime(&etime2);
1425   time1a2 = etime2 - stime2;
1426 
1427   /* make invp */
1428 
1429   invp = (int*)malloc(sizeof(int)*Nmat);
1430 
1431   for (i=0; i<Nmat; i++){
1432     j = Index_BC[nl][i];
1433     invp[j] = i;
1434   }
1435 
1436   dtime(&etime1);
1437   time1a = etime1 - stime1;
1438 
1439   /***********************************************************
1440              set up block matrices, B0, B1, and C
1441   ***********************************************************/
1442 
1443   dtime(&stime1);
1444 
1445   Bnum[0] = (int***)malloc(sizeof(int**)*nl);
1446   Bnum[1] = (int***)malloc(sizeof(int**)*nl);
1447   Cnum  = (int***)malloc(sizeof(int**)*nl);
1448   Bi[0] = (int****)malloc(sizeof(int***)*nl);
1449   Bi[1] = (int****)malloc(sizeof(int***)*nl);
1450   Ci  = (int****)malloc(sizeof(int***)*nl);
1451 
1452   Bx[0] = (double complex****)malloc(sizeof(double complex***)*nl);
1453   Bx[1] = (double complex****)malloc(sizeof(double complex***)*nl);
1454   Cx  = (double complex****)malloc(sizeof(double complex***)*nl);
1455 
1456   IBx[0] = (double complex****)malloc(sizeof(double complex***)*nl);
1457   IBx[1] = (double complex****)malloc(sizeof(double complex***)*nl);
1458   ICx  = (double complex****)malloc(sizeof(double complex***)*nl);
1459 
1460   mp = mpmax/2;
1461 
1462   for (n=0; n<nl; n++){
1463 
1464     Bnum[0][n] = (int**)malloc(sizeof(int*)*mp);
1465     Bnum[1][n] = (int**)malloc(sizeof(int*)*mp);
1466     Cnum[n]  = (int**)malloc(sizeof(int*)*mp);
1467 
1468     Bi[0][n] = (int***)malloc(sizeof(int**)*mp);
1469     Bi[1][n] = (int***)malloc(sizeof(int**)*mp);
1470     Ci[n]  = (int***)malloc(sizeof(int**)*mp);
1471 
1472     Bx[0][n] = (double complex***)malloc(sizeof(double complex**)*mp);
1473     Bx[1][n] = (double complex***)malloc(sizeof(double complex**)*mp);
1474     Cx[n]  = (double complex***)malloc(sizeof(double complex**)*mp);
1475 
1476     IBx[0][n] = (double complex***)malloc(sizeof(double complex**)*mp);
1477     IBx[1][n] = (double complex***)malloc(sizeof(double complex**)*mp);
1478     ICx[n]  = (double complex***)malloc(sizeof(double complex**)*mp);
1479 
1480     for (m=0; m<mp; m++){
1481 
1482       Bnum[0][n][m] = (int*)malloc(sizeof(int)*(EindC[n][m]-SindC[n][m]+1));
1483       Bnum[1][n][m] = (int*)malloc(sizeof(int)*(EindC[n][m]-SindC[n][m]+1));
1484       Cnum[n][m]  = (int*)malloc(sizeof(int)*(EindC[n][m]-SindC[n][m]+1));
1485 
1486       Bi[0][n][m] = (int**)malloc(sizeof(int*)*(EindC[n][m]-SindC[n][m]+1));
1487       Bi[1][n][m] = (int**)malloc(sizeof(int*)*(EindC[n][m]-SindC[n][m]+1));
1488       Ci[n][m]  = (int**)malloc(sizeof(int*)*(EindC[n][m]-SindC[n][m]+1));
1489 
1490       Bx[0][n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1491       Bx[1][n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1492       Cx[n][m]  = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1493 
1494       IBx[0][n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1495       IBx[1][n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1496       ICx[n][m]  = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1497 
1498       for (i=SindC[n][m]; i<=EindC[n][m]; i++){
1499 
1500         i1 = Index_BC[n][i];
1501 
1502         nb0 = 0;
1503         nb1 = 0;
1504         nc  = 0;
1505 
1506         for (j=Tp[i1]; j<Tp[i1+1]; j++){
1507 
1508           j1 = invp[Ti[j]];
1509 
1510           if      ( SindB[n][2*m  ]<=j1 && j1<=EindB[n][2*m  ] ) nb0++;
1511           else if ( SindB[n][2*m+1]<=j1 && j1<=EindB[n][2*m+1] ) nb1++;
1512           else if ( SindC[n][m    ]<=j1 && j1<=EindC[n][m    ] ) nc++;
1513         }
1514 
1515         Bnum[0][n][m][i-SindC[n][m]] = nb0;
1516         Bnum[1][n][m][i-SindC[n][m]] = nb1;
1517         Cnum[n][m][i-SindC[n][m]] = nc;
1518 
1519         Bi[0][n][m][i-SindC[n][m]] = (int*)malloc(sizeof(int)*nb0);
1520         Bi[1][n][m][i-SindC[n][m]] = (int*)malloc(sizeof(int)*nb1);
1521         Ci[n][m][i-SindC[n][m]] = (int*)malloc(sizeof(int)*nc);
1522 
1523         Bx[0][n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nb0);
1524         Bx[1][n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nb1);
1525         Cx[n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nc);
1526 
1527         IBx[0][n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nb0);
1528         IBx[1][n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nb1);
1529         ICx[n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nc);
1530 
1531         for (k=0; k<nb0; k++) IBx[0][n][m][i-SindC[n][m]][k] = 0.0;
1532         for (k=0; k<nb1; k++) IBx[1][n][m][i-SindC[n][m]][k] = 0.0;
1533         for (k=0; k<nc; k++)  ICx[n][m][i-SindC[n][m]][k] = 0.0;
1534 
1535         nb0 = 0;
1536         nb1 = 0;
1537         nc  = 0;
1538 
1539         for (j=Tp[i1]; j<Tp[i1+1]; j++){
1540 
1541           j1 = invp[Ti[j]];
1542           i2 = i - SindC[n][m];
1543 
1544           if      ( SindB[n][2*m]<=j1 && j1<=EindB[n][2*m] ){
1545 
1546             Bi[0][n][m][i2][nb0] = j1 - SindB[n][2*m];
1547             Bx[0][n][m][i2][nb0] = Tx[j];
1548 
1549             nb0++;
1550 	  }
1551 
1552           else if ( SindB[n][2*m+1]<=j1 && j1<=EindB[n][2*m+1] ){
1553 
1554             Bi[1][n][m][i2][nb1] = j1 - SindB[n][2*m+1];
1555             Bx[1][n][m][i2][nb1] = Tx[j];
1556 
1557             nb1++;
1558 	  }
1559           else if ( SindC[n][m]<=j1 && j1<=EindC[n][m] ){
1560 
1561             Ci[n][m][i2][nc] = j1 - SindC[n][m];
1562             Cx[n][m][i2][nc] = Tx[j];
1563 
1564             nc++;
1565 	  }
1566 
1567         } /* j */
1568       } /* i */
1569     } /* m */
1570 
1571     mp /= 2;
1572 
1573   } /* n */
1574 
1575   dtime(&etime1);
1576   time1b = etime1 - stime1;
1577 
1578   /***********************************************************
1579              calculate A^-1 for the smallest blocks
1580   ***********************************************************/
1581 
1582   Anum = (int**)malloc(sizeof(int*)*mpmax);
1583   Ai = (int***)malloc(sizeof(int**)*mpmax);
1584   Ax = (double complex***)malloc(sizeof(double complex**)*mpmax);
1585   IAx = (double complex***)malloc(sizeof(double complex**)*mpmax);
1586 
1587   for (m=0; m<mpmax; m++){
1588 
1589     nsa = EindB[0][m] - SindB[0][m] + 1;
1590 
1591     Anum[m] = (int*)malloc(sizeof(int)*nsa);
1592     Ai[m] = (int**)malloc(sizeof(int*)*nsa);
1593     Ax[m] = (double complex**)malloc(sizeof(double complex*)*nsa);
1594     IAx[m] = (double complex**)malloc(sizeof(double complex*)*nsa);
1595 
1596     for (i=SindB[0][m]; i<=EindB[0][m]; i++){
1597 
1598       i1 = Index_BC[0][i];
1599       i2 = i - SindB[0][m];
1600       numa = 0;
1601 
1602       for (j=Tp[i1]; j<Tp[i1+1]; j++){
1603         j1 = invp[Ti[j]];
1604 	if ( SindB[0][m]<=j1 && j1<=EindB[0][m] ) numa++;
1605       }
1606 
1607       Anum[m][i2] = numa;
1608 
1609       Ai[m][i2] = (int*)malloc(sizeof(int)*numa);
1610       Ax[m][i2] = (double complex*)malloc(sizeof(double complex)*numa);
1611       IAx[m][i2] = (double complex*)malloc(sizeof(double complex)*numa);
1612       for (k=0; k<numa; k++) IAx[m][i2][k] = 0.0;
1613 
1614       numa = 0;
1615 
1616       for (j=Tp[i1]; j<Tp[i1+1]; j++){
1617 
1618         j1 = invp[Ti[j]];
1619 
1620 	if ( SindB[0][m]<=j1 && j1<=EindB[0][m] ){
1621 
1622           Ai[m][i2][numa] = j1 - SindB[0][m];
1623           Ax[m][i2][numa] = Tx[j];
1624 
1625           numa++;
1626 	}
1627       }
1628     }
1629   }
1630 
1631   /* calculate IA */
1632 
1633   mp = mpmax;
1634   n = 0;
1635 
1636   max_nsa = 0;
1637   for (m=0; m<mp; m++){
1638     nsa = EindB[n][m] - SindB[n][m] + 1;
1639     if (max_nsa<nsa) max_nsa = nsa;
1640   }
1641 
1642   IA0 = (dcomplex*)malloc(sizeof(dcomplex)*max_nsa*max_nsa);
1643 
1644   IA = (double**)malloc(sizeof(double*)*mp);
1645   for (m=0; m<mp; m++){
1646     nsa = EindB[n][m] - SindB[n][m] + 1;
1647     IA[m] = (double*)malloc(sizeof(double)*2*nsa*nsa);
1648     for (i=0; i<2*nsa*nsa; i++) IA[m][i] = 0.0;
1649   }
1650 
1651   mp = mpmax;
1652   n = 0;
1653 
1654   for ( m=myid2; m<mp; m+=numprocs2 ){
1655 
1656     nsa = EindB[n][m] - SindB[n][m] + 1;
1657 
1658     for (i=0; i<nsa; i++){
1659       for (j=0; j<nsa; j++){
1660         IA0[nsa*j+i].r = 0.0;
1661         IA0[nsa*j+i].i = 0.0;
1662       }
1663     }
1664 
1665     for (i=0; i<nsa; i++){
1666       for (j=0; j<Anum[m][i]; j++){
1667         j2 = Ai[m][i][j];
1668         IA0[nsa*j2+i].r = creal(Ax[m][i][j]);
1669         IA0[nsa*j2+i].i = cimag(Ax[m][i][j]);
1670       }
1671     }
1672 
1673     dtime(&stime1);
1674 
1675     if (0<nsa) Lapack_LU_Zinverse(nsa,IA0);
1676 
1677     dtime(&etime1);
1678     time1c += etime1 - stime1;
1679 
1680     for (i=0; i<nsa; i++){
1681       for (j=0; j<nsa; j++){
1682         IA[m][2*nsa*j+2*i  ] = IA0[nsa*j+i].r;
1683         IA[m][2*nsa*j+2*i+1] = IA0[nsa*j+i].i;
1684       }
1685     }
1686 
1687     /* store IA into IAx */
1688 
1689     for (i=0; i<(EindB[n][m]-SindB[n][m]+1); i++){
1690       for (j=0; j<Anum[m][i]; j++){
1691         j2 = Ai[m][i][j];
1692         IAx[m][i][j] = IA0[nsa*j2+i].r + I*IA0[nsa*j2+i].i;
1693       }
1694     }
1695 
1696   } /* m */
1697 
1698 
1699   dtime(&stime1);
1700 
1701   /* MPI: IA */
1702 
1703   if (1<numprocs2){
1704     for ( m=0; m<mp; m++ ){
1705       nsa = EindB[n][m] - SindB[n][m] + 1;
1706       ID = m % numprocs2;
1707       MPI_Bcast(&IA[m][0], nsa*nsa*2, MPI_DOUBLE, ID, MPI_Comm);
1708     }
1709   }
1710 
1711   /***********************************************************
1712        allocation of IS, VvecTmp, Vvec, and Lvec
1713   ***********************************************************/
1714 
1715   mp = mpmax/2;
1716   IS = (dcomplex***)malloc(sizeof(dcomplex**)*nl);
1717   for (n=0; n<nl; n++){
1718     IS[n] = (dcomplex**)malloc(sizeof(dcomplex*)*mp);
1719     for (m=0; m<mp; m++){
1720       nsa = EindC[n][m] - SindC[n][m] + 1;
1721 
1722       IS[n][m] = (dcomplex*)malloc(sizeof(dcomplex)*nsa*nsa);
1723       for (i=0; i<nsa*nsa; i++) IS[n][m][i] = Complex(0.0,0.0);
1724     }
1725     mp /= 2;
1726   }
1727 
1728   mp = mpmax/2;
1729   max_nsc = 0;
1730   for (n=0; n<nl; n++){
1731     for (m=0; m<mp; m++){
1732       nsc = EindC[n][m] - SindC[n][m] + 1;
1733       if (max_nsc<nsc) max_nsc = nsc;
1734     }
1735     mp /= 2;
1736   }
1737 
1738   mp = mpmax/2;
1739   max_nsb = 0;
1740   for (n=0; n<nl; n++){
1741     for (m=0; m<mp; m++){
1742       nsb = EindB[n][2*m]-SindB[n][2*m] + 1;
1743       if (max_nsb<nsb) max_nsb = nsb;
1744 
1745       nsb = EindB[n][2*m+1]-SindB[n][2*m+1] + 1;
1746       if (max_nsb<nsb) max_nsb = nsb;
1747     }
1748     mp /= 2;
1749   }
1750 
1751   ISTmp = (double*)malloc(sizeof(double)*max_nsc*max_nsc*2);
1752   VvecTmp = (double*)malloc(sizeof(double)*max_nsc*Nmat*2);
1753 
1754   Vvec = (double complex**)malloc(sizeof(double complex*)*max_nsc);
1755   for (i=0; i<max_nsc; i++){
1756     Vvec[i] = (double complex*)malloc(sizeof(double complex)*Nmat);
1757   }
1758 
1759   Vvec2 = (double complex**)malloc(sizeof(double complex*)*Nmat);
1760   for (i=0; i<Nmat; i++){
1761     Vvec2[i] = (double complex*)malloc(sizeof(double complex)*max_nsc);
1762   }
1763 
1764   mp = mpmax/2;
1765   Lvec[0] = (dcomplex***)malloc(sizeof(dcomplex**)*nl);
1766   for (n=0; n<nl; n++){
1767     Lvec[0][n] = (dcomplex**)malloc(sizeof(dcomplex*)*mp);
1768     for (m=0; m<mp; m++){
1769       kk = (EindB[n][2*m]-SindB[n][2*m]+1)*(EindC[n][m]-SindC[n][m]+1);
1770       Lvec[0][n][m] = (dcomplex*)malloc(sizeof(dcomplex)*kk);
1771     }
1772     mp /= 2;
1773   }
1774 
1775   mp = mpmax/2;
1776   Lvec[1] = (dcomplex***)malloc(sizeof(dcomplex**)*nl);
1777   for (n=0; n<nl; n++){
1778     Lvec[1][n] = (dcomplex**)malloc(sizeof(dcomplex*)*mp);
1779     for (m=0; m<mp; m++){
1780       kk = (EindB[n][2*m+1]-SindB[n][2*m+1]+1)*(EindC[n][m]-SindC[n][m]+1);
1781       Lvec[1][n][m] = (dcomplex*)malloc(sizeof(dcomplex)*kk);
1782     }
1783     mp /= 2;
1784   }
1785 
1786   Q0 = (double complex**)malloc(sizeof(double complex*)*max_nsc);
1787   for (i=0; i<max_nsc; i++){
1788     Q0[i] = (double complex*)malloc(sizeof(double complex)*max_nsc);
1789   }
1790 
1791   Q1 = (double complex**)malloc(sizeof(double complex*)*max_nsc);
1792   for (i=0; i<max_nsc; i++){
1793     Q1[i] = (double complex*)malloc(sizeof(double complex)*max_nsc);
1794   }
1795 
1796   is1 = (int*)malloc(sizeof(int)*max_nsc);
1797   ie1 = (int*)malloc(sizeof(int)*max_nsc);
1798 
1799   Row2ID = (int*)malloc(sizeof(int)*Nmat);
1800 
1801   dtime(&etime1);
1802   time1d = etime1 - stime1;
1803 
1804   dtime(&etime);
1805 
1806   time1 = etime - stime;
1807 
1808   /***********************************************************
1809          main calculations by the recurrence relations
1810   ***********************************************************/
1811 
1812   mp = mpmax/2;
1813   m2 = 1;
1814 
1815   for (n1=0; n1<nl; n1++){
1816 
1817     /*******************************
1818         calculate the initial V0
1819     *******************************/
1820 
1821     dtime(&stime);
1822 
1823     for (m=0; m<mpmax/2; m++){
1824       for (m0=0; m0<2; m0++){
1825 
1826 	nsa = EindB[0][2*m+m0] - SindB[0][2*m+m0] + 1;
1827 
1828 	m3 = (2*m+m0)/(m2*2);
1829 	m4 = ((2*m+m0)/m2)%2;
1830 
1831         nsc = EindC[n1][m3] - SindC[n1][m3] + 1;
1832 
1833 	/***********************************
1834         division of nsc for
1835         MPI parallelization in MPI_Comm
1836 	***********************************/
1837 
1838 	if ( numprocs2<=nsc ){
1839 
1840 	  av_num = (double)nsc/(double)numprocs2;
1841 
1842 	  for (ID=0; ID<numprocs2; ID++){
1843 	    is1[ID] = (int)( (av_num+1.0e-12)*(double)ID);
1844 	    ie1[ID] = (int)( (av_num+1.0e-12)*(double)(ID+1)) - 1;
1845 	  }
1846 
1847 	  is1[0] = 0;
1848 	  ie1[numprocs2-1] = nsc-1;
1849 	}
1850 
1851 	else{
1852 
1853 	  for (ID=0; ID<nsc; ID++){
1854 	    is1[ID] = ID;
1855 	    ie1[ID] = ID;
1856 	  }
1857 	  for (ID=nsc; ID<numprocs2; ID++){
1858 	    is1[ID] =  0;
1859 	    ie1[ID] = -1;
1860 	  }
1861 	}
1862 
1863 #pragma omp parallel shared(myid2,is1,ie1,nsc,nsa,m4,n1,m3,Bnum,Bi,SindB,EindB,m,m0,IA,Bx,Vvec) private(OMPID,Nthrds,Nprocs,j,i,k,k1,k2)
1864 	{
1865 
1866 	  double complex ctmp1,csum1;
1867 
1868 	  /* get info. on OpenMP */
1869 
1870 	  OMPID = omp_get_thread_num();
1871 	  Nthrds = omp_get_num_threads();
1872 	  Nprocs = omp_get_num_procs();
1873 
1874 	  for ( j=is1[myid2]+OMPID; j<=ie1[myid2]; j+=Nthrds ){
1875 
1876 	    for (i=0; i<nsa; i++){
1877 
1878 	      csum1 = 0.0;
1879 
1880 	      for (k=0; k<Bnum[m4][n1][m3][j]; k++){
1881 
1882 		k1 = Bi[m4][n1][m3][j][k] + SindB[n1][2*m3+m4];
1883 		k2 = k1 - SindB[0][2*m+m0];
1884 
1885 		if (SindB[0][2*m+m0]<=k1 && k1<=EindB[0][2*m+m0]){
1886 
1887 		  ctmp1 = IA[2*m+m0][2*nsa*k2+2*i] + I*IA[2*m+m0][2*nsa*k2+2*i+1];
1888 		  csum1 += ctmp1*Bx[m4][n1][m3][j][k];
1889 		}
1890 	      }
1891 
1892 	      Vvec[j][SindB[0][2*m+m0]+i] = csum1;
1893 
1894 	    }
1895 	  } /* j */
1896 
1897 	} /* #pragma omp parallel */
1898 
1899       } /* m0 */
1900     } /* m */
1901 
1902     dtime(&etime);
1903     time2 += etime - stime;
1904 
1905     /*******************************
1906           recurrence relations
1907     *******************************/
1908 
1909     for (m=0; m<mp; m++){
1910 
1911       dtime(&stime);
1912       dtime(&stime1);
1913 
1914       nsc = EindC[n1][m] - SindC[n1][m] + 1;
1915       mp0 = m2;
1916 
1917       /***********************************
1918         division of nsc for
1919         MPI parallelization in MPI_Comm
1920       ***********************************/
1921 
1922       if ( numprocs2<=nsc ){
1923 
1924 	av_num = (double)nsc/(double)numprocs2;
1925 
1926 	for (ID=0; ID<numprocs2; ID++){
1927 	  is1[ID] = (int)( (av_num+1.0e-12)*(double)ID);
1928 	  ie1[ID] = (int)( (av_num+1.0e-12)*(double)(ID+1)) - 1;
1929 	}
1930 
1931 	is1[0] = 0;
1932 	ie1[numprocs2-1] = nsc-1;
1933       }
1934 
1935       else{
1936 
1937 	for (ID=0; ID<nsc; ID++){
1938 	  is1[ID] = ID;
1939 	  ie1[ID] = ID;
1940 	}
1941 	for (ID=nsc; ID<numprocs2; ID++){
1942 	  is1[ID] =  0;
1943 	  ie1[ID] = -1;
1944 	}
1945       }
1946 
1947       dtime(&etime1);
1948       time31 += etime1 - stime1;
1949 
1950       dtime(&stime2);
1951 
1952       /***********************************
1953        loop for n2: which means
1954        climbing the ladder at the level n1
1955       ***********************************/
1956 
1957       for (n2=0; n2<n1; n2++){
1958         for (m1=0; m1<mp0; m1++){
1959 
1960           mm = mp0*m + m1;
1961 
1962           dtime(&stime1);
1963 
1964           nsr = EindC[n2][mm] - SindC[n2][mm] + 1;
1965 
1966           /***********************************
1967                      calculation of Q
1968           ***********************************/
1969 
1970 #pragma omp parallel shared(is1,ie1,myid2,nsr,Bnum,n1,n2,m1,mm,SindB,Bi,Bx,Vvec,IS,Q1,Q0,SindC,numop1) private(OMPID,Nthrds,Nprocs,j,i,k,k1,m5,l,m3,m4,numop0)
1971 
1972           {
1973 
1974 	    double complex csum1;
1975 
1976             numop0 = 0.0;
1977 
1978     	    /* get info. on OpenMP */
1979 
1980   	    OMPID = omp_get_thread_num();
1981 	    Nthrds = omp_get_num_threads();
1982 	    Nprocs = omp_get_num_procs();
1983 
1984 	    for ( j=is1[myid2]+OMPID; j<=ie1[myid2]; j+=Nthrds ){
1985 
1986 	      for (i=0; i<nsr; i++){
1987 
1988 		csum1 = 0.0;
1989 
1990 		/* B_{even} V_{even} */
1991 
1992 		for (k=0; k<Bnum[0][n2][mm][i]; k++){
1993 		  k1 = SindB[n2][2*mm] + Bi[0][n2][mm][i][k];
1994 		  csum1 += Bx[0][n2][mm][i][k]*Vvec[j][k1];
1995 		}
1996 
1997 		numop0 += (double)Bnum[0][n2][mm][i];
1998 
1999 		/* B_{odd} V_{odd} */
2000 
2001 		for (k=0; k<Bnum[1][n2][mm][i]; k++){
2002 		  k1 = SindB[n2][2*mm+1] + Bi[1][n2][mm][i][k];
2003 		  csum1 += Bx[1][n2][mm][i][k]*Vvec[j][k1];
2004 		}
2005 
2006 		numop0 += (double)Bnum[1][n2][mm][i];
2007 
2008 		/* B[C] */
2009 
2010 		m5 = 1;
2011 		for (l=0; l<(n1-n2); l++) m5 *= 2;
2012 
2013 		m3 = mm/m5;
2014 		m4 = (mm/(m5/2))%2;
2015 
2016 		for (k=0; k<Bnum[m4][n1][m3][j]; k++){
2017 		  k1 = Bi[m4][n1][m3][j][k] + SindB[n1][2*m3+m4] - SindC[n2][mm];
2018 		  if (k1==i) csum1 -= Bx[m4][n1][m3][j][k];
2019 		}
2020 
2021 		/* store the temporal result into Q0 */
2022 
2023 		Q0[j][i] = csum1;
2024 
2025 	      } /* i */
2026 
2027 	      /* calculate Q */
2028 
2029 	      for (i=0; i<nsr; i++){
2030 
2031 		csum1 = 0.0;
2032 		for (k=0; k<nsr; k++){
2033 		  csum1 += (IS[n2][mm][i*nsr+k].r+I*IS[n2][mm][i*nsr+k].i)*Q0[j][k];
2034 		}
2035 
2036 		Q1[j][i] = csum1;
2037 	      }
2038 
2039 	    } /* j */
2040 
2041             numop1 += numop0;
2042 
2043 	  } /* #pragma omp parallel */
2044 
2045           dtime(&etime1);
2046           time32 += etime1 - stime1;
2047 
2048           /**********************************
2049                         update V
2050           **********************************/
2051 
2052           dtime(&stime1);
2053 
2054 #pragma omp parallel shared(is1,ie1,myid2,EindB,SindB,n2,mm,nsr,Lvec,Q1,Vvec,SindC,numop1) \
2055  private(OMPID,Nthrds,Nprocs,j,i,i2,k,i1,numop0)
2056 
2057           {
2058 
2059 	    double complex csum0;
2060 
2061             numop0 = 0.0;
2062 
2063     	    /* get info. on OpenMP */
2064 
2065   	    OMPID = omp_get_thread_num();
2066 	    Nthrds = omp_get_num_threads();
2067 	    Nprocs = omp_get_num_procs();
2068 
2069 	    for ( j=is1[myid2]+OMPID; j<=ie1[myid2]; j+=Nthrds ){
2070 
2071   	      /* V_{even} */
2072 
2073 	      for (i=0; i<(EindB[n2][2*mm]-SindB[n2][2*mm]+1); i++){
2074 
2075 		i2 = i*nsr;
2076 		csum0 = 0.0;
2077 
2078 		for (k=0; k<nsr; k++){
2079 		  csum0 += (Lvec[0][n2][mm][i2+k].r+I*Lvec[0][n2][mm][i2+k].i)*Q1[j][k];
2080 		}
2081 
2082 		Vvec[j][SindB[n2][2*mm]+i] += csum0;
2083 	      }
2084 
2085               numop0 += (double)(EindB[n2][2*mm]-SindB[n2][2*mm]+1)*nsr;
2086 
2087 	      /* V_{odd} */
2088 
2089 	      for (i=0; i<(EindB[n2][2*mm+1]-SindB[n2][2*mm+1]+1); i++){
2090 
2091 		i2 = i*nsr;
2092 		csum0 = 0.0;
2093 
2094 		for (k=0; k<nsr; k++){
2095 		  csum0 += (Lvec[1][n2][mm][i2+k].r+I*Lvec[1][n2][mm][i2+k].i)*Q1[j][k];
2096 		}
2097 
2098 		Vvec[j][SindB[n2][2*mm+1]+i] += csum0;
2099 	      }
2100 
2101               numop0 += (double)(EindB[n2][2*mm+1]-SindB[n2][2*mm+1]+1)*nsr;
2102 
2103 	      /* add the contribution of -Q */
2104 
2105 	      for (i=0; i<nsr; i++){
2106 		i1 = SindC[n2][mm] + i;
2107 		Vvec[j][i1] = -Q1[j][i];
2108 	      }
2109 
2110 	    } /* j */
2111 
2112             numop1 += numop0;
2113 
2114 	  } /* #pragma omp parallel */
2115 
2116           dtime(&etime1);
2117           time33 += etime1 - stime1;
2118 
2119 	} /* m1 */
2120 
2121         /* update mp0 */
2122 
2123         mp0 /= 2;
2124 
2125       } /* n2 */
2126 
2127       dtime(&etime2);
2128       time34 += etime2 - stime2;
2129 
2130       dtime(&etime);
2131       time3 += etime - stime;
2132 
2133       /*******************************
2134            calculate the S-matrix
2135       *******************************/
2136 
2137       dtime(&stime);
2138 
2139       /* initialize IS */
2140 
2141       for (j=0; j<nsc; j++){
2142         for (i=0; i<nsc; i++){
2143 
2144 	  ISTmp[2*j*nsc+2*i  ] = 0.0;
2145 	  ISTmp[2*j*nsc+2*i+1] = 0.0;
2146 	}
2147       }
2148 
2149       /* put C to IS */
2150 
2151       for (i=0; i<nsc; i++){
2152 	for (j=0; j<Cnum[n1][m][i]; j++){
2153 
2154 	  j1 = Ci[n1][m][i][j];
2155 
2156           if (is1[myid2]<=j1 && j1<=ie1[myid2]){
2157 
2158   	    ISTmp[2*j1*nsc+2*i  ] = creal(Cx[n1][m][i][j]);
2159 	    ISTmp[2*j1*nsc+2*i+1] = cimag(Cx[n1][m][i][j]);
2160 	  }
2161 	}
2162       }
2163 
2164       /* calculate the inner products */
2165 
2166       dtime(&stime1);
2167 
2168 #pragma omp parallel shared(is1,ie1,myid2,nsc,Bnum,n1,m,SindB,Bi,Bx,Vvec,ISTmp) private(OMPID,Nthrds,Nprocs,j,i,k,k1)
2169 
2170       {
2171 
2172 	double complex csum1;
2173 
2174 
2175 	/* get info. on OpenMP */
2176 
2177 	OMPID = omp_get_thread_num();
2178 	Nthrds = omp_get_num_threads();
2179 	Nprocs = omp_get_num_procs();
2180 
2181 	for ( j=is1[myid2]+OMPID; j<=ie1[myid2]; j+=Nthrds ){
2182 
2183 	  for (i=0; i<nsc; i++){
2184 
2185 	    csum1 = 0.0;
2186 
2187 	    /* contribution of even m */
2188 
2189 	    for (k=0; k<Bnum[0][n1][m][i]; k++){
2190 
2191 	      k1 = SindB[n1][2*m] + Bi[0][n1][m][i][k];
2192 	      csum1 += Bx[0][n1][m][i][k]*Vvec[j][k1];
2193 	    }
2194 
2195 	    /* contribution of odd m */
2196 
2197 	    for (k=0; k<Bnum[1][n1][m][i]; k++){
2198 
2199 	      k1 = SindB[n1][2*m+1] + Bi[1][n1][m][i][k];
2200 	      csum1 += Bx[1][n1][m][i][k]*Vvec[j][k1];
2201 	    }
2202 
2203 	    ISTmp[2*j*nsc+2*i  ] -= creal(csum1);
2204 	    ISTmp[2*j*nsc+2*i+1] -= cimag(csum1);
2205 	  }
2206 	}
2207 
2208       } /* #pragma omp parallel */
2209 
2210       /* MPI: IS */
2211 
2212       if (1<numprocs2){
2213 
2214 	/* sending */
2215 
2216 	size0 = (ie1[myid2]-is1[myid2]+1)*nsc*2;
2217 	if (size0<0) size0 = 0;
2218 	k0 = 2*is1[myid2]*nsc;
2219 	if (k0<0) k0 = 0;
2220 
2221 	for (ID=0; ID<numprocs2; ID++){
2222 	  MPI_Isend(&ISTmp[k0], size0, MPI_DOUBLE, ID, tag, MPI_Comm, &request_send[ID]);
2223 	}
2224 
2225 	/* receiving */
2226 
2227 	for (ID=0; ID<numprocs2; ID++){
2228 
2229 	  size1 = (ie1[ID]-is1[ID]+1)*nsc*2;
2230 	  if (size1<0) size1 = 0;
2231 	  k1 = 2*is1[ID]*nsc;
2232 	  if (k1<0) k1 = 0;
2233 
2234 	  MPI_Irecv(&ISTmp[k1], size1, MPI_DOUBLE, ID, tag, MPI_Comm, &request_recv[ID]);
2235 	}
2236 
2237         /* Waitall */
2238 
2239 	MPI_Waitall(numprocs2,request_recv,stat_send);
2240 	MPI_Waitall(numprocs2,request_send,stat_send);
2241       }
2242 
2243       /* set up IS */
2244 
2245       for (j=0; j<nsc; j++){
2246         for (i=0; i<nsc; i++){
2247 	  IS[n1][m][j*nsc+i].r = ISTmp[2*j*nsc+2*i  ];
2248 	  IS[n1][m][j*nsc+i].i = ISTmp[2*j*nsc+2*i+1];
2249 	}
2250       }
2251 
2252       dtime(&etime1);
2253       time41 += etime1 - stime1;
2254 
2255       /*************************************f******************
2256        MPI communication of Vvec using VvecTmp:
2257        In order to hide the communication, the MPI_Waitall are
2258        called after the calculation of the inverse of S
2259       ********************************************************/
2260 
2261       for (j=is1[myid2]; j<=ie1[myid2]; j++){
2262         for (i=0; i<Nmat; i++){
2263           VvecTmp[2*j*Nmat+2*i  ] = creal(Vvec[j][i]);
2264           VvecTmp[2*j*Nmat+2*i+1] = cimag(Vvec[j][i]);
2265 	}
2266       }
2267 
2268       if (1<numprocs2){
2269 
2270 	/* sending */
2271 
2272 	size0 = (ie1[myid2]-is1[myid2]+1)*Nmat*2;
2273 	if (size0<0) size0 = 0;
2274 	k0 = 2*is1[myid2]*Nmat;
2275 	if (k0<0) k0 = 0;
2276 
2277 	for (ID=0; ID<numprocs2; ID++){
2278 	  MPI_Isend(&VvecTmp[k0], size0, MPI_DOUBLE, ID, tag, MPI_Comm, &request_send[ID]);
2279 	}
2280 
2281 	/* receiving */
2282 
2283 	for (ID=0; ID<numprocs2; ID++){
2284 
2285 	  size1 = (ie1[ID]-is1[ID]+1)*Nmat*2;
2286 	  if (size1<0) size1 = 0;
2287 	  k1 = 2*is1[ID]*Nmat;
2288 	  if (k1<0) k1 = 0;
2289 
2290 	  MPI_Irecv(&VvecTmp[k1], size1, MPI_DOUBLE, ID, tag, MPI_Comm, &request_recv[ID]);
2291 	}
2292       }
2293 
2294       /*******************************
2295         call Lapack_LU_Zinverse
2296         to calculate the inverse of S
2297       *******************************/
2298 
2299       dtime(&stime1);
2300 
2301       if (0<nsc) Lapack_LU_Zinverse(nsc,IS[n1][m]);
2302 
2303       dtime(&etime1);
2304       time42 += etime1 - stime1;
2305 
2306       dtime(&etime);
2307       time4 += etime - stime;
2308 
2309       /****************************************************
2310        To end MPI communication of VvecTmp, call waitall,
2311       ****************************************************/
2312 
2313       if (1<numprocs2){
2314 	MPI_Waitall(numprocs2,request_recv,stat_send);
2315 	MPI_Waitall(numprocs2,request_send,stat_send);
2316       }
2317 
2318       /*******************************
2319             store Vvec as Lvec
2320       *******************************/
2321 
2322       for (j=0; j<nsc; j++){
2323         for (i=SindB[n1][2*m]; i<=EindB[n1][2*m]; i++){
2324 	  Lvec[0][n1][m][ (i-SindB[n1][2*m])*nsc+j ].r = VvecTmp[2*j*Nmat+2*i  ];
2325 	  Lvec[0][n1][m][ (i-SindB[n1][2*m])*nsc+j ].i = VvecTmp[2*j*Nmat+2*i+1];
2326 	}
2327       }
2328 
2329       for (j=0; j<nsc; j++){
2330         for (i=SindB[n1][2*m+1]; i<=EindB[n1][2*m+1]; i++){
2331 	  Lvec[1][n1][m][ (i-SindB[n1][2*m+1])*nsc+j ].r = VvecTmp[2*j*Nmat+2*i  ];
2332 	  Lvec[1][n1][m][ (i-SindB[n1][2*m+1])*nsc+j ].i = VvecTmp[2*j*Nmat+2*i+1];
2333 	}
2334       }
2335 
2336       /*****************************************
2337                      set up Row2ID
2338       *****************************************/
2339 
2340       ID = 0;
2341       for (i=0; i<(EindB[n1][2*m+1]-SindB[n1][2*m]+1); i++){
2342 
2343 	i2 = i + SindB[n1][2*m];
2344         Row2ID[i2] = ID;
2345         ID++;
2346         ID = ID % numprocs2;
2347       }
2348 
2349       /*****************************************
2350           update entries in the inverse of A
2351       *****************************************/
2352 
2353       dtime(&stime);
2354 
2355       /*
2356       printf("n1=%2d m=%2d nsc=%2d\n",n1,m,nsc);
2357       */
2358 
2359       if (0<nsc){
2360 
2361 	/* calculate L_{even}^t S^{-1} */
2362 
2363         dtime(&stime1);
2364 
2365 #pragma omp parallel shared(n1,m,EindB,SindB,Row2ID,myid2,nsc,IS,Lvec,Vvec2,numop2) private(OMPID,Nthrds,Nprocs,i,i2,j,j2,k,i3,j3,dcsum0,numop0)
2366         {
2367 
2368           numop0 = 0.0;
2369 
2370   	  /* get info. on OpenMP */
2371 
2372   	  OMPID = omp_get_thread_num();
2373 	  Nthrds = omp_get_num_threads();
2374 	  Nprocs = omp_get_num_procs();
2375 
2376 	  for ( i=OMPID; i<(EindB[n1][2*m]-SindB[n1][2*m]+1); i+=Nthrds ){
2377 
2378 	    if ( Row2ID[i+SindB[n1][2*m]]==myid2 ){
2379 
2380 	      i2 = i*nsc;
2381 
2382 	      for (j=0; j<nsc; j++){
2383 
2384 		j2 = j*nsc;
2385 
2386 		dcsum0.r = 0.0; dcsum0.i = 0.0;
2387 
2388 		for (k=0; k<nsc; k++){
2389 
2390 		  i3 = i2 + k;
2391 		  j3 = j2 + k;
2392 
2393 		  dcsum0.r += IS[n1][m][j3].r*Lvec[0][n1][m][i3].r - IS[n1][m][j3].i*Lvec[0][n1][m][i3].i;
2394 		  dcsum0.i += IS[n1][m][j3].i*Lvec[0][n1][m][i3].r + IS[n1][m][j3].r*Lvec[0][n1][m][i3].i;
2395 		}
2396 
2397 		Vvec2[i+SindB[n1][2*m]][j] = dcsum0.r + I*dcsum0.i;
2398 	      }
2399 
2400   	      numop0 += (double)nsc*nsc;
2401 	    }
2402 	  }
2403 
2404 	  /* calculate L_{odd}^t S^{-1} */
2405 
2406 	  for ( i=OMPID; i<(EindB[n1][2*m+1]-SindB[n1][2*m+1]+1); i+=Nthrds ){
2407 
2408 	    if ( Row2ID[i+SindB[n1][2*m+1]]==myid2 ){
2409 
2410 	      i2 = i*nsc;
2411 
2412 	      for (j=0; j<nsc; j++){
2413 
2414 		j2 = j*nsc;
2415 
2416 		dcsum0.r = 0.0; dcsum0.i = 0.0;
2417 
2418 		for (k=0; k<nsc; k++){
2419 
2420 		  j3 = j2 + k;
2421 		  i3 = i2 + k;
2422 
2423 		  dcsum0.r += IS[n1][m][j3].r*Lvec[1][n1][m][i3].r - IS[n1][m][j3].i*Lvec[1][n1][m][i3].i;
2424 		  dcsum0.i += IS[n1][m][j3].i*Lvec[1][n1][m][i3].r + IS[n1][m][j3].r*Lvec[1][n1][m][i3].i;
2425 		}
2426 
2427 		Vvec2[i+SindB[n1][2*m+1]][j] = dcsum0.r + I*dcsum0.i;
2428 	      }
2429 
2430  	      numop0 += (double)nsc*nsc;
2431 
2432 	    }
2433 	  }
2434 
2435           numop2 += numop0;
2436 
2437 	} /* #pragma omp parallel */
2438 
2439         dtime(&etime1);
2440         time51 += etime1 - stime1;
2441 
2442 	/* update IAx */
2443 
2444         dtime(&stime1);
2445 
2446 	n2 = 0;
2447 
2448 	for (m1=0; m1<m2; m1++){
2449 
2450 	  m3 = m2*m + m1;
2451 
2452 #pragma omp parallel shared(m2,m1,SindB,EindB,n2,m3,Row2ID,myid2,Anum,Ai,n1,m,EindC,SindC,Lvec,Vvec2,IAx) private(OMPID,Nthrds,Nprocs,k2,i,i2,j,j2,nsc2,k)
2453 	  {
2454 
2455 	    double complex csum1,ctmp2;
2456 
2457   	    /* get info. on OpenMP */
2458 
2459   	    OMPID = omp_get_thread_num();
2460 	    Nthrds = omp_get_num_threads();
2461 	    Nprocs = omp_get_num_procs();
2462 
2463 	    /* for even */
2464 
2465 	    if (m2==1) k2 = 0;
2466 	    else       k2 = ((m2/2)<=m1);
2467 
2468 	    for ( i=SindB[n2][2*m3]+OMPID; i<=EindB[n2][2*m3]; i+=Nthrds ){
2469 
2470 	      if (Row2ID[i]==myid2){
2471 
2472 		i2 = i - SindB[n2][2*m3];
2473 
2474 		for (j=0; j<Anum[2*m3][i2]; j++){
2475 
2476 		  j2 = Ai[2*m3][i2][j] + SindB[0][2*m3] - SindB[n1][2*m+k2];
2477 		  csum1 = 0.0;
2478 
2479 		  nsc2 = EindC[n1][m]-SindC[n1][m]+1;
2480 
2481 		  for (k=0; k<nsc2; k++){
2482 		    ctmp2 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2483 		    csum1 += Vvec2[i][k]*ctmp2;
2484 		  }
2485 
2486 		  IAx[2*m3][i2][j] += csum1;
2487 		}
2488 	      }
2489 	    }
2490 
2491 	    /* for odd */
2492 
2493 	    if (m2==1) k2 = 1;
2494 	    else       k2 = ((m2/2)<=m1);
2495 
2496 	    for ( i=SindB[n2][2*m3+1]+OMPID; i<=EindB[n2][2*m3+1]; i+=Nthrds ){
2497 
2498 	      if (Row2ID[i]==myid2){
2499 
2500 		i2 = i - SindB[n2][2*m3+1];
2501 
2502 		for (j=0; j<Anum[2*m3+1][i2]; j++){
2503 
2504 		  j2 = Ai[2*m3+1][i2][j] + SindB[0][2*m3+1] - SindB[n1][2*m+k2];
2505 		  csum1 = 0.0;
2506 
2507 		  nsc2 = EindC[n1][m] - SindC[n1][m] + 1;
2508 
2509 		  for (k=0; k<nsc2; k++){
2510 		    ctmp2 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2511 		    csum1 += Vvec2[i][k]*ctmp2;
2512 		  }
2513 
2514 		  IAx[2*m3+1][i2][j] += csum1;
2515 		}
2516 	      }
2517 	    }
2518 
2519 	  } /* #pragma omp parallel */
2520 
2521 	} /* m1 */
2522 
2523         dtime(&etime1);
2524         time52 += etime1 - stime1;
2525 
2526 	/* update the outer IBx */
2527 
2528         dtime(&stime1);
2529 
2530 #pragma omp parallel shared(EindC,SindC,n1,m,Bnum,SindB,Bi,Row2ID,myid2,IBx,Vvec2) \
2531                      private(OMPID,Nthrds,Nprocs,i,j,j2)
2532 	{
2533 
2534 	  /* get info. on OpenMP */
2535 
2536 	  OMPID = omp_get_thread_num();
2537 	  Nthrds = omp_get_num_threads();
2538 	  Nprocs = omp_get_num_procs();
2539 
2540 	  for ( i=OMPID; i<(EindC[n1][m]-SindC[n1][m]+1); i+=Nthrds ){
2541 
2542 	    /* for even */
2543 
2544 	    for (j=0; j<Bnum[0][n1][m][i]; j++){
2545 
2546 	      j2 = SindB[n1][2*m] + Bi[0][n1][m][i][j];
2547 
2548 	      if (Row2ID[j2]==myid2){
2549 		IBx[0][n1][m][i][j] = -Vvec2[j2][i];
2550 	      }
2551 	    }
2552 
2553 	    /* for odd */
2554 
2555 	    for (j=0; j<Bnum[1][n1][m][i]; j++){
2556 
2557 	      j2 = SindB[n1][2*m+1] + Bi[1][n1][m][i][j];
2558 
2559 	      if (Row2ID[j2]==myid2){
2560 		IBx[1][n1][m][i][j] = -Vvec2[j2][i];
2561 	      }
2562 	    }
2563 
2564 	  } /* i */
2565 
2566         } /* #pragma omp parallel */
2567 
2568         dtime(&etime1);
2569         time53 += etime1 - stime1;
2570 
2571 	/* update the inner IBx */
2572 
2573         dtime(&stime1);
2574 
2575 #pragma omp parallel shared(m2,n1,m,SindC,EindC,Row2ID,myid2,Bnum,Bi,SindB,Lvec,Vvec2,IBx) \
2576                      private(OMPID,Nthrds,Nprocs,mp0,n2,m1,m3,k2,i,i2,j,j2,nsc2,k)
2577 	{
2578 
2579 	  double complex ctmp1,csum1;
2580 
2581 	  /* get info. on OpenMP */
2582 
2583 	  OMPID = omp_get_thread_num();
2584 	  Nthrds = omp_get_num_threads();
2585 	  Nprocs = omp_get_num_procs();
2586 
2587 	  mp0 = m2;
2588 
2589 	  for (n2=0; n2<n1; n2++){
2590 	    for (m1=0; m1<mp0; m1++){
2591 
2592 	      m3 = mp0*m + m1;
2593 	      k2 = ((mp0/2)<=m1);
2594 
2595 	      for ( i=SindC[n2][m3]+OMPID; i<=EindC[n2][m3]; i+=Nthrds ){
2596 
2597 		if (Row2ID[i]==myid2){
2598 
2599 		  i2 = i - SindC[n2][m3];
2600 
2601 		  /* for even */
2602 
2603 		  for (j=0; j<Bnum[0][n2][m3][i2]; j++){
2604 
2605 		    j2 = Bi[0][n2][m3][i2][j] + SindB[n2][2*m3] - SindB[n1][2*m+k2];
2606 		    csum1 = 0.0;
2607 
2608 		    nsc2 = EindC[n1][m]-SindC[n1][m]+1;
2609 
2610 		    for (k=0; k<nsc2; k++){
2611 
2612 		      ctmp1 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2613 		      csum1 += Vvec2[i][k]*ctmp1;
2614 		    }
2615 
2616 		    IBx[0][n2][m3][i2][j] += csum1;
2617 		  }
2618 
2619 		  /* for odd */
2620 
2621 		  for (j=0; j<Bnum[1][n2][m3][i2]; j++){
2622 
2623 		    j2 = Bi[1][n2][m3][i2][j] + SindB[n2][2*m3+1] - SindB[n1][2*m+k2];
2624 		    csum1 = 0.0;
2625 
2626 		    nsc2 = EindC[n1][m]-SindC[n1][m]+1;
2627 
2628 		    for (k=0; k<nsc2; k++){
2629 		      ctmp1 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2630 		      csum1 += Vvec2[i][k]*ctmp1;
2631 		    }
2632 
2633 		    IBx[1][n2][m3][i2][j] += csum1;
2634 		  }
2635 
2636 		} /* if (Row2ID[i]==myid2) */
2637 	      }
2638 
2639 	    } /* m1 */
2640 
2641 	    mp0 /= 2;
2642 
2643 	  } /* n2 */
2644 
2645         } /* #pragma omp parallel */
2646 
2647         dtime(&etime1);
2648         time54 += etime1 - stime1;
2649 
2650 	/* update the inner ICx */
2651 
2652         dtime(&stime1);
2653 
2654 #pragma omp parallel shared(m2,n1,m,SindC,EindC,Row2ID,myid2,Cnum,Ci,SindB,Lvec,Vvec2,ICx) \
2655                      private(OMPID,Nthrds,Nprocs,mp0,n2,m1,m3,k2,i,i2,j,j2,nsc2,k)
2656 	{
2657 
2658 	  double complex ctmp1,csum1;
2659 
2660 	  /* get info. on OpenMP */
2661 
2662 	  OMPID = omp_get_thread_num();
2663 	  Nthrds = omp_get_num_threads();
2664 	  Nprocs = omp_get_num_procs();
2665 
2666 	  mp0 = m2;
2667 
2668 	  for (n2=0; n2<n1; n2++){
2669 	    for (m1=0; m1<mp0; m1++){
2670 
2671 	      m3 = mp0*m + m1;
2672 	      k2 = ((mp0/2)<=m1);
2673 
2674 	      for ( i=SindC[n2][m3]+OMPID; i<=EindC[n2][m3]; i+=Nthrds ){
2675 
2676 		if (Row2ID[i]==myid2){
2677 
2678 		  i2 = i - SindC[n2][m3];
2679 
2680 		  for (j=0; j<Cnum[n2][m3][i2]; j++){
2681 
2682 		    j2 = Ci[n2][m3][i2][j] + SindC[n2][m3] - SindB[n1][2*m+k2];
2683 		    csum1 = 0.0;
2684 
2685 		    nsc2 = EindC[n1][m]-SindC[n1][m]+1;
2686 
2687 		    for (k=0; k<nsc2; k++){
2688 		      ctmp1 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2689 		      csum1 += Vvec2[i][k]*ctmp1;
2690 		    }
2691 
2692 		    ICx[n2][m3][i2][j] += csum1;
2693 
2694 		  } /* j */
2695 		} /* if (Row2ID[i]==myid2) */
2696 	      } /* i */
2697 	    } /* m1 */
2698 
2699 	    mp0 /= 2;
2700 
2701 	  } /* n2 */
2702 
2703         } /* #pragma omp parallel */
2704 
2705         dtime(&etime1);
2706         time55 += etime1 - stime1;
2707 
2708 	/* update the outer ICx */
2709 
2710         if (myid2==0){
2711 
2712 	  for (i=0; i<nsc; i++){
2713 	    for (j=0; j<Cnum[n1][m][i]; j++){
2714 
2715 	      j2 = Ci[n1][m][i][j];
2716 	      ICx[n1][m][i][j] = IS[n1][m][j2*nsc+i].r + I*IS[n1][m][j2*nsc+i].i;
2717 	    }
2718 	  }
2719 	}
2720 
2721       } /* if (0<nsc), end of "update entries in the inverse of A" */
2722 
2723       dtime(&etime);
2724       time5 += etime - stime;
2725 
2726     } /* m */
2727 
2728     /* update m2 and mp */
2729 
2730     m2 *= 2;
2731     mp /= 2;
2732 
2733   } /* n1 */
2734 
2735   /***********************************************************
2736               add the part of inverse to DMall
2737   ***********************************************************/
2738 
2739   dtime(&stime);
2740 
2741   mp = mpmax/2;
2742   for (n=0; n<nl; n++){
2743     for (m=0; m<mp; m++){
2744       for (i=SindC[n][m]; i<=EindC[n][m]; i++){
2745         for (j=0; j<Bnum[0][n][m][i-SindC[n][m]]; j++){
2746 
2747           j1 = Bi[0][n][m][i-SindC[n][m]][j];
2748           j2 = j1 + SindB[n][2*m];
2749           i2 = i - SindC[n][m];
2750 
2751           i3 = Index_BC[nl][i];
2752           j3 = Index_BC[nl][j2];
2753           k = conv_ind2[i3][j3];
2754   	  Tx[k] = creal(weight*IBx[0][n][m][i2][j]);
2755 
2756           k = conv_ind2[j3][i3];
2757   	  Tx[k] = creal(weight*IBx[0][n][m][i2][j]);
2758 	}
2759       }
2760     }
2761 
2762     mp /= 2;
2763   }
2764 
2765   mp = mpmax/2;
2766   for (n=0; n<nl; n++){
2767     for (m=0; m<mp; m++){
2768       for (i=SindC[n][m]; i<=EindC[n][m]; i++){
2769         for (j=0; j<Bnum[1][n][m][i-SindC[n][m]]; j++){
2770 
2771           j1 = Bi[1][n][m][i-SindC[n][m]][j];
2772           j2 = j1 + SindB[n][2*m+1];
2773           i2 = i - SindC[n][m];
2774 
2775           i3 = Index_BC[nl][i];
2776           j3 = Index_BC[nl][j2];
2777           k = conv_ind2[i3][j3];
2778   	  Tx[k] = creal(weight*IBx[1][n][m][i2][j]);
2779 
2780           k = conv_ind2[j3][i3];
2781   	  Tx[k] = creal(weight*IBx[1][n][m][i2][j]);
2782 	}
2783       }
2784     }
2785 
2786     mp /= 2;
2787   }
2788 
2789   mp = mpmax/2;
2790   for (n=0; n<nl; n++){
2791     for (m=0; m<mp; m++){
2792       for (i=SindC[n][m]; i<=EindC[n][m]; i++){
2793         for (j=0; j<Cnum[n][m][i-SindC[n][m]]; j++){
2794 
2795           j1 = Ci[n][m][i-SindC[n][m]][j];
2796           j2 = j1 + SindC[n][m];
2797           i2 = i - SindC[n][m];
2798 
2799           i3 = Index_BC[nl][i];
2800           j3 = Index_BC[nl][j2];
2801           k = conv_ind2[i3][j3];
2802 
2803   	  Tx[k] = creal(weight*ICx[n][m][i2][j]);
2804 	}
2805       }
2806     }
2807 
2808     mp /= 2;
2809   }
2810 
2811   mp = mpmax;
2812   for (m=0; m<mp; m++){
2813     for (i=SindB[0][m]; i<=EindB[0][m]; i++){
2814       for (j=0; j<Anum[m][i-SindB[0][m]]; j++){
2815 
2816 	j1 = Ai[m][i-SindB[0][m]][j];
2817 	j2 = j1 + SindB[0][m];
2818 	i2 = i - SindB[0][m];
2819 
2820 	i3 = Index_BC[nl][i];
2821 	j3 = Index_BC[nl][j2];
2822 
2823 	k = conv_ind2[i3][j3];
2824 
2825 	Tx[k] = creal(weight*IAx[m][i2][j]);
2826       }
2827     }
2828   }
2829 
2830   if (strcasecmp(mode,"contour")==0){
2831 
2832     for (i=0; i<TNZE; i++){
2833 
2834       row = conv_index_row[i];                /* row index    */
2835       col = conv_index_col[i];                /* column index */
2836       k = conv_ind2[row][col];
2837 
2838       DMall[spin][i] += Tx[k];
2839     }
2840   }
2841   else if (strcasecmp(mode,"retarded")==0){
2842 
2843     sum = 0.0;
2844     for (i=0; i<TNZE; i++){
2845 
2846       row = conv_index_row[i];                /* row index    */
2847       col = conv_index_col[i];                /* column index */
2848       k = conv_ind2[row][col];
2849       sum += Tx[k]*Sall[i];
2850     }
2851 
2852     WRG[spin][Epoint] = sum;
2853   }
2854 
2855   else if (strcasecmp(mode,"force")==0){
2856 
2857     for (i=0; i<TNZE; i++){
2858 
2859       row = conv_index_row[i];                /* row index    */
2860       col = conv_index_col[i];                /* column index */
2861       k = conv_ind2[row][col];
2862 
2863       DMall[spin][i] += Tx[k];
2864     }
2865   }
2866 
2867   dtime(&etime);
2868   time6 = etime - stime;
2869 
2870   if (0<measure_time){
2871 
2872     printf("OND myid=%2d numop1=%15.12f\n",myid,numop1); fflush(stdout);
2873     printf("OND myid=%2d numop2=%15.12f\n",myid,numop2); fflush(stdout);
2874     printf("OND time1 =%15.12f\n",time1); fflush(stdout);
2875     printf("OND time1a=%15.12f\n",time1a); fflush(stdout);
2876     printf("ONDtime1a0=%15.12f\n",time1a0); fflush(stdout);
2877     printf("ONDtime1a1=%15.12f\n",time1a1); fflush(stdout);
2878     printf("ONDtime1a2=%15.12f\n",time1a2); fflush(stdout);
2879     printf("OND time1b=%15.12f\n",time1b); fflush(stdout);
2880     printf("OND time1c=%15.12f\n",time1c); fflush(stdout);
2881     printf("OND time1d=%15.12f\n",time1d); fflush(stdout);
2882     printf("OND time2 =%15.12f\n",time2); fflush(stdout);
2883     printf("OND time3 =%15.12f\n",time3); fflush(stdout);
2884     printf("OND time31=%15.12f\n",time31); fflush(stdout);
2885     printf("OND time32=%15.12f\n",time32); fflush(stdout);
2886     printf("OND time33=%15.12f\n",time33); fflush(stdout);
2887     printf("OND time34=%15.12f\n",time34); fflush(stdout);
2888     printf("OND time4 =%15.12f\n",time4); fflush(stdout);
2889     printf("OND time41=%15.12f\n",time41); fflush(stdout);
2890     printf("OND time42=%15.12f\n",time42); fflush(stdout);
2891     printf("OND time5 =%15.12f\n",time5); fflush(stdout);
2892     printf("OND time51=%15.12f\n",time51); fflush(stdout);
2893     printf("OND time52=%15.12f\n",time52); fflush(stdout);
2894     printf("OND time53=%15.12f\n",time53); fflush(stdout);
2895     printf("OND time54=%15.12f\n",time54); fflush(stdout);
2896     printf("OND time55=%15.12f\n",time55); fflush(stdout);
2897     printf("OND myid=%2d time6 =%15.12f\n",myid,time6); fflush(stdout);
2898   }
2899 
2900   /* free arrays */
2901 
2902   free(Row2ID);
2903 
2904   free(ie1);
2905   free(is1);
2906 
2907   for (i=0; i<max_nsc; i++){
2908     free(Q0[i]);
2909   }
2910   free(Q0);
2911 
2912   for (i=0; i<max_nsc; i++){
2913     free(Q1[i]);
2914   }
2915   free(Q1);
2916 
2917   free(VvecTmp);
2918   free(ISTmp);
2919 
2920   for (i=0; i<max_nsc; i++){
2921     free(Vvec[i]);
2922   }
2923   free(Vvec);
2924 
2925   for (i=0; i<Nmat; i++){
2926     free(Vvec2[i]);
2927   }
2928   free(Vvec2);
2929 
2930   mp = mpmax/2;
2931   for (n=0; n<nl; n++){
2932     for (m=0; m<mp; m++){
2933       free(Lvec[0][n][m]);
2934     }
2935     free(Lvec[0][n]);
2936     mp /= 2;
2937   }
2938   free(Lvec[0]);
2939 
2940   mp = mpmax/2;
2941   for (n=0; n<nl; n++){
2942     for (m=0; m<mp; m++){
2943       free(Lvec[1][n][m]);
2944     }
2945     free(Lvec[1][n]);
2946     mp /= 2;
2947   }
2948   free(Lvec[1]);
2949 
2950   for (m=0; m<mpmax; m++){
2951 
2952     nsa = EindB[0][m] - SindB[0][m] + 1;
2953 
2954     Anum[m] = (int*)malloc(sizeof(int)*nsa);
2955 
2956     for (i=SindB[0][m]; i<=EindB[0][m]; i++){
2957 
2958       i2 = i - SindB[0][m];
2959 
2960       free(Ai[m][i2]);
2961       free(Ax[m][i2]);
2962       free(IAx[m][i2]);
2963     }
2964 
2965     free(Ai[m]);
2966     free(Ax[m]);
2967     free(IAx[m]);
2968     free(Anum[m]);
2969   }
2970   free(Ai);
2971   free(Ax);
2972   free(IAx);
2973   free(Anum);
2974 
2975   for (m=0; m<mp; m++){
2976     free(IA[m]);
2977   }
2978   free(IA);
2979 
2980   free(IA0);
2981 
2982   mp = mpmax/2;
2983   for (n=0; n<nl; n++){
2984     for (m=0; m<mp; m++){
2985       free(IS[n][m]);
2986     }
2987     free(IS[n]);
2988     mp /= 2;
2989   }
2990   free(IS);
2991 
2992   mp = mpmax/2;
2993 
2994   for (n=0; n<nl; n++){
2995     for (m=0; m<mp; m++){
2996       for (i=SindC[n][m]; i<=EindC[n][m]; i++){
2997 
2998         free(IBx[0][n][m][i-SindC[n][m]]);
2999         free(IBx[1][n][m][i-SindC[n][m]]);
3000         free(ICx[n][m][i-SindC[n][m]]);
3001 
3002         free(Bx[0][n][m][i-SindC[n][m]]);
3003         free(Bx[1][n][m][i-SindC[n][m]]);
3004         free(Cx[n][m][i-SindC[n][m]]);
3005 
3006         free(Bi[0][n][m][i-SindC[n][m]]);
3007         free(Bi[1][n][m][i-SindC[n][m]]);
3008         free(Ci[n][m][i-SindC[n][m]]);
3009 
3010       } /* i */
3011 
3012       free(IBx[0][n][m]);
3013       free(IBx[1][n][m]);
3014       free(ICx[n][m]);
3015 
3016       free(Bx[0][n][m]);
3017       free(Bx[1][n][m]);
3018       free(Cx[n][m]);
3019 
3020       free(Bi[0][n][m]);
3021       free(Bi[1][n][m]);
3022       free(Ci[n][m]);
3023 
3024       free(Bnum[0][n][m]);
3025       free(Bnum[1][n][m]);
3026       free(Cnum[n][m]);
3027 
3028     } /* m */
3029 
3030     free(IBx[0][n]);
3031     free(IBx[1][n]);
3032     free(ICx[n]);
3033 
3034     free(Bx[0][n]);
3035     free(Bx[1][n]);
3036     free(Cx[n]);
3037 
3038     free(Bi[0][n]);
3039     free(Bi[1][n]);
3040     free(Ci[n]);
3041 
3042     free(Bnum[0][n]);
3043     free(Bnum[1][n]);
3044     free(Cnum[n]);
3045 
3046     mp /= 2;
3047 
3048   } /* n */
3049 
3050   free(IBx[0]);
3051   free(IBx[1]);
3052   free(ICx);
3053 
3054   free(Bx[0]);
3055   free(Bx[1]);
3056   free(Cx);
3057 
3058   free(Bi[0]);
3059   free(Bi[1]);
3060   free(Ci);
3061 
3062   free(Bnum[0]);
3063   free(Bnum[1]);
3064   free(Cnum);
3065 
3066   free(invp);
3067 
3068   free(Ti);
3069   free(Tx);
3070   free(Tp);
3071   free(Tp0);
3072 
3073   free(request_recv);
3074   free(request_send);
3075   free(stat_send);
3076 }
3077 
3078 
3079 
3080 
3081 
3082 
3083 
qsort_complex(long n,long * a,double complex * b)3084 static void qsort_complex(long n, long *a, double complex *b)
3085 {
3086   long int i;
3087   clists *AB;
3088 
3089   AB = (clists*)malloc(sizeof(clists)*n);
3090 
3091   for (i=0; i<n; i++){
3092     AB[i].a = a[i];
3093     AB[i].b = b[i];
3094   }
3095 
3096   qsort(AB, n, sizeof(clists), (int(*)(const void*, const void*))clists_cmp);
3097 
3098   for (i=0; i<n; i++){
3099     a[i] = AB[i].a;
3100     b[i] = AB[i].b;
3101   }
3102 
3103   free(AB);
3104 }
3105 
3106 
clists_cmp(const clists * x,const clists * y)3107 static int clists_cmp(const clists *x, const clists *y)
3108 {
3109   return (x->a < y->a ? -1 :
3110           y->a < x->a ?  1 : 0);
3111 }
3112 
3113 
3114 
3115 
Cluster_collinear_ON2_Force(int SCF_iter,int SpinP_switch,double *** C,double ** ko,double ***** nh,double **** CntOLP,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])3116 static double Cluster_collinear_ON2_Force(
3117 				    int SCF_iter,
3118 				    int SpinP_switch,
3119 				    double ***C,
3120 				    double **ko,
3121 				    double *****nh, double ****CntOLP,
3122 				    double *****CDM,
3123 				    double *****EDM,
3124 				    double Eele0[2], double Eele1[2])
3125 {
3126   static int firsttime=1;
3127   double TStime,TEtime;
3128   double stime,etime;
3129   int numprocs,myid,ID,tag=999;
3130   int i,j,k,k2,wan,wanA,wanB,Epoint,column;
3131   int tnoA,tnoB,LB_AN,i1,j1,l;
3132   int MA_AN,GA_AN,GB_AN,Gc_AN,h_AN,ni,nj,TNZE;
3133   int Cwan,Hwan,Gh_AN,Anum,tnum,num,num4;
3134   int loop1,Nloop1,Nmat,spin,NEV,imax;
3135   int *NZE,*PZE,*MP;
3136   double **Hall,*Sall;
3137   double **EDMall;
3138   double **eval,**evec;
3139   double **WRG;
3140   double eachsum,totalsum,pChemP;
3141   double TZ,ChemP_MIN,ChemP_MAX,ymax;
3142   double Dnum,spin_degeneracy;
3143   double totalsum1,f0,f1,x1,y1,x0,y0;
3144   double x,Trial_ChemP,dE_step;
3145   double cTrial_ChemP,ChemP0;
3146   double My_Eele1[2];
3147   int po,po3,po4,po6,loopN,free_flag;
3148   int num_loop,end_flag;
3149   int Depth_Level,Number_Division;
3150   int *conv_index_row;
3151   int *conv_index_col;
3152   int **conv_ind2;
3153   int *My_NZeros;
3154   int *is1,*ie1;
3155   /* for MPI */
3156   int myworld1;
3157   int numprocs1,myid1;
3158   int numprocs2,myid2;
3159   int Num_Comm_World1;
3160   int *NPROCS_ID1,*NPROCS_WD1;
3161   int *Comm_World1;
3162   int *Comm_World_StartID1;
3163   MPI_Comm *MPI_CommWD1;
3164   int myworld2B;
3165   int numprocs2B,myid2B;
3166   int Num_Comm_World2B;
3167   int *NPROCS_ID2B,*NPROCS_WD2B;
3168   int *Comm_World2B;
3169   int *Comm_World_StartID2B;
3170   MPI_Comm *MPI_CommWD2B;
3171 
3172   /* MPI */
3173   MPI_Comm_size(mpi_comm_level1,&numprocs);
3174   MPI_Comm_rank(mpi_comm_level1,&myid);
3175 
3176   /* for elapsed time */
3177   MPI_Barrier(mpi_comm_level1);
3178   dtime(&TStime);
3179 
3180   if (0<measure_time) dtime(&stime);
3181 
3182   /****************************************************
3183          set up the first communication world
3184   ****************************************************/
3185 
3186   Num_Comm_World1 = SpinP_switch + 1;
3187 
3188   NPROCS_ID1 = (int*)malloc(sizeof(int)*numprocs);
3189   Comm_World1 = (int*)malloc(sizeof(int)*numprocs);
3190   NPROCS_WD1 = (int*)malloc(sizeof(int)*Num_Comm_World1);
3191   Comm_World_StartID1 = (int*)malloc(sizeof(int)*Num_Comm_World1);
3192   MPI_CommWD1 = (MPI_Comm*)malloc(sizeof(MPI_Comm)*Num_Comm_World1);
3193 
3194   Make_Comm_Worlds(mpi_comm_level1, myid, numprocs, Num_Comm_World1, &myworld1, MPI_CommWD1,
3195                    NPROCS_ID1, Comm_World1, NPROCS_WD1, Comm_World_StartID1);
3196 
3197   /****************************************************
3198            calculation of dimension of matrix
3199   ****************************************************/
3200 
3201   Nmat = 0;
3202   for (i=1; i<=atomnum; i++){
3203     wanA  = WhatSpecies[i];
3204     Nmat += Spe_Total_CNO[wanA];
3205   }
3206 
3207   /****************************************************
3208                   total core charge
3209   ****************************************************/
3210 
3211   TZ = 0.0;
3212   for (i=1; i<=atomnum; i++){
3213     wan = WhatSpecies[i];
3214     TZ = TZ + Spe_Core_Charge[wan];
3215   }
3216 
3217   /****************************************************
3218                    calculate TNZE
3219   ****************************************************/
3220 
3221   TNZE = 0;
3222   for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
3223     Cwan = WhatSpecies[Gc_AN];
3224 
3225     nj = 0;
3226     for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
3227       Gh_AN = natn[Gc_AN][h_AN];
3228       Hwan = WhatSpecies[Gh_AN];
3229       nj += Spe_Total_NO[Hwan];
3230     }
3231 
3232     TNZE += Spe_Total_NO[Cwan]*nj;
3233   }
3234 
3235   /****************************************************
3236                    allocate arrays
3237   ****************************************************/
3238 
3239   WRG = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3240   for (spin=0; spin<(SpinP_switch+1); spin++){
3241     WRG[spin] = (double*)malloc(sizeof(double)*ON2_Npoles_f);
3242     for (i=0; i<ON2_Npoles_f; i++) WRG[spin][i] = 0.0;
3243   }
3244 
3245   conv_ind2 = (int**)malloc(sizeof(int*)*Nmat);
3246   for (i=0; i<Nmat; i++){
3247     conv_ind2[i] = (int*)malloc(sizeof(int)*Nmat);
3248   }
3249 
3250   EDMall = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3251   for (spin=0; spin<(SpinP_switch+1); spin++){
3252     EDMall[spin] = (double*)malloc(sizeof(double)*TNZE);
3253     for (i=0; i<TNZE; i++) EDMall[spin][i] = 0.0;
3254   }
3255 
3256   Hall = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3257   for (spin=0; spin<(SpinP_switch+1); spin++){
3258     Hall[spin] = (double*)malloc(sizeof(double)*TNZE);
3259   }
3260 
3261   Sall = (double*)malloc(sizeof(double)*TNZE);
3262 
3263   MP = (int*)malloc(sizeof(int)*(atomnum+1));
3264 
3265   /* set MP */
3266 
3267   Anum = 1;
3268   for (i=1; i<=atomnum; i++){
3269     MP[i] = Anum;
3270     wanA = WhatSpecies[i];
3271     Anum += Spe_Total_CNO[wanA];
3272   }
3273 
3274   conv_index_row = (int*)malloc(sizeof(int)*TNZE);
3275   conv_index_col = (int*)malloc(sizeof(int)*TNZE);
3276 
3277   /******************************************
3278       MPI communication of nh and CntOLP
3279   ******************************************/
3280 
3281   /* allocation of arrays */
3282 
3283   My_NZeros = (int*)malloc(sizeof(int)*numprocs);
3284   is1 = (int*)malloc(sizeof(int)*numprocs);
3285   ie1 = (int*)malloc(sizeof(int)*numprocs);
3286 
3287   /* find my total number of non-zero elements in myid */
3288 
3289   My_NZeros[myid] = 0;
3290   for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3291     GA_AN = M2G[MA_AN];
3292     wanA = WhatSpecies[GA_AN];
3293     tnoA = Spe_Total_CNO[wanA];
3294 
3295     num = 0;
3296     for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3297       GB_AN = natn[GA_AN][LB_AN];
3298       wanB = WhatSpecies[GB_AN];
3299       tnoB = Spe_Total_CNO[wanB];
3300       num += tnoB;
3301     }
3302 
3303     My_NZeros[myid] += tnoA*num;
3304   }
3305 
3306   for (ID=0; ID<numprocs; ID++){
3307     MPI_Bcast(&My_NZeros[ID],1,MPI_INT,ID,mpi_comm_level1);
3308   }
3309 
3310   tnum = 0;
3311   for (ID=0; ID<numprocs; ID++){
3312     tnum += My_NZeros[ID];
3313   }
3314 
3315   is1[0] = 0;
3316   ie1[0] = My_NZeros[0] - 1;
3317 
3318   for (ID=1; ID<numprocs; ID++){
3319     is1[ID] = ie1[ID-1] + 1;
3320     ie1[ID] = is1[ID] + My_NZeros[ID] - 1;
3321   }
3322 
3323   /******************
3324        for nh
3325   ******************/
3326 
3327   for (spin=0; spin<=SpinP_switch; spin++){
3328 
3329     /* set nh */
3330 
3331     k = is1[myid];
3332     for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3333       GA_AN = M2G[MA_AN];
3334       wanA = WhatSpecies[GA_AN];
3335       tnoA = Spe_Total_CNO[wanA];
3336       for (i=0; i<tnoA; i++){
3337 	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3338 	  GB_AN = natn[GA_AN][LB_AN];
3339 	  wanB = WhatSpecies[GB_AN];
3340 	  tnoB = Spe_Total_CNO[wanB];
3341 	  for (j=0; j<tnoB; j++){
3342 	    Hall[spin][k] = nh[spin][MA_AN][LB_AN][i][j];
3343 	    k++;
3344 	  }
3345 	}
3346       }
3347     }
3348 
3349     /* MPI Hall */
3350 
3351     MPI_Barrier(mpi_comm_level1);
3352 
3353     for (ID=0; ID<numprocs; ID++){
3354       k = is1[ID];
3355       MPI_Bcast(&Hall[spin][k], My_NZeros[ID], MPI_DOUBLE, ID, mpi_comm_level1);
3356     }
3357 
3358   }
3359 
3360   /******************
3361       for CntOLP
3362   ******************/
3363 
3364   k = is1[myid];
3365   for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3366     GA_AN = M2G[MA_AN];
3367     wanA = WhatSpecies[GA_AN];
3368     tnoA = Spe_Total_CNO[wanA];
3369     for (i=0; i<tnoA; i++){
3370       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3371         GB_AN = natn[GA_AN][LB_AN];
3372         wanB = WhatSpecies[GB_AN];
3373         tnoB = Spe_Total_CNO[wanB];
3374         for (j=0; j<tnoB; j++){
3375           Sall[k] = CntOLP[MA_AN][LB_AN][i][j];
3376           k++;
3377 	}
3378       }
3379     }
3380   }
3381 
3382   /* MPI S1 */
3383 
3384   MPI_Barrier(mpi_comm_level1);
3385 
3386   for (ID=0; ID<numprocs; ID++){
3387     k = is1[ID];
3388     MPI_Bcast(&Sall[k], My_NZeros[ID], MPI_DOUBLE, ID, mpi_comm_level1);
3389   }
3390 
3391   /******************************************
3392    set up conv_index_row and conv_index_col
3393   ******************************************/
3394 
3395   k = is1[myid];
3396 
3397   for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3398     GA_AN = M2G[MA_AN];
3399     wanA = WhatSpecies[GA_AN];
3400     tnoA = Spe_Total_CNO[wanA];
3401     for (i=0; i<tnoA; i++){
3402       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3403         GB_AN = natn[GA_AN][LB_AN];
3404         wanB = WhatSpecies[GB_AN];
3405         tnoB = Spe_Total_CNO[wanB];
3406         for (j=0; j<tnoB; j++){
3407 
3408           conv_index_row[k] = MP[GA_AN] + i - 1;
3409           conv_index_col[k] = MP[GB_AN] + j - 1;
3410           k++;
3411 	}
3412       }
3413     }
3414   }
3415 
3416   for (ID=0; ID<numprocs; ID++){
3417     k = is1[ID];
3418     MPI_Bcast(&conv_index_row[k], My_NZeros[ID], MPI_INT, ID, mpi_comm_level1);
3419   }
3420 
3421   for (ID=0; ID<numprocs; ID++){
3422     k = is1[ID];
3423     MPI_Bcast(&conv_index_col[k], My_NZeros[ID], MPI_INT, ID, mpi_comm_level1);
3424   }
3425 
3426   /****************************************************
3427                  set up NZE and PZE
3428   ****************************************************/
3429 
3430   NZE = (int*)malloc(sizeof(int)*Nmat);
3431   PZE = (int*)malloc(sizeof(int)*Nmat);
3432 
3433   for (i=0; i<Nmat; i++){
3434     NZE[i] = 0;
3435     PZE[i] = -1;
3436   }
3437 
3438   for (k=0; k<TNZE; k++){
3439 
3440     i = conv_index_row[k];
3441 
3442     NZE[i]++;
3443     if (PZE[i]==-1) PZE[i] = k;
3444   }
3445 
3446   /* freeing of arrays */
3447 
3448   free(My_NZeros);
3449   free(ie1);
3450 
3451   if (0<measure_time){
3452     dtime(&etime);
3453     printf("myid=%2d OND_Solver time1=%15.12f\n",myid,etime-stime);fflush(stdout);
3454   }
3455 
3456   /*************************************************************************
3457    *************************************************************************
3458 
3459      Part 0: nested dissection of matrix
3460 
3461    *************************************************************************
3462   *************************************************************************/
3463 
3464   free_flag = 0;
3465   Nested_Dissection(free_flag,Nmat,MP,&Depth_Level,&Number_Division);
3466 
3467   /*************************************************************************
3468    *************************************************************************
3469 
3470      Part 1: contour integration with a chemical potential \mu_0
3471 
3472      main loop which is decomposed to
3473      spin and energy point
3474 
3475    *************************************************************************
3476   *************************************************************************/
3477 
3478   MPI_Barrier(mpi_comm_level1);
3479 
3480   Nloop1 = (SpinP_switch+1)*ON2_Npoles_f;
3481 
3482   if (Nloop1<=numprocs){
3483 
3484     Num_Comm_World2B = Nloop1;
3485 
3486     NPROCS_ID2B = (int*)malloc(sizeof(int)*numprocs);
3487     Comm_World2B = (int*)malloc(sizeof(int)*numprocs);
3488     NPROCS_WD2B = (int*)malloc(sizeof(int)*Num_Comm_World2B);
3489     Comm_World_StartID2B = (int*)malloc(sizeof(int)*Num_Comm_World2B);
3490     MPI_CommWD2B = (MPI_Comm*)malloc(sizeof(MPI_Comm)*Num_Comm_World2B);
3491 
3492     Make_Comm_Worlds(mpi_comm_level1, myid, numprocs, Num_Comm_World2B, &myworld2B, MPI_CommWD2B,
3493 		     NPROCS_ID2B, Comm_World2B, NPROCS_WD2B, Comm_World_StartID2B);
3494 
3495     MPI_Comm_size(MPI_CommWD2B[myworld2B],&numprocs2);
3496     MPI_Comm_rank(MPI_CommWD2B[myworld2B],&myid2);
3497   }
3498   else {
3499     numprocs2 = 1;
3500     myid2 = 0;
3501   }
3502 
3503   for (spin=0; spin<(SpinP_switch+1); spin++){
3504     for (i=0; i<TNZE; i++) EDMall[spin][i] = 0.0;
3505   }
3506 
3507   if (Nloop1<=numprocs){
3508 
3509     loop1 = myworld2B;
3510 
3511     /* get spin, Epoint, and column */
3512     spin = loop1/ON2_Npoles_f;
3513     Epoint = loop1 - spin*ON2_Npoles_f;
3514 
3515     OND_Solver("force",
3516 	       myid,numprocs,
3517 	       myid2,numprocs2,
3518 	       Nloop1,
3519 	       MPI_CommWD2B[myworld2B],
3520 	       spin,Epoint,0.0,
3521 	       Nmat,NZE,PZE,
3522 	       Depth_Level,
3523 	       Number_Division,
3524 	       ChemP,
3525 	       Hall,Sall,EDMall,
3526 	       WRG,
3527 	       conv_index_row,
3528 	       conv_index_col,
3529 	       conv_ind2,
3530 	       TNZE);
3531   }
3532 
3533   else {
3534 
3535     for ( loop1=myid; loop1<Nloop1; loop1+=numprocs ){
3536 
3537       /* get spin, Epoint, and column */
3538       spin = loop1/ON2_Npoles_f;
3539       Epoint = loop1 - spin*ON2_Npoles_f;
3540 
3541       /* call OND_Solver */
3542 
3543       OND_Solver("force",
3544 		 myid,numprocs,
3545 		 myid2,numprocs2,
3546 		 Nloop1,
3547 		 mpi_comm_level1,
3548 		 spin,Epoint,0.0,
3549 		 Nmat,NZE,PZE,
3550 		 Depth_Level,
3551 		 Number_Division,
3552 		 ChemP,
3553 		 Hall,Sall,EDMall,
3554 		 WRG,
3555 		 conv_index_row,
3556 		 conv_index_col,
3557 		 conv_ind2,
3558 		 TNZE);
3559 
3560     }
3561   } /* else */
3562 
3563   /*************************************************************************
3564         store the energy density matrix into an array EDM
3565   **************************************************************************/
3566 
3567   MPI_Barrier(mpi_comm_level1);
3568 
3569   for (spin=0; spin<=SpinP_switch; spin++){
3570 
3571     MPI_Allreduce(&EDMall[spin][0], &Sall[0], TNZE, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
3572 
3573     k = is1[myid];
3574     for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3575       GA_AN = M2G[MA_AN];
3576       wanA = WhatSpecies[GA_AN];
3577       tnoA = Spe_Total_CNO[wanA];
3578       for (i=0; i<tnoA; i++){
3579 	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3580 	  GB_AN = natn[GA_AN][LB_AN];
3581 	  wanB = WhatSpecies[GB_AN];
3582 	  tnoB = Spe_Total_CNO[wanB];
3583 	  for (j=0; j<tnoB; j++){
3584 	    EDM[spin][MA_AN][LB_AN][i][j] = Sall[k];
3585 	    k++;
3586 	  }
3587 	}
3588       }
3589     }
3590   }
3591 
3592   /****************************************************
3593      freeing Index_BC, SindB, EindB, SindC, EindE
3594   ****************************************************/
3595 
3596   free_flag = 1;
3597   Nested_Dissection(free_flag,Nmat,MP,&Depth_Level,&Number_Division);
3598 
3599   /****************************************************
3600                    freeing of arrays
3601   ****************************************************/
3602 
3603   free(is1);
3604 
3605   free(NZE);
3606   free(PZE);
3607 
3608   for (spin=0; spin<(SpinP_switch+1); spin++){
3609     free(WRG[spin]);
3610   }
3611   free(WRG);
3612 
3613   for (i=0; i<Nmat; i++){
3614     free(conv_ind2[i]);
3615   }
3616   free(conv_ind2);
3617 
3618   for (spin=0; spin<(SpinP_switch+1); spin++){
3619     free(EDMall[spin]);
3620   }
3621   free(EDMall);
3622 
3623   for (spin=0; spin<(SpinP_switch+1); spin++){
3624     free(Hall[spin]);
3625   }
3626   free(Hall);
3627 
3628   free(Sall);
3629 
3630   free(MP);
3631   free(conv_index_row);
3632   free(conv_index_col);
3633 
3634   /* freeing of arrays for the first world */
3635 
3636   if (Num_Comm_World1<=numprocs){
3637     MPI_Comm_free(&MPI_CommWD1[myworld1]);
3638   }
3639 
3640   free(NPROCS_ID1);
3641   free(Comm_World1);
3642   free(NPROCS_WD1);
3643   free(Comm_World_StartID1);
3644   free(MPI_CommWD1);
3645 
3646   /* freeing of arrays for the second B world */
3647 
3648   if (Nloop1<=numprocs){
3649 
3650     MPI_Comm_free(&MPI_CommWD2B[myworld2B]);
3651     free(NPROCS_ID2B);
3652     free(Comm_World2B);
3653     free(NPROCS_WD2B);
3654     free(Comm_World_StartID2B);
3655     free(MPI_CommWD2B);
3656   }
3657 
3658   /* for elapsed time */
3659   MPI_Barrier(mpi_comm_level1);
3660   dtime(&TEtime);
3661   return (TEtime-TStime);
3662 }
3663 
3664 
3665 
3666 
3667 
3668 
3669 
3670 
3671 
3672 
3673 
3674 
Cluster_collinear_ON2_Iter(int SCF_iter,int SpinP_switch,double *** C,double ** ko,double ***** nh,double **** CntOLP,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])3675 static double Cluster_collinear_ON2_Iter(
3676 				    int SCF_iter,
3677 				    int SpinP_switch,
3678 				    double ***C,
3679 				    double **ko,
3680 				    double *****nh, double ****CntOLP,
3681 				    double *****CDM,
3682 				    double *****EDM,
3683 				    double Eele0[2], double Eele1[2])
3684 {
3685   static int firsttime=1;
3686   double TStime,TEtime;
3687   double stime,etime;
3688   int numprocs,myid,ID,tag=999;
3689   int i,j,k,k2,wan,wanA,wanB,Epoint,column;
3690   int tnoA,tnoB,LB_AN,i1,j1,l;
3691   int MA_AN,GA_AN,GB_AN,Gc_AN,h_AN,ni,nj,TNZE;
3692   int Cwan,Hwan,Gh_AN,Anum,tnum,num,num4;
3693   int loop1,Nloop1,Nmat,spin,NEV,imax;
3694   int *NZE,*PZE,*MP;
3695   double **Hall,*Sall;
3696   double **DMall;
3697   double **eval,**evec;
3698   double **WRG;
3699   double eachsum,totalsum,pChemP;
3700   double TZ,ChemP_MIN,ChemP_MAX,ymax;
3701   double Dnum,spin_degeneracy;
3702   double totalsum1,f0,f1,x1,y1,x0,y0;
3703   double x,Trial_ChemP,dE_step;
3704   double cTrial_ChemP,ChemP0;
3705   double x3,y3,startE,Real_Ene;
3706   double My_Eele1[2];
3707   static double pTrial_ChemP[4],pDiff_Num[5];
3708   static double DeltaMu[7],DeltaN[3],DeltaNp[3];
3709   int po,po3,po4,po6,loopN,free_flag;
3710   int num_loop,end_flag;
3711   int Depth_Level,Number_Division;
3712   int *conv_index_row;
3713   int *conv_index_col;
3714   int **conv_ind2;
3715   int *My_NZeros;
3716   int *is1,*ie1;
3717   /* for MPI */
3718   int myworld1;
3719   int numprocs1,myid1;
3720   int numprocs2,myid2;
3721   int Num_Comm_World1;
3722   int *NPROCS_ID1,*NPROCS_WD1;
3723   int *Comm_World1;
3724   int *Comm_World_StartID1;
3725   MPI_Comm *MPI_CommWD1;
3726   int myworld2B;
3727   int numprocs2B,myid2B;
3728   int Num_Comm_World2B;
3729   int *NPROCS_ID2B,*NPROCS_WD2B;
3730   int *Comm_World2B;
3731   int *Comm_World_StartID2B;
3732   MPI_Comm *MPI_CommWD2B;
3733 
3734   /* MPI */
3735   MPI_Comm_size(mpi_comm_level1,&numprocs);
3736   MPI_Comm_rank(mpi_comm_level1,&myid);
3737 
3738   /* for elapsed time */
3739   MPI_Barrier(mpi_comm_level1);
3740   dtime(&TStime);
3741 
3742   if (0<measure_time) dtime(&stime);
3743 
3744   /****************************************************
3745          set up the first communication world
3746   ****************************************************/
3747 
3748   Num_Comm_World1 = SpinP_switch + 1;
3749 
3750   NPROCS_ID1 = (int*)malloc(sizeof(int)*numprocs);
3751   Comm_World1 = (int*)malloc(sizeof(int)*numprocs);
3752   NPROCS_WD1 = (int*)malloc(sizeof(int)*Num_Comm_World1);
3753   Comm_World_StartID1 = (int*)malloc(sizeof(int)*Num_Comm_World1);
3754   MPI_CommWD1 = (MPI_Comm*)malloc(sizeof(MPI_Comm)*Num_Comm_World1);
3755 
3756   Make_Comm_Worlds(mpi_comm_level1, myid, numprocs, Num_Comm_World1, &myworld1, MPI_CommWD1,
3757                    NPROCS_ID1, Comm_World1, NPROCS_WD1, Comm_World_StartID1);
3758 
3759   /****************************************************
3760            calculation of dimension of matrix
3761   ****************************************************/
3762 
3763   Nmat = 0;
3764   for (i=1; i<=atomnum; i++){
3765     wanA  = WhatSpecies[i];
3766     Nmat += Spe_Total_CNO[wanA];
3767   }
3768 
3769   /****************************************************
3770                   total core charge
3771   ****************************************************/
3772 
3773   TZ = 0.0;
3774   for (i=1; i<=atomnum; i++){
3775     wan = WhatSpecies[i];
3776     TZ = TZ + Spe_Core_Charge[wan];
3777   }
3778 
3779   /****************************************************
3780                    calculate TNZE
3781   ****************************************************/
3782 
3783   TNZE = 0;
3784   for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
3785     Cwan = WhatSpecies[Gc_AN];
3786 
3787     nj = 0;
3788     for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
3789       Gh_AN = natn[Gc_AN][h_AN];
3790       Hwan = WhatSpecies[Gh_AN];
3791       nj += Spe_Total_NO[Hwan];
3792     }
3793 
3794     TNZE += Spe_Total_NO[Cwan]*nj;
3795   }
3796 
3797   /****************************************************
3798                    allocate arrays
3799   ****************************************************/
3800 
3801   WRG = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3802   for (spin=0; spin<(SpinP_switch+1); spin++){
3803     WRG[spin] = (double*)malloc(sizeof(double)*ON2_Npoles);
3804     for (i=0; i<ON2_Npoles; i++) WRG[spin][i] = 0.0;
3805   }
3806 
3807   conv_ind2 = (int**)malloc(sizeof(int*)*Nmat);
3808   for (i=0; i<Nmat; i++){
3809     conv_ind2[i] = (int*)malloc(sizeof(int)*Nmat);
3810   }
3811 
3812   DMall = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3813   for (spin=0; spin<(SpinP_switch+1); spin++){
3814     DMall[spin] = (double*)malloc(sizeof(double)*TNZE);
3815     for (i=0; i<TNZE; i++) DMall[spin][i] = 0.0;
3816   }
3817 
3818   Hall = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3819   for (spin=0; spin<(SpinP_switch+1); spin++){
3820     Hall[spin] = (double*)malloc(sizeof(double)*TNZE);
3821   }
3822 
3823   Sall = (double*)malloc(sizeof(double)*TNZE);
3824 
3825   MP = (int*)malloc(sizeof(int)*(atomnum+1));
3826 
3827   /* set MP */
3828 
3829   Anum = 1;
3830   for (i=1; i<=atomnum; i++){
3831     MP[i] = Anum;
3832     wanA = WhatSpecies[i];
3833     Anum += Spe_Total_CNO[wanA];
3834   }
3835 
3836   conv_index_row = (int*)malloc(sizeof(int)*TNZE);
3837   conv_index_col = (int*)malloc(sizeof(int)*TNZE);
3838 
3839   /******************************************
3840       MPI communication of nh and CntOLP
3841   ******************************************/
3842 
3843   /* allocation of arrays */
3844 
3845   My_NZeros = (int*)malloc(sizeof(int)*numprocs);
3846   is1 = (int*)malloc(sizeof(int)*numprocs);
3847   ie1 = (int*)malloc(sizeof(int)*numprocs);
3848 
3849   /* find my total number of non-zero elements in myid */
3850 
3851   My_NZeros[myid] = 0;
3852   for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3853     GA_AN = M2G[MA_AN];
3854     wanA = WhatSpecies[GA_AN];
3855     tnoA = Spe_Total_CNO[wanA];
3856 
3857     num = 0;
3858     for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3859       GB_AN = natn[GA_AN][LB_AN];
3860       wanB = WhatSpecies[GB_AN];
3861       tnoB = Spe_Total_CNO[wanB];
3862       num += tnoB;
3863     }
3864 
3865     My_NZeros[myid] += tnoA*num;
3866   }
3867 
3868   for (ID=0; ID<numprocs; ID++){
3869     MPI_Bcast(&My_NZeros[ID],1,MPI_INT,ID,mpi_comm_level1);
3870   }
3871 
3872   tnum = 0;
3873   for (ID=0; ID<numprocs; ID++){
3874     tnum += My_NZeros[ID];
3875   }
3876 
3877   is1[0] = 0;
3878   ie1[0] = My_NZeros[0] - 1;
3879 
3880   for (ID=1; ID<numprocs; ID++){
3881     is1[ID] = ie1[ID-1] + 1;
3882     ie1[ID] = is1[ID] + My_NZeros[ID] - 1;
3883   }
3884 
3885   /******************
3886        for nh
3887   ******************/
3888 
3889   for (spin=0; spin<=SpinP_switch; spin++){
3890 
3891     /* set nh */
3892 
3893     k = is1[myid];
3894     for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3895       GA_AN = M2G[MA_AN];
3896       wanA = WhatSpecies[GA_AN];
3897       tnoA = Spe_Total_CNO[wanA];
3898       for (i=0; i<tnoA; i++){
3899 	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3900 	  GB_AN = natn[GA_AN][LB_AN];
3901 	  wanB = WhatSpecies[GB_AN];
3902 	  tnoB = Spe_Total_CNO[wanB];
3903 	  for (j=0; j<tnoB; j++){
3904 	    Hall[spin][k] = nh[spin][MA_AN][LB_AN][i][j];
3905 	    k++;
3906 	  }
3907 	}
3908       }
3909     }
3910 
3911     /* MPI Hall */
3912 
3913     MPI_Barrier(mpi_comm_level1);
3914 
3915     for (ID=0; ID<numprocs; ID++){
3916       k = is1[ID];
3917       MPI_Bcast(&Hall[spin][k], My_NZeros[ID], MPI_DOUBLE, ID, mpi_comm_level1);
3918     }
3919 
3920   }
3921 
3922   /******************
3923       for CntOLP
3924   ******************/
3925 
3926   k = is1[myid];
3927   for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3928     GA_AN = M2G[MA_AN];
3929     wanA = WhatSpecies[GA_AN];
3930     tnoA = Spe_Total_CNO[wanA];
3931     for (i=0; i<tnoA; i++){
3932       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3933         GB_AN = natn[GA_AN][LB_AN];
3934         wanB = WhatSpecies[GB_AN];
3935         tnoB = Spe_Total_CNO[wanB];
3936         for (j=0; j<tnoB; j++){
3937           Sall[k] = CntOLP[MA_AN][LB_AN][i][j];
3938           k++;
3939 	}
3940       }
3941     }
3942   }
3943 
3944   /* MPI S1 */
3945 
3946   MPI_Barrier(mpi_comm_level1);
3947 
3948   for (ID=0; ID<numprocs; ID++){
3949     k = is1[ID];
3950     MPI_Bcast(&Sall[k], My_NZeros[ID], MPI_DOUBLE, ID, mpi_comm_level1);
3951   }
3952 
3953   /******************************************
3954    set up conv_index_row and conv_index_col
3955   ******************************************/
3956 
3957   k = is1[myid];
3958 
3959   for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3960     GA_AN = M2G[MA_AN];
3961     wanA = WhatSpecies[GA_AN];
3962     tnoA = Spe_Total_CNO[wanA];
3963     for (i=0; i<tnoA; i++){
3964       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3965         GB_AN = natn[GA_AN][LB_AN];
3966         wanB = WhatSpecies[GB_AN];
3967         tnoB = Spe_Total_CNO[wanB];
3968         for (j=0; j<tnoB; j++){
3969 
3970           conv_index_row[k] = MP[GA_AN] + i - 1;
3971           conv_index_col[k] = MP[GB_AN] + j - 1;
3972           k++;
3973 	}
3974       }
3975     }
3976   }
3977 
3978   for (ID=0; ID<numprocs; ID++){
3979     k = is1[ID];
3980     MPI_Bcast(&conv_index_row[k], My_NZeros[ID], MPI_INT, ID, mpi_comm_level1);
3981   }
3982 
3983   for (ID=0; ID<numprocs; ID++){
3984     k = is1[ID];
3985     MPI_Bcast(&conv_index_col[k], My_NZeros[ID], MPI_INT, ID, mpi_comm_level1);
3986   }
3987 
3988   /****************************************************
3989                  set up NZE and PZE
3990   ****************************************************/
3991 
3992   NZE = (int*)malloc(sizeof(int)*Nmat);
3993   PZE = (int*)malloc(sizeof(int)*Nmat);
3994 
3995   for (i=0; i<Nmat; i++){
3996     NZE[i] = 0;
3997     PZE[i] = -1;
3998   }
3999 
4000   for (k=0; k<TNZE; k++){
4001 
4002     i = conv_index_row[k];
4003 
4004     NZE[i]++;
4005     if (PZE[i]==-1) PZE[i] = k;
4006   }
4007 
4008   /* freeing of arrays */
4009 
4010   free(My_NZeros);
4011   free(ie1);
4012 
4013   if (0<measure_time){
4014     dtime(&etime);
4015     printf("myid=%2d OND_Solver time1=%15.12f\n",myid,etime-stime);fflush(stdout);
4016   }
4017 
4018   /*************************************************************************
4019    *************************************************************************
4020 
4021      Part 0: nested dissection of matrix
4022 
4023    *************************************************************************
4024   *************************************************************************/
4025 
4026   free_flag = 0;
4027   Nested_Dissection(free_flag,Nmat,MP,&Depth_Level,&Number_Division);
4028 
4029   /*************************************************************************
4030    *************************************************************************
4031 
4032      Part 1: contour integration with a chemical potential \mu_0
4033 
4034      main loop which is decomposed to
4035      spin and energy point
4036 
4037    *************************************************************************
4038   *************************************************************************/
4039 
4040   MPI_Barrier(mpi_comm_level1);
4041 
4042   Nloop1 = (SpinP_switch+1)*ON2_Npoles;
4043 
4044   if (Nloop1<=numprocs){
4045 
4046     Num_Comm_World2B = Nloop1;
4047 
4048     NPROCS_ID2B = (int*)malloc(sizeof(int)*numprocs);
4049     Comm_World2B = (int*)malloc(sizeof(int)*numprocs);
4050     NPROCS_WD2B = (int*)malloc(sizeof(int)*Num_Comm_World2B);
4051     Comm_World_StartID2B = (int*)malloc(sizeof(int)*Num_Comm_World2B);
4052     MPI_CommWD2B = (MPI_Comm*)malloc(sizeof(MPI_Comm)*Num_Comm_World2B);
4053 
4054     Make_Comm_Worlds(mpi_comm_level1, myid, numprocs, Num_Comm_World2B, &myworld2B, MPI_CommWD2B,
4055 		     NPROCS_ID2B, Comm_World2B, NPROCS_WD2B, Comm_World_StartID2B);
4056 
4057     MPI_Comm_size(MPI_CommWD2B[myworld2B],&numprocs2);
4058     MPI_Comm_rank(MPI_CommWD2B[myworld2B],&myid2);
4059   }
4060   else {
4061     numprocs2 = 1;
4062     myid2 = 0;
4063   }
4064 
4065   end_flag = 0;
4066   num_loop = 1;
4067   Trial_ChemP = ChemP;
4068 
4069   do {
4070 
4071     for (spin=0; spin<(SpinP_switch+1); spin++){
4072       for (i=0; i<TNZE; i++) DMall[spin][i] = 0.0;
4073     }
4074 
4075     if (0<measure_time) dtime(&stime);
4076 
4077     if (Nloop1<=numprocs){
4078 
4079       loop1 = myworld2B;
4080 
4081       /* get spin, Epoint, and column */
4082       spin = loop1/ON2_Npoles;
4083       Epoint = loop1 - spin*ON2_Npoles;
4084 
4085       OND_Solver("contour",
4086                  myid,numprocs,
4087                  myid2,numprocs2,
4088 		 Nloop1,
4089 		 MPI_CommWD2B[myworld2B],
4090 		 spin,Epoint,Real_Ene,
4091 		 Nmat,NZE,PZE,
4092 		 Depth_Level,
4093 		 Number_Division,
4094 		 Trial_ChemP,
4095 		 Hall,Sall,DMall,
4096                  WRG,
4097 		 conv_index_row,
4098 		 conv_index_col,
4099 		 conv_ind2,
4100 		 TNZE);
4101     }
4102 
4103     else {
4104 
4105       for ( loop1=myid; loop1<Nloop1; loop1+=numprocs ){
4106 
4107 	/* get spin, Epoint, and column */
4108 	spin = loop1/ON2_Npoles;
4109 	Epoint = loop1 - spin*ON2_Npoles;
4110 
4111 	/* call OND_Solver */
4112 
4113 	OND_Solver("contour",
4114                    myid,numprocs,
4115                    myid2,numprocs2,
4116 		   Nloop1,
4117 		   mpi_comm_level1,
4118 		   spin,Epoint,Real_Ene,
4119 		   Nmat,NZE,PZE,
4120 		   Depth_Level,
4121 		   Number_Division,
4122 		   Trial_ChemP,
4123 		   Hall,Sall,DMall,
4124                    WRG,
4125 		   conv_index_row,
4126 		   conv_index_col,
4127 		   conv_ind2,
4128 		   TNZE);
4129 
4130       }
4131     } /* else */
4132 
4133     if (0<measure_time){
4134       dtime(&etime);
4135       printf("myid=%2d OND_Solver time2 =%15.12f\n",myid,etime-stime);fflush(stdout);
4136     }
4137 
4138     /*******************************************************
4139      calculate "each" number of electrons on each processor,
4140      and MPI communicate to get the total sum.
4141     ********************************************************/
4142 
4143     MPI_Barrier(mpi_comm_level1);
4144 
4145     if (0<measure_time) dtime(&stime);
4146 
4147     eachsum = 0.0;
4148     for (spin=0; spin<=SpinP_switch; spin++){
4149       for (i=0; i<TNZE; i++){
4150 	eachsum += DMall[spin][i]*Sall[i];
4151       }
4152     }
4153 
4154     MPI_Allreduce(&eachsum, &totalsum, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4155 
4156     if (SpinP_switch==0){
4157       eachsum  *= 2.0;
4158       totalsum *= 2.0;
4159     }
4160 
4161     if (0<measure_time){
4162       dtime(&etime);
4163       printf("myid=%2d OND_Solver time3=%15.12f\n",myid,etime-stime);fflush(stdout);
4164     }
4165 
4166     /* set x3 and y3 */
4167 
4168     x3 = Trial_ChemP;
4169     y3 = -TZ + totalsum + system_charge;
4170     pTrial_ChemP[3] = x3;
4171     pDiff_Num[3] = y3;
4172 
4173     if (0<measure_time){
4174       printf("myid=%2d eachsum=%15.12f totalsum=%15.12f\n",myid,eachsum,totalsum);
4175       if (myid==0)
4176       printf("SSS SCF_iter=%2d num_loop=%2d x3=%18.15f y3=%18.15f\n",SCF_iter,num_loop,x3,y3);
4177     }
4178 
4179     /*******************************************************
4180       find the next Trial_ChemP.
4181       updating Trial_ChemP is performed by two ways:
4182 
4183       (1) Step 1:
4184       if fabs(y3) is large enough so that finding a proper
4185       Trial_ChemP can be difficult, then we try to find
4186       Trial_ChemP using the retarded Green function, G(E+i0^+).
4187 
4188       (2) Step 2:
4189       if fabs(y3) is small, then a linear interpolation or
4190       the Muller method is used to estimate Trial_ChemP.
4191     ********************************************************/
4192 
4193     /**********************************************
4194       Step 1:
4195     ***********************************************/
4196 
4197     if (0.01<fabs(y3) && num_loop==1){
4198 
4199       cTrial_ChemP = Trial_ChemP;
4200       dE_step = 0.02/eV2Hartree;
4201 
4202       if (0.0<y3){
4203 	startE = x3 - dE_step*(double)ON2_Npoles*0.8;
4204       }
4205       else {
4206 	startE = x3 - dE_step*(double)ON2_Npoles*0.2;
4207       }
4208 
4209       if (Nloop1<=numprocs){
4210 
4211 	loop1 = myworld2B;
4212 
4213 	/* get spin, Epoint, and column */
4214 	spin = loop1/ON2_Npoles;
4215 	Epoint = loop1 - spin*ON2_Npoles;
4216 	Real_Ene = startE + dE_step*(double)Epoint;
4217 
4218 	OND_Solver("retarded",
4219 		   myid,numprocs,
4220 		   myid2,numprocs2,
4221 		   Nloop1,
4222 		   MPI_CommWD2B[myworld2B],
4223 		   spin,Epoint,Real_Ene,
4224 		   Nmat,NZE,PZE,
4225 		   Depth_Level,
4226 		   Number_Division,
4227 		   Trial_ChemP,
4228 		   Hall,Sall,DMall,
4229 		   WRG,
4230 		   conv_index_row,
4231 		   conv_index_col,
4232 		   conv_ind2,
4233 		   TNZE);
4234       }
4235 
4236       else {
4237 
4238 	for ( loop1=myid; loop1<Nloop1; loop1+=numprocs ){
4239 
4240 	  /* get spin, Epoint, and column */
4241 	  spin = loop1/ON2_Npoles;
4242 	  Epoint = loop1 - spin*ON2_Npoles;
4243 	  Real_Ene = startE + dE_step*(double)Epoint;
4244 
4245 	  /* call OND_Solver */
4246 
4247 	  OND_Solver("retarded",
4248 		     myid,numprocs,
4249 		     myid2,numprocs2,
4250 		     Nloop1,
4251 		     mpi_comm_level1,
4252 		     spin,Epoint,Real_Ene,
4253 		     Nmat,NZE,PZE,
4254 		     Depth_Level,
4255 		     Number_Division,
4256 		     Trial_ChemP,
4257 		     Hall,Sall,DMall,
4258 		     WRG,
4259 		     conv_index_row,
4260 		     conv_index_col,
4261 		     conv_ind2,
4262 		     TNZE);
4263 
4264 	}
4265       } /* else */
4266 
4267       /*******************************
4268         search the next Trial_ChemP
4269       *******************************/
4270 
4271       if (0.0<y3)
4272 	ChemP0 = x3 - dE_step*(double)ON2_Npoles*0.8;
4273       else
4274 	ChemP0 = x3 + dE_step*(double)ON2_Npoles*0.8;
4275 
4276       eachsum = 0.0;
4277 
4278       for (spin=0; spin<=SpinP_switch; spin++){
4279 	for (i=0; i<ON2_Npoles; i++){
4280 
4281 	  Real_Ene = startE + dE_step*(double)i;
4282 	  x = (Real_Ene - Trial_ChemP)*Beta;
4283 	  f0 = 1.0/(1.0 + exp(x));
4284 	  x = (Real_Ene - ChemP0)*Beta;
4285 	  f1 = 1.0/(1.0 + exp(x));
4286 	  eachsum += WRG[spin][i]*(f1-f0);
4287 	}
4288       }
4289 
4290       eachsum = eachsum/PI*dE_step;
4291       if (SpinP_switch==0) eachsum *= 2.0;
4292 
4293       MPI_Allreduce(&eachsum, &totalsum, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4294 
4295      if ( 1<measure_time ){
4296         printf("GGG1 SCF_iter=%2d Trial_ChemP=%15.12f ChemP0=%15.12f y3=%15.12f totalsum=%15.12f\n",
4297                SCF_iter,Trial_ChemP,ChemP0,y3,totalsum);
4298      }
4299 
4300       if ( 0.0<(y3*(y3+totalsum)) ){
4301 
4302         /* just use ChemP0 */
4303 
4304 	Trial_ChemP = ChemP0;
4305       }
4306       else {
4307 
4308 	/* refinement of Tria_ChemP by a bisection method */
4309 
4310 	/* set up the range of Trial_ChemP */
4311 
4312 	if (0.0<y3){
4313 	  ChemP_MAX = Trial_ChemP;
4314 	  ChemP_MIN = ChemP0;
4315 	}
4316 	else {
4317 	  ChemP_MAX = ChemP0;
4318 	  ChemP_MIN = Trial_ChemP;
4319 	}
4320 
4321 	/* do refinement */
4322 
4323 	do {
4324 
4325 	  ChemP0 = 0.50*(ChemP_MAX + ChemP_MIN);
4326 
4327 	  eachsum = 0.0;
4328 
4329 	  for (spin=0; spin<=SpinP_switch; spin++){
4330 	    for (i=0; i<ON2_Npoles; i++){
4331 
4332 	      Real_Ene = startE + dE_step*(double)i;
4333 	      x = (Real_Ene - Trial_ChemP)*Beta;
4334 	      f0 = 1.0/(1.0 + exp(x));
4335 	      x = (Real_Ene - ChemP0)*Beta;
4336 	      f1 = 1.0/(1.0 + exp(x));
4337 	      eachsum += WRG[spin][i]*(f1-f0);
4338 	    }
4339 	  }
4340 
4341           eachsum = eachsum/PI*dE_step;
4342           if (SpinP_switch==0) eachsum *= 2.0;
4343 
4344 	  MPI_Allreduce(&eachsum, &totalsum, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4345 
4346           if ( 1<measure_time ){
4347             printf("EEE myid=%2d SCF_iter=%2d ChemP=%15.12f diff=%15.12f\n",myid,SCF_iter,ChemP0,y3+totalsum);
4348 	  }
4349 
4350 	  if ( 0.0<=(y3+totalsum) ){
4351 	    ChemP_MAX = ChemP0;
4352 	  }
4353 	  else {
4354 	    ChemP_MIN = ChemP0;
4355 	  }
4356 
4357 	  if (fabs(y3+totalsum)<1.0e-8) po6 = 1;
4358 
4359 	} while (po6==0);
4360 
4361 	/* update Trial_ChemP */
4362 
4363 	Trial_ChemP = cTrial_ChemP + 2.0*(ChemP0-cTrial_ChemP);
4364       }
4365 
4366       pTrial_ChemP[0] = x3;
4367       pDiff_Num[0] = y3;
4368 
4369       DeltaMu[6] = Trial_ChemP - cTrial_ChemP;
4370       pDiff_Num[4] = y3;
4371 
4372     }
4373 
4374     /**********************************************
4375       Step 2:
4376     ***********************************************/
4377 
4378     else {
4379 
4380       Trial_ChemP = Find_Trial_ChemP_by_Muller( SCF_iter, num_loop,
4381                                                 pTrial_ChemP,  pDiff_Num,
4382                                                 DeltaMu, DeltaN, DeltaNp );
4383     }
4384 
4385     /* check its convergence */
4386 
4387     if (     fabs(y3)<1.0e-8
4388          || (fabs(y3)<1.0e-7 && SCF_iter<5) ){
4389 
4390       end_flag = 1;
4391     }
4392     else {
4393       num_loop++;
4394     }
4395 
4396   } while (end_flag==0 && num_loop<13);
4397 
4398   /* update the new ChemP */
4399 
4400   ChemP = Trial_ChemP;
4401 
4402   /* show num_loop */
4403 
4404   if (myid==Host_ID){
4405     printf("<Cluster2>  SCF_iter=%2d ChemP_Iter=%2d\n",SCF_iter,num_loop);
4406   }
4407 
4408   /*************************************************************************
4409            store the density matrix by the part 1 into an array DM
4410   **************************************************************************/
4411 
4412   MPI_Barrier(mpi_comm_level1);
4413 
4414   for (spin=0; spin<=SpinP_switch; spin++){
4415 
4416     MPI_Allreduce(&DMall[spin][0], &Sall[0], TNZE, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4417 
4418     k = is1[myid];
4419     for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
4420       GA_AN = M2G[MA_AN];
4421       wanA = WhatSpecies[GA_AN];
4422       tnoA = Spe_Total_CNO[wanA];
4423       for (i=0; i<tnoA; i++){
4424 	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
4425 	  GB_AN = natn[GA_AN][LB_AN];
4426 	  wanB = WhatSpecies[GB_AN];
4427 	  tnoB = Spe_Total_CNO[wanB];
4428 	  for (j=0; j<tnoB; j++){
4429 	    CDM[spin][MA_AN][LB_AN][i][j] = Sall[k];
4430 	    k++;
4431 	  }
4432 	}
4433       }
4434     }
4435   }
4436 
4437   /****************************************************
4438                calculation of band energy
4439   ****************************************************/
4440 
4441   My_Eele1[0] = 0.0;
4442   My_Eele1[1] = 0.0;
4443 
4444   for (spin=0; spin<=SpinP_switch; spin++){
4445     for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
4446       GA_AN = M2G[MA_AN];
4447       wanA = WhatSpecies[GA_AN];
4448       tnoA = Spe_Total_CNO[wanA];
4449       for (j=0; j<=FNAN[GA_AN]; j++){
4450 	wanB = WhatSpecies[natn[GA_AN][j]];
4451 	tnoB = Spe_Total_CNO[wanB];
4452 	for (k=0; k<tnoA; k++){
4453 	  for (l=0; l<tnoB; l++){
4454 	    My_Eele1[spin] += CDM[spin][MA_AN][j][k][l]*nh[spin][MA_AN][j][k][l];
4455 	  }
4456 	}
4457       }
4458     }
4459   }
4460 
4461   MPI_Barrier(mpi_comm_level1);
4462 
4463   /* MPI, My_Eele1 */
4464   for (spin=0; spin<=SpinP_switch; spin++){
4465     MPI_Allreduce(&My_Eele1[spin], &Eele1[spin], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4466   }
4467 
4468   if (SpinP_switch==0) Eele1[1] = Eele1[0];
4469 
4470   /****************************************************
4471      freeing Index_BC, SindB, EindB, SindC, EindE
4472   ****************************************************/
4473 
4474   free_flag = 1;
4475   Nested_Dissection(free_flag,Nmat,MP,&Depth_Level,&Number_Division);
4476 
4477   /****************************************************
4478                    freeing of arrays
4479   ****************************************************/
4480 
4481   free(is1);
4482 
4483   free(NZE);
4484   free(PZE);
4485 
4486   for (spin=0; spin<(SpinP_switch+1); spin++){
4487     free(WRG[spin]);
4488   }
4489   free(WRG);
4490 
4491   for (i=0; i<Nmat; i++){
4492     free(conv_ind2[i]);
4493   }
4494   free(conv_ind2);
4495 
4496   for (spin=0; spin<(SpinP_switch+1); spin++){
4497     free(DMall[spin]);
4498   }
4499   free(DMall);
4500 
4501   for (spin=0; spin<(SpinP_switch+1); spin++){
4502     free(Hall[spin]);
4503   }
4504   free(Hall);
4505 
4506   free(Sall);
4507 
4508   free(MP);
4509   free(conv_index_row);
4510   free(conv_index_col);
4511 
4512   /* freeing of arrays for the first world */
4513 
4514   if (Num_Comm_World1<=numprocs){
4515     MPI_Comm_free(&MPI_CommWD1[myworld1]);
4516   }
4517 
4518   free(NPROCS_ID1);
4519   free(Comm_World1);
4520   free(NPROCS_WD1);
4521   free(Comm_World_StartID1);
4522   free(MPI_CommWD1);
4523 
4524   /* freeing of arrays for the second B world */
4525 
4526   if (Nloop1<=numprocs){
4527 
4528     MPI_Comm_free(&MPI_CommWD2B[myworld2B]);
4529     free(NPROCS_ID2B);
4530     free(Comm_World2B);
4531     free(NPROCS_WD2B);
4532     free(Comm_World_StartID2B);
4533     free(MPI_CommWD2B);
4534   }
4535 
4536   /* for elapsed time */
4537   MPI_Barrier(mpi_comm_level1);
4538   dtime(&TEtime);
4539   return (TEtime-TStime);
4540 }
4541 
4542 
4543 
4544 
Find_Trial_ChemP_by_Muller(int SCF_iter,int num_loop,double pTrial_ChemP[4],double pDiff_Num[5],double DeltaMu[7],double DeltaN[3],double DeltaNp[3])4545 static double Find_Trial_ChemP_by_Muller(
4546         int SCF_iter, int num_loop,
4547         double pTrial_ChemP[4], double pDiff_Num[5],
4548         double DeltaMu[7], double DeltaN[3], double DeltaNp[3])
4549 {
4550   static double stepw;
4551   double y31,x0,x1,x2,x3,y0,y1,y2,y3;
4552   int i,i0,i1,i2,num4,po3,po4;
4553   double a,b,c,d,g0,g1,g2,dFa,dFb,dFc,dF,dF0,F;
4554   double xx[5],yy[5],zz[5],yy0[5];
4555   double z0,z1,z2,a1,a2,a3,det;
4556   double a11,a12,a13,a21,a22,a23,a31,a32,a33;
4557   double scaling4;
4558   double Trial_ChemP,Trial_ChemP0;
4559   int flag_nan;
4560   char nanchar[300];
4561 
4562   Trial_ChemP = pTrial_ChemP[3];
4563 
4564   x0 = pTrial_ChemP[0];
4565   x1 = pTrial_ChemP[1];
4566   x2 = pTrial_ChemP[2];
4567   x3 = pTrial_ChemP[3];
4568 
4569   y0  = pDiff_Num[0];
4570   y1  = pDiff_Num[1];
4571   y2  = pDiff_Num[2];
4572   y3  = pDiff_Num[3];
4573   y31 = pDiff_Num[4];
4574 
4575   /*******************************************************
4576                store history of stepw and y3
4577   *******************************************************/
4578 
4579   if (num_loop==2){
4580 
4581     DeltaMu[2] = DeltaMu[1];
4582     DeltaN[2]  = DeltaN[1];
4583     DeltaNp[2] = DeltaNp[1];
4584 
4585     DeltaMu[1] = DeltaMu[0];
4586     DeltaN[1]  = DeltaN[0];
4587     DeltaNp[1] = DeltaNp[0];
4588 
4589     DeltaMu[0] = DeltaMu[6];
4590     DeltaN[0]  = y31;
4591     DeltaNp[0] = y3;
4592   }
4593 
4594   /*******************************************************
4595                 estimate a new chemical potential
4596   *******************************************************/
4597 
4598   /* num_loop==1 */
4599 
4600   if (num_loop==1){
4601 
4602     y31 = y3;
4603 
4604     if (SCF_iter<4){
4605 
4606       stepw = fabs(y3);
4607       if (0.02<stepw) stepw = -sgn(y3)*0.02;
4608       else            stepw = -(y3 + 10.0*y3*y3);
4609       Trial_ChemP = Trial_ChemP + stepw;
4610     }
4611 
4612     else {
4613 
4614       /* if (4<=SCF_iter), estimate Trial_ChemP using an extrapolation. */
4615 
4616       z0 = DeltaNp[0];
4617       z1 = DeltaNp[1];
4618       z2 = DeltaNp[2];
4619 
4620       a11 = DeltaN[0]; a12 = DeltaMu[0]; a13 = DeltaN[0]*DeltaMu[0];
4621       a21 = DeltaN[1]; a22 = DeltaMu[1]; a23 = DeltaN[1]*DeltaMu[1];
4622       a31 = DeltaN[2]; a32 = DeltaMu[2]; a33 = DeltaN[2]*DeltaMu[2];
4623 
4624       det = a11*a22*a33+a21*a32*a13+a31*a12*a23-a11*a32*a23-a31*a22*a13-a21*a12*a33;
4625 
4626       a1 = ((a22*a33-a23*a32)*z0 + (a13*a32-a12*a33)*z1 + (a12*a23-a13*a22)*z2)/det;
4627       a2 = ((a23*a31-a21*a33)*z0 + (a11*a33-a13*a31)*z1 + (a13*a21-a11*a23)*z2)/det;
4628       a3 = ((a21*a32-a22*a31)*z0 + (a12*a31-a11*a32)*z1 + (a11*a22-a12*a21)*z2)/det;
4629 
4630       stepw = -a1*y3/(a2+a3*y3);
4631 
4632       flag_nan = 0;
4633       sprintf(nanchar,"%8.4f",stepw);
4634 
4635       if (strstr(nanchar,"nan")!=NULL || strstr(nanchar,"NaN")!=NULL
4636 	  || strstr(nanchar,"inf")!=NULL || strstr(nanchar,"Inf")!=NULL){
4637 
4638 	flag_nan = 1;
4639 
4640 	stepw = fabs(y3);
4641 	if (0.02<stepw) stepw = -sgn(y3)*0.02;
4642 	else            stepw = -(y3 + y3*y3);
4643 	Trial_ChemP = Trial_ChemP + stepw;
4644       }
4645       else {
4646 	Trial_ChemP = Trial_ChemP + stepw;
4647       }
4648 
4649     }
4650 
4651     x0 = x3;
4652     y0 = y3;
4653   }
4654 
4655   /* num_loop==2 */
4656 
4657   else if (num_loop==2){
4658 
4659     x1 = x3;
4660     y1 = y3;
4661 
4662     Trial_ChemP0 = (x1*y0-x0*y1)/(y0-y1);
4663 
4664     if ( 0.1<fabs(Trial_ChemP-Trial_ChemP0) ){
4665       Trial_ChemP = Trial_ChemP + 0.1*sgn(Trial_ChemP0 - Trial_ChemP);
4666     }
4667     else {
4668 
4669       Trial_ChemP = Trial_ChemP0;
4670     }
4671   }
4672 
4673   /* num_loop==3 */
4674 
4675   else if (num_loop==3){
4676 
4677     x2 = x3;
4678     y2 = y3;
4679 
4680     a = y0/(x0-x2)/(x0-x1) - y1/(x1-x2)/(x0-x1) + y2/(x0-x2)/(x1-x2);
4681     b = -(x2+x1)*y0/(x0-x2)/(x0-x1) + (x0+x2)*y1/(x1-x2)/(x0-x1) - (x1+x0)*y2/(x0-x2)/(x1-x2);
4682     c = x1*x2*y0/(x0-x2)/(x0-x1) - x2*x0*y1/(x1-x2)/(x0-x1) + x0*x1*y2/(x0-x2)/(x1-x2);
4683 
4684     /* refinement of a, b, and c by the iterative method */
4685 
4686     po4 = 0;
4687     num4 = 0;
4688     scaling4 = 0.3;
4689     dF = 1000.0;
4690 
4691     do {
4692 
4693       g0 = a*x0*x0 + b*x0 + c - y0;
4694       g1 = a*x1*x1 + b*x1 + c - y1;
4695       g2 = a*x2*x2 + b*x2 + c - y2;
4696 
4697       dFa = 2.0*(g0*x0*x0 + g1*x1*x1 + g2*x2*x2);
4698       dFb = 2.0*(g0*x0 + g1*x1 + g2*x2);
4699       dFc = 2.0*(g0 + g1 + g2);
4700 
4701       F = g0*g0 + g1*g1 + g2*g2;
4702       dF0 = dF;
4703       dF = sqrt(fabs(dFa*dFa + dFb*dFb + dFc*dFc));
4704 
4705       if (dF0<dF) scaling4 /= 1.5;
4706 
4707       /*
4708         printf("3 F=%18.15f dF=%18.14f scaling4=%15.12f\n",F,dF,scaling4);
4709       */
4710 
4711       a -= scaling4*dFa;
4712       b -= scaling4*dFb;
4713       c -= scaling4*dFc;
4714 
4715       if (dF<1.0e-11) po4 = 1;
4716       num4++;
4717 
4718     } while (po4==0 && num4<100);
4719 
4720     /* calculte Trial_ChemP */
4721 
4722     /*
4723       printf("3 a=%18.15f\n",a);
4724       printf("3 b=%18.15f\n",b);
4725       printf("3 c=%18.15f\n",c);
4726     */
4727 
4728     if (0.0<=b)
4729       Trial_ChemP0 = -2.0*c/(b+sqrt(fabs(b*b-4*a*c)));
4730     else
4731       Trial_ChemP0 = (-b+sqrt(fabs(b*b-4*a*c)))/(2.0*a);
4732 
4733     flag_nan = 0;
4734     sprintf(nanchar,"%8.4f",Trial_ChemP0);
4735 
4736     if (   strstr(nanchar,"nan")!=NULL || strstr(nanchar,"NaN")!=NULL
4737 	   || strstr(nanchar,"inf")!=NULL || strstr(nanchar,"Inf")!=NULL){
4738 
4739       flag_nan = 1;
4740 
4741       stepw = fabs(y3);
4742       if (0.02<stepw) stepw = -sgn(y3)*0.02;
4743       else            stepw = -(y3 + y3*y3);
4744       Trial_ChemP = Trial_ChemP + stepw;
4745     }
4746     else {
4747 
4748       if ( 0.02<fabs(Trial_ChemP-Trial_ChemP0) ){
4749 	Trial_ChemP = Trial_ChemP + 0.02*sgn(Trial_ChemP0 - Trial_ChemP);
4750       }
4751       else {
4752 	Trial_ChemP = Trial_ChemP0;
4753       }
4754     }
4755 
4756     /*
4757       printf("3 Trial_ChemP0=%18.15f\n",Trial_ChemP0);
4758       printf("3 Trial_ChemP =%18.15f\n",Trial_ChemP);
4759     */
4760 
4761   }
4762 
4763   /* if (4<=num_loop) */
4764 
4765   else {
4766 
4767     Trial_ChemP0 = Trial_ChemP;
4768 
4769     xx[1] = x0;
4770     xx[2] = x1;
4771     xx[3] = x2;
4772     xx[4] = x3;
4773 
4774     yy[1] = y0;
4775     yy[2] = y1;
4776     yy[3] = y2;
4777     yy[4] = y3;
4778 
4779     qsort_double(4, xx, yy);
4780 
4781     /* check whether the zero point is passed or not. */
4782 
4783     po3 = 0;
4784     for (i=1; i<4; i++){
4785       if (yy[i]*yy[i+1]<0.0){
4786 	i0 = i;
4787 	po3 = 1;
4788       }
4789     }
4790 
4791     /*
4792       for (i=1; i<=4; i++){
4793       printf("num_loop=%2d i0=%2d i=%2d xx=%18.15f yy=%18.15f\n",num_loop,i0,i,xx[i],yy[i]);
4794       }
4795     */
4796 
4797     if (po3==1){
4798 
4799       if (i0==1){
4800 	x0 = xx[i0+0];
4801 	x1 = xx[i0+1];
4802 	x2 = xx[i0+2];
4803 
4804 	y0 = yy[i0+0];
4805 	y1 = yy[i0+1];
4806 	y2 = yy[i0+2];
4807       }
4808 
4809       else if (i0==2) {
4810 
4811 	if (fabs(yy[i0-1])<fabs(yy[i0+2])){
4812 
4813 	  x0 = xx[i0-1];
4814 	  x1 = xx[i0+0];
4815 	  x2 = xx[i0+1];
4816 
4817 	  y0 = yy[i0-1];
4818 	  y1 = yy[i0+0];
4819 	  y2 = yy[i0+1];
4820 	}
4821 	else {
4822 	  x0 = xx[i0+0];
4823 	  x1 = xx[i0+1];
4824 	  x2 = xx[i0+2];
4825 
4826 	  y0 = yy[i0+0];
4827 	  y1 = yy[i0+1];
4828 	  y2 = yy[i0+2];
4829 	}
4830       }
4831 
4832       else if (i0==3) {
4833 	x0 = xx[i0-1];
4834 	x1 = xx[i0+0];
4835 	x2 = xx[i0+1];
4836 
4837 	y0 = yy[i0-1];
4838 	y1 = yy[i0+0];
4839 	y2 = yy[i0+1];
4840       }
4841 
4842       a = y0/(x0-x2)/(x0-x1) - y1/(x1-x2)/(x0-x1) + y2/(x0-x2)/(x1-x2);
4843       b = -(x2+x1)*y0/(x0-x2)/(x0-x1) + (x0+x2)*y1/(x1-x2)/(x0-x1) - (x1+x0)*y2/(x0-x2)/(x1-x2);
4844       c = x1*x2*y0/(x0-x2)/(x0-x1) - x2*x0*y1/(x1-x2)/(x0-x1) + x0*x1*y2/(x0-x2)/(x1-x2);
4845 
4846       /*
4847 	printf("x0=%18.15f y0=%18.15f\n",x0,y0);
4848 	printf("x1=%18.15f y1=%18.15f\n",x1,y1);
4849 	printf("x2=%18.15f y2=%18.15f\n",x2,y2);
4850 
4851 	printf("a=%18.15f\n",a);
4852 	printf("b=%18.15f\n",b);
4853 	printf("c=%18.15f\n",c);
4854       */
4855 
4856       /* refinement of a, b, and c by the iterative method */
4857 
4858       po4 = 0;
4859       num4 = 0;
4860       scaling4 = 0.3;
4861       dF = 1000.0;
4862 
4863       do {
4864 
4865 	g0 = a*x0*x0 + b*x0 + c - y0;
4866 	g1 = a*x1*x1 + b*x1 + c - y1;
4867 	g2 = a*x2*x2 + b*x2 + c - y2;
4868 
4869 	dFa = 2.0*(g0*x0*x0 + g1*x1*x1 + g2*x2*x2);
4870 	dFb = 2.0*(g0*x0 + g1*x1 + g2*x2);
4871 	dFc = 2.0*(g0 + g1 + g2);
4872 
4873 	F = g0*g0 + g1*g1 + g2*g2;
4874 	dF0 = dF;
4875 	dF = sqrt(fabs(dFa*dFa + dFb*dFb + dFc*dFc));
4876 
4877 	if (dF0<dF) scaling4 /= 1.5;
4878 
4879 	/*
4880           printf("F=%18.15f dF=%18.14f scaling4=%15.12f\n",F,dF,scaling4);
4881 	*/
4882 
4883 	a -= scaling4*dFa;
4884 	b -= scaling4*dFb;
4885 	c -= scaling4*dFc;
4886 
4887 	if (dF<1.0e-11) po4 = 1;
4888 	num4++;
4889 
4890       } while (po4==0 && num4<100);
4891 
4892       /* calculte Trial_ChemP */
4893 
4894       if (0.0<=b)
4895 	Trial_ChemP0 = -2.0*c/(b+sqrt(fabs(b*b-4*a*c)));
4896       else
4897 	Trial_ChemP0 = (-b+sqrt(fabs(b*b-4*a*c)))/(2.0*a);
4898 
4899       flag_nan = 0;
4900       sprintf(nanchar,"%8.4f",Trial_ChemP0);
4901 
4902       if (   strstr(nanchar,"nan")!=NULL || strstr(nanchar,"NaN")!=NULL
4903 	     || strstr(nanchar,"inf")!=NULL || strstr(nanchar,"Inf")!=NULL){
4904 
4905 	flag_nan = 1;
4906 
4907 	stepw = fabs(y3);
4908 	if (0.02<stepw) stepw = -sgn(y3)*0.02;
4909 	else            stepw = -(y3 + y3*y3);
4910 	Trial_ChemP = Trial_ChemP + stepw;
4911       }
4912       else {
4913 
4914 	if ( 0.02<fabs(Trial_ChemP-Trial_ChemP0) ){
4915 	  Trial_ChemP = Trial_ChemP + 0.02*sgn(Trial_ChemP0 - Trial_ChemP);
4916 	}
4917 	else {
4918 	  Trial_ChemP = Trial_ChemP0;
4919 	}
4920       }
4921 
4922       /*
4923         printf("F=%18.15f dF=%18.14f Trial_ChemP=%18.15f\n",F,dF,Trial_ChemP);
4924       */
4925     }
4926 
4927     else {
4928 
4929       yy[1] = fabs(y0);
4930       yy[2] = fabs(y1);
4931       yy[3] = fabs(y2);
4932       yy[4] = fabs(y3);
4933 
4934       zz[1] = 1;
4935       zz[2] = 2;
4936       zz[3] = 3;
4937       zz[4] = 4;
4938 
4939       xx[1] = x0;
4940       xx[2] = x1;
4941       xx[3] = x2;
4942       xx[4] = x3;
4943 
4944       yy0[1] = y0;
4945       yy0[2] = y1;
4946       yy0[3] = y2;
4947       yy0[4] = y3;
4948 
4949       qsort_double(4, yy, zz);
4950 
4951       i0 = (int)zz[1];
4952       i1 = (int)zz[2];
4953       i2 = (int)zz[3];
4954 
4955       x0 = xx[i0];
4956       x1 = xx[i1];
4957       x2 = xx[i2];
4958 
4959       y0 = yy0[i0];
4960       y1 = yy0[i1];
4961       y2 = yy0[i2];
4962 
4963       Trial_ChemP0 = (x1*y0-x0*y1)/(y0-y1);
4964 
4965       flag_nan = 0;
4966       sprintf(nanchar,"%8.4f",Trial_ChemP0);
4967 
4968       if (   strstr(nanchar,"nan")!=NULL || strstr(nanchar,"NaN")!=NULL
4969 	     || strstr(nanchar,"inf")!=NULL || strstr(nanchar,"Inf")!=NULL){
4970 
4971 	flag_nan = 1;
4972 
4973 	stepw = fabs(y3);
4974 	if (0.02<stepw) stepw = -sgn(y3)*0.02;
4975 	else            stepw = -(y3 + y3*y3);
4976 	Trial_ChemP = Trial_ChemP + stepw;
4977       }
4978       else {
4979 
4980 	if ( 0.02<fabs(Trial_ChemP-Trial_ChemP0) ){
4981 	  Trial_ChemP = Trial_ChemP + 0.02*sgn(Trial_ChemP0 - Trial_ChemP);
4982 	}
4983 	else {
4984 	  Trial_ChemP = Trial_ChemP0;
4985 	}
4986       }
4987 
4988     } /* else */
4989   } /* else */
4990 
4991   /* store pTrial_ChemP and pDiff_Num */
4992 
4993   pTrial_ChemP[0] = x0;
4994   pTrial_ChemP[1] = x1;
4995   pTrial_ChemP[2] = x2;
4996   pTrial_ChemP[3] = x3;
4997 
4998   pDiff_Num[0] = y0;
4999   pDiff_Num[1] = y1;
5000   pDiff_Num[2] = y2;
5001   pDiff_Num[3] = y3;
5002   pDiff_Num[4] = y31;
5003 
5004   DeltaMu[6] = stepw;
5005 
5006   /* return Traial_ChemP */
5007   return Trial_ChemP;
5008 }
5009