1 /**********************************************************************
2   Band_DFT_Col.c:
3 
4      Band_DFT_Col.c is a subroutine to perform band calculations
5      based on a collinear DFT
6 
7   Log of Band_DFT_Col.c:
8 
9      22/Nov/2001  Released by T.Ozaki
10 
11 ***********************************************************************/
12 
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <math.h>
17 #include <time.h>
18 #include "openmx_common.h"
19 #include "lapack_prototypes.h"
20 #include "mpi.h"
21 #include <omp.h>
22 
23 #define  measure_time  0
24 
25 
26 /*---------- added by TOYODA 17/FEB/2010 */
27 #include "exx_debug.h"
28 /*---------- until here */
29 
30 
31 
Band_DFT_Col(int SCF_iter,int knum_i,int knum_j,int knum_k,int SpinP_switch,double ***** nh,double ***** ImNL,double **** CntOLP,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2],int * MP,int * order_GA,double * ko,double * koS,double *** EIGEN,double * H1,double * S1,double * CDM1,double * EDM1,dcomplex ** H,dcomplex ** S,dcomplex ** C,dcomplex * BLAS_S,int *** k_op,int * T_k_op,int ** T_k_ID,double * T_KGrids1,double * T_KGrids2,double * T_KGrids3,int myworld1,int * NPROCS_ID1,int * Comm_World1,int * NPROCS_WD1,int * Comm_World_StartID1,MPI_Comm * MPI_CommWD1,int myworld2,int * NPROCS_ID2,int * NPROCS_WD2,int * Comm_World2,int * Comm_World_StartID2,MPI_Comm * MPI_CommWD2,EXX_t * exx,dcomplex **** exx_CDM,double * Uexx)32 double Band_DFT_Col(int SCF_iter,
33                     int knum_i, int knum_j, int knum_k,
34 		    int SpinP_switch,
35 		    double *****nh,
36 		    double *****ImNL,
37 		    double ****CntOLP,
38 		    double *****CDM,
39 		    double *****EDM,
40 		    double Eele0[2], double Eele1[2],
41 		    int *MP,
42 		    int *order_GA,
43 		    double *ko,
44 		    double *koS,
45 		    double ***EIGEN,
46 		    double *H1,
47 		    double *S1,
48 		    double *CDM1,
49 		    double *EDM1,
50 		    dcomplex **H,
51 		    dcomplex **S,
52 		    dcomplex **C,
53                     dcomplex *BLAS_S,
54 		    int ***k_op,
55 		    int *T_k_op,
56 		    int **T_k_ID,
57 		    double *T_KGrids1,
58 		    double *T_KGrids2,
59 		    double *T_KGrids3,
60                     int myworld1,
61 		    int *NPROCS_ID1,
62 		    int *Comm_World1,
63 		    int *NPROCS_WD1,
64 		    int *Comm_World_StartID1,
65 		    MPI_Comm *MPI_CommWD1,
66                     int myworld2,
67 		    int *NPROCS_ID2,
68 		    int *NPROCS_WD2,
69 		    int *Comm_World2,
70 		    int *Comm_World_StartID2,
71 		    MPI_Comm *MPI_CommWD2,
72 		    EXX_t *exx,
73 		    dcomplex ****exx_CDM,
74 		    double *Uexx)
75 {
76   static int firsttime=1;
77   int i,j,k,l,m,n,wan,MaxN,i0,ks;
78   int i1,i1s,j1,ia,jb,lmax,po,po1,spin,s1,e1;
79   int num2,RnB,l1,l2,l3,loop_num,ns,ne;
80   int ct_AN,h_AN,wanA,tnoA,wanB,tnoB;
81   int MA_AN,GA_AN,Anum,num_kloop0;
82   int T_knum,S_knum,E_knum,kloop,kloop0;
83   double av_num,lumos;
84   double time0;
85   int LB_AN,GB_AN,Bnum;
86   double k1,k2,k3,Fkw;
87   double sum,sumi,sum_weights;
88   double Num_State;
89   double My_Num_State;
90   double FermiF;
91   double tmp,eig,kw,EV_cut0;
92   double x,Dnum,Dnum2,AcP,ChemP_MAX,ChemP_MIN;
93   double *VecFkw;
94   double *VecFkwE;
95   int *is1,*ie1;
96   int *is2,*ie2;
97   MPI_Status *stat_send;
98   MPI_Request *request_send;
99   MPI_Request *request_recv;
100   int *My_NZeros;
101   int *SP_NZeros;
102   int *SP_Atoms;
103   MPI_Comm *MPI_CommWD_CDM1;
104   int *MPI_CDM1_flag;
105 
106   int all_knum;
107   dcomplex Ctmp1,Ctmp2;
108   int ii,ij,ik;
109   int BM,BN,BK;
110   double u2,v2,uv,vu;
111   double d1,d2;
112   double My_Eele1[2];
113   double TZ,dum,sumE,kRn,si,co;
114   double Resum,ResumE,Redum,Redum2;
115   double Imsum,ImsumE,Imdum,Imdum2;
116   double TStime,TEtime,SiloopTime,EiloopTime;
117   double Stime,Etime,Stime0,Etime0;
118   double FermiEps=1.0e-14;
119   double x_cut=30.0;
120   double My_Eele0[2];
121 
122   char file_EV[YOUSO10];
123   FILE *fp_EV;
124 #ifdef xt3
125   char buf[fp_bsize];          /* setvbuf */
126 #endif
127   int AN,Rn,size_H1;
128   int parallel_mode;
129   int numprocs0,myid0;
130   int ID,ID0,ID1;
131   int numprocs1,myid1;
132   int numprocs2,myid2;
133   int Num_Comm_World1;
134   int Num_Comm_World2;
135 
136   int tag=999,IDS,IDR;
137   MPI_Status stat;
138   MPI_Request request;
139 
140   double time1,time2,time3;
141   double time4,time5,time6;
142   double time7,time8,time9;
143   double time10,time11,time12;
144   double time81,time82,time83;
145   double time84,time85;
146   double time51;
147 
148   dcomplex *BLAS_H;
149   dcomplex *BLAS_C;
150 
151   /* for OpenMP */
152   int OMPID,Nthrds,Nprocs;
153 
154   /*---------- added by TOYODA 16/FEB/2010 */
155   const int *exx_ep_atom1;
156   const int *exx_ep_atom2;
157   const int *exx_ep_cell;
158   int iRn_x, iRn_y, iRn_z, iRn;
159   int ncd, nshell_ep, iep, nep;
160   /*---------- until here */
161 
162   /* for time */
163   dtime(&TStime);
164 
165   time1 = 0.0;
166   time2 = 0.0;
167   time3 = 0.0;
168   time4 = 0.0;
169   time5 = 0.0;
170   time6 = 0.0;
171   time7 = 0.0;
172   time8 = 0.0;
173   time9 = 0.0;
174   time10 = 0.0;
175   time11 = 0.0;
176   time12 = 0.0;
177   time81 = 0.0;
178   time82 = 0.0;
179   time83 = 0.0;
180   time84 = 0.0;
181   time85 = 0.0;
182   time51 = 0.0;
183 
184   /*---------- added by TOYODA 16/FEB/2010 */
185   if (g_exx_switch) {
186     nep          = EXX_Number_of_EP(exx);
187     nshell_ep    = EXX_Number_of_EP_Shells(exx);
188     exx_ep_atom1 = EXX_Array_EP_Atom1(exx);
189     exx_ep_atom2 = EXX_Array_EP_Atom2(exx);
190     exx_ep_cell  = EXX_Array_EP_Cell(exx);
191     ncd = 2*nshell_ep + 1;
192     Uexx[0] = 0.0;
193     Uexx[1] = 0.0;
194   }
195   /*---------- until here */
196 
197   /* MPI */
198   MPI_Comm_size(mpi_comm_level1,&numprocs0);
199   MPI_Comm_rank(mpi_comm_level1,&myid0);
200   MPI_Barrier(mpi_comm_level1);
201 
202   Num_Comm_World1 = SpinP_switch + 1;
203 
204   /***********************************************
205        for pallalel calculations in myworld1
206   ***********************************************/
207 
208   MPI_Comm_size(MPI_CommWD1[myworld1],&numprocs1);
209   MPI_Comm_rank(MPI_CommWD1[myworld1],&myid1);
210 
211   /****************************************************
212    find the number of basis functions, n
213   ****************************************************/
214 
215   n = 0;
216   for (i=1; i<=atomnum; i++){
217     wanA  = WhatSpecies[i];
218     n += Spe_Total_CNO[wanA];
219   }
220 
221   /****************************************************
222    find TZ
223   ****************************************************/
224 
225   TZ = 0.0;
226   for (i=1; i<=atomnum; i++){
227     wan = WhatSpecies[i];
228     TZ += Spe_Core_Charge[wan];
229   }
230 
231   /***********************************************
232      find the number of states to be solved
233   ***********************************************/
234 
235   lumos = (double)n*0.300;
236   if (lumos<60.0) lumos = 400.0;
237   MaxN = (TZ-system_charge)/2 + (int)lumos;
238   if (n<MaxN) MaxN = n;
239 
240   /***********************************************
241      allocation of arrays
242   ***********************************************/
243 
244   VecFkw  = (double*)malloc(sizeof(double)*(n+2));
245   VecFkwE = (double*)malloc(sizeof(double)*(n+2));
246 
247   BLAS_H = (dcomplex*)malloc(sizeof(dcomplex)*n*n);
248   BLAS_C = (dcomplex*)malloc(sizeof(dcomplex)*n*n);
249 
250   My_NZeros = (int*)malloc(sizeof(int)*numprocs0);
251   SP_NZeros = (int*)malloc(sizeof(int)*numprocs0);
252   SP_Atoms = (int*)malloc(sizeof(int)*numprocs0);
253 
254   MPI_CommWD_CDM1 = (MPI_Comm*)malloc(sizeof(MPI_Comm)*numprocs0);
255   MPI_CDM1_flag = (int*)malloc(sizeof(int)*numprocs0);
256   for (i=0; i<numprocs0; i++) MPI_CDM1_flag[i] = 0;
257 
258   if (firsttime) {
259   PrintMemory("Band_DFT: ko", sizeof(double)*(n+1),NULL);
260   PrintMemory("Band_DFT: koS",sizeof(double)*(n+1),NULL);
261   PrintMemory("Band_DFT: H",  sizeof(dcomplex)*(n+1)*(n+1),NULL);
262   PrintMemory("Band_DFT: S",  sizeof(dcomplex)*(n+1)*(n+1),NULL);
263   PrintMemory("Band_DFT: C",  sizeof(dcomplex)*(n+1)*(n+1),NULL);
264   }
265 
266   /***********************************************
267               k-points by regular mesh
268   ***********************************************/
269 
270   if (way_of_kpoint==1){
271 
272     /**************************************************************
273      k_op[i][j][k]: weight of DOS
274                  =0   no calc.
275                  =1   G-point
276                  =2   which has k<->-k point
277         Now , only the relation, E(k)=E(-k), is used.
278 
279     Future release: k_op will be used for symmetry operation
280     *************************************************************/
281 
282     for (i=0;i<=knum_i-1;i++) {
283       for (j=0;j<=knum_j-1;j++) {
284 	for (k=0;k<=knum_k-1;k++) {
285 	  k_op[i][j][k]=-999;
286 	}
287       }
288     }
289 
290     for (i=0;i<=knum_i-1;i++) {
291       for (j=0;j<=knum_j-1;j++) {
292 	for (k=0;k<=knum_k-1;k++) {
293 	  if ( k_op[i][j][k]==-999 ) {
294 	    k_inversion(i,j,k,knum_i,knum_j,knum_k,&ii,&ij,&ik);
295 	    if ( i==ii && j==ij && k==ik ) {
296 	      k_op[i][j][k]    = 1;
297 	    }
298 
299 	    else {
300 	      k_op[i][j][k]    = 2;
301 	      k_op[ii][ij][ik] = 0;
302 	    }
303 	  }
304 	} /* k */
305       } /* j */
306     } /* i */
307 
308     /***********************************
309        one-dimentionalize for MPI
310     ************************************/
311 
312     T_knum = 0;
313     for (i=0; i<=(knum_i-1); i++){
314       for (j=0; j<=(knum_j-1); j++){
315 	for (k=0; k<=(knum_k-1); k++){
316 	  if (0<k_op[i][j][k]){
317 	    T_knum++;
318 	  }
319 	}
320       }
321     }
322 
323     /* set T_KGrids1,2,3 and T_k_op */
324 
325     T_knum = 0;
326     for (i=0; i<knum_i; i++){
327 
328       if (knum_i==1)  k1 = 0.0;
329       else            k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*(double)knum_i) + Shift_K_Point;
330 
331       for (j=0; j<knum_j; j++){
332 
333 	if (knum_j==1)  k2 = 0.0;
334 	else            k2 = -0.5 + (2.0*(double)j+1.0)/(2.0*(double)knum_j) - Shift_K_Point;
335 
336 	for (k=0; k<knum_k; k++){
337 
338 	  if (knum_k==1)  k3 = 0.0;
339 	  else            k3 = -0.5 + (2.0*(double)k+1.0)/(2.0*(double)knum_k) + 2.0*Shift_K_Point;
340 
341 	  if (0<k_op[i][j][k]){
342 
343 	    T_KGrids1[T_knum] = k1;
344 	    T_KGrids2[T_knum] = k2;
345 	    T_KGrids3[T_knum] = k3;
346 	    T_k_op[T_knum]    = k_op[i][j][k];
347 
348 	    T_knum++;
349 	  }
350 	}
351       }
352     }
353 
354     if (myid0==Host_ID && 0<level_stdout){
355 
356       printf(" KGrids1: ");fflush(stdout);
357       for (i=0;i<=knum_i-1;i++){
358 	if (knum_i==1)  k1 = 0.0;
359 	else            k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*(double)knum_i) + Shift_K_Point;
360 	printf("%9.5f ",k1);fflush(stdout);
361       }
362       printf("\n");fflush(stdout);
363 
364       printf(" KGrids2: ");fflush(stdout);
365 
366       for (i=0;i<=knum_j-1;i++){
367 	if (knum_j==1)  k2 = 0.0;
368 	else            k2 = -0.5 + (2.0*(double)i+1.0)/(2.0*(double)knum_j) - Shift_K_Point;
369 	printf("%9.5f ",k2);fflush(stdout);
370       }
371       printf("\n");fflush(stdout);
372 
373       printf(" KGrids3: ");fflush(stdout);
374       for (i=0;i<=knum_k-1;i++){
375 	if (knum_k==1)  k3 = 0.0;
376 	else            k3 = -0.5 + (2.0*(double)i+1.0)/(2.0*(double)knum_k) + 2.0*Shift_K_Point;
377 	printf("%9.5f ",k3);fflush(stdout);
378       }
379       printf("\n");fflush(stdout);
380     }
381 
382   }
383 
384   /***********************************************
385                 Monkhorst-Pack k-points
386   ***********************************************/
387 
388   else if (way_of_kpoint==2){
389 
390     T_knum = num_non_eq_kpt;
391 
392     for (k=0; k<num_non_eq_kpt; k++){
393       T_KGrids1[k] = NE_KGrids1[k];
394       T_KGrids2[k] = NE_KGrids2[k];
395       T_KGrids3[k] = NE_KGrids3[k];
396       T_k_op[k]    = NE_T_k_op[k];
397     }
398   }
399 
400   /***********************************************
401             calculate the sum of weights
402   ***********************************************/
403 
404   sum_weights = 0.0;
405   for (k=0; k<T_knum; k++){
406     sum_weights += (double)T_k_op[k];
407   }
408 
409   /***********************************************
410            allocate k-points into processors
411   ***********************************************/
412 
413   if (numprocs1<T_knum){
414 
415     /* set parallel_mode */
416     parallel_mode = 0;
417 
418     /* allocation of kloop to ID */
419 
420     for (ID=0; ID<numprocs1; ID++){
421       tmp = (double)T_knum/(double)numprocs1;
422       S_knum = (int)((double)ID*(tmp+1.0e-12));
423       E_knum = (int)((double)(ID+1)*(tmp+1.0e-12)) - 1;
424       if (ID==(numprocs1-1)) E_knum = T_knum - 1;
425       if (E_knum<0)          E_knum = 0;
426 
427       for (k=S_knum; k<=E_knum; k++){
428         /* ID in the first level world */
429         T_k_ID[myworld1][k] = ID;
430       }
431     }
432 
433     /* find own informations */
434 
435     tmp = (double)T_knum/(double)numprocs1;
436     S_knum = (int)((double)myid1*(tmp+1.0e-12));
437     E_knum = (int)((double)(myid1+1)*(tmp+1.0e-12)) - 1;
438     if (myid1==(numprocs1-1)) E_knum = T_knum - 1;
439     if (E_knum<0)             E_knum = 0;
440 
441     num_kloop0 = E_knum - S_knum + 1;
442   }
443 
444   else {
445 
446     /* set parallel_mode */
447     parallel_mode = 1;
448     num_kloop0 = 1;
449 
450     Num_Comm_World2 = T_knum;
451     MPI_Comm_size(MPI_CommWD2[myworld2],&numprocs2);
452     MPI_Comm_rank(MPI_CommWD2[myworld2],&myid2);
453 
454     S_knum = myworld2;
455 
456     /* allocate k-points into processors */
457 
458     for (k=0; k<T_knum; k++){
459       /* ID in the first level world */
460       T_k_ID[myworld1][k] = Comm_World_StartID2[k];
461     }
462   }
463 
464   /****************************************************
465    find all_knum
466    if (all_knum==1), all the calculation will be made
467    by the first diagonalization loop, and the second
468    diagonalization will be skipped.
469   ****************************************************/
470 
471   MPI_Allreduce(&num_kloop0, &all_knum, 1, MPI_INT, MPI_PROD, mpi_comm_level1);
472 
473   if (SpinP_switch==1 && numprocs0==1 && all_knum==1){
474     all_knum = 0;
475   }
476 
477   /****************************************************
478     if (parallel_mode==1 && all_knum==1)
479      make is1, ie1, is2, ie2
480   ****************************************************/
481 
482   if (all_knum==1){
483 
484     /* allocation */
485 
486     stat_send = (MPI_Status *)malloc(sizeof(MPI_Status)*numprocs2);
487     request_send = (MPI_Request *)malloc(sizeof(MPI_Request)*numprocs2);
488     request_recv = (MPI_Request *)malloc(sizeof(MPI_Request)*numprocs2);
489 
490     is1 = (int*)malloc(sizeof(int)*numprocs2);
491     ie1 = (int*)malloc(sizeof(int)*numprocs2);
492 
493     is2 = (int*)malloc(sizeof(int)*numprocs2);
494     ie2 = (int*)malloc(sizeof(int)*numprocs2);
495 
496     /* make is1 and ie1 */
497 
498     if ( numprocs2<=n ){
499 
500       av_num = (double)n/(double)numprocs2;
501 
502       for (ID=0; ID<numprocs2; ID++){
503 	is1[ID] = (int)(av_num*(double)ID) + 1;
504 	ie1[ID] = (int)(av_num*(double)(ID+1));
505       }
506 
507       is1[0] = 1;
508       ie1[numprocs2-1] = n;
509 
510     }
511 
512     else{
513 
514       for (ID=0; ID<n; ID++){
515 	is1[ID] = ID + 1;
516 	ie1[ID] = ID + 1;
517       }
518       for (ID=n; ID<numprocs2; ID++){
519 	is1[ID] =  1;
520 	ie1[ID] = -2;
521       }
522     }
523 
524     /* make is2 and ie2 */
525 
526     if ( numprocs2<=MaxN ){
527 
528       av_num = (double)MaxN/(double)numprocs2;
529 
530       for (ID=0; ID<numprocs2; ID++){
531 	is2[ID] = (int)(av_num*(double)ID) + 1;
532 	ie2[ID] = (int)(av_num*(double)(ID+1));
533       }
534 
535       is2[0] = 1;
536       ie2[numprocs2-1] = MaxN;
537     }
538 
539     else{
540       for (ID=0; ID<MaxN; ID++){
541 	is2[ID] = ID + 1;
542 	ie2[ID] = ID + 1;
543       }
544       for (ID=MaxN; ID<numprocs2; ID++){
545 	is2[ID] =  1;
546 	ie2[ID] = -2;
547       }
548     }
549 
550     /**********************************************
551        for MPI communication of CDM1 and EDM1
552     **********************************************/
553 
554     {
555     MPI_Group new_group,old_group;
556     int *new_ranks;
557 
558     new_ranks = (int*)malloc(sizeof(int)*T_knum);
559 
560     /* ID: zeroth world */
561 
562     for (ID=Comm_World_StartID1[myworld1]; ID<(numprocs0+Comm_World_StartID1[myworld1]); ID++){
563 
564       /* ID0: zeroth world */
565 
566       ID0 = ID % numprocs0;
567 
568       /* ID1: first world */
569 
570       ID1 = (ID-Comm_World_StartID1[myworld1]) % numprocs1;
571 
572       for (i=0; i<T_knum; i++){
573         if (Comm_World_StartID2[i]<=ID1){
574           ks = i;
575         }
576       }
577 
578       new_ranks[0] = ID1;
579       if (myid1==ID1) MPI_CDM1_flag[ID0] = 1;
580 
581       for (i=(ks+1); i<(T_knum+ks); i++){
582         i0 = i % T_knum;
583         /* id in the first world */
584         ID1 = Comm_World_StartID2[i0] + (ID0 % NPROCS_WD2[i0]);
585         new_ranks[i-ks] = ID1;
586         if (myid1==ID1) MPI_CDM1_flag[ID0] = 1;
587       }
588 
589       MPI_Comm_group(MPI_CommWD1[myworld1], &old_group);
590 
591       /* define a new group */
592       MPI_Group_incl(old_group,T_knum,new_ranks,&new_group);
593       MPI_Comm_create(MPI_CommWD1[myworld1], new_group, &MPI_CommWD_CDM1[ID0]);
594       MPI_Group_free(&new_group);
595     }
596 
597     free(new_ranks); /* never forget cleaning! */
598     }
599 
600   } /* if (all_knum==1) */
601 
602   /****************************************************
603      communicate T_k_ID
604   ****************************************************/
605 
606   if (numprocs0==1 && SpinP_switch==1){
607     for (k=0; k<T_knum; k++){
608       T_k_ID[1][k] = T_k_ID[0][k];
609     }
610   }
611   else{
612     for (spin=0; spin<=SpinP_switch; spin++){
613       ID = Comm_World_StartID1[spin];
614       MPI_Bcast(&T_k_ID[spin][0], T_knum, MPI_INT, ID, mpi_comm_level1);
615     }
616   }
617 
618   /****************************************************
619      store in each processor all the matrix elements
620         for overlap and Hamiltonian matrices
621   ****************************************************/
622 
623   dtime(&Stime);
624 
625   /* spin=myworld1 */
626 
627   spin = myworld1;
628 
629   /* set S1 */
630 
631   if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
632     size_H1 = Get_OneD_HS_Col(1, CntOLP,   S1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
633   }
634 
635 diagonalize1:
636 
637   /* set H1 */
638 
639   if (SpinP_switch==0){
640     size_H1 = Get_OneD_HS_Col(1, nh[0], H1,   MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
641   }
642   else if (1<numprocs0){
643     size_H1 = Get_OneD_HS_Col(1, nh[0], H1,   MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
644     size_H1 = Get_OneD_HS_Col(1, nh[1], CDM1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
645 
646     if (myworld1){
647       for (i=0; i<size_H1; i++){
648         H1[i] = CDM1[i];
649       }
650     }
651   }
652   else{
653     size_H1 = Get_OneD_HS_Col(1, nh[spin], H1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
654   }
655 
656   dtime(&Etime);
657   time1 += Etime - Stime;
658 
659   /****************************************************
660                       start kloop
661   ****************************************************/
662 
663   dtime(&SiloopTime);
664 
665   for (kloop0=0; kloop0<num_kloop0; kloop0++){
666 
667     kloop = S_knum + kloop0;
668 
669     k1 = T_KGrids1[kloop];
670     k2 = T_KGrids2[kloop];
671     k3 = T_KGrids3[kloop];
672 
673     /* make S and H */
674 
675     if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
676 
677       for (i1=0; i1<n*n; i1++) BLAS_S[i1] = Complex(0.0,0.0);
678 
679       for (i1=1; i1<=n; i1++){
680 	for (j1=1; j1<=n; j1++){
681 	  S[i1][j1] = Complex(0.0,0.0);
682 	}
683       }
684     }
685 
686     for (i1=1; i1<=n; i1++){
687       for (j1=1; j1<=n; j1++){
688 	H[i1][j1] = Complex(0.0,0.0);
689       }
690     }
691 
692     k = 0;
693     for (AN=1; AN<=atomnum; AN++){
694       GA_AN = order_GA[AN];
695       wanA = WhatSpecies[GA_AN];
696       tnoA = Spe_Total_CNO[wanA];
697       Anum = MP[GA_AN];
698 
699       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
700 	GB_AN = natn[GA_AN][LB_AN];
701 	Rn = ncn[GA_AN][LB_AN];
702 	wanB = WhatSpecies[GB_AN];
703 	tnoB = Spe_Total_CNO[wanB];
704 	Bnum = MP[GB_AN];
705 
706 	l1 = atv_ijk[Rn][1];
707 	l2 = atv_ijk[Rn][2];
708 	l3 = atv_ijk[Rn][3];
709 	kRn = k1*(double)l1 + k2*(double)l2 + k3*(double)l3;
710 
711 	si = sin(2.0*PI*kRn);
712 	co = cos(2.0*PI*kRn);
713 
714 	for (i=0; i<tnoA; i++){
715 
716 	  for (j=0; j<tnoB; j++){
717 
718 	    H[Anum+i][Bnum+j].r += H1[k]*co;
719 	    H[Anum+i][Bnum+j].i += H1[k]*si;
720 
721 	    k++;
722 
723 	  }
724 
725           if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
726 
727             k -= tnoB;
728 
729   	    for (j=0; j<tnoB; j++){
730 
731 	      S[Anum+i][Bnum+j].r += S1[k]*co;
732 	      S[Anum+i][Bnum+j].i += S1[k]*si;
733 
734   	      k++;
735 	    }
736 	  }
737 
738 	}
739       }
740     }
741 
742     /*---------- added by TOYODA 15/FEB/2010 */
743     if (5==XC_switch) {
744 
745 #if 0
746       if (1<SCF_iter) { EXX_Debug_Check_DM(exx, exx_CDM, CDM); }
747 #endif
748 
749       EXX_Fock_Band(H, exx, exx_CDM, MP, k1, k2, k3, spin);
750     }
751     /*---------- until here */
752 
753     /* for blas */
754 
755     for (i1=1; i1<=n; i1++){
756       for (j1=1; j1<=n; j1++){
757 	BLAS_H[(j1-1)*n+i1-1] = H[i1][j1];
758       }
759     }
760 
761     /* diagonalize S */
762 
763     dtime(&Stime);
764 
765     if (parallel_mode==0){
766       EigenBand_lapack(S,ko,n,n,1);
767     }
768     else if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
769       Eigen_PHH(MPI_CommWD2[myworld2],S,ko,n,n,1);
770     }
771 
772     dtime(&Etime);
773     time2 += Etime - Stime;
774 
775     if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
776 
777       if (3<=level_stdout){
778 	printf(" myid0=%2d spin=%2d kloop %2d  k1 k2 k3 %10.6f %10.6f %10.6f\n",
779 	       myid0,spin,kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
780 	for (i1=1; i1<=n; i1++){
781 	  printf("  Eigenvalues of OLP  %2d  %15.12f\n",i1,ko[i1]);
782 	}
783       }
784 
785       /* minus eigenvalues to 1.0e-14 */
786 
787       for (l=1; l<=n; l++){
788 	if (ko[l]<0.0) ko[l] = 1.0e-14;
789 	koS[l] = ko[l];
790       }
791 
792       /* calculate S*1/sqrt(ko) */
793 
794       for (l=1; l<=n; l++) ko[l] = 1.0/sqrt(ko[l]);
795 
796       /* S * 1.0/sqrt(ko[l]) */
797 
798 #pragma omp parallel shared(BLAS_S,ko,S,n) private(OMPID,Nthrds,Nprocs,i1,j1)
799       {
800 
801 	/* get info. on OpenMP */
802 
803 	OMPID = omp_get_thread_num();
804 	Nthrds = omp_get_num_threads();
805 	Nprocs = omp_get_num_procs();
806 
807 	for (i1=1+OMPID; i1<=n; i1+=Nthrds){
808 	  for (j1=1; j1<=n; j1++){
809 
810 	    S[i1][j1].r = S[i1][j1].r*ko[j1];
811 	    S[i1][j1].i = S[i1][j1].i*ko[j1];
812 
813 	    BLAS_S[(j1-1)*n+i1-1] = S[i1][j1];
814 	  }
815 	}
816       } /* #pragma omp parallel */
817     }
818 
819     /****************************************************
820      1.0/sqrt(ko[l]) * U^t * H * U * 1.0/sqrt(ko[l])
821     ****************************************************/
822 
823     dtime(&Stime);
824 
825     /* first transposition of S */
826 
827     /*
828     for (i1=1; i1<=n; i1++){
829       for (j1=i1+1; j1<=n; j1++){
830 	Ctmp1 = S[i1][j1];
831 	Ctmp2 = S[j1][i1];
832 	S[i1][j1] = Ctmp2;
833 	S[j1][i1] = Ctmp1;
834       }
835     }
836     */
837 
838     /**********************************
839       for parallel in the second world
840     **********************************/
841 
842     if (all_knum==1){
843 
844       /* H * U * 1.0/sqrt(ko[l]) */
845       /* C is distributed by row in each processor */
846 
847       /*
848       for (j1=is1[myid2]; j1<=ie1[myid2]; j1++){
849 	for (i1=1; i1<=n; i1++){
850 
851 	  sum  = 0.0;
852 	  sumi = 0.0;
853 
854 	  for (l=1; l<=n; l++){
855 	    sum  += H[i1][l].r*S[j1][l].r - H[i1][l].i*S[j1][l].i;
856 	    sumi += H[i1][l].r*S[j1][l].i + H[i1][l].i*S[j1][l].r;
857 	  }
858 
859 	  C[j1][i1].r = sum;
860 	  C[j1][i1].i = sumi;
861 
862 	}
863       }
864       */
865 
866       /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
867 
868 
869     /*
870     printf("ABC3 myid0=%2d myid1=%2d myid2=%2d numprocs2=%2d\n",myid0,myid1,myid2,numprocs2);
871     MPI_Finalize();
872     exit(0);
873     */
874 
875 #pragma omp parallel shared(myid2,ie1,is1,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,ns,ne)
876       {
877 
878 	/* get info. on OpenMP */
879 
880 	OMPID = omp_get_thread_num();
881 	Nthrds = omp_get_num_threads();
882 	Nprocs = omp_get_num_procs();
883 
884         ns = is1[myid2] + OMPID*(ie1[myid2]-is1[myid2]+1)/Nthrds;
885         ne = is1[myid2] + (OMPID+1)*(ie1[myid2]-is1[myid2]+1)/Nthrds - 1;
886 
887 	BM = n;
888 	BN = ne - ns + 1;
889 	BK = n;
890 
891 	Ctmp1.r = 1.0;
892 	Ctmp1.i = 0.0;
893 	Ctmp2.r = 0.0;
894 	Ctmp2.i = 0.0;
895 
896         if (0<BN){
897 
898 	  F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
899 		  	        &Ctmp1,
900 			        BLAS_H, &BM,
901 			        &BLAS_S[(ns-1)*n], &BK,
902 			        &Ctmp2,
903 			        &BLAS_C[(ns-1)*n], &BM);
904 	}
905 
906       } /* #pragma omp parallel */
907 
908       /*
909       BM = n;
910       BN = ie1[myid2] - is1[myid2] + 1;
911       BK = n;
912 
913       Ctmp1.r = 1.0;
914       Ctmp1.i = 0.0;
915       Ctmp2.r = 0.0;
916       Ctmp2.i = 0.0;
917 
918       F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
919                             &Ctmp1,
920                             BLAS_H, &BM,
921 			    &BLAS_S[(is1[myid2]-1)*n], &BK,
922                             &Ctmp2,
923 			    &BLAS_C[(is1[myid2]-1)*n], &BM);
924       */
925 
926 
927       /* 1.0/sqrt(ko[l]) * U^+ H * U * 1.0/sqrt(ko[l]) */
928       /* H is distributed by row in each processor */
929 
930       /*
931       for (j1=is1[myid2]; j1<=ie1[myid2]; j1++){
932         for (i1=1; i1<=n; i1++){
933 
934 	  sum  = 0.0;
935 	  sumi = 0.0;
936 
937 	  for (l=1; l<=n; l++){
938 	    sum  +=  S[i1][l].r*C[j1][l].r + S[i1][l].i*C[j1][l].i;
939 	    sumi +=  S[i1][l].r*C[j1][l].i - S[i1][l].i*C[j1][l].r;
940 	  }
941 
942 	  H[j1][i1].r = sum;
943 	  H[j1][i1].i = sumi;
944 
945 	}
946       }
947       */
948 
949       /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
950 
951 #pragma omp parallel shared(H,myid2,ie1,is1,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,ns,ne,i1,j1)
952       {
953 
954 	/* get info. on OpenMP */
955 
956 	OMPID = omp_get_thread_num();
957 	Nthrds = omp_get_num_threads();
958 	Nprocs = omp_get_num_procs();
959 
960         ns = is1[myid2] + OMPID*(ie1[myid2]-is1[myid2]+1)/Nthrds;
961         ne = is1[myid2] + (OMPID+1)*(ie1[myid2]-is1[myid2]+1)/Nthrds - 1;
962 
963 	BM = n;
964 	BN = ne - ns + 1;
965 	BK = n;
966 
967 	Ctmp1.r = 1.0;
968 	Ctmp1.i = 0.0;
969 	Ctmp2.r = 0.0;
970 	Ctmp2.i = 0.0;
971 
972         if (0<BN){
973  	  F77_NAME(zgemm,ZGEMM)("C","N", &BM,&BN,&BK,
974 		  	        &Ctmp1,
975 			        BLAS_S, &BM,
976 			        &BLAS_C[(ns-1)*n], &BK,
977 			        &Ctmp2,
978 			        &BLAS_H[(ns-1)*n], &BM);
979 	}
980 
981 	for (j1=ns; j1<=ne; j1++){
982 	  for (i1=1; i1<=n; i1++){
983 	    H[j1][i1] = BLAS_H[(j1-1)*n+i1-1];
984 	  }
985 	}
986 
987       } /* #pragma omp parallel */
988 
989       /* broadcast H */
990 
991       BroadCast_ComplexMatrix(MPI_CommWD2[myworld2],H,n,is1,ie1,myid2,numprocs2,
992                               stat_send,request_send,request_recv);
993     }
994 
995     else{
996 
997       /* H * U * 1.0/sqrt(ko[l]) */
998 
999       /*
1000       for (j1=1; j1<=n; j1++){
1001 	for (i1=1; i1<=n; i1++){
1002 
1003 	  sum  = 0.0;
1004 	  sumi = 0.0;
1005 
1006 	  for (l=1; l<=n; l++){
1007 	    sum  += H[i1][l].r*S[j1][l].r - H[i1][l].i*S[j1][l].i;
1008 	    sumi += H[i1][l].r*S[j1][l].i + H[i1][l].i*S[j1][l].r;
1009 	  }
1010 
1011 	  C[j1][i1].r = sum;
1012 	  C[j1][i1].i = sumi;
1013 
1014 	}
1015       }
1016       */
1017 
1018       /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
1019 
1020 #pragma omp parallel shared(BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK)
1021       {
1022 
1023 	/* get info. on OpenMP */
1024 
1025 	OMPID = omp_get_thread_num();
1026 	Nthrds = omp_get_num_threads();
1027 	Nprocs = omp_get_num_procs();
1028 
1029 	BM = n;
1030 	BN = (OMPID+1)*n/Nthrds - (OMPID*n/Nthrds+1) + 1;
1031 	BK = n;
1032 
1033 	Ctmp1.r = 1.0;
1034 	Ctmp1.i = 0.0;
1035 	Ctmp2.r = 0.0;
1036 	Ctmp2.i = 0.0;
1037 
1038         if (0<BN){
1039 	  F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
1040 		 	        &Ctmp1,
1041 			        BLAS_H, &BM,
1042 			        &BLAS_S[(OMPID*n/Nthrds)*n], &BK,
1043 			        &Ctmp2,
1044 			        &BLAS_C[(OMPID*n/Nthrds)*n], &BM);
1045 	}
1046 
1047       } /* #pragma omp parallel */
1048 
1049       /* 1.0/sqrt(ko[l]) * U^+ H * U * 1.0/sqrt(ko[l]) */
1050 
1051       /*
1052       for (j1=1; j1<=n; j1++){
1053 	for (i1=1; i1<=n; i1++){
1054 
1055 	  sum  = 0.0;
1056 	  sumi = 0.0;
1057 
1058 	  for (l=1; l<=n; l++){
1059 	    sum  += S[i1][l].r*C[j1][l].r + S[i1][l].i*C[j1][l].i;
1060 	    sumi += S[i1][l].r*C[j1][l].i - S[i1][l].i*C[j1][l].r;
1061 	  }
1062 
1063 	  H[j1][i1].r = sum;
1064 	  H[j1][i1].i = sumi;
1065 	}
1066       }
1067       */
1068 
1069       /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
1070 
1071 #pragma omp parallel shared(H,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,i1,j1)
1072       {
1073 
1074 	/* get info. on OpenMP */
1075 
1076 	OMPID = omp_get_thread_num();
1077 	Nthrds = omp_get_num_threads();
1078 	Nprocs = omp_get_num_procs();
1079 
1080 	BM = n;
1081 	BN = (OMPID+1)*n/Nthrds - (OMPID*n/Nthrds+1) + 1;
1082 	BK = n;
1083 
1084 	Ctmp1.r = 1.0;
1085 	Ctmp1.i = 0.0;
1086 	Ctmp2.r = 0.0;
1087 	Ctmp2.i = 0.0;
1088 
1089         if (0<BN){
1090 	  F77_NAME(zgemm,ZGEMM)("C","N", &BM,&BN,&BK,
1091 			        &Ctmp1,
1092 			        BLAS_S, &BM,
1093 			        &BLAS_C[(OMPID*n/Nthrds)*n], &BK,
1094 			        &Ctmp2,
1095 			        &BLAS_H[(OMPID*n/Nthrds)*n], &BM);
1096 	}
1097 
1098 	for (j1=(OMPID*n/Nthrds+1); j1<=(OMPID+1)*n/Nthrds; j1++){
1099 	  for (i1=1; i1<=n; i1++){
1100 	    H[j1][i1] = BLAS_H[(j1-1)*n+i1-1];
1101 	  }
1102 	}
1103 
1104       } /* #pragma omp parallel */
1105 
1106     } /* else */
1107 
1108     /* H to C (transposition) */
1109 
1110     for (i1=1; i1<=n; i1++){
1111       for (j1=1; j1<=n; j1++){
1112 	C[j1][i1] = H[i1][j1];
1113       }
1114     }
1115 
1116     /* penalty for ill-conditioning states */
1117 
1118     EV_cut0 = Threshold_OLP_Eigen;
1119 
1120     for (i1=1; i1<=n; i1++){
1121 
1122       if (koS[i1]<EV_cut0){
1123 	C[i1][i1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
1124       }
1125 
1126       /* cutoff the interaction between the ill-conditioned state */
1127 
1128       if (1.0e+3<C[i1][i1].r){
1129 	for (j1=1; j1<=n; j1++){
1130 	  C[i1][j1] = Complex(0.0,0.0);
1131 	  C[j1][i1] = Complex(0.0,0.0);
1132 	}
1133 	C[i1][i1].r = 1.0e+4;
1134       }
1135     }
1136 
1137     dtime(&Etime);
1138     time3 += Etime - Stime;
1139 
1140     /* diagonalize H' */
1141 
1142     dtime(&Stime);
1143 
1144     if (parallel_mode==0){
1145       EigenBand_lapack(C,ko,n,MaxN,all_knum);
1146     }
1147     else{
1148       /*  The output C matrix is distributed by column. */
1149       Eigen_PHH(MPI_CommWD2[myworld2],C,ko,n,MaxN,0);
1150     }
1151 
1152     dtime(&Etime);
1153     time4 += Etime - Stime;
1154 
1155     for (l=1; l<=MaxN; l++){
1156       EIGEN[spin][kloop][l] = ko[l];
1157     }
1158 
1159     if (3<=level_stdout && 0<=kloop){
1160       printf(" myid0=%2d spin=%2d kloop %i, k1 k2 k3 %10.6f %10.6f %10.6f\n",
1161 	     myid0,spin,kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
1162       for (i1=1; i1<=n; i1++){
1163 	if (SpinP_switch==0)
1164 	  printf("  Eigenvalues of Kohn-Sham %2d %15.12f %15.12f\n",
1165 		 i1,EIGEN[0][kloop][i1],EIGEN[0][kloop][i1]);
1166 	else
1167 	  printf("  Eigenvalues of Kohn-Sham %2d %15.12f %15.12f\n",
1168 		 i1,EIGEN[0][kloop][i1],EIGEN[1][kloop][i1]);
1169       }
1170     }
1171 
1172     /* calculation of wave functions */
1173 
1174     dtime(&Stime);
1175 
1176     if (all_knum==1){
1177 
1178       /*  The H matrix is distributed by row */
1179 
1180       /*
1181       for (i1=1; i1<=n; i1++){
1182 	for (j1=is2[myid2]; j1<=ie2[myid2]; j1++){
1183 	  H[j1][i1] = C[i1][j1];
1184 	}
1185       }
1186       */
1187 
1188       for (i1=1; i1<=n; i1++){
1189 	for (j1=is2[myid2]; j1<=ie2[myid2]; j1++){
1190           BLAS_H[(j1-1)*n+i1-1] = C[i1][j1];
1191 	}
1192       }
1193 
1194       /* the second transposition of S */
1195 
1196       /*
1197       for (i1=1; i1<=n; i1++){
1198 	for (j1=i1+1; j1<=n; j1++){
1199 	  Ctmp1 = S[i1][j1];
1200 	  Ctmp2 = S[j1][i1];
1201 	  S[i1][j1] = Ctmp2;
1202 	  S[j1][i1] = Ctmp1;
1203 	}
1204       }
1205       */
1206 
1207       /* C is distributed by row in each processor */
1208 
1209       /*
1210       for (j1=is2[myid2]; j1<=ie2[myid2]; j1++){
1211         for (i1=1; i1<=n; i1++){
1212 
1213 	  sum  = 0.0;
1214 	  sumi = 0.0;
1215 	  for (l=1; l<=n; l++){
1216 	    sum  += S[i1][l].r*H[j1][l].r - S[i1][l].i*H[j1][l].i;
1217 	    sumi += S[i1][l].r*H[j1][l].i + S[i1][l].i*H[j1][l].r;
1218 	  }
1219 
1220 	  C[j1][i1].r = sum;
1221 	  C[j1][i1].i = sumi;
1222 	}
1223       }
1224       */
1225 
1226       /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
1227 
1228 #pragma omp parallel shared(C,myid2,ie2,is2,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,ns,ne,i1,j1)
1229       {
1230 
1231 	/* get info. on OpenMP */
1232 
1233 	OMPID = omp_get_thread_num();
1234 	Nthrds = omp_get_num_threads();
1235 	Nprocs = omp_get_num_procs();
1236 
1237         ns = is2[myid2] + OMPID*(ie2[myid2]-is2[myid2]+1)/Nthrds;
1238         ne = is2[myid2] + (OMPID+1)*(ie2[myid2]-is2[myid2]+1)/Nthrds - 1;
1239 
1240 	BM = n;
1241 	BN = ne - ns + 1;
1242 	BK = n;
1243 
1244 	Ctmp1.r = 1.0;
1245 	Ctmp1.i = 0.0;
1246 	Ctmp2.r = 0.0;
1247 	Ctmp2.i = 0.0;
1248 
1249         if (0<BN){
1250 	  F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
1251 			        &Ctmp1,
1252 			        BLAS_S, &BM,
1253 			        &BLAS_H[(ns-1)*n], &BK,
1254 			        &Ctmp2,
1255 			        &BLAS_C[(ns-1)*n], &BM);
1256 	}
1257 
1258 	for (j1=ns; j1<=ne; j1++){
1259 	  for (i1=1; i1<=n; i1++){
1260 	    C[j1][i1] = BLAS_C[(j1-1)*n+i1-1];
1261 	  }
1262 	}
1263 
1264       } /* #pragma omp parallel */
1265 
1266       /* broadcast C:
1267        C is distributed by row in each processor
1268       */
1269 
1270       dtime(&Stime0);
1271 
1272       BroadCast_ComplexMatrix(MPI_CommWD2[myworld2],C,n,is2,ie2,myid2,numprocs2,
1273 			      stat_send,request_send,request_recv);
1274 
1275       /* C to H (transposition)
1276          H consists of column vectors
1277       */
1278 
1279       for (i1=1; i1<=MaxN; i1++){
1280         for (j1=1; j1<=n; j1++){
1281 	  H[j1][i1] = C[i1][j1];
1282 	}
1283       }
1284 
1285       dtime(&Etime0);
1286       time51 += Etime0 - Stime0;
1287 
1288     } /* if (all_knum==1) */
1289 
1290     dtime(&Etime);
1291     time5 += Etime - Stime;
1292 
1293   } /* kloop0 */
1294 
1295   dtime(&EiloopTime);
1296 
1297   if (SpinP_switch==1 && numprocs0==1 && spin==0){
1298     spin++;
1299     goto diagonalize1;
1300   }
1301 
1302   /****************************************************
1303      MPI:
1304 
1305      EIGEN
1306   ****************************************************/
1307 
1308   MPI_Barrier(mpi_comm_level1);
1309   dtime(&Stime);
1310 
1311   for (spin=0; spin<=SpinP_switch; spin++){
1312     for (kloop=0; kloop<T_knum; kloop++){
1313 
1314       /* get ID in the zeroth world */
1315       ID = Comm_World_StartID1[spin] + T_k_ID[spin][kloop];
1316       MPI_Bcast(&EIGEN[spin][kloop][0], MaxN+1, MPI_DOUBLE, ID, mpi_comm_level1);
1317     }
1318   }
1319 
1320   dtime(&Etime);
1321   time6 += Etime - Stime;
1322 
1323   /**************************************
1324          find chemical potential
1325   **************************************/
1326 
1327   dtime(&Stime);
1328 
1329   po = 0;
1330   loop_num = 0;
1331   ChemP_MAX = 10.0;
1332   ChemP_MIN =-10.0;
1333 
1334   do {
1335 
1336     loop_num++;
1337 
1338     ChemP = 0.50*(ChemP_MAX + ChemP_MIN);
1339     Num_State = 0.0;
1340 
1341     for (kloop=0; kloop<T_knum; kloop++){
1342       for (spin=0; spin<=SpinP_switch; spin++){
1343 	for (l=1; l<=MaxN; l++){
1344 
1345 	  x = (EIGEN[spin][kloop][l] - ChemP)*Beta;
1346 
1347 	  if (x<=-x_cut)      FermiF = 1.0;
1348 	  else if (x>=x_cut)  FermiF = 0.0;
1349 	  else                FermiF = 1.0/(1.0 + exp(x));
1350 
1351 	  Num_State += FermiF*(double)T_k_op[kloop];
1352 
1353 	}
1354       }
1355     }
1356 
1357     if (SpinP_switch==0)
1358       Num_State = 2.0*Num_State/sum_weights;
1359     else
1360       Num_State = Num_State/sum_weights;
1361 
1362     Dnum = TZ - Num_State - system_charge;
1363 
1364     if (0.0<=Dnum) ChemP_MIN = ChemP;
1365     else           ChemP_MAX = ChemP;
1366     if (fabs(Dnum)<10e-14) po = 1;
1367   }
1368   while (po==0 && loop_num<2000);
1369 
1370   /****************************************************
1371            band energy in a finite temperature
1372   ****************************************************/
1373 
1374   Eele0[0] = 0.0;
1375   Eele0[1] = 0.0;
1376 
1377   for (kloop=0; kloop<T_knum; kloop++){
1378     for (spin=0; spin<=SpinP_switch; spin++){
1379       for (l=1; l<=MaxN; l++){
1380 
1381 	x = (EIGEN[spin][kloop][l] - ChemP)*Beta;
1382 
1383 	if (x<=-x_cut)      FermiF = 1.0;
1384 	else if (x_cut<=x)  FermiF = 0.0;
1385 	else                FermiF = 1.0/(1.0 + exp(x));
1386 
1387 	Eele0[spin] += FermiF*EIGEN[spin][kloop][l]*(double)T_k_op[kloop];
1388 
1389       }
1390     }
1391   }
1392 
1393   if (SpinP_switch==0){
1394     Eele0[0] = Eele0[0]/sum_weights;
1395     Eele0[1] = Eele0[0];
1396   }
1397   else {
1398     Eele0[0] = Eele0[0]/sum_weights;
1399     Eele0[1] = Eele0[1]/sum_weights;
1400   }
1401 
1402   Uele = Eele0[0] + Eele0[1];
1403 
1404   if (2<=level_stdout){
1405     printf("myid0=%2d ChemP=%lf, Eele0[0]=%lf, Eele0[1]=%lf\n",myid0,ChemP,Eele0[0],Eele0[1]);
1406   }
1407 
1408   dtime(&Etime);
1409   time7 += Etime - Stime;
1410 
1411   /*---------- added by TOYODA 16/FEB/2010 */
1412   if (g_exx_switch) {
1413     EXX_Energy_Band(Uexx, exx, exx_CDM, MP);
1414     if (0==SpinP_switch){
1415       Uexx[1] = Uexx[0];
1416       EXX_Log_Print("EXX: U= %20.12f\n", Uexx[0]);
1417     } else {
1418       EXX_Log_Print("EXX: U= %20.12f %20.2f\n", Uexx[0], Uexx[1]);
1419     }
1420   }
1421   /*---------- until here */
1422 
1423   /****************************************************
1424          if all_knum==1, calculate CDM and EDM
1425   ****************************************************/
1426 
1427   dtime(&Stime);
1428 
1429   if (all_knum==1){
1430 
1431     dtime(&Stime0);
1432 
1433     /* initialize CDM1 and EDM1 */
1434 
1435     for (i=0; i<size_H1; i++){
1436       CDM1[i] = 0.0;
1437       EDM1[i] = 0.0;
1438     }
1439 
1440     /* calculate CDM and EDM */
1441 
1442     spin = myworld1;
1443     kloop = S_knum;
1444 
1445     k1 = T_KGrids1[kloop];
1446     k2 = T_KGrids2[kloop];
1447     k3 = T_KGrids3[kloop];
1448 
1449     /* weight of k-point */
1450 
1451     kw = (double)T_k_op[kloop];
1452 
1453     /* store Fermi function */
1454 
1455     po = 0;
1456     l = 1;
1457     do{
1458 
1459       eig = EIGEN[spin][kloop][l];
1460       x = (eig - ChemP)*Beta;
1461 
1462       if      (x<-x_cut)  FermiF = 1.0;
1463       else if (x>x_cut)   FermiF = 0.0;
1464       else                FermiF = 1.0/(1.0 + exp(x));
1465 
1466       VecFkw[l]  = FermiF*kw;
1467       VecFkwE[l] = VecFkw[l]*eig;
1468 
1469       if ( FermiF<=FermiEps && po==0 ) {
1470 	lmax = l;
1471 	po = 1;
1472       }
1473 
1474       l++;
1475 
1476     } while(po==0 && l<=MaxN);
1477 
1478     if (po==0) lmax = MaxN;
1479 
1480     dtime(&Etime0);
1481     time81 += Etime0 - Stime0;
1482 
1483     dtime(&Stime0);
1484 
1485     /* predetermination of k
1486        H is used as temporal array
1487     */
1488 
1489     k = 0;
1490     for (AN=1; AN<=atomnum; AN++){
1491 
1492       GA_AN = order_GA[AN];
1493       wanA = WhatSpecies[GA_AN];
1494       tnoA = Spe_Total_CNO[wanA];
1495 
1496       H[0][AN].r = (double)k;
1497 
1498       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1499 	GB_AN = natn[GA_AN][LB_AN];
1500 	Rn = ncn[GA_AN][LB_AN];
1501 	wanB = WhatSpecies[GB_AN];
1502 	tnoB = Spe_Total_CNO[wanB];
1503 	k += tnoA*tnoB;
1504       }
1505     }
1506 
1507     /* DM and EDM */
1508 
1509 #pragma omp parallel shared(numprocs0,My_NZeros,SP_NZeros,MPI_CDM1_flag,EDM1,CDM1,VecFkwE,VecFkw,H,lmax,k1,k2,k3,atv_ijk,ncn,natn,FNAN,MP,Spe_Total_CNO,WhatSpecies,order_GA,atomnum) private(OMPID,Nthrds,Nprocs,AN,GA_AN,wanA,tnoA,Anum,k,LB_AN,GB_AN,Rn,wanB,tnoB,Bnum,l1,l2,l3,kRn,si,co,i,ia,j,jb,d1,d2,l,tmp,po,ID)
1510     {
1511 
1512       /* get info. on OpenMP */
1513 
1514       OMPID = omp_get_thread_num();
1515       Nthrds = omp_get_num_threads();
1516       Nprocs = omp_get_num_procs();
1517 
1518       for (AN=1+OMPID; AN<=atomnum; AN+=Nthrds){
1519 
1520 	GA_AN = order_GA[AN];
1521 	wanA = WhatSpecies[GA_AN];
1522 	tnoA = Spe_Total_CNO[wanA];
1523 	Anum = MP[GA_AN];
1524 
1525 	k = (int)H[0][AN].r;
1526 
1527 	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1528 	  GB_AN = natn[GA_AN][LB_AN];
1529 	  Rn = ncn[GA_AN][LB_AN];
1530 	  wanB = WhatSpecies[GB_AN];
1531 	  tnoB = Spe_Total_CNO[wanB];
1532 	  Bnum = MP[GB_AN];
1533 
1534 	  l1 = atv_ijk[Rn][1];
1535 	  l2 = atv_ijk[Rn][2];
1536 	  l3 = atv_ijk[Rn][3];
1537 	  kRn = k1*(double)l1 + k2*(double)l2 + k3*(double)l3;
1538 
1539 	  si = sin(2.0*PI*kRn);
1540 	  co = cos(2.0*PI*kRn);
1541 
1542 	  for (i=0; i<tnoA; i++){
1543 
1544 	    ia = Anum + i;
1545 
1546 	    for (j=0; j<tnoB; j++){
1547 
1548 	      jb = Bnum + j;
1549 
1550 	      /***************************************************************
1551                Note that the imagiary part is zero,
1552                since
1553 
1554                at k
1555                A = (co + i si)(Re + i Im) = (co*Re - si*Im) + i (co*Im + si*Re)
1556                at -k
1557                B = (co - i si)(Re - i Im) = (co*Re - si*Im) - i (co*Im + si*Re)
1558                Thus, Re(A+B) = 2*(co*Re - si*Im)
1559                      Im(A+B) = 0
1560 	      ***************************************************************/
1561 
1562 	      po = 0;
1563 	      ID = 0;
1564 	      do {
1565 		if (MPI_CDM1_flag[ID] && SP_NZeros[ID]<=k && k<(SP_NZeros[ID]+My_NZeros[ID])) po = 1;
1566 		ID++;
1567 	      } while (po==0 && ID<numprocs0);
1568 
1569 	      if (po==1){
1570 
1571 		d1 = 0.0;
1572 		d2 = 0.0;
1573 
1574 		for (l=1; l<=lmax; l++){
1575 
1576 		  tmp = co*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
1577 		       -si*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
1578 
1579 		  d1 += VecFkw[l]*tmp;
1580 		  d2 += VecFkwE[l]*tmp;;
1581 		}
1582 
1583 		CDM1[k] += d1;
1584 		EDM1[k] += d2;
1585 	      }
1586 
1587 	      /* increment of k */
1588 
1589 	      k++;
1590 
1591 	    }
1592 	  }
1593 
1594 	} /* LB_AN */
1595       } /* AN */
1596 
1597     } /* #pragma omp parallel */
1598 
1599     /* sum of CDM1 and EDM1 by Allreduce in MPI */
1600 
1601     dtime(&Etime0);
1602     time82 += Etime0 - Stime0;
1603 
1604     dtime(&Stime0);
1605 
1606     for (ID=0; ID<numprocs0; ID++){
1607       if (MPI_CDM1_flag[ID]){
1608         k = SP_NZeros[ID];
1609         MPI_Reduce(&CDM1[k], &H1[k], My_NZeros[ID], MPI_DOUBLE, MPI_SUM, 0, MPI_CommWD_CDM1[ID]);
1610         MPI_Reduce(&EDM1[k], &S1[k], My_NZeros[ID], MPI_DOUBLE, MPI_SUM, 0, MPI_CommWD_CDM1[ID]);
1611       }
1612     }
1613 
1614     dtime(&Etime0);
1615     time83 += Etime0 - Stime0;
1616 
1617     dtime(&Stime0);
1618 
1619     /* CDM and EDM */
1620 
1621     k = 0;
1622     for (AN=1; AN<=atomnum; AN++){
1623       GA_AN = order_GA[AN];
1624       MA_AN = F_G2M[GA_AN];
1625 
1626       wanA = WhatSpecies[GA_AN];
1627       tnoA = Spe_Total_CNO[wanA];
1628 
1629       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1630 	GB_AN = natn[GA_AN][LB_AN];
1631 	wanB = WhatSpecies[GB_AN];
1632 	tnoB = Spe_Total_CNO[wanB];
1633 
1634 	for (i=0; i<tnoA; i++){
1635 	  for (j=0; j<tnoB; j++){
1636 
1637 	    if (1<=MA_AN && MA_AN<=Matomnum){
1638 
1639 	      CDM[spin][MA_AN][LB_AN][i][j] = H1[k];
1640               EDM[spin][MA_AN][LB_AN][i][j] = S1[k];
1641 	    }
1642 
1643 	    k++;
1644 
1645 	  }
1646 	}
1647       }
1648     }
1649 
1650     dtime(&Etime0);
1651     time84 += Etime0 - Stime0;
1652 
1653     dtime(&Stime0);
1654 
1655     /* if necessary, MPI communication of CDM and EDM */
1656 
1657     if (1<numprocs0 && SpinP_switch==1){
1658 
1659       /* set spin */
1660 
1661       if (myworld1==0){
1662 	spin = 1;
1663       }
1664       else{
1665 	spin = 0;
1666       }
1667 
1668       /* communicate CDM1 and EDM1 */
1669 
1670       for (i=0; i<=1; i++){
1671 
1672         for (ID=Comm_World_StartID1[i]; ID<(numprocs0+Comm_World_StartID1[i]); ID++){
1673 
1674           /* ID0: zeroth world */
1675 
1676           ID0 = ID % numprocs0;
1677 
1678           /* ID's for sending and receiving in the zeroth world */
1679 
1680           IDS = Comm_World_StartID1[i] + (ID-Comm_World_StartID1[i]) % NPROCS_WD1[i];
1681           IDR = ID0;
1682 
1683           k = SP_NZeros[IDR];
1684 
1685           if (IDS!=IDR){
1686 
1687 	    /* CDM1 */
1688 
1689 	    if (myid0==IDS){
1690 	      MPI_Isend(&H1[k], My_NZeros[IDR], MPI_DOUBLE, IDR, tag, mpi_comm_level1, &request);
1691 	    }
1692 
1693 	    if (myid0==IDR){
1694 	      MPI_Recv(&CDM1[k], My_NZeros[IDR], MPI_DOUBLE, IDS, tag, mpi_comm_level1, &stat);
1695 	    }
1696 
1697 	    if (myid0==IDS){
1698 	      MPI_Wait(&request,&stat);
1699 	    }
1700 
1701 	    /* EDM1 */
1702 
1703 	    if (myid0==IDS){
1704 	      MPI_Isend(&S1[k], My_NZeros[IDR], MPI_DOUBLE, IDR, tag, mpi_comm_level1, &request);
1705 	    }
1706 
1707 	    if (myid0==IDR){
1708 	      MPI_Recv(&EDM1[k], My_NZeros[IDR], MPI_DOUBLE, IDS, tag, mpi_comm_level1, &stat);
1709 	    }
1710 
1711 	    if (myid0==IDS){
1712 	      MPI_Wait(&request,&stat);
1713 	    }
1714 
1715 	  }
1716         }
1717       }
1718 
1719       /* put CDM1 and EDM1 into CDM and EDM */
1720 
1721       k = 0;
1722       for (AN=1; AN<=atomnum; AN++){
1723 	GA_AN = order_GA[AN];
1724 	MA_AN = F_G2M[GA_AN];
1725 
1726 	wanA = WhatSpecies[GA_AN];
1727 	tnoA = Spe_Total_CNO[wanA];
1728 
1729 	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1730 	  GB_AN = natn[GA_AN][LB_AN];
1731 	  wanB = WhatSpecies[GB_AN];
1732 	  tnoB = Spe_Total_CNO[wanB];
1733 
1734 	  for (i=0; i<tnoA; i++){
1735 	    for (j=0; j<tnoB; j++){
1736 
1737 	      if (1<=MA_AN && MA_AN<=Matomnum){
1738 		CDM[spin][MA_AN][LB_AN][i][j] = CDM1[k];
1739                 EDM[spin][MA_AN][LB_AN][i][j] = EDM1[k];
1740 	      }
1741 
1742 	      k++;
1743 
1744 	    }
1745 	  }
1746 	}
1747       }
1748     }
1749 
1750     dtime(&Etime0);
1751     time85 += Etime0 - Stime0;
1752 
1753     /*---------- added by TOYODA 16/FEB/2010 */
1754     if (g_exx_switch) {
1755       /* clear buffer */
1756       for (iep=0; iep<nep; iep++) {
1757         GA_AN = exx_ep_atom1[iep]+1;
1758         GB_AN = exx_ep_atom2[iep]+1;
1759         tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
1760 	tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
1761         for (i=0; i<tnoA; i++){
1762           for (j=0; j<tnoB; j++){
1763             exx_CDM[0][iep][i][j].r = 0.0;
1764             exx_CDM[0][iep][i][j].i = 0.0;
1765           }
1766         }
1767       }
1768       /* calculate DM */
1769       for (iep=0; iep<nep; iep++) {
1770         GA_AN = exx_ep_atom1[iep]+1;
1771         GB_AN = exx_ep_atom2[iep]+1;
1772         iRn   = exx_ep_cell[iep];
1773         tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
1774 	tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
1775         Anum = MP[GA_AN];
1776 	Bnum = MP[GB_AN];
1777 
1778         /* phase */
1779         iRn_x = iRn%ncd - nshell_ep;
1780         iRn_y = (iRn/ncd)%ncd - nshell_ep;
1781         iRn_z = (iRn/ncd/ncd)%ncd - nshell_ep;
1782         kRn = k1*(double)iRn_x + k2*(double)iRn_y + k3*(double)iRn_z;
1783         si = sin(2.0*PI*kRn);
1784         co = cos(2.0*PI*kRn);
1785 
1786         for (i=0; i<tnoA; i++){
1787           ia = Anum + i;
1788           for (j=0; j<tnoB; j++){
1789             jb = Bnum + j;
1790             d1 = 0.0;
1791             d2 = 0.0;
1792             for (l=1; l<=lmax; l++) {
1793               tmp = co*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
1794                     -si*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
1795               d1 += VecFkw[l]*tmp;
1796               tmp = si*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
1797                     +co*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
1798               d2 += VecFkw[l]*tmp;
1799             }
1800             exx_CDM[0][iep][i][j].r += d1;
1801             exx_CDM[0][iep][i][j].i += d2;
1802 	  }
1803         }
1804       } /* loop of iep */
1805     } /* end if */
1806     /*---------- until here added by TOYODA 16/FEB/2010 */
1807 
1808   } /* if (all_knum==1) */
1809 
1810   dtime(&Etime);
1811   time8 += Etime - Stime;
1812 
1813   if (myid0==Host_ID && 0<level_stdout){
1814     printf("<Band_DFT>  Eigen, time=%lf\n", EiloopTime-SiloopTime);fflush(stdout);
1815   }
1816 
1817   dtime(&SiloopTime);
1818 
1819   /****************************************************
1820    ****************************************************
1821      diagonalization for calculating density matrix
1822    ****************************************************
1823   ****************************************************/
1824 
1825   if (all_knum!=1){
1826 
1827     /* spin=myworld1 */
1828 
1829     spin = myworld1;
1830 
1831   diagonalize2:
1832 
1833     /* set S1 */
1834 
1835     size_H1 = Get_OneD_HS_Col(1, CntOLP,   S1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1836 
1837     /* set H1 */
1838 
1839     if (SpinP_switch==0){
1840       size_H1 = Get_OneD_HS_Col(1, nh[0], H1,   MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1841     }
1842     else if (1<numprocs0){
1843       size_H1 = Get_OneD_HS_Col(1, nh[0], H1,   MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1844       size_H1 = Get_OneD_HS_Col(1, nh[1], CDM1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1845 
1846       if (myworld1){
1847 	for (i=0; i<size_H1; i++){
1848 	  H1[i] = CDM1[i];
1849 	}
1850       }
1851     }
1852     else{
1853       size_H1 = Get_OneD_HS_Col(1, nh[spin], H1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1854     }
1855 
1856     /* initialize CDM1 and EDM1 */
1857 
1858     for (i=0; i<size_H1; i++){
1859       CDM1[i] = 0.0;
1860       EDM1[i] = 0.0;
1861     }
1862 
1863     /*---------- added by TOYODA 16/FEB/2010 */
1864     if (g_exx_switch) {
1865       /* clear exx_DM */
1866       for (iep=0; iep<nep; iep++) {
1867         GA_AN = exx_ep_atom1[iep]+1;
1868         GB_AN = exx_ep_atom2[iep]+1;
1869         tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
1870 	tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
1871         for (i=0; i<tnoA; i++){
1872           for (j=0; j<tnoB; j++){
1873             exx_CDM[spin][iep][i][j].r = 0.0;
1874             exx_CDM[spin][iep][i][j].i = 0.0;
1875           }
1876         }
1877       }
1878     }
1879     /*---------- until here added by TOYODA 16/FEB/2010 */
1880 
1881     /* initialize CDM, EDM, and iDM */
1882 
1883     for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
1884       GA_AN = M2G[MA_AN];
1885       wanA = WhatSpecies[GA_AN];
1886       tnoA = Spe_Total_CNO[wanA];
1887       Anum = MP[GA_AN];
1888       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1889 	GB_AN = natn[GA_AN][LB_AN];
1890 	wanB = WhatSpecies[GB_AN];
1891 	tnoB = Spe_Total_CNO[wanB];
1892 	Bnum = MP[GB_AN];
1893 
1894 	for (i=0; i<tnoA; i++){
1895 	  for (j=0; j<tnoB; j++){
1896 	    CDM[spin][MA_AN][LB_AN][i][j] = 0.0;
1897             EDM[spin][MA_AN][LB_AN][i][j] = 0.0;
1898 	    iDM[0][0][MA_AN][LB_AN][i][j] = 0.0;
1899 	    iDM[0][1][MA_AN][LB_AN][i][j] = 0.0;
1900 	  }
1901 	}
1902       }
1903     }
1904 
1905     /* for kloop */
1906 
1907     for (kloop0=0; kloop0<num_kloop0; kloop0++){
1908 
1909       kloop = kloop0 + S_knum;
1910 
1911       k1 = T_KGrids1[kloop];
1912       k2 = T_KGrids2[kloop];
1913       k3 = T_KGrids3[kloop];
1914 
1915       /* make S and H */
1916 
1917       for (i1=1; i1<=n; i1++){
1918 	for (j1=1; j1<=n; j1++){
1919 	  S[i1][j1] = Complex(0.0,0.0);
1920 	  H[i1][j1] = Complex(0.0,0.0);
1921 	}
1922       }
1923 
1924       k = 0;
1925       for (AN=1; AN<=atomnum; AN++){
1926 	GA_AN = order_GA[AN];
1927 	wanA = WhatSpecies[GA_AN];
1928 	tnoA = Spe_Total_CNO[wanA];
1929 	Anum = MP[GA_AN];
1930 
1931 	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1932 	  GB_AN = natn[GA_AN][LB_AN];
1933 	  Rn = ncn[GA_AN][LB_AN];
1934 	  wanB = WhatSpecies[GB_AN];
1935 	  tnoB = Spe_Total_CNO[wanB];
1936 	  Bnum = MP[GB_AN];
1937 
1938 	  l1 = atv_ijk[Rn][1];
1939 	  l2 = atv_ijk[Rn][2];
1940 	  l3 = atv_ijk[Rn][3];
1941 	  kRn = k1*(double)l1 + k2*(double)l2 + k3*(double)l3;
1942 
1943 	  si = sin(2.0*PI*kRn);
1944 	  co = cos(2.0*PI*kRn);
1945 
1946 	  for (i=0; i<tnoA; i++){
1947 	    for (j=0; j<tnoB; j++){
1948 
1949 	      S[Anum+i][Bnum+j].r += S1[k]*co;
1950 	      S[Anum+i][Bnum+j].i += S1[k]*si;
1951 	      H[Anum+i][Bnum+j].r += H1[k]*co;
1952 	      H[Anum+i][Bnum+j].i += H1[k]*si;
1953 
1954 	      k++;
1955 
1956 	    }
1957 	  }
1958 	}
1959       }
1960 
1961       /* for blas */
1962 
1963       for (i1=1; i1<=n; i1++){
1964 	for (j1=1; j1<=n; j1++){
1965 	  BLAS_H[(j1-1)*n+i1-1] = H[i1][j1];
1966 	}
1967       }
1968 
1969       /* diagonalize S */
1970 
1971       dtime(&Stime);
1972 
1973       if (parallel_mode==0){
1974 	EigenBand_lapack(S,ko,n,n,1);
1975       }
1976       else{
1977 	Eigen_PHH(MPI_CommWD2[myworld2],S,ko,n,n,1);
1978       }
1979 
1980       dtime(&Etime);
1981       time9 += Etime - Stime;
1982 
1983       if (3<=level_stdout){
1984 	printf(" myid0=%2d kloop %2d  k1 k2 k3 %10.6f %10.6f %10.6f\n",
1985 	       myid0,kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
1986 	for (i1=1; i1<=n; i1++){
1987 	  printf("  Eigenvalues of OLP  %2d  %15.12f\n",i1,ko[i1]);
1988 	}
1989       }
1990 
1991       /* minus eigenvalues to 1.0e-14 */
1992 
1993       for (l=1; l<=n; l++){
1994 	if (ko[l]<0.0) ko[l] = 1.0e-14;
1995 	koS[l] = ko[l];
1996       }
1997 
1998       /* calculate S*1/sqrt(ko) */
1999 
2000       for (l=1; l<=n; l++) ko[l] = 1.0/sqrt(ko[l]);
2001 
2002       /* S * 1.0/sqrt(ko[l])  */
2003 
2004 #pragma omp parallel shared(BLAS_S,ko,S,n) private(OMPID,Nthrds,Nprocs,i1,j1)
2005       {
2006 
2007 	/* get info. on OpenMP */
2008 
2009 	OMPID = omp_get_thread_num();
2010 	Nthrds = omp_get_num_threads();
2011 	Nprocs = omp_get_num_procs();
2012 
2013 	for (i1=1+OMPID; i1<=n; i1+=Nthrds){
2014 	  for (j1=1; j1<=n; j1++){
2015 
2016 	    S[i1][j1].r = S[i1][j1].r*ko[j1];
2017 	    S[i1][j1].i = S[i1][j1].i*ko[j1];
2018 	    BLAS_S[(j1-1)*n+i1-1] = S[i1][j1];
2019 	  }
2020 	}
2021 
2022       } /* #pragma omp parallel */
2023 
2024       /****************************************************
2025             1/sqrt(ko) * U^t * H * U * 1/sqrt(ko)
2026       ****************************************************/
2027 
2028       /* transpose S */
2029 
2030       /*
2031       for (i1=1; i1<=n; i1++){
2032 	for (j1=i1+1; j1<=n; j1++){
2033 	  Ctmp1 = S[i1][j1];
2034 	  Ctmp2 = S[j1][i1];
2035 	  S[i1][j1] = Ctmp2;
2036 	  S[j1][i1] = Ctmp1;
2037 	}
2038       }
2039       */
2040 
2041       /* H * U * 1/sqrt(ko) */
2042 
2043       /*
2044       for (j1=1; j1<=n; j1++){
2045 	for (i1=1; i1<=n; i1++){
2046 
2047 	  sum  = 0.0;
2048 	  sumi = 0.0;
2049 
2050 	  for (l=1; l<=n; l++){
2051 	    sum  += H[i1][l].r*S[j1][l].r - H[i1][l].i*S[j1][l].i;
2052 	    sumi += H[i1][l].r*S[j1][l].i + H[i1][l].i*S[j1][l].r;
2053 	  }
2054 
2055 	  C[j1][i1].r = sum;
2056 	  C[j1][i1].i = sumi;
2057 	}
2058       }
2059       */
2060 
2061       /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
2062 
2063 #pragma omp parallel shared(BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK)
2064       {
2065 
2066 	/* get info. on OpenMP */
2067 
2068 	OMPID = omp_get_thread_num();
2069 	Nthrds = omp_get_num_threads();
2070 	Nprocs = omp_get_num_procs();
2071 
2072 	BM = n;
2073 	BN = (OMPID+1)*n/Nthrds - (OMPID*n/Nthrds+1) + 1;
2074 	BK = n;
2075 
2076 	Ctmp1.r = 1.0;
2077 	Ctmp1.i = 0.0;
2078 	Ctmp2.r = 0.0;
2079 	Ctmp2.i = 0.0;
2080 
2081         if (0<BN){
2082 	  F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
2083 		  	        &Ctmp1,
2084 			        BLAS_H, &BM,
2085 			        &BLAS_S[(OMPID*n/Nthrds)*n], &BK,
2086 			        &Ctmp2,
2087 			        &BLAS_C[(OMPID*n/Nthrds)*n], &BM);
2088 	}
2089 
2090       } /* #pragma omp parallel */
2091 
2092       /* 1/sqrt(ko) * U^+ H * U * 1/sqrt(ko) */
2093 
2094       /*
2095       for (i1=1; i1<=n; i1++){
2096 	for (j1=1; j1<=n; j1++){
2097 	  sum  = 0.0;
2098 	  sumi = 0.0;
2099 	  for (l=1; l<=n; l++){
2100 	    sum  +=  S[i1][l].r*C[j1][l].r + S[i1][l].i*C[j1][l].i;
2101 	    sumi +=  S[i1][l].r*C[j1][l].i - S[i1][l].i*C[j1][l].r;
2102 	  }
2103 	  H[i1][j1].r = sum;
2104 	  H[i1][j1].i = sumi;
2105 	}
2106       }
2107       */
2108 
2109       /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
2110 
2111 #pragma omp parallel shared(C,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,i1,j1)
2112       {
2113 
2114 	/* get info. on OpenMP */
2115 
2116 	OMPID = omp_get_thread_num();
2117 	Nthrds = omp_get_num_threads();
2118 	Nprocs = omp_get_num_procs();
2119 
2120 	BM = n;
2121 	BN = (OMPID+1)*n/Nthrds - (OMPID*n/Nthrds+1) + 1;
2122 	BK = n;
2123 
2124 	Ctmp1.r = 1.0;
2125 	Ctmp1.i = 0.0;
2126 	Ctmp2.r = 0.0;
2127 	Ctmp2.i = 0.0;
2128 
2129         if (0<BN){
2130 	  F77_NAME(zgemm,ZGEMM)("C","N", &BM,&BN,&BK,
2131 			        &Ctmp1,
2132 			        BLAS_S, &BM,
2133 			        &BLAS_C[(OMPID*n/Nthrds)*n], &BK,
2134 			        &Ctmp2,
2135 			        &BLAS_H[(OMPID*n/Nthrds)*n], &BM);
2136 	}
2137 
2138 	for (j1=(OMPID*n/Nthrds+1); j1<=(OMPID+1)*n/Nthrds; j1++){
2139 	  for (i1=1; i1<=n; i1++){
2140 	    C[i1][j1] = BLAS_H[(j1-1)*n+i1-1];
2141 	  }
2142 	}
2143 
2144       } /* #pragma omp parallel */
2145 
2146       /* H to C */
2147 
2148       /*
2149       for (i1=1; i1<=n; i1++){
2150 	for (j1=1; j1<=n; j1++){
2151 	  C[i1][j1] = H[i1][j1];
2152 	}
2153       }
2154       */
2155 
2156       /* penalty for ill-conditioning states */
2157 
2158       EV_cut0 = Threshold_OLP_Eigen;
2159 
2160       for (i1=1; i1<=n; i1++){
2161 
2162 	if (koS[i1]<EV_cut0){
2163 	  C[i1][i1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
2164 	}
2165 
2166 	/* cutoff the interaction between the ill-conditioned state */
2167 
2168 	if (1.0e+3<C[i1][i1].r){
2169 	  for (j1=1; j1<=n; j1++){
2170 	    C[i1][j1] = Complex(0.0,0.0);
2171 	    C[j1][i1] = Complex(0.0,0.0);
2172 	  }
2173 	  C[i1][i1].r = 1.0e+4;
2174 	}
2175       }
2176 
2177       /* diagonalize H' */
2178 
2179       dtime(&Stime);
2180 
2181       if (parallel_mode==0){
2182 	EigenBand_lapack(C,ko,n,MaxN,1);
2183       }
2184       else{
2185 	/*  The C matrix is distributed by row */
2186 	Eigen_PHH(MPI_CommWD2[myworld2],C,ko,n,MaxN,1);
2187       }
2188 
2189       dtime(&Etime);
2190       time10 += Etime - Stime;
2191 
2192       if (3<=level_stdout && 0<=kloop){
2193 	printf("  kloop %i, k1 k2 k3 %10.6f %10.6f %10.6f\n",
2194 	       kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
2195 	for (i1=1; i1<=n; i1++){
2196 	  printf("  Eigenvalues of Kohn-Sham(DM) spin=%2d i1=%2d %15.12f\n",
2197 		 spin,i1,ko[i1]);
2198 	}
2199       }
2200 
2201       /****************************************************
2202         transformation to the original eigenvectors.
2203 	     NOTE JRCAT-244p and JAIST-2122p
2204       ****************************************************/
2205 
2206       /* transpose */
2207 
2208       /*
2209       for (i1=1; i1<=n; i1++){
2210 	for (j1=i1+1; j1<=n; j1++){
2211 	  Ctmp1 = S[i1][j1];
2212 	  Ctmp2 = S[j1][i1];
2213 	  S[i1][j1] = Ctmp2;
2214 	  S[j1][i1] = Ctmp1;
2215 	}
2216       }
2217       */
2218 
2219       /* transpose */
2220 
2221       /*
2222       for (i1=1; i1<=n; i1++){
2223 	for (j1=i1+1; j1<=n; j1++){
2224 	  Ctmp1 = C[i1][j1];
2225 	  Ctmp2 = C[j1][i1];
2226 	  C[i1][j1] = Ctmp2;
2227 	  C[j1][i1] = Ctmp1;
2228 	}
2229       }
2230       */
2231 
2232       /*
2233       for (i1=1; i1<=n; i1++){
2234         for (j1=1; j1<=MaxN; j1++){
2235 
2236 	  sum  = 0.0;
2237 	  sumi = 0.0;
2238 	  for (l=1; l<=n; l++){
2239 	    sum  += S[i1][l].r*C[j1][l].r - S[i1][l].i*C[j1][l].i;
2240 	    sumi += S[i1][l].r*C[j1][l].i + S[i1][l].i*C[j1][l].r;
2241 	  }
2242 	  H[i1][j1].r = sum;
2243 	  H[i1][j1].i = sumi;
2244 	}
2245       }
2246       */
2247 
2248       for (i1=1; i1<=n; i1++){
2249 	for (j1=1; j1<=n; j1++){
2250           BLAS_H[(j1-1)*n+i1-1] = C[i1][j1];
2251 	}
2252       }
2253 
2254       /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
2255 
2256 #pragma omp parallel shared(H,BLAS_S,BLAS_H,BLAS_C,n,MaxN) private(OMPID,Nthrds,Nprocs,j1,i1,Ctmp1,Ctmp2,BM,BN,BK)
2257       {
2258 
2259 	/* get info. on OpenMP */
2260 
2261 	OMPID = omp_get_thread_num();
2262 	Nthrds = omp_get_num_threads();
2263 	Nprocs = omp_get_num_procs();
2264 
2265 	BM = n;
2266 	BN = (OMPID+1)*MaxN/Nthrds - (OMPID*MaxN/Nthrds+1) + 1;
2267 	BK = n;
2268 
2269 	Ctmp1.r = 1.0;
2270 	Ctmp1.i = 0.0;
2271 	Ctmp2.r = 0.0;
2272 	Ctmp2.i = 0.0;
2273 
2274         if (0<BN){
2275 	  F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
2276 			        &Ctmp1,
2277 			        BLAS_S, &BM,
2278 			        &BLAS_H[(OMPID*MaxN/Nthrds)*n], &BK,
2279 			        &Ctmp2,
2280 			        &BLAS_C[(OMPID*MaxN/Nthrds)*n], &BM);
2281 	}
2282 
2283 	for (j1=(OMPID*MaxN/Nthrds+1); j1<=(OMPID+1)*MaxN/Nthrds; j1++){
2284 	  for (i1=1; i1<=n; i1++){
2285 	    H[i1][j1] = BLAS_C[(j1-1)*n+i1-1];
2286 	  }
2287 	}
2288 
2289       } /* #pragma omp parallel */
2290 
2291       /****************************************************
2292                    calculate DM and EDM
2293       ****************************************************/
2294 
2295       dtime(&Stime);
2296 
2297       /* weight of k-point */
2298 
2299       kw = (double)T_k_op[kloop];
2300 
2301       /* store Fermi function */
2302 
2303       po = 0;
2304       l = 1;
2305       do{
2306 
2307 	eig = EIGEN[spin][kloop][l];
2308 	x = (eig - ChemP)*Beta;
2309 
2310 	if      (x<-x_cut)  FermiF = 1.0;
2311 	else if (x>x_cut)   FermiF = 0.0;
2312 	else                FermiF = 1.0/(1.0 + exp(x));
2313 
2314 	VecFkw[l]  = FermiF*kw;
2315 	VecFkwE[l] = VecFkw[l]*eig;
2316 
2317 	if ( FermiF<=FermiEps && po==0 ) {
2318 	  lmax = l;
2319 	  po = 1;
2320 	}
2321 
2322 	l++;
2323 
2324       } while(po==0 && l<=MaxN);
2325 
2326       if (po==0) lmax = MaxN;
2327 
2328       /* predetermination of k
2329          H is used as temporal array
2330       */
2331 
2332       k = 0;
2333       for (AN=1; AN<=atomnum; AN++){
2334 
2335 	GA_AN = order_GA[AN];
2336 	wanA = WhatSpecies[GA_AN];
2337 	tnoA = Spe_Total_CNO[wanA];
2338 
2339         H[0][AN].r = (double)k;
2340 
2341 	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2342 	  GB_AN = natn[GA_AN][LB_AN];
2343 	  Rn = ncn[GA_AN][LB_AN];
2344 	  wanB = WhatSpecies[GB_AN];
2345 	  tnoB = Spe_Total_CNO[wanB];
2346           k += tnoA*tnoB;
2347 	}
2348       }
2349 
2350       /* DM and EDM */
2351 
2352 #pragma omp parallel shared(EDM1,CDM1,VecFkwE,VecFkw,H,lmax,k1,k2,k3,atv_ijk,ncn,natn,FNAN,MP,Spe_Total_CNO,WhatSpecies,order_GA,atomnum) private(OMPID,Nthrds,Nprocs,AN,GA_AN,wanA,tnoA,Anum,k,LB_AN,GB_AN,Rn,wanB,tnoB,Bnum,l1,l2,l3,kRn,si,co,i,ia,j,jb,d1,d2,l,tmp)
2353       {
2354 
2355 	/* get info. on OpenMP */
2356 
2357 	OMPID = omp_get_thread_num();
2358 	Nthrds = omp_get_num_threads();
2359 	Nprocs = omp_get_num_procs();
2360 
2361 	for (AN=1+OMPID; AN<=atomnum; AN+=Nthrds){
2362 
2363 	  GA_AN = order_GA[AN];
2364 	  wanA = WhatSpecies[GA_AN];
2365 	  tnoA = Spe_Total_CNO[wanA];
2366 	  Anum = MP[GA_AN];
2367 
2368 	  k = (int)H[0][AN].r;
2369 
2370 	  for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2371 	    GB_AN = natn[GA_AN][LB_AN];
2372 	    Rn = ncn[GA_AN][LB_AN];
2373 	    wanB = WhatSpecies[GB_AN];
2374 	    tnoB = Spe_Total_CNO[wanB];
2375 	    Bnum = MP[GB_AN];
2376 
2377 	    l1 = atv_ijk[Rn][1];
2378 	    l2 = atv_ijk[Rn][2];
2379 	    l3 = atv_ijk[Rn][3];
2380 	    kRn = k1*(double)l1 + k2*(double)l2 + k3*(double)l3;
2381 
2382 	    si = sin(2.0*PI*kRn);
2383 	    co = cos(2.0*PI*kRn);
2384 
2385 	    for (i=0; i<tnoA; i++){
2386 
2387 	      ia = Anum + i;
2388 
2389 	      for (j=0; j<tnoB; j++){
2390 
2391 		jb = Bnum + j;
2392 
2393 		/***************************************************************
2394                 Note that the imagiary part is zero,
2395                  since
2396 
2397                  at k
2398                  A = (co + i si)(Re + i Im) = (co*Re - si*Im) + i (co*Im + si*Re)
2399                  at -k
2400                  B = (co - i si)(Re - i Im) = (co*Re - si*Im) - i (co*Im + si*Re)
2401                  Thus, Re(A+B) = 2*(co*Re - si*Im)
2402                        Im(A+B) = 0
2403 		***************************************************************/
2404 
2405 		d1 = 0.0;
2406 		d2 = 0.0;
2407 
2408 		for (l=1; l<=lmax; l++){
2409 
2410 		  tmp = co*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
2411 		       -si*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
2412 
2413 		  d1 += VecFkw[l]*tmp;
2414 		  d2 += VecFkwE[l]*tmp;;
2415 
2416 		}
2417 
2418 		CDM1[k] += d1;
2419 		EDM1[k] += d2;
2420 
2421 		/* increment of k */
2422 
2423 		k++;
2424 
2425 	      }
2426 	    }
2427 	  }
2428 	}
2429 
2430       } /* #pragma omp parallel */
2431 
2432       dtime(&Etime);
2433       time11 += Etime - Stime;
2434 
2435       /*---------- added by TOYODA 16/FEB/2010 */
2436       if (g_exx_switch) {
2437         for (iep=0; iep<nep; iep++) {
2438           GA_AN = exx_ep_atom1[iep]+1;
2439           GB_AN = exx_ep_atom2[iep]+1;
2440           iRn   = exx_ep_cell[iep];
2441           tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
2442           tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
2443           Anum = MP[GA_AN];
2444           Bnum = MP[GB_AN];
2445 
2446           /* phase */
2447           iRn_x = iRn%ncd - nshell_ep;
2448           iRn_y = (iRn/ncd)%ncd - nshell_ep;
2449           iRn_z = (iRn/ncd/ncd)%ncd - nshell_ep;
2450           kRn = k1*(double)iRn_x + k2*(double)iRn_y + k3*(double)iRn_z;
2451           si = sin(2.0*PI*kRn);
2452           co = cos(2.0*PI*kRn);
2453 
2454           for (i=0; i<tnoA; i++){
2455             ia = Anum + i;
2456             for (j=0; j<tnoB; j++){
2457               jb = Bnum + j;
2458               d1 = 0.0;
2459               d2 = 0.0;
2460               for (l=1; l<=lmax; l++){
2461                 tmp = co*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
2462                       -si*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
2463                 d1 += VecFkw[l]*tmp;
2464                 tmp = si*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
2465                       +co*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
2466                 d2 += VecFkw[l]*tmp;
2467               }
2468               exx_CDM[spin][iep][i][j].r += d1;
2469               exx_CDM[spin][iep][i][j].i += d2;
2470             }
2471           }
2472         } /* loop of iep */
2473       }
2474       /*---------- until here */
2475 
2476     } /* kloop0 */
2477 
2478     /*******************************************************
2479          sum of CDM1 and EDM1 by Allreduce in MPI
2480     *******************************************************/
2481 
2482     dtime(&Stime);
2483 
2484     if (parallel_mode){
2485       tmp = 1.0/(double)numprocs2;
2486       for (i=0; i<size_H1; i++){
2487 	CDM1[i] *= tmp;
2488 	EDM1[i] *= tmp;
2489       }
2490     }
2491 
2492     MPI_Allreduce(&CDM1[0], &H1[0], size_H1, MPI_DOUBLE, MPI_SUM, MPI_CommWD1[myworld1]);
2493     MPI_Allreduce(&EDM1[0], &S1[0], size_H1, MPI_DOUBLE, MPI_SUM, MPI_CommWD1[myworld1]);
2494 
2495     /* CDM and EDM */
2496 
2497     k = 0;
2498     for (AN=1; AN<=atomnum; AN++){
2499       GA_AN = order_GA[AN];
2500       MA_AN = F_G2M[GA_AN];
2501 
2502       wanA = WhatSpecies[GA_AN];
2503       tnoA = Spe_Total_CNO[wanA];
2504 
2505       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2506 	GB_AN = natn[GA_AN][LB_AN];
2507 	wanB = WhatSpecies[GB_AN];
2508 	tnoB = Spe_Total_CNO[wanB];
2509 
2510 	for (i=0; i<tnoA; i++){
2511 	  for (j=0; j<tnoB; j++){
2512 
2513 	    if (1<=MA_AN && MA_AN<=Matomnum){
2514 
2515 	      CDM[spin][MA_AN][LB_AN][i][j] = H1[k];
2516               EDM[spin][MA_AN][LB_AN][i][j] = S1[k];
2517 	    }
2518 
2519 	    k++;
2520 
2521 	  }
2522 	}
2523       }
2524     }
2525 
2526     dtime(&Etime);
2527     time12 += Etime - Stime;
2528 
2529     /*---------- added by TOYODA 16/FEB/2010 */
2530     if (g_exx_switch) {
2531       EXX_Reduce_DM(exx, exx_CDM[spin]);
2532     }
2533     /*---------- until here */
2534 
2535     if (SpinP_switch==1 && numprocs0==1 && spin==0){
2536       spin++;
2537       goto diagonalize2;
2538     }
2539 
2540     /* if necessary, MPI communication of CDM and EDM */
2541 
2542     if (1<numprocs0 && SpinP_switch==1){
2543 
2544       /* set spin */
2545 
2546       if (myworld1==0){
2547 	spin = 1;
2548       }
2549       else{
2550 	spin = 0;
2551       }
2552 
2553       /* communicate CDM1 and EDM1 */
2554 
2555       for (i=0; i<=1; i++){
2556 
2557 	IDS = Comm_World_StartID1[i%2];
2558 	IDR = Comm_World_StartID1[(i+1)%2];
2559 
2560 	if (myid0==IDS){
2561 	  MPI_Isend(&H1[0], size_H1, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &request);
2562 	}
2563 
2564 	if (myid0==IDR){
2565 	  MPI_Recv(&CDM1[0], size_H1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &stat);
2566 	}
2567 
2568 	if (myid0==IDS){
2569 	  MPI_Wait(&request,&stat);
2570 	}
2571 
2572 	if (myid0==IDS){
2573 	  MPI_Isend(&S1[0], size_H1, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &request);
2574 	}
2575 
2576 	if (myid0==IDR){
2577 	  MPI_Recv(&EDM1[0], size_H1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &stat);
2578 	}
2579 
2580 	if (myid0==IDS){
2581 	  MPI_Wait(&request,&stat);
2582 	}
2583       }
2584 
2585       MPI_Bcast(&CDM1[0], size_H1, MPI_DOUBLE, 0, MPI_CommWD1[myworld1]);
2586       MPI_Bcast(&EDM1[0], size_H1, MPI_DOUBLE, 0, MPI_CommWD1[myworld1]);
2587 
2588       /* put CDM1 and EDM1 into CDM and EDM */
2589 
2590       k = 0;
2591       for (AN=1; AN<=atomnum; AN++){
2592 	GA_AN = order_GA[AN];
2593 	MA_AN = F_G2M[GA_AN];
2594 
2595 	wanA = WhatSpecies[GA_AN];
2596 	tnoA = Spe_Total_CNO[wanA];
2597 
2598 	for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2599 	  GB_AN = natn[GA_AN][LB_AN];
2600 	  wanB = WhatSpecies[GB_AN];
2601 	  tnoB = Spe_Total_CNO[wanB];
2602 
2603 	  for (i=0; i<tnoA; i++){
2604 	    for (j=0; j<tnoB; j++){
2605 
2606 	      if (1<=MA_AN && MA_AN<=Matomnum){
2607 		CDM[spin][MA_AN][LB_AN][i][j] = CDM1[k];
2608    	        EDM[spin][MA_AN][LB_AN][i][j] = EDM1[k];
2609 	      }
2610 
2611 	      k++;
2612 
2613 	    }
2614 	  }
2615 	}
2616       }
2617     }
2618 
2619   } /* if (all_knum!=1) */
2620 
2621   /****************************************************
2622            normalization of CDM, EDM, and iDM
2623   ****************************************************/
2624 
2625   dtime(&EiloopTime);
2626 
2627   if (myid0==Host_ID && 0<level_stdout){
2628     printf("<Band_DFT>  DM, time=%lf\n", EiloopTime-SiloopTime);fflush(stdout);
2629   }
2630 
2631   dum = 1.0/sum_weights;
2632 
2633   for (spin=0; spin<=SpinP_switch; spin++){
2634     for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
2635       GA_AN = M2G[MA_AN];
2636       wanA = WhatSpecies[GA_AN];
2637       tnoA = Spe_Total_CNO[wanA];
2638       Anum = MP[GA_AN];
2639       for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2640 	GB_AN = natn[GA_AN][LB_AN];
2641 	wanB = WhatSpecies[GB_AN];
2642 	tnoB = Spe_Total_CNO[wanB];
2643 	Bnum = MP[GB_AN];
2644 
2645 	for (i=0; i<tnoA; i++){
2646 	  for (j=0; j<tnoB; j++){
2647 	    CDM[spin][MA_AN][LB_AN][i][j]    = CDM[spin][MA_AN][LB_AN][i][j]*dum;
2648             EDM[spin][MA_AN][LB_AN][i][j]    = EDM[spin][MA_AN][LB_AN][i][j]*dum;
2649 	    iDM[0][spin][MA_AN][LB_AN][i][j] = iDM[0][spin][MA_AN][LB_AN][i][j]*dum;
2650 	  }
2651 	}
2652       }
2653     }
2654   }
2655 
2656   /*---------- added by TOYODA 16/FEB/2010 */
2657   if (g_exx_switch) {
2658     for (spin=0; spin<=SpinP_switch; spin++){
2659       for (iep=0; iep<nep; iep++) {
2660         GA_AN = exx_ep_atom1[iep]+1;
2661         GB_AN = exx_ep_atom2[iep]+1;
2662         tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
2663         tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
2664         for (i=0; i<tnoA; i++){
2665           for (j=0; j<tnoB; j++){
2666             exx_CDM[spin][iep][i][j].r *= dum;
2667             exx_CDM[spin][iep][i][j].i *= dum;
2668           }
2669         }
2670       }
2671     }
2672   }
2673   /*---------- until here */
2674 
2675   /****************************************************
2676                        bond-energies
2677   ****************************************************/
2678 
2679   My_Eele1[0] = 0.0;
2680   My_Eele1[1] = 0.0;
2681   for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
2682     GA_AN = M2G[MA_AN];
2683     wanA = WhatSpecies[GA_AN];
2684     tnoA = Spe_Total_CNO[wanA];
2685 
2686     for (j=0; j<=FNAN[GA_AN]; j++){
2687       GB_AN = natn[GA_AN][j];
2688       wanB = WhatSpecies[GB_AN];
2689       tnoB = Spe_Total_CNO[wanB];
2690 
2691       for (k=0; k<tnoA; k++){
2692 	for (l=0; l<tnoB; l++){
2693 	  for (spin=0; spin<=SpinP_switch; spin++){
2694 	    My_Eele1[spin] += CDM[spin][MA_AN][j][k][l]*nh[spin][MA_AN][j][k][l];
2695 	  }
2696 	}
2697       }
2698 
2699       /*
2700       printf("GA_AN=%2d LB_AN=%2d GB_AN=%2d RnB=%2d\n",GA_AN,LB_AN,GB_AN,ncn[GA_AN][j]);
2701       for (k=0; k<tnoA; k++){
2702 	for (l=0; l<tnoB; l++){
2703 	  for (spin=0; spin<=SpinP_switch; spin++){
2704 	    printf("%15.12f ",CDM[spin][MA_AN][j][k][l]);
2705 	  }
2706 	}
2707         printf("\n");
2708       }
2709       */
2710 
2711     }
2712   }
2713 
2714   /* MPI, My_Eele1 */
2715   MPI_Barrier(mpi_comm_level1);
2716   for (spin=0; spin<=SpinP_switch; spin++){
2717     MPI_Allreduce(&My_Eele1[spin], &Eele1[spin], 1, MPI_DOUBLE,
2718                    MPI_SUM, mpi_comm_level1);
2719   }
2720 
2721   if (SpinP_switch==0){
2722     Eele1[1] = Eele1[0];
2723   }
2724 
2725   if (3<=level_stdout && myid0==Host_ID){
2726     printf("Eele00=%15.12f Eele01=%15.12f\n",Eele0[0],Eele0[1]);
2727     printf("Eele10=%15.12f Eele11=%15.12f\n",Eele1[0],Eele1[1]);
2728   }
2729 
2730   /****************************************************
2731                         output
2732   ****************************************************/
2733 
2734   if (myid0==Host_ID){
2735 
2736     strcpy(file_EV,".EV");
2737     fnjoint(filepath,filename,file_EV);
2738 
2739     if ((fp_EV = fopen(file_EV,"w")) != NULL){
2740 
2741 #ifdef xt3
2742       setvbuf(fp_EV,buf,_IOFBF,fp_bsize);
2743 #endif
2744 
2745       fprintf(fp_EV,"\n");
2746       fprintf(fp_EV,"***********************************************************\n");
2747       fprintf(fp_EV,"***********************************************************\n");
2748       fprintf(fp_EV,"           Eigenvalues (Hartree) of SCF KS-eq.           \n");
2749       fprintf(fp_EV,"***********************************************************\n");
2750       fprintf(fp_EV,"***********************************************************\n\n");
2751       fprintf(fp_EV,"   Chemical Potential (Hatree) = %18.14f\n",ChemP);
2752       fprintf(fp_EV,"   Number of States            = %18.14f\n",Num_State);
2753       fprintf(fp_EV,"   Eigenvalues\n");
2754       fprintf(fp_EV,"              Up-spin           Down-spin\n");
2755 
2756       for (kloop=0; kloop<T_knum; kloop++){
2757 
2758 	k1 = T_KGrids1[kloop];
2759 	k2 = T_KGrids2[kloop];
2760 	k3 = T_KGrids3[kloop];
2761 
2762 	if (0<T_k_op[kloop]){
2763 
2764 	  fprintf(fp_EV,"\n");
2765 	  fprintf(fp_EV,"   kloop=%i\n",kloop);
2766 	  fprintf(fp_EV,"   k1=%10.5f k2=%10.5f k3=%10.5f\n\n",k1,k2,k3);
2767 	  for (l=1; l<=MaxN; l++){
2768 	    if (SpinP_switch==0){
2769 	      fprintf(fp_EV,"%5d  %18.14f %18.14f\n",
2770 		      l,EIGEN[0][kloop][l],EIGEN[0][kloop][l]);
2771 	    }
2772 	    else if (SpinP_switch==1){
2773 	      fprintf(fp_EV,"%5d  %18.14f %18.14f\n",
2774 		      l,EIGEN[0][kloop][l],EIGEN[1][kloop][l]);
2775 	    }
2776 	  }
2777 	}
2778       }
2779       fclose(fp_EV);
2780     }
2781     else{
2782       printf("Failure of saving the EV file.\n");
2783       fclose(fp_EV);
2784     }
2785   }
2786 
2787   /****************************************************
2788                        free arrays
2789   ****************************************************/
2790 
2791   if (all_knum==1){
2792 
2793     free(stat_send);
2794     free(request_send);
2795     free(request_recv);
2796 
2797     free(is1);
2798     free(ie1);
2799 
2800     free(is2);
2801     free(ie2);
2802 
2803     for (ID=0; ID<numprocs0; ID++){
2804       if (MPI_CDM1_flag[ID]){
2805         MPI_Comm_free(&MPI_CommWD_CDM1[ID]);
2806       }
2807     }
2808   }
2809 
2810   free(VecFkw);
2811   free(VecFkwE);
2812 
2813   free(BLAS_H);
2814   free(BLAS_C);
2815 
2816   free(My_NZeros);
2817   free(SP_NZeros);
2818   free(SP_Atoms);
2819 
2820   free(MPI_CommWD_CDM1);
2821   free(MPI_CDM1_flag);
2822 
2823   /* for PrintMemory and allocation */
2824   firsttime=0;
2825 
2826   /* for elapsed time */
2827 
2828   if (measure_time){
2829     printf("myid0=%2d time1 =%9.4f\n",myid0,time1);fflush(stdout);
2830     printf("myid0=%2d time2 =%9.4f\n",myid0,time2);fflush(stdout);
2831     printf("myid0=%2d time3 =%9.4f\n",myid0,time3);fflush(stdout);
2832     printf("myid0=%2d time4 =%9.4f\n",myid0,time4);fflush(stdout);
2833     printf("myid0=%2d time5 =%9.4f\n",myid0,time5);fflush(stdout);
2834     printf("myid0=%2d time51=%9.4f\n",myid0,time51);fflush(stdout);
2835     printf("myid0=%2d time6 =%9.4f\n",myid0,time6);fflush(stdout);
2836     printf("myid0=%2d time7 =%9.4f\n",myid0,time7);fflush(stdout);
2837     printf("myid0=%2d time8 =%9.4f\n",myid0,time8);fflush(stdout);
2838     printf("myid0=%2d time81=%9.4f\n",myid0,time81);fflush(stdout);
2839     printf("myid0=%2d time82=%9.4f\n",myid0,time82);fflush(stdout);
2840     printf("myid0=%2d time83=%9.4f\n",myid0,time83);fflush(stdout);
2841     printf("myid0=%2d time84=%9.4f\n",myid0,time84);fflush(stdout);
2842     printf("myid0=%2d time85=%9.4f\n",myid0,time85);fflush(stdout);
2843     printf("myid0=%2d time9 =%9.4f\n",myid0,time9);fflush(stdout);
2844     printf("myid0=%2d time10=%9.4f\n",myid0,time10);fflush(stdout);
2845     printf("myid0=%2d time11=%9.4f\n",myid0,time11);fflush(stdout);
2846     printf("myid0=%2d time12=%9.4f\n",myid0,time12);fflush(stdout);
2847 
2848   }
2849 
2850   MPI_Barrier(mpi_comm_level1);
2851   dtime(&TEtime);
2852   time0 = TEtime - TStime;
2853   return time0;
2854 }
2855