1 /**********************************************************************
2 Band_DFT_Col.c:
3
4 Band_DFT_Col.c is a subroutine to perform band calculations
5 based on a collinear DFT
6
7 Log of Band_DFT_Col.c:
8
9 22/Nov/2001 Released by T.Ozaki
10
11 ***********************************************************************/
12
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <math.h>
17 #include <time.h>
18 #include "openmx_common.h"
19 #include "lapack_prototypes.h"
20 #include "mpi.h"
21 #include <omp.h>
22
23 #define measure_time 0
24
25
26 /*---------- added by TOYODA 17/FEB/2010 */
27 #include "exx_debug.h"
28 /*---------- until here */
29
30
31
Band_DFT_Col(int SCF_iter,int knum_i,int knum_j,int knum_k,int SpinP_switch,double ***** nh,double ***** ImNL,double **** CntOLP,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2],int * MP,int * order_GA,double * ko,double * koS,double *** EIGEN,double * H1,double * S1,double * CDM1,double * EDM1,dcomplex ** H,dcomplex ** S,dcomplex ** C,dcomplex * BLAS_S,int *** k_op,int * T_k_op,int ** T_k_ID,double * T_KGrids1,double * T_KGrids2,double * T_KGrids3,int myworld1,int * NPROCS_ID1,int * Comm_World1,int * NPROCS_WD1,int * Comm_World_StartID1,MPI_Comm * MPI_CommWD1,int myworld2,int * NPROCS_ID2,int * NPROCS_WD2,int * Comm_World2,int * Comm_World_StartID2,MPI_Comm * MPI_CommWD2,EXX_t * exx,dcomplex **** exx_CDM,double * Uexx)32 double Band_DFT_Col(int SCF_iter,
33 int knum_i, int knum_j, int knum_k,
34 int SpinP_switch,
35 double *****nh,
36 double *****ImNL,
37 double ****CntOLP,
38 double *****CDM,
39 double *****EDM,
40 double Eele0[2], double Eele1[2],
41 int *MP,
42 int *order_GA,
43 double *ko,
44 double *koS,
45 double ***EIGEN,
46 double *H1,
47 double *S1,
48 double *CDM1,
49 double *EDM1,
50 dcomplex **H,
51 dcomplex **S,
52 dcomplex **C,
53 dcomplex *BLAS_S,
54 int ***k_op,
55 int *T_k_op,
56 int **T_k_ID,
57 double *T_KGrids1,
58 double *T_KGrids2,
59 double *T_KGrids3,
60 int myworld1,
61 int *NPROCS_ID1,
62 int *Comm_World1,
63 int *NPROCS_WD1,
64 int *Comm_World_StartID1,
65 MPI_Comm *MPI_CommWD1,
66 int myworld2,
67 int *NPROCS_ID2,
68 int *NPROCS_WD2,
69 int *Comm_World2,
70 int *Comm_World_StartID2,
71 MPI_Comm *MPI_CommWD2,
72 EXX_t *exx,
73 dcomplex ****exx_CDM,
74 double *Uexx)
75 {
76 static int firsttime=1;
77 int i,j,k,l,m,n,wan,MaxN,i0,ks;
78 int i1,i1s,j1,ia,jb,lmax,po,po1,spin,s1,e1;
79 int num2,RnB,l1,l2,l3,loop_num,ns,ne;
80 int ct_AN,h_AN,wanA,tnoA,wanB,tnoB;
81 int MA_AN,GA_AN,Anum,num_kloop0;
82 int T_knum,S_knum,E_knum,kloop,kloop0;
83 double av_num,lumos;
84 double time0;
85 int LB_AN,GB_AN,Bnum;
86 double k1,k2,k3,Fkw;
87 double sum,sumi,sum_weights;
88 double Num_State;
89 double My_Num_State;
90 double FermiF;
91 double tmp,eig,kw,EV_cut0;
92 double x,Dnum,Dnum2,AcP,ChemP_MAX,ChemP_MIN;
93 double *VecFkw;
94 double *VecFkwE;
95 int *is1,*ie1;
96 int *is2,*ie2;
97 MPI_Status *stat_send;
98 MPI_Request *request_send;
99 MPI_Request *request_recv;
100 int *My_NZeros;
101 int *SP_NZeros;
102 int *SP_Atoms;
103 MPI_Comm *MPI_CommWD_CDM1;
104 int *MPI_CDM1_flag;
105
106 int all_knum;
107 dcomplex Ctmp1,Ctmp2;
108 int ii,ij,ik;
109 int BM,BN,BK;
110 double u2,v2,uv,vu;
111 double d1,d2;
112 double My_Eele1[2];
113 double TZ,dum,sumE,kRn,si,co;
114 double Resum,ResumE,Redum,Redum2;
115 double Imsum,ImsumE,Imdum,Imdum2;
116 double TStime,TEtime,SiloopTime,EiloopTime;
117 double Stime,Etime,Stime0,Etime0;
118 double FermiEps=1.0e-14;
119 double x_cut=30.0;
120 double My_Eele0[2];
121
122 char file_EV[YOUSO10];
123 FILE *fp_EV;
124 #ifdef xt3
125 char buf[fp_bsize]; /* setvbuf */
126 #endif
127 int AN,Rn,size_H1;
128 int parallel_mode;
129 int numprocs0,myid0;
130 int ID,ID0,ID1;
131 int numprocs1,myid1;
132 int numprocs2,myid2;
133 int Num_Comm_World1;
134 int Num_Comm_World2;
135
136 int tag=999,IDS,IDR;
137 MPI_Status stat;
138 MPI_Request request;
139
140 double time1,time2,time3;
141 double time4,time5,time6;
142 double time7,time8,time9;
143 double time10,time11,time12;
144 double time81,time82,time83;
145 double time84,time85;
146 double time51;
147
148 dcomplex *BLAS_H;
149 dcomplex *BLAS_C;
150
151 /* for OpenMP */
152 int OMPID,Nthrds,Nprocs;
153
154 /*---------- added by TOYODA 16/FEB/2010 */
155 const int *exx_ep_atom1;
156 const int *exx_ep_atom2;
157 const int *exx_ep_cell;
158 int iRn_x, iRn_y, iRn_z, iRn;
159 int ncd, nshell_ep, iep, nep;
160 /*---------- until here */
161
162 /* for time */
163 dtime(&TStime);
164
165 time1 = 0.0;
166 time2 = 0.0;
167 time3 = 0.0;
168 time4 = 0.0;
169 time5 = 0.0;
170 time6 = 0.0;
171 time7 = 0.0;
172 time8 = 0.0;
173 time9 = 0.0;
174 time10 = 0.0;
175 time11 = 0.0;
176 time12 = 0.0;
177 time81 = 0.0;
178 time82 = 0.0;
179 time83 = 0.0;
180 time84 = 0.0;
181 time85 = 0.0;
182 time51 = 0.0;
183
184 /*---------- added by TOYODA 16/FEB/2010 */
185 if (g_exx_switch) {
186 nep = EXX_Number_of_EP(exx);
187 nshell_ep = EXX_Number_of_EP_Shells(exx);
188 exx_ep_atom1 = EXX_Array_EP_Atom1(exx);
189 exx_ep_atom2 = EXX_Array_EP_Atom2(exx);
190 exx_ep_cell = EXX_Array_EP_Cell(exx);
191 ncd = 2*nshell_ep + 1;
192 Uexx[0] = 0.0;
193 Uexx[1] = 0.0;
194 }
195 /*---------- until here */
196
197 /* MPI */
198 MPI_Comm_size(mpi_comm_level1,&numprocs0);
199 MPI_Comm_rank(mpi_comm_level1,&myid0);
200 MPI_Barrier(mpi_comm_level1);
201
202 Num_Comm_World1 = SpinP_switch + 1;
203
204 /***********************************************
205 for pallalel calculations in myworld1
206 ***********************************************/
207
208 MPI_Comm_size(MPI_CommWD1[myworld1],&numprocs1);
209 MPI_Comm_rank(MPI_CommWD1[myworld1],&myid1);
210
211 /****************************************************
212 find the number of basis functions, n
213 ****************************************************/
214
215 n = 0;
216 for (i=1; i<=atomnum; i++){
217 wanA = WhatSpecies[i];
218 n += Spe_Total_CNO[wanA];
219 }
220
221 /****************************************************
222 find TZ
223 ****************************************************/
224
225 TZ = 0.0;
226 for (i=1; i<=atomnum; i++){
227 wan = WhatSpecies[i];
228 TZ += Spe_Core_Charge[wan];
229 }
230
231 /***********************************************
232 find the number of states to be solved
233 ***********************************************/
234
235 lumos = (double)n*0.300;
236 if (lumos<60.0) lumos = 400.0;
237 MaxN = (TZ-system_charge)/2 + (int)lumos;
238 if (n<MaxN) MaxN = n;
239
240 /***********************************************
241 allocation of arrays
242 ***********************************************/
243
244 VecFkw = (double*)malloc(sizeof(double)*(n+2));
245 VecFkwE = (double*)malloc(sizeof(double)*(n+2));
246
247 BLAS_H = (dcomplex*)malloc(sizeof(dcomplex)*n*n);
248 BLAS_C = (dcomplex*)malloc(sizeof(dcomplex)*n*n);
249
250 My_NZeros = (int*)malloc(sizeof(int)*numprocs0);
251 SP_NZeros = (int*)malloc(sizeof(int)*numprocs0);
252 SP_Atoms = (int*)malloc(sizeof(int)*numprocs0);
253
254 MPI_CommWD_CDM1 = (MPI_Comm*)malloc(sizeof(MPI_Comm)*numprocs0);
255 MPI_CDM1_flag = (int*)malloc(sizeof(int)*numprocs0);
256 for (i=0; i<numprocs0; i++) MPI_CDM1_flag[i] = 0;
257
258 if (firsttime) {
259 PrintMemory("Band_DFT: ko", sizeof(double)*(n+1),NULL);
260 PrintMemory("Band_DFT: koS",sizeof(double)*(n+1),NULL);
261 PrintMemory("Band_DFT: H", sizeof(dcomplex)*(n+1)*(n+1),NULL);
262 PrintMemory("Band_DFT: S", sizeof(dcomplex)*(n+1)*(n+1),NULL);
263 PrintMemory("Band_DFT: C", sizeof(dcomplex)*(n+1)*(n+1),NULL);
264 }
265
266 /***********************************************
267 k-points by regular mesh
268 ***********************************************/
269
270 if (way_of_kpoint==1){
271
272 /**************************************************************
273 k_op[i][j][k]: weight of DOS
274 =0 no calc.
275 =1 G-point
276 =2 which has k<->-k point
277 Now , only the relation, E(k)=E(-k), is used.
278
279 Future release: k_op will be used for symmetry operation
280 *************************************************************/
281
282 for (i=0;i<=knum_i-1;i++) {
283 for (j=0;j<=knum_j-1;j++) {
284 for (k=0;k<=knum_k-1;k++) {
285 k_op[i][j][k]=-999;
286 }
287 }
288 }
289
290 for (i=0;i<=knum_i-1;i++) {
291 for (j=0;j<=knum_j-1;j++) {
292 for (k=0;k<=knum_k-1;k++) {
293 if ( k_op[i][j][k]==-999 ) {
294 k_inversion(i,j,k,knum_i,knum_j,knum_k,&ii,&ij,&ik);
295 if ( i==ii && j==ij && k==ik ) {
296 k_op[i][j][k] = 1;
297 }
298
299 else {
300 k_op[i][j][k] = 2;
301 k_op[ii][ij][ik] = 0;
302 }
303 }
304 } /* k */
305 } /* j */
306 } /* i */
307
308 /***********************************
309 one-dimentionalize for MPI
310 ************************************/
311
312 T_knum = 0;
313 for (i=0; i<=(knum_i-1); i++){
314 for (j=0; j<=(knum_j-1); j++){
315 for (k=0; k<=(knum_k-1); k++){
316 if (0<k_op[i][j][k]){
317 T_knum++;
318 }
319 }
320 }
321 }
322
323 /* set T_KGrids1,2,3 and T_k_op */
324
325 T_knum = 0;
326 for (i=0; i<knum_i; i++){
327
328 if (knum_i==1) k1 = 0.0;
329 else k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*(double)knum_i) + Shift_K_Point;
330
331 for (j=0; j<knum_j; j++){
332
333 if (knum_j==1) k2 = 0.0;
334 else k2 = -0.5 + (2.0*(double)j+1.0)/(2.0*(double)knum_j) - Shift_K_Point;
335
336 for (k=0; k<knum_k; k++){
337
338 if (knum_k==1) k3 = 0.0;
339 else k3 = -0.5 + (2.0*(double)k+1.0)/(2.0*(double)knum_k) + 2.0*Shift_K_Point;
340
341 if (0<k_op[i][j][k]){
342
343 T_KGrids1[T_knum] = k1;
344 T_KGrids2[T_knum] = k2;
345 T_KGrids3[T_knum] = k3;
346 T_k_op[T_knum] = k_op[i][j][k];
347
348 T_knum++;
349 }
350 }
351 }
352 }
353
354 if (myid0==Host_ID && 0<level_stdout){
355
356 printf(" KGrids1: ");fflush(stdout);
357 for (i=0;i<=knum_i-1;i++){
358 if (knum_i==1) k1 = 0.0;
359 else k1 = -0.5 + (2.0*(double)i+1.0)/(2.0*(double)knum_i) + Shift_K_Point;
360 printf("%9.5f ",k1);fflush(stdout);
361 }
362 printf("\n");fflush(stdout);
363
364 printf(" KGrids2: ");fflush(stdout);
365
366 for (i=0;i<=knum_j-1;i++){
367 if (knum_j==1) k2 = 0.0;
368 else k2 = -0.5 + (2.0*(double)i+1.0)/(2.0*(double)knum_j) - Shift_K_Point;
369 printf("%9.5f ",k2);fflush(stdout);
370 }
371 printf("\n");fflush(stdout);
372
373 printf(" KGrids3: ");fflush(stdout);
374 for (i=0;i<=knum_k-1;i++){
375 if (knum_k==1) k3 = 0.0;
376 else k3 = -0.5 + (2.0*(double)i+1.0)/(2.0*(double)knum_k) + 2.0*Shift_K_Point;
377 printf("%9.5f ",k3);fflush(stdout);
378 }
379 printf("\n");fflush(stdout);
380 }
381
382 }
383
384 /***********************************************
385 Monkhorst-Pack k-points
386 ***********************************************/
387
388 else if (way_of_kpoint==2){
389
390 T_knum = num_non_eq_kpt;
391
392 for (k=0; k<num_non_eq_kpt; k++){
393 T_KGrids1[k] = NE_KGrids1[k];
394 T_KGrids2[k] = NE_KGrids2[k];
395 T_KGrids3[k] = NE_KGrids3[k];
396 T_k_op[k] = NE_T_k_op[k];
397 }
398 }
399
400 /***********************************************
401 calculate the sum of weights
402 ***********************************************/
403
404 sum_weights = 0.0;
405 for (k=0; k<T_knum; k++){
406 sum_weights += (double)T_k_op[k];
407 }
408
409 /***********************************************
410 allocate k-points into processors
411 ***********************************************/
412
413 if (numprocs1<T_knum){
414
415 /* set parallel_mode */
416 parallel_mode = 0;
417
418 /* allocation of kloop to ID */
419
420 for (ID=0; ID<numprocs1; ID++){
421 tmp = (double)T_knum/(double)numprocs1;
422 S_knum = (int)((double)ID*(tmp+1.0e-12));
423 E_knum = (int)((double)(ID+1)*(tmp+1.0e-12)) - 1;
424 if (ID==(numprocs1-1)) E_knum = T_knum - 1;
425 if (E_knum<0) E_knum = 0;
426
427 for (k=S_knum; k<=E_knum; k++){
428 /* ID in the first level world */
429 T_k_ID[myworld1][k] = ID;
430 }
431 }
432
433 /* find own informations */
434
435 tmp = (double)T_knum/(double)numprocs1;
436 S_knum = (int)((double)myid1*(tmp+1.0e-12));
437 E_knum = (int)((double)(myid1+1)*(tmp+1.0e-12)) - 1;
438 if (myid1==(numprocs1-1)) E_knum = T_knum - 1;
439 if (E_knum<0) E_knum = 0;
440
441 num_kloop0 = E_knum - S_knum + 1;
442 }
443
444 else {
445
446 /* set parallel_mode */
447 parallel_mode = 1;
448 num_kloop0 = 1;
449
450 Num_Comm_World2 = T_knum;
451 MPI_Comm_size(MPI_CommWD2[myworld2],&numprocs2);
452 MPI_Comm_rank(MPI_CommWD2[myworld2],&myid2);
453
454 S_knum = myworld2;
455
456 /* allocate k-points into processors */
457
458 for (k=0; k<T_knum; k++){
459 /* ID in the first level world */
460 T_k_ID[myworld1][k] = Comm_World_StartID2[k];
461 }
462 }
463
464 /****************************************************
465 find all_knum
466 if (all_knum==1), all the calculation will be made
467 by the first diagonalization loop, and the second
468 diagonalization will be skipped.
469 ****************************************************/
470
471 MPI_Allreduce(&num_kloop0, &all_knum, 1, MPI_INT, MPI_PROD, mpi_comm_level1);
472
473 if (SpinP_switch==1 && numprocs0==1 && all_knum==1){
474 all_knum = 0;
475 }
476
477 /****************************************************
478 if (parallel_mode==1 && all_knum==1)
479 make is1, ie1, is2, ie2
480 ****************************************************/
481
482 if (all_knum==1){
483
484 /* allocation */
485
486 stat_send = (MPI_Status *)malloc(sizeof(MPI_Status)*numprocs2);
487 request_send = (MPI_Request *)malloc(sizeof(MPI_Request)*numprocs2);
488 request_recv = (MPI_Request *)malloc(sizeof(MPI_Request)*numprocs2);
489
490 is1 = (int*)malloc(sizeof(int)*numprocs2);
491 ie1 = (int*)malloc(sizeof(int)*numprocs2);
492
493 is2 = (int*)malloc(sizeof(int)*numprocs2);
494 ie2 = (int*)malloc(sizeof(int)*numprocs2);
495
496 /* make is1 and ie1 */
497
498 if ( numprocs2<=n ){
499
500 av_num = (double)n/(double)numprocs2;
501
502 for (ID=0; ID<numprocs2; ID++){
503 is1[ID] = (int)(av_num*(double)ID) + 1;
504 ie1[ID] = (int)(av_num*(double)(ID+1));
505 }
506
507 is1[0] = 1;
508 ie1[numprocs2-1] = n;
509
510 }
511
512 else{
513
514 for (ID=0; ID<n; ID++){
515 is1[ID] = ID + 1;
516 ie1[ID] = ID + 1;
517 }
518 for (ID=n; ID<numprocs2; ID++){
519 is1[ID] = 1;
520 ie1[ID] = -2;
521 }
522 }
523
524 /* make is2 and ie2 */
525
526 if ( numprocs2<=MaxN ){
527
528 av_num = (double)MaxN/(double)numprocs2;
529
530 for (ID=0; ID<numprocs2; ID++){
531 is2[ID] = (int)(av_num*(double)ID) + 1;
532 ie2[ID] = (int)(av_num*(double)(ID+1));
533 }
534
535 is2[0] = 1;
536 ie2[numprocs2-1] = MaxN;
537 }
538
539 else{
540 for (ID=0; ID<MaxN; ID++){
541 is2[ID] = ID + 1;
542 ie2[ID] = ID + 1;
543 }
544 for (ID=MaxN; ID<numprocs2; ID++){
545 is2[ID] = 1;
546 ie2[ID] = -2;
547 }
548 }
549
550 /**********************************************
551 for MPI communication of CDM1 and EDM1
552 **********************************************/
553
554 {
555 MPI_Group new_group,old_group;
556 int *new_ranks;
557
558 new_ranks = (int*)malloc(sizeof(int)*T_knum);
559
560 /* ID: zeroth world */
561
562 for (ID=Comm_World_StartID1[myworld1]; ID<(numprocs0+Comm_World_StartID1[myworld1]); ID++){
563
564 /* ID0: zeroth world */
565
566 ID0 = ID % numprocs0;
567
568 /* ID1: first world */
569
570 ID1 = (ID-Comm_World_StartID1[myworld1]) % numprocs1;
571
572 for (i=0; i<T_knum; i++){
573 if (Comm_World_StartID2[i]<=ID1){
574 ks = i;
575 }
576 }
577
578 new_ranks[0] = ID1;
579 if (myid1==ID1) MPI_CDM1_flag[ID0] = 1;
580
581 for (i=(ks+1); i<(T_knum+ks); i++){
582 i0 = i % T_knum;
583 /* id in the first world */
584 ID1 = Comm_World_StartID2[i0] + (ID0 % NPROCS_WD2[i0]);
585 new_ranks[i-ks] = ID1;
586 if (myid1==ID1) MPI_CDM1_flag[ID0] = 1;
587 }
588
589 MPI_Comm_group(MPI_CommWD1[myworld1], &old_group);
590
591 /* define a new group */
592 MPI_Group_incl(old_group,T_knum,new_ranks,&new_group);
593 MPI_Comm_create(MPI_CommWD1[myworld1], new_group, &MPI_CommWD_CDM1[ID0]);
594 MPI_Group_free(&new_group);
595 }
596
597 free(new_ranks); /* never forget cleaning! */
598 }
599
600 } /* if (all_knum==1) */
601
602 /****************************************************
603 communicate T_k_ID
604 ****************************************************/
605
606 if (numprocs0==1 && SpinP_switch==1){
607 for (k=0; k<T_knum; k++){
608 T_k_ID[1][k] = T_k_ID[0][k];
609 }
610 }
611 else{
612 for (spin=0; spin<=SpinP_switch; spin++){
613 ID = Comm_World_StartID1[spin];
614 MPI_Bcast(&T_k_ID[spin][0], T_knum, MPI_INT, ID, mpi_comm_level1);
615 }
616 }
617
618 /****************************************************
619 store in each processor all the matrix elements
620 for overlap and Hamiltonian matrices
621 ****************************************************/
622
623 dtime(&Stime);
624
625 /* spin=myworld1 */
626
627 spin = myworld1;
628
629 /* set S1 */
630
631 if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
632 size_H1 = Get_OneD_HS_Col(1, CntOLP, S1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
633 }
634
635 diagonalize1:
636
637 /* set H1 */
638
639 if (SpinP_switch==0){
640 size_H1 = Get_OneD_HS_Col(1, nh[0], H1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
641 }
642 else if (1<numprocs0){
643 size_H1 = Get_OneD_HS_Col(1, nh[0], H1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
644 size_H1 = Get_OneD_HS_Col(1, nh[1], CDM1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
645
646 if (myworld1){
647 for (i=0; i<size_H1; i++){
648 H1[i] = CDM1[i];
649 }
650 }
651 }
652 else{
653 size_H1 = Get_OneD_HS_Col(1, nh[spin], H1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
654 }
655
656 dtime(&Etime);
657 time1 += Etime - Stime;
658
659 /****************************************************
660 start kloop
661 ****************************************************/
662
663 dtime(&SiloopTime);
664
665 for (kloop0=0; kloop0<num_kloop0; kloop0++){
666
667 kloop = S_knum + kloop0;
668
669 k1 = T_KGrids1[kloop];
670 k2 = T_KGrids2[kloop];
671 k3 = T_KGrids3[kloop];
672
673 /* make S and H */
674
675 if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
676
677 for (i1=0; i1<n*n; i1++) BLAS_S[i1] = Complex(0.0,0.0);
678
679 for (i1=1; i1<=n; i1++){
680 for (j1=1; j1<=n; j1++){
681 S[i1][j1] = Complex(0.0,0.0);
682 }
683 }
684 }
685
686 for (i1=1; i1<=n; i1++){
687 for (j1=1; j1<=n; j1++){
688 H[i1][j1] = Complex(0.0,0.0);
689 }
690 }
691
692 k = 0;
693 for (AN=1; AN<=atomnum; AN++){
694 GA_AN = order_GA[AN];
695 wanA = WhatSpecies[GA_AN];
696 tnoA = Spe_Total_CNO[wanA];
697 Anum = MP[GA_AN];
698
699 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
700 GB_AN = natn[GA_AN][LB_AN];
701 Rn = ncn[GA_AN][LB_AN];
702 wanB = WhatSpecies[GB_AN];
703 tnoB = Spe_Total_CNO[wanB];
704 Bnum = MP[GB_AN];
705
706 l1 = atv_ijk[Rn][1];
707 l2 = atv_ijk[Rn][2];
708 l3 = atv_ijk[Rn][3];
709 kRn = k1*(double)l1 + k2*(double)l2 + k3*(double)l3;
710
711 si = sin(2.0*PI*kRn);
712 co = cos(2.0*PI*kRn);
713
714 for (i=0; i<tnoA; i++){
715
716 for (j=0; j<tnoB; j++){
717
718 H[Anum+i][Bnum+j].r += H1[k]*co;
719 H[Anum+i][Bnum+j].i += H1[k]*si;
720
721 k++;
722
723 }
724
725 if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
726
727 k -= tnoB;
728
729 for (j=0; j<tnoB; j++){
730
731 S[Anum+i][Bnum+j].r += S1[k]*co;
732 S[Anum+i][Bnum+j].i += S1[k]*si;
733
734 k++;
735 }
736 }
737
738 }
739 }
740 }
741
742 /*---------- added by TOYODA 15/FEB/2010 */
743 if (5==XC_switch) {
744
745 #if 0
746 if (1<SCF_iter) { EXX_Debug_Check_DM(exx, exx_CDM, CDM); }
747 #endif
748
749 EXX_Fock_Band(H, exx, exx_CDM, MP, k1, k2, k3, spin);
750 }
751 /*---------- until here */
752
753 /* for blas */
754
755 for (i1=1; i1<=n; i1++){
756 for (j1=1; j1<=n; j1++){
757 BLAS_H[(j1-1)*n+i1-1] = H[i1][j1];
758 }
759 }
760
761 /* diagonalize S */
762
763 dtime(&Stime);
764
765 if (parallel_mode==0){
766 EigenBand_lapack(S,ko,n,n,1);
767 }
768 else if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
769 Eigen_PHH(MPI_CommWD2[myworld2],S,ko,n,n,1);
770 }
771
772 dtime(&Etime);
773 time2 += Etime - Stime;
774
775 if (SCF_iter==1 || rediagonalize_flag_overlap_matrix==1 || all_knum!=1){
776
777 if (3<=level_stdout){
778 printf(" myid0=%2d spin=%2d kloop %2d k1 k2 k3 %10.6f %10.6f %10.6f\n",
779 myid0,spin,kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
780 for (i1=1; i1<=n; i1++){
781 printf(" Eigenvalues of OLP %2d %15.12f\n",i1,ko[i1]);
782 }
783 }
784
785 /* minus eigenvalues to 1.0e-14 */
786
787 for (l=1; l<=n; l++){
788 if (ko[l]<0.0) ko[l] = 1.0e-14;
789 koS[l] = ko[l];
790 }
791
792 /* calculate S*1/sqrt(ko) */
793
794 for (l=1; l<=n; l++) ko[l] = 1.0/sqrt(ko[l]);
795
796 /* S * 1.0/sqrt(ko[l]) */
797
798 #pragma omp parallel shared(BLAS_S,ko,S,n) private(OMPID,Nthrds,Nprocs,i1,j1)
799 {
800
801 /* get info. on OpenMP */
802
803 OMPID = omp_get_thread_num();
804 Nthrds = omp_get_num_threads();
805 Nprocs = omp_get_num_procs();
806
807 for (i1=1+OMPID; i1<=n; i1+=Nthrds){
808 for (j1=1; j1<=n; j1++){
809
810 S[i1][j1].r = S[i1][j1].r*ko[j1];
811 S[i1][j1].i = S[i1][j1].i*ko[j1];
812
813 BLAS_S[(j1-1)*n+i1-1] = S[i1][j1];
814 }
815 }
816 } /* #pragma omp parallel */
817 }
818
819 /****************************************************
820 1.0/sqrt(ko[l]) * U^t * H * U * 1.0/sqrt(ko[l])
821 ****************************************************/
822
823 dtime(&Stime);
824
825 /* first transposition of S */
826
827 /*
828 for (i1=1; i1<=n; i1++){
829 for (j1=i1+1; j1<=n; j1++){
830 Ctmp1 = S[i1][j1];
831 Ctmp2 = S[j1][i1];
832 S[i1][j1] = Ctmp2;
833 S[j1][i1] = Ctmp1;
834 }
835 }
836 */
837
838 /**********************************
839 for parallel in the second world
840 **********************************/
841
842 if (all_knum==1){
843
844 /* H * U * 1.0/sqrt(ko[l]) */
845 /* C is distributed by row in each processor */
846
847 /*
848 for (j1=is1[myid2]; j1<=ie1[myid2]; j1++){
849 for (i1=1; i1<=n; i1++){
850
851 sum = 0.0;
852 sumi = 0.0;
853
854 for (l=1; l<=n; l++){
855 sum += H[i1][l].r*S[j1][l].r - H[i1][l].i*S[j1][l].i;
856 sumi += H[i1][l].r*S[j1][l].i + H[i1][l].i*S[j1][l].r;
857 }
858
859 C[j1][i1].r = sum;
860 C[j1][i1].i = sumi;
861
862 }
863 }
864 */
865
866 /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
867
868
869 /*
870 printf("ABC3 myid0=%2d myid1=%2d myid2=%2d numprocs2=%2d\n",myid0,myid1,myid2,numprocs2);
871 MPI_Finalize();
872 exit(0);
873 */
874
875 #pragma omp parallel shared(myid2,ie1,is1,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,ns,ne)
876 {
877
878 /* get info. on OpenMP */
879
880 OMPID = omp_get_thread_num();
881 Nthrds = omp_get_num_threads();
882 Nprocs = omp_get_num_procs();
883
884 ns = is1[myid2] + OMPID*(ie1[myid2]-is1[myid2]+1)/Nthrds;
885 ne = is1[myid2] + (OMPID+1)*(ie1[myid2]-is1[myid2]+1)/Nthrds - 1;
886
887 BM = n;
888 BN = ne - ns + 1;
889 BK = n;
890
891 Ctmp1.r = 1.0;
892 Ctmp1.i = 0.0;
893 Ctmp2.r = 0.0;
894 Ctmp2.i = 0.0;
895
896 if (0<BN){
897
898 F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
899 &Ctmp1,
900 BLAS_H, &BM,
901 &BLAS_S[(ns-1)*n], &BK,
902 &Ctmp2,
903 &BLAS_C[(ns-1)*n], &BM);
904 }
905
906 } /* #pragma omp parallel */
907
908 /*
909 BM = n;
910 BN = ie1[myid2] - is1[myid2] + 1;
911 BK = n;
912
913 Ctmp1.r = 1.0;
914 Ctmp1.i = 0.0;
915 Ctmp2.r = 0.0;
916 Ctmp2.i = 0.0;
917
918 F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
919 &Ctmp1,
920 BLAS_H, &BM,
921 &BLAS_S[(is1[myid2]-1)*n], &BK,
922 &Ctmp2,
923 &BLAS_C[(is1[myid2]-1)*n], &BM);
924 */
925
926
927 /* 1.0/sqrt(ko[l]) * U^+ H * U * 1.0/sqrt(ko[l]) */
928 /* H is distributed by row in each processor */
929
930 /*
931 for (j1=is1[myid2]; j1<=ie1[myid2]; j1++){
932 for (i1=1; i1<=n; i1++){
933
934 sum = 0.0;
935 sumi = 0.0;
936
937 for (l=1; l<=n; l++){
938 sum += S[i1][l].r*C[j1][l].r + S[i1][l].i*C[j1][l].i;
939 sumi += S[i1][l].r*C[j1][l].i - S[i1][l].i*C[j1][l].r;
940 }
941
942 H[j1][i1].r = sum;
943 H[j1][i1].i = sumi;
944
945 }
946 }
947 */
948
949 /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
950
951 #pragma omp parallel shared(H,myid2,ie1,is1,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,ns,ne,i1,j1)
952 {
953
954 /* get info. on OpenMP */
955
956 OMPID = omp_get_thread_num();
957 Nthrds = omp_get_num_threads();
958 Nprocs = omp_get_num_procs();
959
960 ns = is1[myid2] + OMPID*(ie1[myid2]-is1[myid2]+1)/Nthrds;
961 ne = is1[myid2] + (OMPID+1)*(ie1[myid2]-is1[myid2]+1)/Nthrds - 1;
962
963 BM = n;
964 BN = ne - ns + 1;
965 BK = n;
966
967 Ctmp1.r = 1.0;
968 Ctmp1.i = 0.0;
969 Ctmp2.r = 0.0;
970 Ctmp2.i = 0.0;
971
972 if (0<BN){
973 F77_NAME(zgemm,ZGEMM)("C","N", &BM,&BN,&BK,
974 &Ctmp1,
975 BLAS_S, &BM,
976 &BLAS_C[(ns-1)*n], &BK,
977 &Ctmp2,
978 &BLAS_H[(ns-1)*n], &BM);
979 }
980
981 for (j1=ns; j1<=ne; j1++){
982 for (i1=1; i1<=n; i1++){
983 H[j1][i1] = BLAS_H[(j1-1)*n+i1-1];
984 }
985 }
986
987 } /* #pragma omp parallel */
988
989 /* broadcast H */
990
991 BroadCast_ComplexMatrix(MPI_CommWD2[myworld2],H,n,is1,ie1,myid2,numprocs2,
992 stat_send,request_send,request_recv);
993 }
994
995 else{
996
997 /* H * U * 1.0/sqrt(ko[l]) */
998
999 /*
1000 for (j1=1; j1<=n; j1++){
1001 for (i1=1; i1<=n; i1++){
1002
1003 sum = 0.0;
1004 sumi = 0.0;
1005
1006 for (l=1; l<=n; l++){
1007 sum += H[i1][l].r*S[j1][l].r - H[i1][l].i*S[j1][l].i;
1008 sumi += H[i1][l].r*S[j1][l].i + H[i1][l].i*S[j1][l].r;
1009 }
1010
1011 C[j1][i1].r = sum;
1012 C[j1][i1].i = sumi;
1013
1014 }
1015 }
1016 */
1017
1018 /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
1019
1020 #pragma omp parallel shared(BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK)
1021 {
1022
1023 /* get info. on OpenMP */
1024
1025 OMPID = omp_get_thread_num();
1026 Nthrds = omp_get_num_threads();
1027 Nprocs = omp_get_num_procs();
1028
1029 BM = n;
1030 BN = (OMPID+1)*n/Nthrds - (OMPID*n/Nthrds+1) + 1;
1031 BK = n;
1032
1033 Ctmp1.r = 1.0;
1034 Ctmp1.i = 0.0;
1035 Ctmp2.r = 0.0;
1036 Ctmp2.i = 0.0;
1037
1038 if (0<BN){
1039 F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
1040 &Ctmp1,
1041 BLAS_H, &BM,
1042 &BLAS_S[(OMPID*n/Nthrds)*n], &BK,
1043 &Ctmp2,
1044 &BLAS_C[(OMPID*n/Nthrds)*n], &BM);
1045 }
1046
1047 } /* #pragma omp parallel */
1048
1049 /* 1.0/sqrt(ko[l]) * U^+ H * U * 1.0/sqrt(ko[l]) */
1050
1051 /*
1052 for (j1=1; j1<=n; j1++){
1053 for (i1=1; i1<=n; i1++){
1054
1055 sum = 0.0;
1056 sumi = 0.0;
1057
1058 for (l=1; l<=n; l++){
1059 sum += S[i1][l].r*C[j1][l].r + S[i1][l].i*C[j1][l].i;
1060 sumi += S[i1][l].r*C[j1][l].i - S[i1][l].i*C[j1][l].r;
1061 }
1062
1063 H[j1][i1].r = sum;
1064 H[j1][i1].i = sumi;
1065 }
1066 }
1067 */
1068
1069 /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
1070
1071 #pragma omp parallel shared(H,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,i1,j1)
1072 {
1073
1074 /* get info. on OpenMP */
1075
1076 OMPID = omp_get_thread_num();
1077 Nthrds = omp_get_num_threads();
1078 Nprocs = omp_get_num_procs();
1079
1080 BM = n;
1081 BN = (OMPID+1)*n/Nthrds - (OMPID*n/Nthrds+1) + 1;
1082 BK = n;
1083
1084 Ctmp1.r = 1.0;
1085 Ctmp1.i = 0.0;
1086 Ctmp2.r = 0.0;
1087 Ctmp2.i = 0.0;
1088
1089 if (0<BN){
1090 F77_NAME(zgemm,ZGEMM)("C","N", &BM,&BN,&BK,
1091 &Ctmp1,
1092 BLAS_S, &BM,
1093 &BLAS_C[(OMPID*n/Nthrds)*n], &BK,
1094 &Ctmp2,
1095 &BLAS_H[(OMPID*n/Nthrds)*n], &BM);
1096 }
1097
1098 for (j1=(OMPID*n/Nthrds+1); j1<=(OMPID+1)*n/Nthrds; j1++){
1099 for (i1=1; i1<=n; i1++){
1100 H[j1][i1] = BLAS_H[(j1-1)*n+i1-1];
1101 }
1102 }
1103
1104 } /* #pragma omp parallel */
1105
1106 } /* else */
1107
1108 /* H to C (transposition) */
1109
1110 for (i1=1; i1<=n; i1++){
1111 for (j1=1; j1<=n; j1++){
1112 C[j1][i1] = H[i1][j1];
1113 }
1114 }
1115
1116 /* penalty for ill-conditioning states */
1117
1118 EV_cut0 = Threshold_OLP_Eigen;
1119
1120 for (i1=1; i1<=n; i1++){
1121
1122 if (koS[i1]<EV_cut0){
1123 C[i1][i1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
1124 }
1125
1126 /* cutoff the interaction between the ill-conditioned state */
1127
1128 if (1.0e+3<C[i1][i1].r){
1129 for (j1=1; j1<=n; j1++){
1130 C[i1][j1] = Complex(0.0,0.0);
1131 C[j1][i1] = Complex(0.0,0.0);
1132 }
1133 C[i1][i1].r = 1.0e+4;
1134 }
1135 }
1136
1137 dtime(&Etime);
1138 time3 += Etime - Stime;
1139
1140 /* diagonalize H' */
1141
1142 dtime(&Stime);
1143
1144 if (parallel_mode==0){
1145 EigenBand_lapack(C,ko,n,MaxN,all_knum);
1146 }
1147 else{
1148 /* The output C matrix is distributed by column. */
1149 Eigen_PHH(MPI_CommWD2[myworld2],C,ko,n,MaxN,0);
1150 }
1151
1152 dtime(&Etime);
1153 time4 += Etime - Stime;
1154
1155 for (l=1; l<=MaxN; l++){
1156 EIGEN[spin][kloop][l] = ko[l];
1157 }
1158
1159 if (3<=level_stdout && 0<=kloop){
1160 printf(" myid0=%2d spin=%2d kloop %i, k1 k2 k3 %10.6f %10.6f %10.6f\n",
1161 myid0,spin,kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
1162 for (i1=1; i1<=n; i1++){
1163 if (SpinP_switch==0)
1164 printf(" Eigenvalues of Kohn-Sham %2d %15.12f %15.12f\n",
1165 i1,EIGEN[0][kloop][i1],EIGEN[0][kloop][i1]);
1166 else
1167 printf(" Eigenvalues of Kohn-Sham %2d %15.12f %15.12f\n",
1168 i1,EIGEN[0][kloop][i1],EIGEN[1][kloop][i1]);
1169 }
1170 }
1171
1172 /* calculation of wave functions */
1173
1174 dtime(&Stime);
1175
1176 if (all_knum==1){
1177
1178 /* The H matrix is distributed by row */
1179
1180 /*
1181 for (i1=1; i1<=n; i1++){
1182 for (j1=is2[myid2]; j1<=ie2[myid2]; j1++){
1183 H[j1][i1] = C[i1][j1];
1184 }
1185 }
1186 */
1187
1188 for (i1=1; i1<=n; i1++){
1189 for (j1=is2[myid2]; j1<=ie2[myid2]; j1++){
1190 BLAS_H[(j1-1)*n+i1-1] = C[i1][j1];
1191 }
1192 }
1193
1194 /* the second transposition of S */
1195
1196 /*
1197 for (i1=1; i1<=n; i1++){
1198 for (j1=i1+1; j1<=n; j1++){
1199 Ctmp1 = S[i1][j1];
1200 Ctmp2 = S[j1][i1];
1201 S[i1][j1] = Ctmp2;
1202 S[j1][i1] = Ctmp1;
1203 }
1204 }
1205 */
1206
1207 /* C is distributed by row in each processor */
1208
1209 /*
1210 for (j1=is2[myid2]; j1<=ie2[myid2]; j1++){
1211 for (i1=1; i1<=n; i1++){
1212
1213 sum = 0.0;
1214 sumi = 0.0;
1215 for (l=1; l<=n; l++){
1216 sum += S[i1][l].r*H[j1][l].r - S[i1][l].i*H[j1][l].i;
1217 sumi += S[i1][l].r*H[j1][l].i + S[i1][l].i*H[j1][l].r;
1218 }
1219
1220 C[j1][i1].r = sum;
1221 C[j1][i1].i = sumi;
1222 }
1223 }
1224 */
1225
1226 /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
1227
1228 #pragma omp parallel shared(C,myid2,ie2,is2,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,ns,ne,i1,j1)
1229 {
1230
1231 /* get info. on OpenMP */
1232
1233 OMPID = omp_get_thread_num();
1234 Nthrds = omp_get_num_threads();
1235 Nprocs = omp_get_num_procs();
1236
1237 ns = is2[myid2] + OMPID*(ie2[myid2]-is2[myid2]+1)/Nthrds;
1238 ne = is2[myid2] + (OMPID+1)*(ie2[myid2]-is2[myid2]+1)/Nthrds - 1;
1239
1240 BM = n;
1241 BN = ne - ns + 1;
1242 BK = n;
1243
1244 Ctmp1.r = 1.0;
1245 Ctmp1.i = 0.0;
1246 Ctmp2.r = 0.0;
1247 Ctmp2.i = 0.0;
1248
1249 if (0<BN){
1250 F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
1251 &Ctmp1,
1252 BLAS_S, &BM,
1253 &BLAS_H[(ns-1)*n], &BK,
1254 &Ctmp2,
1255 &BLAS_C[(ns-1)*n], &BM);
1256 }
1257
1258 for (j1=ns; j1<=ne; j1++){
1259 for (i1=1; i1<=n; i1++){
1260 C[j1][i1] = BLAS_C[(j1-1)*n+i1-1];
1261 }
1262 }
1263
1264 } /* #pragma omp parallel */
1265
1266 /* broadcast C:
1267 C is distributed by row in each processor
1268 */
1269
1270 dtime(&Stime0);
1271
1272 BroadCast_ComplexMatrix(MPI_CommWD2[myworld2],C,n,is2,ie2,myid2,numprocs2,
1273 stat_send,request_send,request_recv);
1274
1275 /* C to H (transposition)
1276 H consists of column vectors
1277 */
1278
1279 for (i1=1; i1<=MaxN; i1++){
1280 for (j1=1; j1<=n; j1++){
1281 H[j1][i1] = C[i1][j1];
1282 }
1283 }
1284
1285 dtime(&Etime0);
1286 time51 += Etime0 - Stime0;
1287
1288 } /* if (all_knum==1) */
1289
1290 dtime(&Etime);
1291 time5 += Etime - Stime;
1292
1293 } /* kloop0 */
1294
1295 dtime(&EiloopTime);
1296
1297 if (SpinP_switch==1 && numprocs0==1 && spin==0){
1298 spin++;
1299 goto diagonalize1;
1300 }
1301
1302 /****************************************************
1303 MPI:
1304
1305 EIGEN
1306 ****************************************************/
1307
1308 MPI_Barrier(mpi_comm_level1);
1309 dtime(&Stime);
1310
1311 for (spin=0; spin<=SpinP_switch; spin++){
1312 for (kloop=0; kloop<T_knum; kloop++){
1313
1314 /* get ID in the zeroth world */
1315 ID = Comm_World_StartID1[spin] + T_k_ID[spin][kloop];
1316 MPI_Bcast(&EIGEN[spin][kloop][0], MaxN+1, MPI_DOUBLE, ID, mpi_comm_level1);
1317 }
1318 }
1319
1320 dtime(&Etime);
1321 time6 += Etime - Stime;
1322
1323 /**************************************
1324 find chemical potential
1325 **************************************/
1326
1327 dtime(&Stime);
1328
1329 po = 0;
1330 loop_num = 0;
1331 ChemP_MAX = 10.0;
1332 ChemP_MIN =-10.0;
1333
1334 do {
1335
1336 loop_num++;
1337
1338 ChemP = 0.50*(ChemP_MAX + ChemP_MIN);
1339 Num_State = 0.0;
1340
1341 for (kloop=0; kloop<T_knum; kloop++){
1342 for (spin=0; spin<=SpinP_switch; spin++){
1343 for (l=1; l<=MaxN; l++){
1344
1345 x = (EIGEN[spin][kloop][l] - ChemP)*Beta;
1346
1347 if (x<=-x_cut) FermiF = 1.0;
1348 else if (x>=x_cut) FermiF = 0.0;
1349 else FermiF = 1.0/(1.0 + exp(x));
1350
1351 Num_State += FermiF*(double)T_k_op[kloop];
1352
1353 }
1354 }
1355 }
1356
1357 if (SpinP_switch==0)
1358 Num_State = 2.0*Num_State/sum_weights;
1359 else
1360 Num_State = Num_State/sum_weights;
1361
1362 Dnum = TZ - Num_State - system_charge;
1363
1364 if (0.0<=Dnum) ChemP_MIN = ChemP;
1365 else ChemP_MAX = ChemP;
1366 if (fabs(Dnum)<10e-14) po = 1;
1367 }
1368 while (po==0 && loop_num<2000);
1369
1370 /****************************************************
1371 band energy in a finite temperature
1372 ****************************************************/
1373
1374 Eele0[0] = 0.0;
1375 Eele0[1] = 0.0;
1376
1377 for (kloop=0; kloop<T_knum; kloop++){
1378 for (spin=0; spin<=SpinP_switch; spin++){
1379 for (l=1; l<=MaxN; l++){
1380
1381 x = (EIGEN[spin][kloop][l] - ChemP)*Beta;
1382
1383 if (x<=-x_cut) FermiF = 1.0;
1384 else if (x_cut<=x) FermiF = 0.0;
1385 else FermiF = 1.0/(1.0 + exp(x));
1386
1387 Eele0[spin] += FermiF*EIGEN[spin][kloop][l]*(double)T_k_op[kloop];
1388
1389 }
1390 }
1391 }
1392
1393 if (SpinP_switch==0){
1394 Eele0[0] = Eele0[0]/sum_weights;
1395 Eele0[1] = Eele0[0];
1396 }
1397 else {
1398 Eele0[0] = Eele0[0]/sum_weights;
1399 Eele0[1] = Eele0[1]/sum_weights;
1400 }
1401
1402 Uele = Eele0[0] + Eele0[1];
1403
1404 if (2<=level_stdout){
1405 printf("myid0=%2d ChemP=%lf, Eele0[0]=%lf, Eele0[1]=%lf\n",myid0,ChemP,Eele0[0],Eele0[1]);
1406 }
1407
1408 dtime(&Etime);
1409 time7 += Etime - Stime;
1410
1411 /*---------- added by TOYODA 16/FEB/2010 */
1412 if (g_exx_switch) {
1413 EXX_Energy_Band(Uexx, exx, exx_CDM, MP);
1414 if (0==SpinP_switch){
1415 Uexx[1] = Uexx[0];
1416 EXX_Log_Print("EXX: U= %20.12f\n", Uexx[0]);
1417 } else {
1418 EXX_Log_Print("EXX: U= %20.12f %20.2f\n", Uexx[0], Uexx[1]);
1419 }
1420 }
1421 /*---------- until here */
1422
1423 /****************************************************
1424 if all_knum==1, calculate CDM and EDM
1425 ****************************************************/
1426
1427 dtime(&Stime);
1428
1429 if (all_knum==1){
1430
1431 dtime(&Stime0);
1432
1433 /* initialize CDM1 and EDM1 */
1434
1435 for (i=0; i<size_H1; i++){
1436 CDM1[i] = 0.0;
1437 EDM1[i] = 0.0;
1438 }
1439
1440 /* calculate CDM and EDM */
1441
1442 spin = myworld1;
1443 kloop = S_knum;
1444
1445 k1 = T_KGrids1[kloop];
1446 k2 = T_KGrids2[kloop];
1447 k3 = T_KGrids3[kloop];
1448
1449 /* weight of k-point */
1450
1451 kw = (double)T_k_op[kloop];
1452
1453 /* store Fermi function */
1454
1455 po = 0;
1456 l = 1;
1457 do{
1458
1459 eig = EIGEN[spin][kloop][l];
1460 x = (eig - ChemP)*Beta;
1461
1462 if (x<-x_cut) FermiF = 1.0;
1463 else if (x>x_cut) FermiF = 0.0;
1464 else FermiF = 1.0/(1.0 + exp(x));
1465
1466 VecFkw[l] = FermiF*kw;
1467 VecFkwE[l] = VecFkw[l]*eig;
1468
1469 if ( FermiF<=FermiEps && po==0 ) {
1470 lmax = l;
1471 po = 1;
1472 }
1473
1474 l++;
1475
1476 } while(po==0 && l<=MaxN);
1477
1478 if (po==0) lmax = MaxN;
1479
1480 dtime(&Etime0);
1481 time81 += Etime0 - Stime0;
1482
1483 dtime(&Stime0);
1484
1485 /* predetermination of k
1486 H is used as temporal array
1487 */
1488
1489 k = 0;
1490 for (AN=1; AN<=atomnum; AN++){
1491
1492 GA_AN = order_GA[AN];
1493 wanA = WhatSpecies[GA_AN];
1494 tnoA = Spe_Total_CNO[wanA];
1495
1496 H[0][AN].r = (double)k;
1497
1498 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1499 GB_AN = natn[GA_AN][LB_AN];
1500 Rn = ncn[GA_AN][LB_AN];
1501 wanB = WhatSpecies[GB_AN];
1502 tnoB = Spe_Total_CNO[wanB];
1503 k += tnoA*tnoB;
1504 }
1505 }
1506
1507 /* DM and EDM */
1508
1509 #pragma omp parallel shared(numprocs0,My_NZeros,SP_NZeros,MPI_CDM1_flag,EDM1,CDM1,VecFkwE,VecFkw,H,lmax,k1,k2,k3,atv_ijk,ncn,natn,FNAN,MP,Spe_Total_CNO,WhatSpecies,order_GA,atomnum) private(OMPID,Nthrds,Nprocs,AN,GA_AN,wanA,tnoA,Anum,k,LB_AN,GB_AN,Rn,wanB,tnoB,Bnum,l1,l2,l3,kRn,si,co,i,ia,j,jb,d1,d2,l,tmp,po,ID)
1510 {
1511
1512 /* get info. on OpenMP */
1513
1514 OMPID = omp_get_thread_num();
1515 Nthrds = omp_get_num_threads();
1516 Nprocs = omp_get_num_procs();
1517
1518 for (AN=1+OMPID; AN<=atomnum; AN+=Nthrds){
1519
1520 GA_AN = order_GA[AN];
1521 wanA = WhatSpecies[GA_AN];
1522 tnoA = Spe_Total_CNO[wanA];
1523 Anum = MP[GA_AN];
1524
1525 k = (int)H[0][AN].r;
1526
1527 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1528 GB_AN = natn[GA_AN][LB_AN];
1529 Rn = ncn[GA_AN][LB_AN];
1530 wanB = WhatSpecies[GB_AN];
1531 tnoB = Spe_Total_CNO[wanB];
1532 Bnum = MP[GB_AN];
1533
1534 l1 = atv_ijk[Rn][1];
1535 l2 = atv_ijk[Rn][2];
1536 l3 = atv_ijk[Rn][3];
1537 kRn = k1*(double)l1 + k2*(double)l2 + k3*(double)l3;
1538
1539 si = sin(2.0*PI*kRn);
1540 co = cos(2.0*PI*kRn);
1541
1542 for (i=0; i<tnoA; i++){
1543
1544 ia = Anum + i;
1545
1546 for (j=0; j<tnoB; j++){
1547
1548 jb = Bnum + j;
1549
1550 /***************************************************************
1551 Note that the imagiary part is zero,
1552 since
1553
1554 at k
1555 A = (co + i si)(Re + i Im) = (co*Re - si*Im) + i (co*Im + si*Re)
1556 at -k
1557 B = (co - i si)(Re - i Im) = (co*Re - si*Im) - i (co*Im + si*Re)
1558 Thus, Re(A+B) = 2*(co*Re - si*Im)
1559 Im(A+B) = 0
1560 ***************************************************************/
1561
1562 po = 0;
1563 ID = 0;
1564 do {
1565 if (MPI_CDM1_flag[ID] && SP_NZeros[ID]<=k && k<(SP_NZeros[ID]+My_NZeros[ID])) po = 1;
1566 ID++;
1567 } while (po==0 && ID<numprocs0);
1568
1569 if (po==1){
1570
1571 d1 = 0.0;
1572 d2 = 0.0;
1573
1574 for (l=1; l<=lmax; l++){
1575
1576 tmp = co*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
1577 -si*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
1578
1579 d1 += VecFkw[l]*tmp;
1580 d2 += VecFkwE[l]*tmp;;
1581 }
1582
1583 CDM1[k] += d1;
1584 EDM1[k] += d2;
1585 }
1586
1587 /* increment of k */
1588
1589 k++;
1590
1591 }
1592 }
1593
1594 } /* LB_AN */
1595 } /* AN */
1596
1597 } /* #pragma omp parallel */
1598
1599 /* sum of CDM1 and EDM1 by Allreduce in MPI */
1600
1601 dtime(&Etime0);
1602 time82 += Etime0 - Stime0;
1603
1604 dtime(&Stime0);
1605
1606 for (ID=0; ID<numprocs0; ID++){
1607 if (MPI_CDM1_flag[ID]){
1608 k = SP_NZeros[ID];
1609 MPI_Reduce(&CDM1[k], &H1[k], My_NZeros[ID], MPI_DOUBLE, MPI_SUM, 0, MPI_CommWD_CDM1[ID]);
1610 MPI_Reduce(&EDM1[k], &S1[k], My_NZeros[ID], MPI_DOUBLE, MPI_SUM, 0, MPI_CommWD_CDM1[ID]);
1611 }
1612 }
1613
1614 dtime(&Etime0);
1615 time83 += Etime0 - Stime0;
1616
1617 dtime(&Stime0);
1618
1619 /* CDM and EDM */
1620
1621 k = 0;
1622 for (AN=1; AN<=atomnum; AN++){
1623 GA_AN = order_GA[AN];
1624 MA_AN = F_G2M[GA_AN];
1625
1626 wanA = WhatSpecies[GA_AN];
1627 tnoA = Spe_Total_CNO[wanA];
1628
1629 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1630 GB_AN = natn[GA_AN][LB_AN];
1631 wanB = WhatSpecies[GB_AN];
1632 tnoB = Spe_Total_CNO[wanB];
1633
1634 for (i=0; i<tnoA; i++){
1635 for (j=0; j<tnoB; j++){
1636
1637 if (1<=MA_AN && MA_AN<=Matomnum){
1638
1639 CDM[spin][MA_AN][LB_AN][i][j] = H1[k];
1640 EDM[spin][MA_AN][LB_AN][i][j] = S1[k];
1641 }
1642
1643 k++;
1644
1645 }
1646 }
1647 }
1648 }
1649
1650 dtime(&Etime0);
1651 time84 += Etime0 - Stime0;
1652
1653 dtime(&Stime0);
1654
1655 /* if necessary, MPI communication of CDM and EDM */
1656
1657 if (1<numprocs0 && SpinP_switch==1){
1658
1659 /* set spin */
1660
1661 if (myworld1==0){
1662 spin = 1;
1663 }
1664 else{
1665 spin = 0;
1666 }
1667
1668 /* communicate CDM1 and EDM1 */
1669
1670 for (i=0; i<=1; i++){
1671
1672 for (ID=Comm_World_StartID1[i]; ID<(numprocs0+Comm_World_StartID1[i]); ID++){
1673
1674 /* ID0: zeroth world */
1675
1676 ID0 = ID % numprocs0;
1677
1678 /* ID's for sending and receiving in the zeroth world */
1679
1680 IDS = Comm_World_StartID1[i] + (ID-Comm_World_StartID1[i]) % NPROCS_WD1[i];
1681 IDR = ID0;
1682
1683 k = SP_NZeros[IDR];
1684
1685 if (IDS!=IDR){
1686
1687 /* CDM1 */
1688
1689 if (myid0==IDS){
1690 MPI_Isend(&H1[k], My_NZeros[IDR], MPI_DOUBLE, IDR, tag, mpi_comm_level1, &request);
1691 }
1692
1693 if (myid0==IDR){
1694 MPI_Recv(&CDM1[k], My_NZeros[IDR], MPI_DOUBLE, IDS, tag, mpi_comm_level1, &stat);
1695 }
1696
1697 if (myid0==IDS){
1698 MPI_Wait(&request,&stat);
1699 }
1700
1701 /* EDM1 */
1702
1703 if (myid0==IDS){
1704 MPI_Isend(&S1[k], My_NZeros[IDR], MPI_DOUBLE, IDR, tag, mpi_comm_level1, &request);
1705 }
1706
1707 if (myid0==IDR){
1708 MPI_Recv(&EDM1[k], My_NZeros[IDR], MPI_DOUBLE, IDS, tag, mpi_comm_level1, &stat);
1709 }
1710
1711 if (myid0==IDS){
1712 MPI_Wait(&request,&stat);
1713 }
1714
1715 }
1716 }
1717 }
1718
1719 /* put CDM1 and EDM1 into CDM and EDM */
1720
1721 k = 0;
1722 for (AN=1; AN<=atomnum; AN++){
1723 GA_AN = order_GA[AN];
1724 MA_AN = F_G2M[GA_AN];
1725
1726 wanA = WhatSpecies[GA_AN];
1727 tnoA = Spe_Total_CNO[wanA];
1728
1729 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1730 GB_AN = natn[GA_AN][LB_AN];
1731 wanB = WhatSpecies[GB_AN];
1732 tnoB = Spe_Total_CNO[wanB];
1733
1734 for (i=0; i<tnoA; i++){
1735 for (j=0; j<tnoB; j++){
1736
1737 if (1<=MA_AN && MA_AN<=Matomnum){
1738 CDM[spin][MA_AN][LB_AN][i][j] = CDM1[k];
1739 EDM[spin][MA_AN][LB_AN][i][j] = EDM1[k];
1740 }
1741
1742 k++;
1743
1744 }
1745 }
1746 }
1747 }
1748 }
1749
1750 dtime(&Etime0);
1751 time85 += Etime0 - Stime0;
1752
1753 /*---------- added by TOYODA 16/FEB/2010 */
1754 if (g_exx_switch) {
1755 /* clear buffer */
1756 for (iep=0; iep<nep; iep++) {
1757 GA_AN = exx_ep_atom1[iep]+1;
1758 GB_AN = exx_ep_atom2[iep]+1;
1759 tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
1760 tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
1761 for (i=0; i<tnoA; i++){
1762 for (j=0; j<tnoB; j++){
1763 exx_CDM[0][iep][i][j].r = 0.0;
1764 exx_CDM[0][iep][i][j].i = 0.0;
1765 }
1766 }
1767 }
1768 /* calculate DM */
1769 for (iep=0; iep<nep; iep++) {
1770 GA_AN = exx_ep_atom1[iep]+1;
1771 GB_AN = exx_ep_atom2[iep]+1;
1772 iRn = exx_ep_cell[iep];
1773 tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
1774 tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
1775 Anum = MP[GA_AN];
1776 Bnum = MP[GB_AN];
1777
1778 /* phase */
1779 iRn_x = iRn%ncd - nshell_ep;
1780 iRn_y = (iRn/ncd)%ncd - nshell_ep;
1781 iRn_z = (iRn/ncd/ncd)%ncd - nshell_ep;
1782 kRn = k1*(double)iRn_x + k2*(double)iRn_y + k3*(double)iRn_z;
1783 si = sin(2.0*PI*kRn);
1784 co = cos(2.0*PI*kRn);
1785
1786 for (i=0; i<tnoA; i++){
1787 ia = Anum + i;
1788 for (j=0; j<tnoB; j++){
1789 jb = Bnum + j;
1790 d1 = 0.0;
1791 d2 = 0.0;
1792 for (l=1; l<=lmax; l++) {
1793 tmp = co*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
1794 -si*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
1795 d1 += VecFkw[l]*tmp;
1796 tmp = si*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
1797 +co*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
1798 d2 += VecFkw[l]*tmp;
1799 }
1800 exx_CDM[0][iep][i][j].r += d1;
1801 exx_CDM[0][iep][i][j].i += d2;
1802 }
1803 }
1804 } /* loop of iep */
1805 } /* end if */
1806 /*---------- until here added by TOYODA 16/FEB/2010 */
1807
1808 } /* if (all_knum==1) */
1809
1810 dtime(&Etime);
1811 time8 += Etime - Stime;
1812
1813 if (myid0==Host_ID && 0<level_stdout){
1814 printf("<Band_DFT> Eigen, time=%lf\n", EiloopTime-SiloopTime);fflush(stdout);
1815 }
1816
1817 dtime(&SiloopTime);
1818
1819 /****************************************************
1820 ****************************************************
1821 diagonalization for calculating density matrix
1822 ****************************************************
1823 ****************************************************/
1824
1825 if (all_knum!=1){
1826
1827 /* spin=myworld1 */
1828
1829 spin = myworld1;
1830
1831 diagonalize2:
1832
1833 /* set S1 */
1834
1835 size_H1 = Get_OneD_HS_Col(1, CntOLP, S1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1836
1837 /* set H1 */
1838
1839 if (SpinP_switch==0){
1840 size_H1 = Get_OneD_HS_Col(1, nh[0], H1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1841 }
1842 else if (1<numprocs0){
1843 size_H1 = Get_OneD_HS_Col(1, nh[0], H1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1844 size_H1 = Get_OneD_HS_Col(1, nh[1], CDM1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1845
1846 if (myworld1){
1847 for (i=0; i<size_H1; i++){
1848 H1[i] = CDM1[i];
1849 }
1850 }
1851 }
1852 else{
1853 size_H1 = Get_OneD_HS_Col(1, nh[spin], H1, MP, order_GA, My_NZeros, SP_NZeros, SP_Atoms);
1854 }
1855
1856 /* initialize CDM1 and EDM1 */
1857
1858 for (i=0; i<size_H1; i++){
1859 CDM1[i] = 0.0;
1860 EDM1[i] = 0.0;
1861 }
1862
1863 /*---------- added by TOYODA 16/FEB/2010 */
1864 if (g_exx_switch) {
1865 /* clear exx_DM */
1866 for (iep=0; iep<nep; iep++) {
1867 GA_AN = exx_ep_atom1[iep]+1;
1868 GB_AN = exx_ep_atom2[iep]+1;
1869 tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
1870 tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
1871 for (i=0; i<tnoA; i++){
1872 for (j=0; j<tnoB; j++){
1873 exx_CDM[spin][iep][i][j].r = 0.0;
1874 exx_CDM[spin][iep][i][j].i = 0.0;
1875 }
1876 }
1877 }
1878 }
1879 /*---------- until here added by TOYODA 16/FEB/2010 */
1880
1881 /* initialize CDM, EDM, and iDM */
1882
1883 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
1884 GA_AN = M2G[MA_AN];
1885 wanA = WhatSpecies[GA_AN];
1886 tnoA = Spe_Total_CNO[wanA];
1887 Anum = MP[GA_AN];
1888 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1889 GB_AN = natn[GA_AN][LB_AN];
1890 wanB = WhatSpecies[GB_AN];
1891 tnoB = Spe_Total_CNO[wanB];
1892 Bnum = MP[GB_AN];
1893
1894 for (i=0; i<tnoA; i++){
1895 for (j=0; j<tnoB; j++){
1896 CDM[spin][MA_AN][LB_AN][i][j] = 0.0;
1897 EDM[spin][MA_AN][LB_AN][i][j] = 0.0;
1898 iDM[0][0][MA_AN][LB_AN][i][j] = 0.0;
1899 iDM[0][1][MA_AN][LB_AN][i][j] = 0.0;
1900 }
1901 }
1902 }
1903 }
1904
1905 /* for kloop */
1906
1907 for (kloop0=0; kloop0<num_kloop0; kloop0++){
1908
1909 kloop = kloop0 + S_knum;
1910
1911 k1 = T_KGrids1[kloop];
1912 k2 = T_KGrids2[kloop];
1913 k3 = T_KGrids3[kloop];
1914
1915 /* make S and H */
1916
1917 for (i1=1; i1<=n; i1++){
1918 for (j1=1; j1<=n; j1++){
1919 S[i1][j1] = Complex(0.0,0.0);
1920 H[i1][j1] = Complex(0.0,0.0);
1921 }
1922 }
1923
1924 k = 0;
1925 for (AN=1; AN<=atomnum; AN++){
1926 GA_AN = order_GA[AN];
1927 wanA = WhatSpecies[GA_AN];
1928 tnoA = Spe_Total_CNO[wanA];
1929 Anum = MP[GA_AN];
1930
1931 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
1932 GB_AN = natn[GA_AN][LB_AN];
1933 Rn = ncn[GA_AN][LB_AN];
1934 wanB = WhatSpecies[GB_AN];
1935 tnoB = Spe_Total_CNO[wanB];
1936 Bnum = MP[GB_AN];
1937
1938 l1 = atv_ijk[Rn][1];
1939 l2 = atv_ijk[Rn][2];
1940 l3 = atv_ijk[Rn][3];
1941 kRn = k1*(double)l1 + k2*(double)l2 + k3*(double)l3;
1942
1943 si = sin(2.0*PI*kRn);
1944 co = cos(2.0*PI*kRn);
1945
1946 for (i=0; i<tnoA; i++){
1947 for (j=0; j<tnoB; j++){
1948
1949 S[Anum+i][Bnum+j].r += S1[k]*co;
1950 S[Anum+i][Bnum+j].i += S1[k]*si;
1951 H[Anum+i][Bnum+j].r += H1[k]*co;
1952 H[Anum+i][Bnum+j].i += H1[k]*si;
1953
1954 k++;
1955
1956 }
1957 }
1958 }
1959 }
1960
1961 /* for blas */
1962
1963 for (i1=1; i1<=n; i1++){
1964 for (j1=1; j1<=n; j1++){
1965 BLAS_H[(j1-1)*n+i1-1] = H[i1][j1];
1966 }
1967 }
1968
1969 /* diagonalize S */
1970
1971 dtime(&Stime);
1972
1973 if (parallel_mode==0){
1974 EigenBand_lapack(S,ko,n,n,1);
1975 }
1976 else{
1977 Eigen_PHH(MPI_CommWD2[myworld2],S,ko,n,n,1);
1978 }
1979
1980 dtime(&Etime);
1981 time9 += Etime - Stime;
1982
1983 if (3<=level_stdout){
1984 printf(" myid0=%2d kloop %2d k1 k2 k3 %10.6f %10.6f %10.6f\n",
1985 myid0,kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
1986 for (i1=1; i1<=n; i1++){
1987 printf(" Eigenvalues of OLP %2d %15.12f\n",i1,ko[i1]);
1988 }
1989 }
1990
1991 /* minus eigenvalues to 1.0e-14 */
1992
1993 for (l=1; l<=n; l++){
1994 if (ko[l]<0.0) ko[l] = 1.0e-14;
1995 koS[l] = ko[l];
1996 }
1997
1998 /* calculate S*1/sqrt(ko) */
1999
2000 for (l=1; l<=n; l++) ko[l] = 1.0/sqrt(ko[l]);
2001
2002 /* S * 1.0/sqrt(ko[l]) */
2003
2004 #pragma omp parallel shared(BLAS_S,ko,S,n) private(OMPID,Nthrds,Nprocs,i1,j1)
2005 {
2006
2007 /* get info. on OpenMP */
2008
2009 OMPID = omp_get_thread_num();
2010 Nthrds = omp_get_num_threads();
2011 Nprocs = omp_get_num_procs();
2012
2013 for (i1=1+OMPID; i1<=n; i1+=Nthrds){
2014 for (j1=1; j1<=n; j1++){
2015
2016 S[i1][j1].r = S[i1][j1].r*ko[j1];
2017 S[i1][j1].i = S[i1][j1].i*ko[j1];
2018 BLAS_S[(j1-1)*n+i1-1] = S[i1][j1];
2019 }
2020 }
2021
2022 } /* #pragma omp parallel */
2023
2024 /****************************************************
2025 1/sqrt(ko) * U^t * H * U * 1/sqrt(ko)
2026 ****************************************************/
2027
2028 /* transpose S */
2029
2030 /*
2031 for (i1=1; i1<=n; i1++){
2032 for (j1=i1+1; j1<=n; j1++){
2033 Ctmp1 = S[i1][j1];
2034 Ctmp2 = S[j1][i1];
2035 S[i1][j1] = Ctmp2;
2036 S[j1][i1] = Ctmp1;
2037 }
2038 }
2039 */
2040
2041 /* H * U * 1/sqrt(ko) */
2042
2043 /*
2044 for (j1=1; j1<=n; j1++){
2045 for (i1=1; i1<=n; i1++){
2046
2047 sum = 0.0;
2048 sumi = 0.0;
2049
2050 for (l=1; l<=n; l++){
2051 sum += H[i1][l].r*S[j1][l].r - H[i1][l].i*S[j1][l].i;
2052 sumi += H[i1][l].r*S[j1][l].i + H[i1][l].i*S[j1][l].r;
2053 }
2054
2055 C[j1][i1].r = sum;
2056 C[j1][i1].i = sumi;
2057 }
2058 }
2059 */
2060
2061 /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
2062
2063 #pragma omp parallel shared(BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK)
2064 {
2065
2066 /* get info. on OpenMP */
2067
2068 OMPID = omp_get_thread_num();
2069 Nthrds = omp_get_num_threads();
2070 Nprocs = omp_get_num_procs();
2071
2072 BM = n;
2073 BN = (OMPID+1)*n/Nthrds - (OMPID*n/Nthrds+1) + 1;
2074 BK = n;
2075
2076 Ctmp1.r = 1.0;
2077 Ctmp1.i = 0.0;
2078 Ctmp2.r = 0.0;
2079 Ctmp2.i = 0.0;
2080
2081 if (0<BN){
2082 F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
2083 &Ctmp1,
2084 BLAS_H, &BM,
2085 &BLAS_S[(OMPID*n/Nthrds)*n], &BK,
2086 &Ctmp2,
2087 &BLAS_C[(OMPID*n/Nthrds)*n], &BM);
2088 }
2089
2090 } /* #pragma omp parallel */
2091
2092 /* 1/sqrt(ko) * U^+ H * U * 1/sqrt(ko) */
2093
2094 /*
2095 for (i1=1; i1<=n; i1++){
2096 for (j1=1; j1<=n; j1++){
2097 sum = 0.0;
2098 sumi = 0.0;
2099 for (l=1; l<=n; l++){
2100 sum += S[i1][l].r*C[j1][l].r + S[i1][l].i*C[j1][l].i;
2101 sumi += S[i1][l].r*C[j1][l].i - S[i1][l].i*C[j1][l].r;
2102 }
2103 H[i1][j1].r = sum;
2104 H[i1][j1].i = sumi;
2105 }
2106 }
2107 */
2108
2109 /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
2110
2111 #pragma omp parallel shared(C,BLAS_S,BLAS_H,BLAS_C,n) private(OMPID,Nthrds,Nprocs,Ctmp1,Ctmp2,BM,BN,BK,i1,j1)
2112 {
2113
2114 /* get info. on OpenMP */
2115
2116 OMPID = omp_get_thread_num();
2117 Nthrds = omp_get_num_threads();
2118 Nprocs = omp_get_num_procs();
2119
2120 BM = n;
2121 BN = (OMPID+1)*n/Nthrds - (OMPID*n/Nthrds+1) + 1;
2122 BK = n;
2123
2124 Ctmp1.r = 1.0;
2125 Ctmp1.i = 0.0;
2126 Ctmp2.r = 0.0;
2127 Ctmp2.i = 0.0;
2128
2129 if (0<BN){
2130 F77_NAME(zgemm,ZGEMM)("C","N", &BM,&BN,&BK,
2131 &Ctmp1,
2132 BLAS_S, &BM,
2133 &BLAS_C[(OMPID*n/Nthrds)*n], &BK,
2134 &Ctmp2,
2135 &BLAS_H[(OMPID*n/Nthrds)*n], &BM);
2136 }
2137
2138 for (j1=(OMPID*n/Nthrds+1); j1<=(OMPID+1)*n/Nthrds; j1++){
2139 for (i1=1; i1<=n; i1++){
2140 C[i1][j1] = BLAS_H[(j1-1)*n+i1-1];
2141 }
2142 }
2143
2144 } /* #pragma omp parallel */
2145
2146 /* H to C */
2147
2148 /*
2149 for (i1=1; i1<=n; i1++){
2150 for (j1=1; j1<=n; j1++){
2151 C[i1][j1] = H[i1][j1];
2152 }
2153 }
2154 */
2155
2156 /* penalty for ill-conditioning states */
2157
2158 EV_cut0 = Threshold_OLP_Eigen;
2159
2160 for (i1=1; i1<=n; i1++){
2161
2162 if (koS[i1]<EV_cut0){
2163 C[i1][i1].r += pow((koS[i1]/EV_cut0),-2.0) - 1.0;
2164 }
2165
2166 /* cutoff the interaction between the ill-conditioned state */
2167
2168 if (1.0e+3<C[i1][i1].r){
2169 for (j1=1; j1<=n; j1++){
2170 C[i1][j1] = Complex(0.0,0.0);
2171 C[j1][i1] = Complex(0.0,0.0);
2172 }
2173 C[i1][i1].r = 1.0e+4;
2174 }
2175 }
2176
2177 /* diagonalize H' */
2178
2179 dtime(&Stime);
2180
2181 if (parallel_mode==0){
2182 EigenBand_lapack(C,ko,n,MaxN,1);
2183 }
2184 else{
2185 /* The C matrix is distributed by row */
2186 Eigen_PHH(MPI_CommWD2[myworld2],C,ko,n,MaxN,1);
2187 }
2188
2189 dtime(&Etime);
2190 time10 += Etime - Stime;
2191
2192 if (3<=level_stdout && 0<=kloop){
2193 printf(" kloop %i, k1 k2 k3 %10.6f %10.6f %10.6f\n",
2194 kloop,T_KGrids1[kloop],T_KGrids2[kloop],T_KGrids3[kloop]);
2195 for (i1=1; i1<=n; i1++){
2196 printf(" Eigenvalues of Kohn-Sham(DM) spin=%2d i1=%2d %15.12f\n",
2197 spin,i1,ko[i1]);
2198 }
2199 }
2200
2201 /****************************************************
2202 transformation to the original eigenvectors.
2203 NOTE JRCAT-244p and JAIST-2122p
2204 ****************************************************/
2205
2206 /* transpose */
2207
2208 /*
2209 for (i1=1; i1<=n; i1++){
2210 for (j1=i1+1; j1<=n; j1++){
2211 Ctmp1 = S[i1][j1];
2212 Ctmp2 = S[j1][i1];
2213 S[i1][j1] = Ctmp2;
2214 S[j1][i1] = Ctmp1;
2215 }
2216 }
2217 */
2218
2219 /* transpose */
2220
2221 /*
2222 for (i1=1; i1<=n; i1++){
2223 for (j1=i1+1; j1<=n; j1++){
2224 Ctmp1 = C[i1][j1];
2225 Ctmp2 = C[j1][i1];
2226 C[i1][j1] = Ctmp2;
2227 C[j1][i1] = Ctmp1;
2228 }
2229 }
2230 */
2231
2232 /*
2233 for (i1=1; i1<=n; i1++){
2234 for (j1=1; j1<=MaxN; j1++){
2235
2236 sum = 0.0;
2237 sumi = 0.0;
2238 for (l=1; l<=n; l++){
2239 sum += S[i1][l].r*C[j1][l].r - S[i1][l].i*C[j1][l].i;
2240 sumi += S[i1][l].r*C[j1][l].i + S[i1][l].i*C[j1][l].r;
2241 }
2242 H[i1][j1].r = sum;
2243 H[i1][j1].i = sumi;
2244 }
2245 }
2246 */
2247
2248 for (i1=1; i1<=n; i1++){
2249 for (j1=1; j1<=n; j1++){
2250 BLAS_H[(j1-1)*n+i1-1] = C[i1][j1];
2251 }
2252 }
2253
2254 /* note for BLAS, A[M*K] * B[K*N] = C[M*N] */
2255
2256 #pragma omp parallel shared(H,BLAS_S,BLAS_H,BLAS_C,n,MaxN) private(OMPID,Nthrds,Nprocs,j1,i1,Ctmp1,Ctmp2,BM,BN,BK)
2257 {
2258
2259 /* get info. on OpenMP */
2260
2261 OMPID = omp_get_thread_num();
2262 Nthrds = omp_get_num_threads();
2263 Nprocs = omp_get_num_procs();
2264
2265 BM = n;
2266 BN = (OMPID+1)*MaxN/Nthrds - (OMPID*MaxN/Nthrds+1) + 1;
2267 BK = n;
2268
2269 Ctmp1.r = 1.0;
2270 Ctmp1.i = 0.0;
2271 Ctmp2.r = 0.0;
2272 Ctmp2.i = 0.0;
2273
2274 if (0<BN){
2275 F77_NAME(zgemm,ZGEMM)("N","N", &BM,&BN,&BK,
2276 &Ctmp1,
2277 BLAS_S, &BM,
2278 &BLAS_H[(OMPID*MaxN/Nthrds)*n], &BK,
2279 &Ctmp2,
2280 &BLAS_C[(OMPID*MaxN/Nthrds)*n], &BM);
2281 }
2282
2283 for (j1=(OMPID*MaxN/Nthrds+1); j1<=(OMPID+1)*MaxN/Nthrds; j1++){
2284 for (i1=1; i1<=n; i1++){
2285 H[i1][j1] = BLAS_C[(j1-1)*n+i1-1];
2286 }
2287 }
2288
2289 } /* #pragma omp parallel */
2290
2291 /****************************************************
2292 calculate DM and EDM
2293 ****************************************************/
2294
2295 dtime(&Stime);
2296
2297 /* weight of k-point */
2298
2299 kw = (double)T_k_op[kloop];
2300
2301 /* store Fermi function */
2302
2303 po = 0;
2304 l = 1;
2305 do{
2306
2307 eig = EIGEN[spin][kloop][l];
2308 x = (eig - ChemP)*Beta;
2309
2310 if (x<-x_cut) FermiF = 1.0;
2311 else if (x>x_cut) FermiF = 0.0;
2312 else FermiF = 1.0/(1.0 + exp(x));
2313
2314 VecFkw[l] = FermiF*kw;
2315 VecFkwE[l] = VecFkw[l]*eig;
2316
2317 if ( FermiF<=FermiEps && po==0 ) {
2318 lmax = l;
2319 po = 1;
2320 }
2321
2322 l++;
2323
2324 } while(po==0 && l<=MaxN);
2325
2326 if (po==0) lmax = MaxN;
2327
2328 /* predetermination of k
2329 H is used as temporal array
2330 */
2331
2332 k = 0;
2333 for (AN=1; AN<=atomnum; AN++){
2334
2335 GA_AN = order_GA[AN];
2336 wanA = WhatSpecies[GA_AN];
2337 tnoA = Spe_Total_CNO[wanA];
2338
2339 H[0][AN].r = (double)k;
2340
2341 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2342 GB_AN = natn[GA_AN][LB_AN];
2343 Rn = ncn[GA_AN][LB_AN];
2344 wanB = WhatSpecies[GB_AN];
2345 tnoB = Spe_Total_CNO[wanB];
2346 k += tnoA*tnoB;
2347 }
2348 }
2349
2350 /* DM and EDM */
2351
2352 #pragma omp parallel shared(EDM1,CDM1,VecFkwE,VecFkw,H,lmax,k1,k2,k3,atv_ijk,ncn,natn,FNAN,MP,Spe_Total_CNO,WhatSpecies,order_GA,atomnum) private(OMPID,Nthrds,Nprocs,AN,GA_AN,wanA,tnoA,Anum,k,LB_AN,GB_AN,Rn,wanB,tnoB,Bnum,l1,l2,l3,kRn,si,co,i,ia,j,jb,d1,d2,l,tmp)
2353 {
2354
2355 /* get info. on OpenMP */
2356
2357 OMPID = omp_get_thread_num();
2358 Nthrds = omp_get_num_threads();
2359 Nprocs = omp_get_num_procs();
2360
2361 for (AN=1+OMPID; AN<=atomnum; AN+=Nthrds){
2362
2363 GA_AN = order_GA[AN];
2364 wanA = WhatSpecies[GA_AN];
2365 tnoA = Spe_Total_CNO[wanA];
2366 Anum = MP[GA_AN];
2367
2368 k = (int)H[0][AN].r;
2369
2370 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2371 GB_AN = natn[GA_AN][LB_AN];
2372 Rn = ncn[GA_AN][LB_AN];
2373 wanB = WhatSpecies[GB_AN];
2374 tnoB = Spe_Total_CNO[wanB];
2375 Bnum = MP[GB_AN];
2376
2377 l1 = atv_ijk[Rn][1];
2378 l2 = atv_ijk[Rn][2];
2379 l3 = atv_ijk[Rn][3];
2380 kRn = k1*(double)l1 + k2*(double)l2 + k3*(double)l3;
2381
2382 si = sin(2.0*PI*kRn);
2383 co = cos(2.0*PI*kRn);
2384
2385 for (i=0; i<tnoA; i++){
2386
2387 ia = Anum + i;
2388
2389 for (j=0; j<tnoB; j++){
2390
2391 jb = Bnum + j;
2392
2393 /***************************************************************
2394 Note that the imagiary part is zero,
2395 since
2396
2397 at k
2398 A = (co + i si)(Re + i Im) = (co*Re - si*Im) + i (co*Im + si*Re)
2399 at -k
2400 B = (co - i si)(Re - i Im) = (co*Re - si*Im) - i (co*Im + si*Re)
2401 Thus, Re(A+B) = 2*(co*Re - si*Im)
2402 Im(A+B) = 0
2403 ***************************************************************/
2404
2405 d1 = 0.0;
2406 d2 = 0.0;
2407
2408 for (l=1; l<=lmax; l++){
2409
2410 tmp = co*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
2411 -si*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
2412
2413 d1 += VecFkw[l]*tmp;
2414 d2 += VecFkwE[l]*tmp;;
2415
2416 }
2417
2418 CDM1[k] += d1;
2419 EDM1[k] += d2;
2420
2421 /* increment of k */
2422
2423 k++;
2424
2425 }
2426 }
2427 }
2428 }
2429
2430 } /* #pragma omp parallel */
2431
2432 dtime(&Etime);
2433 time11 += Etime - Stime;
2434
2435 /*---------- added by TOYODA 16/FEB/2010 */
2436 if (g_exx_switch) {
2437 for (iep=0; iep<nep; iep++) {
2438 GA_AN = exx_ep_atom1[iep]+1;
2439 GB_AN = exx_ep_atom2[iep]+1;
2440 iRn = exx_ep_cell[iep];
2441 tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
2442 tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
2443 Anum = MP[GA_AN];
2444 Bnum = MP[GB_AN];
2445
2446 /* phase */
2447 iRn_x = iRn%ncd - nshell_ep;
2448 iRn_y = (iRn/ncd)%ncd - nshell_ep;
2449 iRn_z = (iRn/ncd/ncd)%ncd - nshell_ep;
2450 kRn = k1*(double)iRn_x + k2*(double)iRn_y + k3*(double)iRn_z;
2451 si = sin(2.0*PI*kRn);
2452 co = cos(2.0*PI*kRn);
2453
2454 for (i=0; i<tnoA; i++){
2455 ia = Anum + i;
2456 for (j=0; j<tnoB; j++){
2457 jb = Bnum + j;
2458 d1 = 0.0;
2459 d2 = 0.0;
2460 for (l=1; l<=lmax; l++){
2461 tmp = co*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
2462 -si*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
2463 d1 += VecFkw[l]*tmp;
2464 tmp = si*(H[ia][l].r*H[jb][l].r + H[ia][l].i*H[jb][l].i)
2465 +co*(H[ia][l].r*H[jb][l].i - H[ia][l].i*H[jb][l].r);
2466 d2 += VecFkw[l]*tmp;
2467 }
2468 exx_CDM[spin][iep][i][j].r += d1;
2469 exx_CDM[spin][iep][i][j].i += d2;
2470 }
2471 }
2472 } /* loop of iep */
2473 }
2474 /*---------- until here */
2475
2476 } /* kloop0 */
2477
2478 /*******************************************************
2479 sum of CDM1 and EDM1 by Allreduce in MPI
2480 *******************************************************/
2481
2482 dtime(&Stime);
2483
2484 if (parallel_mode){
2485 tmp = 1.0/(double)numprocs2;
2486 for (i=0; i<size_H1; i++){
2487 CDM1[i] *= tmp;
2488 EDM1[i] *= tmp;
2489 }
2490 }
2491
2492 MPI_Allreduce(&CDM1[0], &H1[0], size_H1, MPI_DOUBLE, MPI_SUM, MPI_CommWD1[myworld1]);
2493 MPI_Allreduce(&EDM1[0], &S1[0], size_H1, MPI_DOUBLE, MPI_SUM, MPI_CommWD1[myworld1]);
2494
2495 /* CDM and EDM */
2496
2497 k = 0;
2498 for (AN=1; AN<=atomnum; AN++){
2499 GA_AN = order_GA[AN];
2500 MA_AN = F_G2M[GA_AN];
2501
2502 wanA = WhatSpecies[GA_AN];
2503 tnoA = Spe_Total_CNO[wanA];
2504
2505 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2506 GB_AN = natn[GA_AN][LB_AN];
2507 wanB = WhatSpecies[GB_AN];
2508 tnoB = Spe_Total_CNO[wanB];
2509
2510 for (i=0; i<tnoA; i++){
2511 for (j=0; j<tnoB; j++){
2512
2513 if (1<=MA_AN && MA_AN<=Matomnum){
2514
2515 CDM[spin][MA_AN][LB_AN][i][j] = H1[k];
2516 EDM[spin][MA_AN][LB_AN][i][j] = S1[k];
2517 }
2518
2519 k++;
2520
2521 }
2522 }
2523 }
2524 }
2525
2526 dtime(&Etime);
2527 time12 += Etime - Stime;
2528
2529 /*---------- added by TOYODA 16/FEB/2010 */
2530 if (g_exx_switch) {
2531 EXX_Reduce_DM(exx, exx_CDM[spin]);
2532 }
2533 /*---------- until here */
2534
2535 if (SpinP_switch==1 && numprocs0==1 && spin==0){
2536 spin++;
2537 goto diagonalize2;
2538 }
2539
2540 /* if necessary, MPI communication of CDM and EDM */
2541
2542 if (1<numprocs0 && SpinP_switch==1){
2543
2544 /* set spin */
2545
2546 if (myworld1==0){
2547 spin = 1;
2548 }
2549 else{
2550 spin = 0;
2551 }
2552
2553 /* communicate CDM1 and EDM1 */
2554
2555 for (i=0; i<=1; i++){
2556
2557 IDS = Comm_World_StartID1[i%2];
2558 IDR = Comm_World_StartID1[(i+1)%2];
2559
2560 if (myid0==IDS){
2561 MPI_Isend(&H1[0], size_H1, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &request);
2562 }
2563
2564 if (myid0==IDR){
2565 MPI_Recv(&CDM1[0], size_H1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &stat);
2566 }
2567
2568 if (myid0==IDS){
2569 MPI_Wait(&request,&stat);
2570 }
2571
2572 if (myid0==IDS){
2573 MPI_Isend(&S1[0], size_H1, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &request);
2574 }
2575
2576 if (myid0==IDR){
2577 MPI_Recv(&EDM1[0], size_H1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &stat);
2578 }
2579
2580 if (myid0==IDS){
2581 MPI_Wait(&request,&stat);
2582 }
2583 }
2584
2585 MPI_Bcast(&CDM1[0], size_H1, MPI_DOUBLE, 0, MPI_CommWD1[myworld1]);
2586 MPI_Bcast(&EDM1[0], size_H1, MPI_DOUBLE, 0, MPI_CommWD1[myworld1]);
2587
2588 /* put CDM1 and EDM1 into CDM and EDM */
2589
2590 k = 0;
2591 for (AN=1; AN<=atomnum; AN++){
2592 GA_AN = order_GA[AN];
2593 MA_AN = F_G2M[GA_AN];
2594
2595 wanA = WhatSpecies[GA_AN];
2596 tnoA = Spe_Total_CNO[wanA];
2597
2598 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2599 GB_AN = natn[GA_AN][LB_AN];
2600 wanB = WhatSpecies[GB_AN];
2601 tnoB = Spe_Total_CNO[wanB];
2602
2603 for (i=0; i<tnoA; i++){
2604 for (j=0; j<tnoB; j++){
2605
2606 if (1<=MA_AN && MA_AN<=Matomnum){
2607 CDM[spin][MA_AN][LB_AN][i][j] = CDM1[k];
2608 EDM[spin][MA_AN][LB_AN][i][j] = EDM1[k];
2609 }
2610
2611 k++;
2612
2613 }
2614 }
2615 }
2616 }
2617 }
2618
2619 } /* if (all_knum!=1) */
2620
2621 /****************************************************
2622 normalization of CDM, EDM, and iDM
2623 ****************************************************/
2624
2625 dtime(&EiloopTime);
2626
2627 if (myid0==Host_ID && 0<level_stdout){
2628 printf("<Band_DFT> DM, time=%lf\n", EiloopTime-SiloopTime);fflush(stdout);
2629 }
2630
2631 dum = 1.0/sum_weights;
2632
2633 for (spin=0; spin<=SpinP_switch; spin++){
2634 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
2635 GA_AN = M2G[MA_AN];
2636 wanA = WhatSpecies[GA_AN];
2637 tnoA = Spe_Total_CNO[wanA];
2638 Anum = MP[GA_AN];
2639 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
2640 GB_AN = natn[GA_AN][LB_AN];
2641 wanB = WhatSpecies[GB_AN];
2642 tnoB = Spe_Total_CNO[wanB];
2643 Bnum = MP[GB_AN];
2644
2645 for (i=0; i<tnoA; i++){
2646 for (j=0; j<tnoB; j++){
2647 CDM[spin][MA_AN][LB_AN][i][j] = CDM[spin][MA_AN][LB_AN][i][j]*dum;
2648 EDM[spin][MA_AN][LB_AN][i][j] = EDM[spin][MA_AN][LB_AN][i][j]*dum;
2649 iDM[0][spin][MA_AN][LB_AN][i][j] = iDM[0][spin][MA_AN][LB_AN][i][j]*dum;
2650 }
2651 }
2652 }
2653 }
2654 }
2655
2656 /*---------- added by TOYODA 16/FEB/2010 */
2657 if (g_exx_switch) {
2658 for (spin=0; spin<=SpinP_switch; spin++){
2659 for (iep=0; iep<nep; iep++) {
2660 GA_AN = exx_ep_atom1[iep]+1;
2661 GB_AN = exx_ep_atom2[iep]+1;
2662 tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
2663 tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
2664 for (i=0; i<tnoA; i++){
2665 for (j=0; j<tnoB; j++){
2666 exx_CDM[spin][iep][i][j].r *= dum;
2667 exx_CDM[spin][iep][i][j].i *= dum;
2668 }
2669 }
2670 }
2671 }
2672 }
2673 /*---------- until here */
2674
2675 /****************************************************
2676 bond-energies
2677 ****************************************************/
2678
2679 My_Eele1[0] = 0.0;
2680 My_Eele1[1] = 0.0;
2681 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
2682 GA_AN = M2G[MA_AN];
2683 wanA = WhatSpecies[GA_AN];
2684 tnoA = Spe_Total_CNO[wanA];
2685
2686 for (j=0; j<=FNAN[GA_AN]; j++){
2687 GB_AN = natn[GA_AN][j];
2688 wanB = WhatSpecies[GB_AN];
2689 tnoB = Spe_Total_CNO[wanB];
2690
2691 for (k=0; k<tnoA; k++){
2692 for (l=0; l<tnoB; l++){
2693 for (spin=0; spin<=SpinP_switch; spin++){
2694 My_Eele1[spin] += CDM[spin][MA_AN][j][k][l]*nh[spin][MA_AN][j][k][l];
2695 }
2696 }
2697 }
2698
2699 /*
2700 printf("GA_AN=%2d LB_AN=%2d GB_AN=%2d RnB=%2d\n",GA_AN,LB_AN,GB_AN,ncn[GA_AN][j]);
2701 for (k=0; k<tnoA; k++){
2702 for (l=0; l<tnoB; l++){
2703 for (spin=0; spin<=SpinP_switch; spin++){
2704 printf("%15.12f ",CDM[spin][MA_AN][j][k][l]);
2705 }
2706 }
2707 printf("\n");
2708 }
2709 */
2710
2711 }
2712 }
2713
2714 /* MPI, My_Eele1 */
2715 MPI_Barrier(mpi_comm_level1);
2716 for (spin=0; spin<=SpinP_switch; spin++){
2717 MPI_Allreduce(&My_Eele1[spin], &Eele1[spin], 1, MPI_DOUBLE,
2718 MPI_SUM, mpi_comm_level1);
2719 }
2720
2721 if (SpinP_switch==0){
2722 Eele1[1] = Eele1[0];
2723 }
2724
2725 if (3<=level_stdout && myid0==Host_ID){
2726 printf("Eele00=%15.12f Eele01=%15.12f\n",Eele0[0],Eele0[1]);
2727 printf("Eele10=%15.12f Eele11=%15.12f\n",Eele1[0],Eele1[1]);
2728 }
2729
2730 /****************************************************
2731 output
2732 ****************************************************/
2733
2734 if (myid0==Host_ID){
2735
2736 strcpy(file_EV,".EV");
2737 fnjoint(filepath,filename,file_EV);
2738
2739 if ((fp_EV = fopen(file_EV,"w")) != NULL){
2740
2741 #ifdef xt3
2742 setvbuf(fp_EV,buf,_IOFBF,fp_bsize);
2743 #endif
2744
2745 fprintf(fp_EV,"\n");
2746 fprintf(fp_EV,"***********************************************************\n");
2747 fprintf(fp_EV,"***********************************************************\n");
2748 fprintf(fp_EV," Eigenvalues (Hartree) of SCF KS-eq. \n");
2749 fprintf(fp_EV,"***********************************************************\n");
2750 fprintf(fp_EV,"***********************************************************\n\n");
2751 fprintf(fp_EV," Chemical Potential (Hatree) = %18.14f\n",ChemP);
2752 fprintf(fp_EV," Number of States = %18.14f\n",Num_State);
2753 fprintf(fp_EV," Eigenvalues\n");
2754 fprintf(fp_EV," Up-spin Down-spin\n");
2755
2756 for (kloop=0; kloop<T_knum; kloop++){
2757
2758 k1 = T_KGrids1[kloop];
2759 k2 = T_KGrids2[kloop];
2760 k3 = T_KGrids3[kloop];
2761
2762 if (0<T_k_op[kloop]){
2763
2764 fprintf(fp_EV,"\n");
2765 fprintf(fp_EV," kloop=%i\n",kloop);
2766 fprintf(fp_EV," k1=%10.5f k2=%10.5f k3=%10.5f\n\n",k1,k2,k3);
2767 for (l=1; l<=MaxN; l++){
2768 if (SpinP_switch==0){
2769 fprintf(fp_EV,"%5d %18.14f %18.14f\n",
2770 l,EIGEN[0][kloop][l],EIGEN[0][kloop][l]);
2771 }
2772 else if (SpinP_switch==1){
2773 fprintf(fp_EV,"%5d %18.14f %18.14f\n",
2774 l,EIGEN[0][kloop][l],EIGEN[1][kloop][l]);
2775 }
2776 }
2777 }
2778 }
2779 fclose(fp_EV);
2780 }
2781 else{
2782 printf("Failure of saving the EV file.\n");
2783 fclose(fp_EV);
2784 }
2785 }
2786
2787 /****************************************************
2788 free arrays
2789 ****************************************************/
2790
2791 if (all_knum==1){
2792
2793 free(stat_send);
2794 free(request_send);
2795 free(request_recv);
2796
2797 free(is1);
2798 free(ie1);
2799
2800 free(is2);
2801 free(ie2);
2802
2803 for (ID=0; ID<numprocs0; ID++){
2804 if (MPI_CDM1_flag[ID]){
2805 MPI_Comm_free(&MPI_CommWD_CDM1[ID]);
2806 }
2807 }
2808 }
2809
2810 free(VecFkw);
2811 free(VecFkwE);
2812
2813 free(BLAS_H);
2814 free(BLAS_C);
2815
2816 free(My_NZeros);
2817 free(SP_NZeros);
2818 free(SP_Atoms);
2819
2820 free(MPI_CommWD_CDM1);
2821 free(MPI_CDM1_flag);
2822
2823 /* for PrintMemory and allocation */
2824 firsttime=0;
2825
2826 /* for elapsed time */
2827
2828 if (measure_time){
2829 printf("myid0=%2d time1 =%9.4f\n",myid0,time1);fflush(stdout);
2830 printf("myid0=%2d time2 =%9.4f\n",myid0,time2);fflush(stdout);
2831 printf("myid0=%2d time3 =%9.4f\n",myid0,time3);fflush(stdout);
2832 printf("myid0=%2d time4 =%9.4f\n",myid0,time4);fflush(stdout);
2833 printf("myid0=%2d time5 =%9.4f\n",myid0,time5);fflush(stdout);
2834 printf("myid0=%2d time51=%9.4f\n",myid0,time51);fflush(stdout);
2835 printf("myid0=%2d time6 =%9.4f\n",myid0,time6);fflush(stdout);
2836 printf("myid0=%2d time7 =%9.4f\n",myid0,time7);fflush(stdout);
2837 printf("myid0=%2d time8 =%9.4f\n",myid0,time8);fflush(stdout);
2838 printf("myid0=%2d time81=%9.4f\n",myid0,time81);fflush(stdout);
2839 printf("myid0=%2d time82=%9.4f\n",myid0,time82);fflush(stdout);
2840 printf("myid0=%2d time83=%9.4f\n",myid0,time83);fflush(stdout);
2841 printf("myid0=%2d time84=%9.4f\n",myid0,time84);fflush(stdout);
2842 printf("myid0=%2d time85=%9.4f\n",myid0,time85);fflush(stdout);
2843 printf("myid0=%2d time9 =%9.4f\n",myid0,time9);fflush(stdout);
2844 printf("myid0=%2d time10=%9.4f\n",myid0,time10);fflush(stdout);
2845 printf("myid0=%2d time11=%9.4f\n",myid0,time11);fflush(stdout);
2846 printf("myid0=%2d time12=%9.4f\n",myid0,time12);fflush(stdout);
2847
2848 }
2849
2850 MPI_Barrier(mpi_comm_level1);
2851 dtime(&TEtime);
2852 time0 = TEtime - TStime;
2853 return time0;
2854 }
2855