1 /**********************************************************************
2 Cluster_DFT_ON2.c:
3
4 Cluster_DFT_ON2.c is a subroutine to perform cluster calculations
5 with an O(N^2~) method.
6
7 Log of Cluster_DFT_ON2.c:
8
9 11/Sep./2009 Released by T.Ozaki
10
11 ***********************************************************************/
12
13 #include <complex.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <math.h>
18 #include <time.h>
19 #include <complex.h>
20 #include "openmx_common.h"
21 #include "lapack_prototypes.h"
22 #include "mpi.h"
23 #include <omp.h>
24
25 #define measure_time 0
26
27
28 typedef struct {
29 long int a;
30 double complex b;
31 } clists;
32
33
34
35 static double Find_Trial_ChemP_by_Muller(
36 int SCF_iter, int num_loop,
37 double pTrial_ChemP[4], double pDiff_Num[5],
38 double DeltaMu[7], double DeltaN[3], double DeltaNp[3]);
39
40 static void qsort_complex(long n, long *a, double complex *b);
41 static int clists_cmp(const clists *x, const clists *y);
42 int Lapack_LU_Zinverse(int , dcomplex *);
43
44 static void Nested_Dissection( int free_flag, int Nmat, int *MP,
45 int *Depth_Level, int *Number_Division);
46
47 static double Cluster_collinear_ON2_Force(
48 int SCF_iter,
49 int SpinP_switch,
50 double ***C,
51 double **ko,
52 double *****nh, double ****CntOLP,
53 double *****CDM,
54 double *****EDM,
55 double Eele0[2], double Eele1[2]);
56
57 static double Cluster_collinear_ON2_Iter(
58 int SCF_iter,
59 int SpinP_switch,
60 double ***C,
61 double **ko,
62 double *****nh, double ****CntOLP,
63 double *****CDM,
64 double *****EDM,
65 double Eele0[2], double Eele1[2]);
66
67
68
69
70 static void OND_Solver(
71 char *mode,
72 int myid,
73 int numprocs,
74 int myid2,
75 int numprocs2,
76 int Nloop1,
77 MPI_Comm MPI_Comm,
78 int spin,
79 int Epoint,
80 double Real_Ene,
81 int Nmat,
82 int *NZE,
83 int *PZE,
84 int nl,
85 int mpmax,
86 double Trial_ChemP,
87 double **Hall,
88 double *Sall,
89 double **DMall,
90 double **WRG,
91 int *conv_index_row,
92 int *conv_index_col,
93 int **conv_ind2,
94 int TNZE);
95
96 static void Bisection_System(
97 int Latomnum,
98 double **Cell_Gxyz2,
99 int **OG2G,
100 int **G2OG,
101 int *NN_FNAN,
102 int **NN_natn,
103 int **NN_ncn,
104 int *OGmin,
105 int *OGmax,
106 int *num0,
107 int *numb,
108 int *num1);
109
110
111
112
113 int **Index_BC;
114 int **SindB,**EindB;
115 int **SindC,**EindC;
116 int *invp;
117 int ***Bnum[2],***Cnum,**Anum;
118 int ****Bi[2],****Ci,***Ai;
119 double complex ****Bx[2];
120 double complex ****Cx;
121 double complex ****IBx[2];
122 double complex ****ICx;
123 double complex ***Ax;
124 double complex ***IAx;
125 dcomplex *IA0,***IS;
126 double **IA;
127 dcomplex ***Lvec[2];
128 double complex **Vvec,**Vvec2;
129 double complex **Q0,**Q1;
130 double *VvecTmp,*ISTmp;
131
132
Cluster_DFT_ON2(char * mode,int SCF_iter,int SpinP_switch,double *** Cluster_ReCoes,double ** Cluster_ko,double ***** nh,double ***** ImNL,double **** CntOLP,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])133 double Cluster_DFT_ON2(char *mode,
134 int SCF_iter,
135 int SpinP_switch,
136 double ***Cluster_ReCoes,
137 double **Cluster_ko,
138 double *****nh,
139 double *****ImNL,
140 double ****CntOLP,
141 double *****CDM,
142 double *****EDM,
143 double Eele0[2], double Eele1[2])
144 {
145 static double time0;
146
147 /****************************************************
148 collinear without spin-orbit coupling
149 ****************************************************/
150
151 if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==0 ){
152
153 if (strcasecmp(mode,"scf")==0){
154
155 time0 = Cluster_collinear_ON2_Iter( SCF_iter,SpinP_switch,Cluster_ReCoes,Cluster_ko,
156 nh,CntOLP,CDM,EDM,Eele0,Eele1 );
157
158 }
159
160 else if (strcasecmp(mode,"force")==0){
161
162 time0 = Cluster_collinear_ON2_Force( SCF_iter,SpinP_switch,Cluster_ReCoes,Cluster_ko,
163 nh,CntOLP,CDM,EDM,Eele0,Eele1 );
164
165 }
166
167 }
168
169 /****************************************************
170 collinear with spin-orbit coupling
171 ****************************************************/
172
173 else if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==1 ){
174 printf("Spin-orbit coupling is not supported for collinear DFT calculations.\n");
175 MPI_Finalize();
176 exit(1);
177 }
178
179 /****************************************************
180 non-collinear with and without spin-orbit coupling
181 ****************************************************/
182
183 else if (SpinP_switch==3){
184 /*
185 time0 = Cluster_non_collinear(mode,SCF_iter,SpinP_switch,nh,ImNL,CntOLP,CDM,EDM,Eele0,Eele1);
186 */
187 }
188
189 return time0;
190 }
191
192
193
194
195
196
197
Nested_Dissection(int free_flag,int Nmat,int * MP,int * Depth_Level,int * Number_Division)198 static void Nested_Dissection(int free_flag, int Nmat, int *MP, int *Depth_Level, int *Number_Division)
199 {
200 int i,j,k,l,n,nmin,nmax,ct_AN;
201 int FNAN_min,po,n0,n1,nb,k0,i0,GA,p,k1;
202 int Gc_AN,AN0,Mc_AN,Gh_AN,h_AN,OGh_AN,Rn;
203 int OG_min,OG_max,OG_max0,diff_n,nmin0,nmax0;
204 int Anum,wanA;
205 int abc_n0[4],abc_nb[4],abc_n1[4];
206 int abc_OG_max[4],abc_OG_min[4];
207 int abc_nmax[4],abc_nmin[4];
208 int *OGmin,*OGmax,**G2OG,**OG2G;
209 double **Cell_Gxyz2;
210 int avsize_cluster;
211 int m,mp,mpmax,nl,np;
212 int num0,numb,num1;
213 int **Atom_Index_BC;
214 int **Atom_SindB;
215 int **Atom_EindB;
216 int **Atom_SindC;
217 int **Atom_EindC;
218 int **NN_natn,**NN_ncn,*NN_FNAN,*MP2;
219
220 /**********************************************
221 calulate Depth_Level and Number_Division
222 Depth_Level = nl
223 Number_Division = mpmax
224 **********************************************/
225
226 avsize_cluster = 20;
227
228 if (avsize_cluster<atomnum){
229 np = atomnum/avsize_cluster;
230 nl = log((double)np)/log(2.0);
231 if (nl<0) nl = 0;
232 }
233
234 else {
235 nl = 0;
236 }
237
238 mpmax = 1;
239 for (i=0; i<nl; i++) mpmax = 2*mpmax;
240
241 /************************************************
242 if (free_flag==1)
243 free Index_BC, SindB, EindB, SindC, EindC
244 else
245 allocate them
246 ************************************************/
247
248 if (free_flag==1){
249
250 for (n=0; n<(nl+1); n++){
251 free(Index_BC[n]);
252 }
253 free(Index_BC);
254
255 for (n=0; n<(nl+1); n++){
256 free(SindB[n]);
257 }
258 free(SindB);
259
260 for (n=0; n<(nl+1); n++){
261 free(EindB[n]);
262 }
263 free(EindB);
264
265 for (n=0; n<nl; n++){
266 free(SindC[n]);
267 }
268 free(SindC);
269
270 for (n=0; n<nl; n++){
271 free(EindC[n]);
272 }
273 free(EindC);
274
275 }
276
277 else {
278
279 Index_BC = (int**)malloc(sizeof(int*)*(nl+1));
280 for (n=0; n<(nl+1); n++){
281 Index_BC[n] = (int*)malloc(sizeof(int)*Nmat);
282 for (i=0; i<Nmat; i++){
283 Index_BC[n][i] = i;
284 }
285 }
286
287 SindB = (int**)malloc(sizeof(int*)*(nl+1));
288 mp = mpmax;
289 for (n=0; n<(nl+1); n++){
290 SindB[n] = (int*)malloc(sizeof(int)*mp);
291 for (m=0; m<mp; m++) SindB[n][m] = 0;
292 mp /= 2;
293 }
294
295 EindB = (int**)malloc(sizeof(int*)*(nl+1));
296 mp = mpmax;
297 for (n=0; n<(nl+1); n++){
298 EindB[n] = (int*)malloc(sizeof(int)*mp);
299 for (m=0; m<mp; m++) EindB[n][m] = 0;
300 mp /= 2;
301 }
302
303 SindC = (int**)malloc(sizeof(int*)*nl);
304 mp = mpmax/2;
305 for (n=0; n<nl; n++){
306 SindC[n] = (int*)malloc(sizeof(int)*mp);
307 for (m=0; m<mp; m++) SindC[n][m] = 0;
308 mp /= 2;
309 }
310
311 EindC = (int**)malloc(sizeof(int*)*nl);
312 mp = mpmax/2;
313 for (n=0; n<nl; n++){
314 EindC[n] = (int*)malloc(sizeof(int)*mp);
315 for (m=0; m<mp; m++) EindC[n][m] = 0;
316 mp /= 2;
317 }
318 }
319
320 if (free_flag==1) return ;
321
322 /* allocation of arrays */
323
324 Atom_Index_BC = (int**)malloc(sizeof(int*)*(nl+1));
325 for (n=0; n<(nl+1); n++){
326 Atom_Index_BC[n] = (int*)malloc(sizeof(int)*(atomnum+1));
327 for (i=0; i<(atomnum+1); i++){
328 Atom_Index_BC[n][i] = i;
329 }
330 }
331
332 Atom_SindB = (int**)malloc(sizeof(int*)*(nl+1));
333 mp = mpmax;
334 for (n=0; n<(nl+1); n++){
335 Atom_SindB[n] = (int*)malloc(sizeof(int)*mp);
336 for (m=0; m<mp; m++) Atom_SindB[n][m] = 0;
337 mp /= 2;
338 }
339
340 Atom_EindB = (int**)malloc(sizeof(int*)*(nl+1));
341 mp = mpmax;
342 for (n=0; n<(nl+1); n++){
343 Atom_EindB[n] = (int*)malloc(sizeof(int)*mp);
344 for (m=0; m<mp; m++) Atom_EindB[n][m] = 0;
345 mp /= 2;
346 }
347
348 Atom_SindC = (int**)malloc(sizeof(int*)*nl);
349 mp = mpmax/2;
350 for (n=0; n<nl; n++){
351 Atom_SindC[n] = (int*)malloc(sizeof(int)*mp);
352 for (m=0; m<mp; m++) Atom_SindC[n][m] = 0;
353 mp /= 2;
354 }
355
356 Atom_EindC = (int**)malloc(sizeof(int*)*nl);
357 mp = mpmax/2;
358 for (n=0; n<nl; n++){
359 Atom_EindC[n] = (int*)malloc(sizeof(int)*mp);
360 for (m=0; m<mp; m++) Atom_EindC[n][m] = 0;
361 mp /= 2;
362 }
363
364 Cell_Gxyz2 = (double**)malloc(sizeof(double*)*4);
365 for (k=0; k<4; k++){
366 Cell_Gxyz2[k] = (double*)malloc(sizeof(double)*(atomnum+1));
367 }
368
369 OG2G = (int**)malloc(sizeof(int*)*4);
370 for (k=0; k<4; k++){
371 OG2G[k] = (int*)malloc(sizeof(int)*(atomnum+1));
372 }
373
374 G2OG = (int**)malloc(sizeof(int*)*4);
375 for (k=0; k<4; k++){
376 G2OG[k] = (int*)malloc(sizeof(int)*(atomnum+1));
377 }
378
379 OGmin = (int*)malloc(sizeof(int)*(atomnum+1));
380 OGmax = (int*)malloc(sizeof(int)*(atomnum+1));
381
382 NN_ncn = (int**)malloc(sizeof(int*)*(atomnum+1));
383 for (ct_AN=0; ct_AN<=atomnum; ct_AN++){
384 NN_ncn[ct_AN] = (int*)malloc(sizeof(int)*((int)(Max_FSNAN*ScaleSize)+1));
385 }
386
387 NN_natn = (int**)malloc(sizeof(int*)*(atomnum+1));
388 for (ct_AN=0; ct_AN<=atomnum; ct_AN++){
389 NN_natn[ct_AN] = (int*)malloc(sizeof(int)*((int)(Max_FSNAN*ScaleSize)+1));
390 }
391
392 NN_FNAN = (int*)malloc(sizeof(int)*(atomnum+1));
393 MP2 = (int*)malloc(sizeof(int)*(atomnum+1));
394
395 /**********************************************
396 perform a nested dissection by bisection
397 **********************************************/
398
399 Atom_SindB[nl][0] = 1;
400 Atom_EindB[nl][0] = atomnum;
401
402 mp = 1;
403
404 /* n: level of hierarchy */
405
406 for (n=(nl-1); 0<=n; n--){
407
408 /* m: child number */
409
410 for (m=0; m<mp; m++){
411
412 /* construct the adjacency information */
413
414 for (i=Atom_SindB[n+1][m]; i<=Atom_EindB[n+1][m]; i++){
415
416 GA = Atom_Index_BC[n+1][i];
417 i0 = 0;
418
419 for (j=0; j<=FNAN[GA]; j++){
420
421 po = 0;
422 for (p=Atom_SindB[n+1][m]; p<=Atom_EindB[n+1][m]; p++){
423
424 if (natn[GA][j]==Atom_Index_BC[n+1][p]){
425
426 k1 = p - Atom_SindB[n+1][m] + 1;
427 po = 1;
428
429 break;
430 }
431 } /* p */
432
433 if (po==1){
434
435 NN_natn[i-Atom_SindB[n+1][m]+1][i0] = k1;
436 NN_ncn[i-Atom_SindB[n+1][m]+1][i0] = ncn[GA][j];
437 i0++;
438 }
439
440 } /* j */
441
442 NN_FNAN[i-Atom_SindB[n+1][m]+1] = i0 - 1;
443
444 for (k=1; k<=3; k++){
445 Cell_Gxyz2[k][i-Atom_SindB[n+1][m]+1] = Cell_Gxyz[GA][k];
446 OG2G[k][i-Atom_SindB[n+1][m]+1] = GA;
447 G2OG[k][i-Atom_SindB[n+1][m]+1] = i-Atom_SindB[n+1][m]+1;
448 }
449
450 } /* i */
451
452 Latomnum = Atom_EindB[n+1][m] - Atom_SindB[n+1][m] + 1;
453
454 if (0<Latomnum){
455 Bisection_System( Latomnum,Cell_Gxyz2,OG2G,G2OG,
456 NN_FNAN,NN_natn,NN_ncn,
457 OGmin,OGmax,&num0,&numb,&num1);
458 }
459 else {
460 num0 = 0;
461 numb = 0;
462 num1 = 0;
463 }
464
465 /* construct Atom_Index_BC for the C part */
466
467 Atom_EindC[n][m] = Atom_EindB[n+1][m];
468 Atom_SindC[n][m] = Atom_EindC[n][m] + 1 - numb;
469
470 for (i=Atom_SindC[n][m]; i<=Atom_EindC[n][m]; i++){
471 Atom_Index_BC[n][i] = OG2G[0][num0+i-Atom_SindC[n][m]+1];
472 }
473
474 /* construct Index_BC for the B parts */
475
476 Atom_SindB[n][2*m] = Atom_SindB[n+1][m];
477 Atom_EindB[n][2*m] = Atom_SindB[n][2*m] + num0 - 1;
478
479 Atom_SindB[n][2*m+1] = Atom_EindB[n][2*m] + 1;
480 Atom_EindB[n][2*m+1] = Atom_SindB[n][2*m+1] + num1 - 1;
481
482 for (i=Atom_SindB[n][2*m]; i<=Atom_EindB[n][2*m]; i++){
483 Atom_Index_BC[n][i] = OG2G[0][i-Atom_SindB[n][2*m]+1];
484 }
485
486 for (i=Atom_SindB[n][2*m+1]; i<=Atom_EindB[n][2*m+1]; i++){
487 Atom_Index_BC[n][i] = OG2G[0][num0+numb+i-Atom_SindB[n][2*m+1]+1];
488 }
489
490 } /* m */
491
492 mp *= 2;
493
494 } /* n */
495
496 /*************************************************************
497 reorder Atom_Index_BC so that the order of the global index
498 can be equivalent in all the Atom_Index_BCs
499 *************************************************************/
500
501 mp = mpmax;
502
503 for (n=0; n<nl; n++){
504
505 for (m=0; m<mp; m++){
506 for (i=Atom_SindB[n][m]; i<=Atom_EindB[n][m]; i++){
507 Atom_Index_BC[n+1][i] = Atom_Index_BC[n][i];
508 }
509 }
510
511 for (m=0; m<(mp/2); m++){
512 for (i=Atom_SindC[n][m]; i<=Atom_EindC[n][m]; i++){
513 Atom_Index_BC[n+1][i] = Atom_Index_BC[n][i];
514 }
515 }
516
517 mp /= 2;
518 }
519
520 /* printing Atom_Index_BC for debugging */
521
522 /*
523 printf("\n\n");
524 printf("nl=%2d mpmax=%2d\n",nl,mpmax);
525
526 mp = mpmax;
527 for (n=0; n<(nl+1); n++){
528
529 for (m=0; m<mp; m++){
530 for (i=Atom_SindB[n][m]; i<=Atom_EindB[n][m]; i++){
531 printf("B n=%2d m=%2d i=%2d Atom_Index_BC=%2d\n",n,m,i,Atom_Index_BC[n][i]);
532 }
533 }
534
535 mp /= 2;
536 }
537
538 mp = mpmax/2;
539 for (n=0; n<nl; n++){
540
541 for (m=0; m<mp; m++){
542 for (i=Atom_SindC[n][m]; i<=Atom_EindC[n][m]; i++){
543 printf("C n=%2d m=%2d i=%2d Atom_Index_BC=%2d\n",n,m,i,Atom_Index_BC[n][i]);
544 }
545 }
546
547 mp /= 2;
548 }
549 */
550
551 /*************************************************************
552 construct Index_BC using Atom_Index_BC
553 *************************************************************/
554
555 /* set up Depth_Level and Number_Division */
556
557 *Depth_Level = nl;
558 *Number_Division = mpmax;
559
560 /* set up MP2 where MP2[i=ordered index for atom]
561 gives an ordered index for orbital. */
562
563 Anum = 1;
564 for (i=1; i<=atomnum; i++){
565 Gc_AN = Atom_Index_BC[nl][i];
566 MP2[i] = Anum;
567 wanA = WhatSpecies[Gc_AN];
568 Anum += Spe_Total_CNO[wanA];
569 }
570
571 /* Index_BC for B */
572
573 mp = mpmax;
574 for (n=0; n<(nl+1); n++){
575 for (m=0; m<mp; m++){
576
577 i = Atom_SindB[n][m];
578 SindB[n][m] = MP2[i]-1;
579
580 i = Atom_EindB[n][m];
581 Gc_AN = Atom_Index_BC[n][i];
582 wanA = WhatSpecies[Gc_AN];
583 EindB[n][m] = MP2[i] + Spe_Total_CNO[wanA]-2;
584
585 for (i=Atom_SindB[n][m]; i<=Atom_EindB[n][m]; i++){
586
587 Gc_AN = Atom_Index_BC[n][i];
588 wanA = WhatSpecies[Gc_AN];
589 i0 = MP2[i];
590
591 for (j=MP[Gc_AN]; j<(MP[Gc_AN]+Spe_Total_CNO[wanA]); j++){
592 Index_BC[n][i0-1] = j-1;
593 i0++;
594 }
595 }
596 }
597
598 mp /= 2;
599 }
600
601 /* Index_BC for C */
602
603 mp = mpmax/2;
604 for (n=0; n<nl; n++){
605 for (m=0; m<mp; m++){
606
607 i = Atom_SindC[n][m];
608 SindC[n][m] = MP2[i]-1;
609
610 i = Atom_EindC[n][m];
611 Gc_AN = Atom_Index_BC[n][i];
612 wanA = WhatSpecies[Gc_AN];
613 EindC[n][m] = MP2[i] + Spe_Total_CNO[wanA]-2;
614
615 for (i=Atom_SindC[n][m]; i<=Atom_EindC[n][m]; i++){
616
617 Gc_AN = Atom_Index_BC[n][i];
618 wanA = WhatSpecies[Gc_AN];
619 i0 = MP2[i];
620
621 for (j=MP[Gc_AN]; j<(MP[Gc_AN]+Spe_Total_CNO[wanA]); j++){
622 Index_BC[n][i0-1] = j-1;
623 i0++;
624 }
625 }
626 }
627
628 mp /= 2;
629 }
630
631
632 /* printing Index_BC for debugging */
633
634 /*
635 printf("\n\n");
636 printf("nl=%2d mpmax=%2d\n",nl,mpmax);
637
638 mp = mpmax;
639 for (n=0; n<(nl+1); n++){
640
641 for (m=0; m<mp; m++){
642 for (i=SindB[n][m]; i<=EindB[n][m]; i++){
643 printf("B n=%2d m=%2d i=%2d Index_BC=%2d\n",n,m,i,Index_BC[n][i]);
644 }
645 }
646
647 mp /= 2;
648 }
649
650 mp = mpmax/2;
651 for (n=0; n<nl; n++){
652
653 for (m=0; m<mp; m++){
654 for (i=SindC[n][m]; i<=EindC[n][m]; i++){
655 printf("C n=%2d m=%2d i=%2d Index_BC=%2d\n",n,m,i,Index_BC[n][i]);
656 }
657 }
658
659 mp /= 2;
660 }
661 */
662
663
664 /* printing structures */
665
666 /*
667 {
668
669 int n3;
670 char NameC[100][100];
671
672 n3 = nl-6;
673
674 sprintf(NameC[0],"He");
675 sprintf(NameC[1],"Li");
676 sprintf(NameC[2],"Be");
677 sprintf(NameC[3],"B");
678 sprintf(NameC[4],"C");
679 sprintf(NameC[5],"N");
680 sprintf(NameC[6],"F");
681 sprintf(NameC[7],"Ne");
682 sprintf(NameC[8],"Na");
683 sprintf(NameC[9],"Mg");
684 sprintf(NameC[10],"Al");
685 sprintf(NameC[11],"Si");
686 sprintf(NameC[12],"P");
687 sprintf(NameC[13],"S");
688 sprintf(NameC[14],"Cl");
689 sprintf(NameC[15],"Ar");
690 sprintf(NameC[16],"K");
691
692 printf("%2d\n\n",atomnum);
693
694 mp = mpmax;
695 for (n=0; n<(nl+1); n++){
696
697 for (m=0; m<mp; m++){
698
699 if (n==n3){
700 for (i=SindB[n][m]; i<=EindB[n][m]; i++){
701
702 j = Index_BC[n][i] + 1;
703 printf("H %12.7f %12.7f %12.7f 0.0 0.0 0.0\n",
704 Gxyz[j][1]*BohrR,Gxyz[j][2]*BohrR,Gxyz[j][3]*BohrR);
705 }
706 }
707
708 }
709
710 mp /= 2;
711 }
712
713 mp = mpmax/2;
714 for (n=0; n<nl; n++){
715
716 for (m=0; m<mp; m++){
717
718 if (n3<=n){
719 for (i=SindC[n][m]; i<=EindC[n][m]; i++){
720
721 j = Index_BC[n][i] + 1;
722
723 printf("%s %12.7f %12.7f %12.7f 0.0 0.0 0.0\n",
724 NameC[n],Gxyz[j][1]*BohrR,Gxyz[j][2]*BohrR,Gxyz[j][3]*BohrR);
725
726
727
728 }
729 }
730
731 }
732
733 mp /= 2;
734 }
735
736 }
737 MPI_Finalize();
738 exit(0);
739 */
740
741 /* freeing of arrays */
742
743 free(MP2);
744 free(NN_FNAN);
745
746 for (ct_AN=0; ct_AN<=atomnum; ct_AN++){
747 free(NN_natn[ct_AN]);
748 }
749 free(NN_natn);
750
751 for (ct_AN=0; ct_AN<=atomnum; ct_AN++){
752 free(NN_ncn[ct_AN]);
753 }
754 free(NN_ncn);
755
756 free(OGmax);
757 free(OGmin);
758
759 for (k=0; k<4; k++){
760 free(G2OG[k]);
761 }
762 free(G2OG);
763
764 for (k=0; k<4; k++){
765 free(OG2G[k]);
766 }
767 free(OG2G);
768
769 for (k=0; k<4; k++){
770 free(Cell_Gxyz2[k]);
771 }
772 free(Cell_Gxyz2);
773
774 for (n=0; n<nl; n++){
775 free(Atom_EindC[n]);
776 }
777 free(Atom_EindC);
778
779 for (n=0; n<nl; n++){
780 free(Atom_SindC[n]);
781 }
782 free(Atom_SindC);
783
784 for (n=0; n<(nl+1); n++){
785 free(Atom_EindB[n]);
786 }
787 free(Atom_EindB);
788
789 for (n=0; n<(nl+1); n++){
790 free(Atom_SindB[n]);
791 }
792 free(Atom_SindB);
793
794 for (n=0; n<(nl+1); n++){
795 free(Atom_Index_BC[n]);
796 }
797 free(Atom_Index_BC);
798 }
799
800
801
802
803
804
805
Bisection_System(int Latomnum,double ** Cell_Gxyz2,int ** OG2G,int ** G2OG,int * NN_FNAN,int ** NN_natn,int ** NN_ncn,int * OGmin,int * OGmax,int * num0,int * numb,int * num1)806 static void Bisection_System(
807 int Latomnum,
808 double **Cell_Gxyz2,
809 int **OG2G,
810 int **G2OG,
811 int *NN_FNAN,
812 int **NN_natn,
813 int **NN_ncn,
814 int *OGmin,
815 int *OGmax,
816 int *num0,
817 int *numb,
818 int *num1)
819 {
820 int i,k,AN0,AN1,AN2,Gc_AN,h_AN,Gh_AN;
821 int nmin,nmax,l,n,Rn,OGh_AN;
822 int FNAN_min,OG_max,OG_min,po;
823 int OG_min0,OG_max0,diff_n,nmin0,nmax0;
824 int abc_n0[4],abc_nb[4],abc_n1[4];
825 int abc_OG_max[4],abc_OG_min[4];
826 int abc_nmax[4],abc_nmin[4],abc_mag[4];
827 int n0,n1,nb,n00,nb0,n10,k0,wanA;
828 int *longtail_flag;
829 double rcut,max_rcut,min_rcut;
830
831 /***************************************************************
832 eliminate atoms which have longer cutoff of basis set,
833 where we let the eliminated atoms be in the C region.
834 ***************************************************************/
835
836 longtail_flag = (int*)malloc(sizeof(int)*(Latomnum+1));
837 for (i=0; i<(Latomnum+1); i++) longtail_flag[i] = 0;
838
839 max_rcut = -100000.0;
840 min_rcut = 100000.0;
841
842 for (AN0=1; AN0<=Latomnum; AN0++){
843
844 AN1 = OG2G[1][AN0];
845 wanA = WhatSpecies[AN1];
846 rcut = Spe_Atom_Cut1[wanA];
847
848 if (max_rcut<rcut) max_rcut = rcut;
849 if (rcut<min_rcut) min_rcut = rcut;
850 }
851
852 if ( 1.5<=(max_rcut/min_rcut) ){
853
854 for (AN0=1; AN0<=Latomnum; AN0++){
855
856 AN1 = OG2G[1][AN0];
857 wanA = WhatSpecies[AN1];
858 rcut = Spe_Atom_Cut1[wanA];
859
860 if ( 1.5<=(rcut/min_rcut) ){
861 longtail_flag[AN0] = 1;
862 }
863 }
864 }
865
866 /***************************************************************
867 bisection of the system
868 ***************************************************************/
869
870 for (k=1; k<=3; k++){
871
872 /*
873 for (i=1; i<=Latomnum; i++){
874 printf("A1 k=%2d i=%2d Cell_Gxyz2=%15.12f OG2G=%2d G2OG=%2d\n",
875 k,i,Cell_Gxyz2[k][i],OG2G[k][i],G2OG[k][i]);
876 }
877 */
878
879 /* order atoms based on Cell_Gxyz2 */
880
881 qsort_double3((long)Latomnum,Cell_Gxyz2[k],OG2G[k],G2OG[k]);
882
883 /*
884 for (i=1; i<=Latomnum; i++){
885 printf("A2 k=%2d i=%2d Cell_Gxyz2=%15.12f OG2G=%2d G2OG=%2d\n",
886 k,i,Cell_Gxyz2[k][i],OG2G[k][i],G2OG[k][i]);
887 }
888 */
889
890 for (i=1; i<=Latomnum; i++){
891 AN0 = G2OG[k][i];
892 G2OG[0][AN0] = i;
893 }
894
895 /* find the reachable range of each atom */
896
897 for (AN0=1; AN0<=Latomnum; AN0++){
898
899 AN1 = G2OG[k][AN0];
900
901 if (longtail_flag[AN1]==0){ /* eliminate atoms with "longtail_flag[AN1]==0" */
902
903 nmin = 1000000;
904 nmax =-1000000;
905
906 for (h_AN=0; h_AN<=NN_FNAN[AN1]; h_AN++){
907
908 AN2 = NN_natn[AN1][h_AN];
909 Rn = NN_ncn[AN1][h_AN];
910 l = atv_ijk[Rn][k];
911
912 n = (G2OG[0][AN2]-1) + Latomnum*l;
913
914 if (longtail_flag[AN2]==0){ /* eliminate atoms with "longtail_flag[AN2]==0" */
915 if (nmax<n) nmax = n;
916 if (n<nmin) nmin = n;
917 }
918 }
919 }
920
921 else {
922 nmin = AN0-1;
923 nmax = AN0-1;
924 }
925
926 OGmin[AN0] = nmin;
927 OGmax[AN0] = nmax;
928
929 } /* AN0 */
930
931 /* find the starting atom which has the lowest NN_FNAN. */
932
933 FNAN_min = 1000000;
934 for (AN0=1; AN0<=Latomnum; AN0++){
935
936 AN1 = G2OG[k][AN0];
937
938 if (longtail_flag[AN1]==0){ /* eliminate atoms with "longtail_flag[AN1]==0" */
939 if (NN_FNAN[AN1]<FNAN_min){
940
941 FNAN_min = NN_FNAN[AN1];
942 OG_min = AN0;
943 }
944 }
945 }
946
947 /**********************************************************
948 extend the size of the part n0 step by step
949 **********************************************************/
950
951 po = 0;
952 OG_max = OG_min;
953 nmin = OGmin[OG_min];
954 nmax = OGmax[OG_min];
955 diff_n = 1000000;
956 OG_min0 = OG_min;
957 OG_max0 = OG_max;
958 nmin0 = nmin;
959 nmax0 = nmax;
960
961 do {
962
963 if (OGmin[OG_max]<nmin) nmin = OGmin[OG_max];
964 if (nmax<OGmax[OG_max]) nmax = OGmax[OG_max];
965
966 if (OGmin[OG_min]<nmin) nmin = OGmin[OG_min];
967 if (nmax<OGmax[OG_min]) nmax = OGmax[OG_min];
968
969 n0 = OG_max - OG_min + 1;
970 nb = (nmax - nmin + 1) - n0;
971 n1 = Latomnum - (n0 + nb);
972
973 /**********************************************************
974 set up flag into OG2G[0][AN],
975 where AN is the local index when the routine was called.
976 **********************************************************/
977
978 for (AN0=0; AN0<=Latomnum; AN0++) OG2G[0][AN0] = 0;
979
980 for (AN0=OG_min; AN0<=OG_max; AN0++){
981 AN1 = G2OG[k][AN0];
982 OG2G[0][AN1] = 10; /* flag in the n0 part */
983 }
984
985 for (AN0=nmin; AN0<=nmax; AN0++){
986
987 if (0<=AN0){
988 AN1 = AN0%Latomnum + 1;
989 }
990 else {
991
992 AN1 = AN0;
993 do {
994 AN1 += Latomnum;
995 } while (AN1<0);
996 AN1 = AN1 + 1;
997 }
998
999 AN2 = G2OG[k][AN1];
1000
1001 if (OG2G[0][AN2]==0) OG2G[0][AN2] = 20; /* flag in the nb part */
1002 }
1003
1004 for (AN0=1; AN0<=Latomnum; AN0++){
1005 if (OG2G[0][AN0]==0) OG2G[0][AN0] = 30; /* flag in the n1 part */
1006 }
1007
1008 /**********************************************************
1009 update n0, nb, n1 using OG2G[0] and longtail_flag
1010 **********************************************************/
1011
1012 for (AN0=1; AN0<=Latomnum; AN0++){
1013
1014 if (longtail_flag[AN0]==1 && OG2G[0][AN0]==10){
1015 n0--;
1016 nb++;
1017 }
1018
1019 if (longtail_flag[AN0]==1 && OG2G[0][AN0]==30){
1020 n1--;
1021 nb++;
1022 }
1023 }
1024
1025 /**********************************************************
1026 if (fabs(n1-n0)<=diff_n) then, update parameters
1027 **********************************************************/
1028
1029 if (fabs(n1-n0)<=diff_n){
1030
1031 diff_n = fabs(n1-n0);
1032 OG_min0 = OG_min;
1033 OG_max0 = OG_max;
1034 nmin0 = nmin;
1035 nmax0 = nmax;
1036 n00 = n0;
1037 nb0 = nb;
1038 n10 = n1;
1039 }
1040 else {
1041 po = 1;
1042 }
1043
1044 if ( Latomnum<=(nmax-nmin+1) ) {
1045 po = 2; /* partitioning is impossible along the direction. */
1046 }
1047
1048 /*
1049 printf("FFF po=%2d k=%2d OG_min=%2d OG_max=%2d nmin=%2d nmax=%2d n0=%2d nb=%2d n1=%2d\n",
1050 po,k,OG_min,OG_max,nmin,nmax,n0,nb,n1);
1051 */
1052
1053
1054 if (po==0){
1055 if (OG_max<Latomnum) OG_max++;
1056 else if (1<OG_min) OG_min--;
1057 }
1058
1059 } while (po==0);
1060
1061 abc_OG_max[k] = OG_max0;
1062 abc_OG_min[k] = OG_min0;
1063 abc_nmax[k] = nmax0;
1064 abc_nmin[k] = nmin0;
1065 abc_n0[k] = n00;
1066 abc_nb[k] = nb0;
1067 abc_n1[k] = n10;
1068
1069 /*
1070 printf("k=%2d n0=%2d nb=%2d n1=%2d\n",k,abc_n0[k],abc_nb[k],abc_n1[k]);
1071 */
1072
1073 } /* k */
1074
1075 /**********************************************************
1076 set up OG2G
1077 **********************************************************/
1078
1079 abc_mag[1] = fabs(abc_n0[1]-abc_n1[1])+abc_nb[1];
1080 abc_mag[2] = fabs(abc_n0[2]-abc_n1[2])+abc_nb[2];
1081 abc_mag[3] = fabs(abc_n0[3]-abc_n1[3])+abc_nb[3];
1082
1083 /*
1084 printf("abc_mag[1]=%2d\n",abc_mag[1]);
1085 printf("abc_mag[2]=%2d\n",abc_mag[2]);
1086 printf("abc_mag[3]=%2d\n",abc_mag[3]);
1087 */
1088
1089 if (abc_mag[1]<=abc_mag[2]) k0 = 1;
1090 else k0 = 2;
1091 if (abc_mag[3]<abc_mag[k0]) k0 = 3;
1092
1093 /*
1094 printf("k0=%2d abc_n0[k0]=%2d\n",k0,abc_n0[k0]);
1095 */
1096
1097 /* The n0 part */
1098
1099 n = 1;
1100 for (AN0=abc_OG_min[k0]; AN0<(abc_OG_min[k0]+abc_n0[k0]); AN0++){
1101
1102 AN1 = G2OG[k0][AN0];
1103
1104 if (longtail_flag[AN1]!=1){
1105 G2OG[0][n] = OG2G[k0][AN0];
1106 OG2G[k0][AN0] = -1;
1107 n++;
1108 }
1109 }
1110
1111 /* The nb part from the n0 part */
1112
1113 for (AN0=abc_OG_min[k0]; AN0<(abc_OG_min[k0]+abc_n0[k0]); AN0++){
1114
1115 AN1 = G2OG[k0][AN0];
1116
1117 if (longtail_flag[AN1]==1){
1118 G2OG[0][n] = OG2G[k0][AN0];
1119 OG2G[k0][AN0] = -1;
1120 n++;
1121 }
1122 }
1123
1124 /* The nb part from the original nb part */
1125
1126 for (AN0=abc_nmin[k0]; AN0<=abc_nmax[k0]; AN0++){
1127
1128 if (0<=AN0){
1129 AN1 = AN0%Latomnum + 1;
1130 }
1131 else {
1132
1133 AN1 = AN0;
1134 do {
1135 AN1 += Latomnum;
1136 } while (AN1<0);
1137 AN1 = AN1 + 1;
1138 }
1139
1140 if (OG2G[k0][AN1]!=-1) {
1141 G2OG[0][n] = OG2G[k0][AN1];
1142 OG2G[k0][AN1] = -1;
1143 n++;
1144 }
1145 }
1146
1147 /* The nb part from the n1 part */
1148
1149 for (AN0=1; AN0<=Latomnum; AN0++){
1150
1151 AN1 = G2OG[k0][AN0];
1152
1153 if (OG2G[k0][AN0]!=-1 && longtail_flag[AN1]==1) {
1154 G2OG[0][n] = OG2G[k0][AN0];
1155 OG2G[k0][AN0] = -1;
1156 n++;
1157 }
1158 }
1159
1160 /* The n1 part */
1161
1162 for (AN0=1; AN0<=Latomnum; AN0++){
1163
1164 AN1 = G2OG[k0][AN0];
1165
1166 if (OG2G[k0][AN0]!=-1 && longtail_flag[AN1]!=1) {
1167 G2OG[0][n] = OG2G[k0][AN0];
1168 OG2G[k0][AN0] = -1;
1169 n++;
1170 }
1171 }
1172
1173 if ( (n-1)!=Latomnum ){
1174 printf("Could not bisect\n");
1175
1176 MPI_Finalize();
1177 exit(1);
1178 }
1179
1180 /* set n0, nb, n1, and OG2G[0] */
1181
1182 n0 = abc_n0[k0];
1183 nb = abc_nb[k0];
1184 n1 = abc_n1[k0];
1185
1186 for (n=1; n<=Latomnum; n++){
1187 OG2G[0][n] = G2OG[0][n];
1188 }
1189
1190 /* print OG2G for debugging */
1191
1192 /*
1193 for (n=1; n<=n0; n++){
1194 printf("n0: n=%2d OG2G=%2d\n",n,OG2G[0][n]);
1195 }
1196
1197 for (n=n0+1; n<=n0+nb; n++){
1198 printf("nb: n=%2d OG2G=%2d\n",n,OG2G[0][n]);
1199 }
1200
1201 for (n=n0+nb+1; n<=n0+nb+n1; n++){
1202 printf("n1: n=%2d OG2G=%2d\n",n,OG2G[0][n]);
1203 }
1204 */
1205
1206 *num0 = abc_n0[k0];
1207 *numb = abc_nb[k0];
1208 *num1 = abc_n1[k0];
1209
1210 free(longtail_flag);
1211 }
1212
1213
1214
1215
1216
1217
OND_Solver(char * mode,int myid,int numprocs,int myid2,int numprocs2,int Nloop1,MPI_Comm MPI_Comm,int spin,int Epoint,double Real_Ene,int Nmat,int * NZE,int * PZE,int nl,int mpmax,double Trial_ChemP,double ** Hall,double * Sall,double ** DMall,double ** WRG,int * conv_index_row,int * conv_index_col,int ** conv_ind2,int TNZE)1218 static void OND_Solver(
1219 char *mode,
1220 int myid,
1221 int numprocs,
1222 int myid2,
1223 int numprocs2,
1224 int Nloop1,
1225 MPI_Comm MPI_Comm,
1226 int spin,
1227 int Epoint,
1228 double Real_Ene,
1229 int Nmat,
1230 int *NZE,
1231 int *PZE,
1232 int nl,
1233 int mpmax,
1234 double Trial_ChemP,
1235 double **Hall,
1236 double *Sall,
1237 double **DMall,
1238 double **WRG,
1239 int *conv_index_row,
1240 int *conv_index_col,
1241 int **conv_ind2,
1242 int TNZE)
1243 {
1244 int po,TNZE2,TNZE3,col,row;
1245 int i,j,k,j1,k1,l,p,q;
1246 double complex *Tx;
1247 int *Ti,*Tp,*Tp0;
1248 double tol,sum;
1249 double stime,etime;
1250 double stime1,etime1;
1251 double stime2,etime2;
1252 double complex alpha,weight;
1253 double complex ctmp,ctmp3;
1254 double av_num;
1255 int io,jo,jo_min,n;
1256 int m0,m1,m2,m3,m4,m5;
1257 int n0,n1,n2,mp0,nsr;
1258 int nb0,nb1,nc,k2,kk;
1259 int mp,mm,numa;
1260 int m,i1,i2,j2,i3,j3,nsa,nsc,nsc2,nsb;
1261 int max_nsc,max_nsb,max_nsa;
1262 dcomplex dcsum0,dcsum1,dcsum2,dcsum3,dcsum4;
1263 double complex csum2,csum3,csum4;
1264 double time1a,time1a2,time1a1,time1a0,time1b,time1c,time1d;
1265 double time1,time2,time3,time4,time5,time6,time41,time42;
1266 double time31,time32,time33,time34;
1267 double time51,time52,time53,time54,time55;
1268 double numop1,numop2,numop0;
1269 dcomplex al,be;
1270 /* for OpenMP */
1271 int OMPID,Nthrds,Nprocs;
1272 /* for MPI */
1273 int ID,*is1,*ie1,*Row2ID,tag=999;
1274 int size0,size1,k0;
1275 MPI_Status *stat_send;
1276 MPI_Request *request_send;
1277 MPI_Request *request_recv;
1278
1279 stat_send = (MPI_Status *)malloc(sizeof(MPI_Status)*numprocs2);
1280 request_send = (MPI_Request *)malloc(sizeof(MPI_Request)*numprocs2);
1281 request_recv = (MPI_Request *)malloc(sizeof(MPI_Request)*numprocs2);
1282
1283 al.r = 1.0;
1284 al.i = 0.0;
1285 be.r = 0.0;
1286 be.i = 0.0;
1287
1288 numop1 = 0.0;
1289 numop2 = 0.0;
1290
1291 time1 = 0.0;
1292 time1a= 0.0;
1293 time1a0= 0.0;
1294 time1a1= 0.0;
1295 time1a2= 0.0;
1296 time1b= 0.0;
1297 time1c= 0.0;
1298 time1d= 0.0;
1299 time2 = 0.0;
1300 time3 = 0.0;
1301 time31= 0.0;
1302 time32= 0.0;
1303 time33= 0.0;
1304 time34= 0.0;
1305 time4 = 0.0;
1306 time41= 0.0;
1307 time42= 0.0;
1308 time5 = 0.0;
1309 time51= 0.0;
1310 time52= 0.0;
1311 time53= 0.0;
1312 time54= 0.0;
1313 time55= 0.0;
1314 time6 = 0.0;
1315
1316 dtime(&stime);
1317 dtime(&stime1);
1318
1319 /* set alpha and weight */
1320
1321 if (strcasecmp(mode,"contour")==0){
1322
1323 if (ON2_method[Epoint]==1){ /* poles */
1324 alpha = Trial_ChemP + I*(ON2_zp[Epoint].i/Beta);
1325 weight = -2.0*ON2_Rp[Epoint].r/Beta;
1326 }
1327 else if (ON2_method[Epoint]==2){ /* zeroth moment */
1328 alpha = ON2_zp[Epoint].r + I*ON2_zp[Epoint].i;
1329 weight = I*ON2_Rp[Epoint].i;
1330 }
1331 }
1332
1333 else if (strcasecmp(mode,"retarded")==0){
1334 alpha = Real_Ene + 0.01/eV2Hartree*I;
1335 weight = I;
1336 }
1337
1338 else if (strcasecmp(mode,"force")==0){
1339
1340 if (ON2_method_f[Epoint]==1){ /* poles */
1341 alpha = Trial_ChemP + I*(ON2_zp_f[Epoint].i/Beta);
1342 weight = -2.0*ON2_Rp_f[Epoint].r/Beta*alpha;
1343 }
1344 else if (ON2_method_f[Epoint]==2){ /* zeroth and 1st order moments */
1345 alpha = ON2_zp_f[Epoint].r + I*ON2_zp_f[Epoint].i;
1346 weight = ON2_Rp_f[Epoint].r + I*ON2_Rp_f[Epoint].i;
1347 }
1348 }
1349
1350 /***********************************************************
1351 construct the A-matrix of which form is defined in
1352 CSparse as "compressed-column form".
1353 ***********************************************************/
1354
1355 Tp = (int*)malloc(sizeof(int)*(Nmat+1));
1356 Tp0 = (int*)malloc(sizeof(int)*(Nmat+1));
1357
1358 if ( 1<measure_time ){
1359 printf("myid=%2d Epoint =%2d %10.5f %10.5f\n",myid,Epoint,Trial_ChemP,cimag(alpha) );fflush(stdout);
1360 }
1361
1362 dtime(&stime2);
1363
1364 for (i=0; i<TNZE; i++){
1365 row = conv_index_row[i]; /* row index */
1366 col = conv_index_col[i]; /* column index */
1367 conv_ind2[row][col] = 1;
1368 }
1369
1370 dtime(&etime2);
1371 time1a0 = etime2 - stime2;
1372
1373 dtime(&stime2);
1374
1375 for (j=0; j<Nmat; j++) Tp0[j] = 0;
1376
1377 for (i=0; i<TNZE; i++){
1378
1379 row = conv_index_row[i]; /* row index */
1380 col = conv_index_col[i]; /* column index */
1381
1382 if (conv_ind2[row][col]==1){
1383 conv_ind2[row][col] = -(Tp0[col]+1);
1384 Tp0[col]++;
1385 }
1386 }
1387
1388 Tp[0] = 0;
1389 for (j=1; j<=Nmat; j++){ /* j: col */
1390 Tp[j] = Tp[j-1] + Tp0[j-1];
1391 }
1392 TNZE2 = Tp[Nmat];
1393
1394 for (i=0; i<TNZE; i++){
1395
1396 row = conv_index_row[i]; /* row index */
1397 col = conv_index_col[i]; /* column index */
1398
1399 if (conv_ind2[row][col]<0){
1400 conv_ind2[row][col] = -conv_ind2[row][col] - 1 + Tp[col];
1401 }
1402 }
1403
1404 Tx = (double complex*)malloc(sizeof(double complex)*TNZE2);
1405 Ti = (int*)malloc(sizeof(int)*TNZE2);
1406 for (i=0; i<TNZE2; i++) Tx[i] = 0.0;
1407
1408 dtime(&etime2);
1409 time1a1 = etime2 - stime2;
1410
1411 dtime(&stime2);
1412
1413 for (i=0; i<TNZE; i++){
1414
1415 row = conv_index_row[i]; /* row index */
1416 col = conv_index_col[i]; /* column index */
1417
1418 k = conv_ind2[row][col];
1419
1420 Ti[k] = row; /* row index */
1421 Tx[k] += (alpha*Sall[i] - Hall[spin][i]); /* value */
1422 }
1423
1424 dtime(&etime2);
1425 time1a2 = etime2 - stime2;
1426
1427 /* make invp */
1428
1429 invp = (int*)malloc(sizeof(int)*Nmat);
1430
1431 for (i=0; i<Nmat; i++){
1432 j = Index_BC[nl][i];
1433 invp[j] = i;
1434 }
1435
1436 dtime(&etime1);
1437 time1a = etime1 - stime1;
1438
1439 /***********************************************************
1440 set up block matrices, B0, B1, and C
1441 ***********************************************************/
1442
1443 dtime(&stime1);
1444
1445 Bnum[0] = (int***)malloc(sizeof(int**)*nl);
1446 Bnum[1] = (int***)malloc(sizeof(int**)*nl);
1447 Cnum = (int***)malloc(sizeof(int**)*nl);
1448 Bi[0] = (int****)malloc(sizeof(int***)*nl);
1449 Bi[1] = (int****)malloc(sizeof(int***)*nl);
1450 Ci = (int****)malloc(sizeof(int***)*nl);
1451
1452 Bx[0] = (double complex****)malloc(sizeof(double complex***)*nl);
1453 Bx[1] = (double complex****)malloc(sizeof(double complex***)*nl);
1454 Cx = (double complex****)malloc(sizeof(double complex***)*nl);
1455
1456 IBx[0] = (double complex****)malloc(sizeof(double complex***)*nl);
1457 IBx[1] = (double complex****)malloc(sizeof(double complex***)*nl);
1458 ICx = (double complex****)malloc(sizeof(double complex***)*nl);
1459
1460 mp = mpmax/2;
1461
1462 for (n=0; n<nl; n++){
1463
1464 Bnum[0][n] = (int**)malloc(sizeof(int*)*mp);
1465 Bnum[1][n] = (int**)malloc(sizeof(int*)*mp);
1466 Cnum[n] = (int**)malloc(sizeof(int*)*mp);
1467
1468 Bi[0][n] = (int***)malloc(sizeof(int**)*mp);
1469 Bi[1][n] = (int***)malloc(sizeof(int**)*mp);
1470 Ci[n] = (int***)malloc(sizeof(int**)*mp);
1471
1472 Bx[0][n] = (double complex***)malloc(sizeof(double complex**)*mp);
1473 Bx[1][n] = (double complex***)malloc(sizeof(double complex**)*mp);
1474 Cx[n] = (double complex***)malloc(sizeof(double complex**)*mp);
1475
1476 IBx[0][n] = (double complex***)malloc(sizeof(double complex**)*mp);
1477 IBx[1][n] = (double complex***)malloc(sizeof(double complex**)*mp);
1478 ICx[n] = (double complex***)malloc(sizeof(double complex**)*mp);
1479
1480 for (m=0; m<mp; m++){
1481
1482 Bnum[0][n][m] = (int*)malloc(sizeof(int)*(EindC[n][m]-SindC[n][m]+1));
1483 Bnum[1][n][m] = (int*)malloc(sizeof(int)*(EindC[n][m]-SindC[n][m]+1));
1484 Cnum[n][m] = (int*)malloc(sizeof(int)*(EindC[n][m]-SindC[n][m]+1));
1485
1486 Bi[0][n][m] = (int**)malloc(sizeof(int*)*(EindC[n][m]-SindC[n][m]+1));
1487 Bi[1][n][m] = (int**)malloc(sizeof(int*)*(EindC[n][m]-SindC[n][m]+1));
1488 Ci[n][m] = (int**)malloc(sizeof(int*)*(EindC[n][m]-SindC[n][m]+1));
1489
1490 Bx[0][n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1491 Bx[1][n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1492 Cx[n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1493
1494 IBx[0][n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1495 IBx[1][n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1496 ICx[n][m] = (double complex**)malloc(sizeof(double complex*)*(EindC[n][m]-SindC[n][m]+1));
1497
1498 for (i=SindC[n][m]; i<=EindC[n][m]; i++){
1499
1500 i1 = Index_BC[n][i];
1501
1502 nb0 = 0;
1503 nb1 = 0;
1504 nc = 0;
1505
1506 for (j=Tp[i1]; j<Tp[i1+1]; j++){
1507
1508 j1 = invp[Ti[j]];
1509
1510 if ( SindB[n][2*m ]<=j1 && j1<=EindB[n][2*m ] ) nb0++;
1511 else if ( SindB[n][2*m+1]<=j1 && j1<=EindB[n][2*m+1] ) nb1++;
1512 else if ( SindC[n][m ]<=j1 && j1<=EindC[n][m ] ) nc++;
1513 }
1514
1515 Bnum[0][n][m][i-SindC[n][m]] = nb0;
1516 Bnum[1][n][m][i-SindC[n][m]] = nb1;
1517 Cnum[n][m][i-SindC[n][m]] = nc;
1518
1519 Bi[0][n][m][i-SindC[n][m]] = (int*)malloc(sizeof(int)*nb0);
1520 Bi[1][n][m][i-SindC[n][m]] = (int*)malloc(sizeof(int)*nb1);
1521 Ci[n][m][i-SindC[n][m]] = (int*)malloc(sizeof(int)*nc);
1522
1523 Bx[0][n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nb0);
1524 Bx[1][n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nb1);
1525 Cx[n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nc);
1526
1527 IBx[0][n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nb0);
1528 IBx[1][n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nb1);
1529 ICx[n][m][i-SindC[n][m]] = (double complex*)malloc(sizeof(double complex)*nc);
1530
1531 for (k=0; k<nb0; k++) IBx[0][n][m][i-SindC[n][m]][k] = 0.0;
1532 for (k=0; k<nb1; k++) IBx[1][n][m][i-SindC[n][m]][k] = 0.0;
1533 for (k=0; k<nc; k++) ICx[n][m][i-SindC[n][m]][k] = 0.0;
1534
1535 nb0 = 0;
1536 nb1 = 0;
1537 nc = 0;
1538
1539 for (j=Tp[i1]; j<Tp[i1+1]; j++){
1540
1541 j1 = invp[Ti[j]];
1542 i2 = i - SindC[n][m];
1543
1544 if ( SindB[n][2*m]<=j1 && j1<=EindB[n][2*m] ){
1545
1546 Bi[0][n][m][i2][nb0] = j1 - SindB[n][2*m];
1547 Bx[0][n][m][i2][nb0] = Tx[j];
1548
1549 nb0++;
1550 }
1551
1552 else if ( SindB[n][2*m+1]<=j1 && j1<=EindB[n][2*m+1] ){
1553
1554 Bi[1][n][m][i2][nb1] = j1 - SindB[n][2*m+1];
1555 Bx[1][n][m][i2][nb1] = Tx[j];
1556
1557 nb1++;
1558 }
1559 else if ( SindC[n][m]<=j1 && j1<=EindC[n][m] ){
1560
1561 Ci[n][m][i2][nc] = j1 - SindC[n][m];
1562 Cx[n][m][i2][nc] = Tx[j];
1563
1564 nc++;
1565 }
1566
1567 } /* j */
1568 } /* i */
1569 } /* m */
1570
1571 mp /= 2;
1572
1573 } /* n */
1574
1575 dtime(&etime1);
1576 time1b = etime1 - stime1;
1577
1578 /***********************************************************
1579 calculate A^-1 for the smallest blocks
1580 ***********************************************************/
1581
1582 Anum = (int**)malloc(sizeof(int*)*mpmax);
1583 Ai = (int***)malloc(sizeof(int**)*mpmax);
1584 Ax = (double complex***)malloc(sizeof(double complex**)*mpmax);
1585 IAx = (double complex***)malloc(sizeof(double complex**)*mpmax);
1586
1587 for (m=0; m<mpmax; m++){
1588
1589 nsa = EindB[0][m] - SindB[0][m] + 1;
1590
1591 Anum[m] = (int*)malloc(sizeof(int)*nsa);
1592 Ai[m] = (int**)malloc(sizeof(int*)*nsa);
1593 Ax[m] = (double complex**)malloc(sizeof(double complex*)*nsa);
1594 IAx[m] = (double complex**)malloc(sizeof(double complex*)*nsa);
1595
1596 for (i=SindB[0][m]; i<=EindB[0][m]; i++){
1597
1598 i1 = Index_BC[0][i];
1599 i2 = i - SindB[0][m];
1600 numa = 0;
1601
1602 for (j=Tp[i1]; j<Tp[i1+1]; j++){
1603 j1 = invp[Ti[j]];
1604 if ( SindB[0][m]<=j1 && j1<=EindB[0][m] ) numa++;
1605 }
1606
1607 Anum[m][i2] = numa;
1608
1609 Ai[m][i2] = (int*)malloc(sizeof(int)*numa);
1610 Ax[m][i2] = (double complex*)malloc(sizeof(double complex)*numa);
1611 IAx[m][i2] = (double complex*)malloc(sizeof(double complex)*numa);
1612 for (k=0; k<numa; k++) IAx[m][i2][k] = 0.0;
1613
1614 numa = 0;
1615
1616 for (j=Tp[i1]; j<Tp[i1+1]; j++){
1617
1618 j1 = invp[Ti[j]];
1619
1620 if ( SindB[0][m]<=j1 && j1<=EindB[0][m] ){
1621
1622 Ai[m][i2][numa] = j1 - SindB[0][m];
1623 Ax[m][i2][numa] = Tx[j];
1624
1625 numa++;
1626 }
1627 }
1628 }
1629 }
1630
1631 /* calculate IA */
1632
1633 mp = mpmax;
1634 n = 0;
1635
1636 max_nsa = 0;
1637 for (m=0; m<mp; m++){
1638 nsa = EindB[n][m] - SindB[n][m] + 1;
1639 if (max_nsa<nsa) max_nsa = nsa;
1640 }
1641
1642 IA0 = (dcomplex*)malloc(sizeof(dcomplex)*max_nsa*max_nsa);
1643
1644 IA = (double**)malloc(sizeof(double*)*mp);
1645 for (m=0; m<mp; m++){
1646 nsa = EindB[n][m] - SindB[n][m] + 1;
1647 IA[m] = (double*)malloc(sizeof(double)*2*nsa*nsa);
1648 for (i=0; i<2*nsa*nsa; i++) IA[m][i] = 0.0;
1649 }
1650
1651 mp = mpmax;
1652 n = 0;
1653
1654 for ( m=myid2; m<mp; m+=numprocs2 ){
1655
1656 nsa = EindB[n][m] - SindB[n][m] + 1;
1657
1658 for (i=0; i<nsa; i++){
1659 for (j=0; j<nsa; j++){
1660 IA0[nsa*j+i].r = 0.0;
1661 IA0[nsa*j+i].i = 0.0;
1662 }
1663 }
1664
1665 for (i=0; i<nsa; i++){
1666 for (j=0; j<Anum[m][i]; j++){
1667 j2 = Ai[m][i][j];
1668 IA0[nsa*j2+i].r = creal(Ax[m][i][j]);
1669 IA0[nsa*j2+i].i = cimag(Ax[m][i][j]);
1670 }
1671 }
1672
1673 dtime(&stime1);
1674
1675 if (0<nsa) Lapack_LU_Zinverse(nsa,IA0);
1676
1677 dtime(&etime1);
1678 time1c += etime1 - stime1;
1679
1680 for (i=0; i<nsa; i++){
1681 for (j=0; j<nsa; j++){
1682 IA[m][2*nsa*j+2*i ] = IA0[nsa*j+i].r;
1683 IA[m][2*nsa*j+2*i+1] = IA0[nsa*j+i].i;
1684 }
1685 }
1686
1687 /* store IA into IAx */
1688
1689 for (i=0; i<(EindB[n][m]-SindB[n][m]+1); i++){
1690 for (j=0; j<Anum[m][i]; j++){
1691 j2 = Ai[m][i][j];
1692 IAx[m][i][j] = IA0[nsa*j2+i].r + I*IA0[nsa*j2+i].i;
1693 }
1694 }
1695
1696 } /* m */
1697
1698
1699 dtime(&stime1);
1700
1701 /* MPI: IA */
1702
1703 if (1<numprocs2){
1704 for ( m=0; m<mp; m++ ){
1705 nsa = EindB[n][m] - SindB[n][m] + 1;
1706 ID = m % numprocs2;
1707 MPI_Bcast(&IA[m][0], nsa*nsa*2, MPI_DOUBLE, ID, MPI_Comm);
1708 }
1709 }
1710
1711 /***********************************************************
1712 allocation of IS, VvecTmp, Vvec, and Lvec
1713 ***********************************************************/
1714
1715 mp = mpmax/2;
1716 IS = (dcomplex***)malloc(sizeof(dcomplex**)*nl);
1717 for (n=0; n<nl; n++){
1718 IS[n] = (dcomplex**)malloc(sizeof(dcomplex*)*mp);
1719 for (m=0; m<mp; m++){
1720 nsa = EindC[n][m] - SindC[n][m] + 1;
1721
1722 IS[n][m] = (dcomplex*)malloc(sizeof(dcomplex)*nsa*nsa);
1723 for (i=0; i<nsa*nsa; i++) IS[n][m][i] = Complex(0.0,0.0);
1724 }
1725 mp /= 2;
1726 }
1727
1728 mp = mpmax/2;
1729 max_nsc = 0;
1730 for (n=0; n<nl; n++){
1731 for (m=0; m<mp; m++){
1732 nsc = EindC[n][m] - SindC[n][m] + 1;
1733 if (max_nsc<nsc) max_nsc = nsc;
1734 }
1735 mp /= 2;
1736 }
1737
1738 mp = mpmax/2;
1739 max_nsb = 0;
1740 for (n=0; n<nl; n++){
1741 for (m=0; m<mp; m++){
1742 nsb = EindB[n][2*m]-SindB[n][2*m] + 1;
1743 if (max_nsb<nsb) max_nsb = nsb;
1744
1745 nsb = EindB[n][2*m+1]-SindB[n][2*m+1] + 1;
1746 if (max_nsb<nsb) max_nsb = nsb;
1747 }
1748 mp /= 2;
1749 }
1750
1751 ISTmp = (double*)malloc(sizeof(double)*max_nsc*max_nsc*2);
1752 VvecTmp = (double*)malloc(sizeof(double)*max_nsc*Nmat*2);
1753
1754 Vvec = (double complex**)malloc(sizeof(double complex*)*max_nsc);
1755 for (i=0; i<max_nsc; i++){
1756 Vvec[i] = (double complex*)malloc(sizeof(double complex)*Nmat);
1757 }
1758
1759 Vvec2 = (double complex**)malloc(sizeof(double complex*)*Nmat);
1760 for (i=0; i<Nmat; i++){
1761 Vvec2[i] = (double complex*)malloc(sizeof(double complex)*max_nsc);
1762 }
1763
1764 mp = mpmax/2;
1765 Lvec[0] = (dcomplex***)malloc(sizeof(dcomplex**)*nl);
1766 for (n=0; n<nl; n++){
1767 Lvec[0][n] = (dcomplex**)malloc(sizeof(dcomplex*)*mp);
1768 for (m=0; m<mp; m++){
1769 kk = (EindB[n][2*m]-SindB[n][2*m]+1)*(EindC[n][m]-SindC[n][m]+1);
1770 Lvec[0][n][m] = (dcomplex*)malloc(sizeof(dcomplex)*kk);
1771 }
1772 mp /= 2;
1773 }
1774
1775 mp = mpmax/2;
1776 Lvec[1] = (dcomplex***)malloc(sizeof(dcomplex**)*nl);
1777 for (n=0; n<nl; n++){
1778 Lvec[1][n] = (dcomplex**)malloc(sizeof(dcomplex*)*mp);
1779 for (m=0; m<mp; m++){
1780 kk = (EindB[n][2*m+1]-SindB[n][2*m+1]+1)*(EindC[n][m]-SindC[n][m]+1);
1781 Lvec[1][n][m] = (dcomplex*)malloc(sizeof(dcomplex)*kk);
1782 }
1783 mp /= 2;
1784 }
1785
1786 Q0 = (double complex**)malloc(sizeof(double complex*)*max_nsc);
1787 for (i=0; i<max_nsc; i++){
1788 Q0[i] = (double complex*)malloc(sizeof(double complex)*max_nsc);
1789 }
1790
1791 Q1 = (double complex**)malloc(sizeof(double complex*)*max_nsc);
1792 for (i=0; i<max_nsc; i++){
1793 Q1[i] = (double complex*)malloc(sizeof(double complex)*max_nsc);
1794 }
1795
1796 is1 = (int*)malloc(sizeof(int)*max_nsc);
1797 ie1 = (int*)malloc(sizeof(int)*max_nsc);
1798
1799 Row2ID = (int*)malloc(sizeof(int)*Nmat);
1800
1801 dtime(&etime1);
1802 time1d = etime1 - stime1;
1803
1804 dtime(&etime);
1805
1806 time1 = etime - stime;
1807
1808 /***********************************************************
1809 main calculations by the recurrence relations
1810 ***********************************************************/
1811
1812 mp = mpmax/2;
1813 m2 = 1;
1814
1815 for (n1=0; n1<nl; n1++){
1816
1817 /*******************************
1818 calculate the initial V0
1819 *******************************/
1820
1821 dtime(&stime);
1822
1823 for (m=0; m<mpmax/2; m++){
1824 for (m0=0; m0<2; m0++){
1825
1826 nsa = EindB[0][2*m+m0] - SindB[0][2*m+m0] + 1;
1827
1828 m3 = (2*m+m0)/(m2*2);
1829 m4 = ((2*m+m0)/m2)%2;
1830
1831 nsc = EindC[n1][m3] - SindC[n1][m3] + 1;
1832
1833 /***********************************
1834 division of nsc for
1835 MPI parallelization in MPI_Comm
1836 ***********************************/
1837
1838 if ( numprocs2<=nsc ){
1839
1840 av_num = (double)nsc/(double)numprocs2;
1841
1842 for (ID=0; ID<numprocs2; ID++){
1843 is1[ID] = (int)( (av_num+1.0e-12)*(double)ID);
1844 ie1[ID] = (int)( (av_num+1.0e-12)*(double)(ID+1)) - 1;
1845 }
1846
1847 is1[0] = 0;
1848 ie1[numprocs2-1] = nsc-1;
1849 }
1850
1851 else{
1852
1853 for (ID=0; ID<nsc; ID++){
1854 is1[ID] = ID;
1855 ie1[ID] = ID;
1856 }
1857 for (ID=nsc; ID<numprocs2; ID++){
1858 is1[ID] = 0;
1859 ie1[ID] = -1;
1860 }
1861 }
1862
1863 #pragma omp parallel shared(myid2,is1,ie1,nsc,nsa,m4,n1,m3,Bnum,Bi,SindB,EindB,m,m0,IA,Bx,Vvec) private(OMPID,Nthrds,Nprocs,j,i,k,k1,k2)
1864 {
1865
1866 double complex ctmp1,csum1;
1867
1868 /* get info. on OpenMP */
1869
1870 OMPID = omp_get_thread_num();
1871 Nthrds = omp_get_num_threads();
1872 Nprocs = omp_get_num_procs();
1873
1874 for ( j=is1[myid2]+OMPID; j<=ie1[myid2]; j+=Nthrds ){
1875
1876 for (i=0; i<nsa; i++){
1877
1878 csum1 = 0.0;
1879
1880 for (k=0; k<Bnum[m4][n1][m3][j]; k++){
1881
1882 k1 = Bi[m4][n1][m3][j][k] + SindB[n1][2*m3+m4];
1883 k2 = k1 - SindB[0][2*m+m0];
1884
1885 if (SindB[0][2*m+m0]<=k1 && k1<=EindB[0][2*m+m0]){
1886
1887 ctmp1 = IA[2*m+m0][2*nsa*k2+2*i] + I*IA[2*m+m0][2*nsa*k2+2*i+1];
1888 csum1 += ctmp1*Bx[m4][n1][m3][j][k];
1889 }
1890 }
1891
1892 Vvec[j][SindB[0][2*m+m0]+i] = csum1;
1893
1894 }
1895 } /* j */
1896
1897 } /* #pragma omp parallel */
1898
1899 } /* m0 */
1900 } /* m */
1901
1902 dtime(&etime);
1903 time2 += etime - stime;
1904
1905 /*******************************
1906 recurrence relations
1907 *******************************/
1908
1909 for (m=0; m<mp; m++){
1910
1911 dtime(&stime);
1912 dtime(&stime1);
1913
1914 nsc = EindC[n1][m] - SindC[n1][m] + 1;
1915 mp0 = m2;
1916
1917 /***********************************
1918 division of nsc for
1919 MPI parallelization in MPI_Comm
1920 ***********************************/
1921
1922 if ( numprocs2<=nsc ){
1923
1924 av_num = (double)nsc/(double)numprocs2;
1925
1926 for (ID=0; ID<numprocs2; ID++){
1927 is1[ID] = (int)( (av_num+1.0e-12)*(double)ID);
1928 ie1[ID] = (int)( (av_num+1.0e-12)*(double)(ID+1)) - 1;
1929 }
1930
1931 is1[0] = 0;
1932 ie1[numprocs2-1] = nsc-1;
1933 }
1934
1935 else{
1936
1937 for (ID=0; ID<nsc; ID++){
1938 is1[ID] = ID;
1939 ie1[ID] = ID;
1940 }
1941 for (ID=nsc; ID<numprocs2; ID++){
1942 is1[ID] = 0;
1943 ie1[ID] = -1;
1944 }
1945 }
1946
1947 dtime(&etime1);
1948 time31 += etime1 - stime1;
1949
1950 dtime(&stime2);
1951
1952 /***********************************
1953 loop for n2: which means
1954 climbing the ladder at the level n1
1955 ***********************************/
1956
1957 for (n2=0; n2<n1; n2++){
1958 for (m1=0; m1<mp0; m1++){
1959
1960 mm = mp0*m + m1;
1961
1962 dtime(&stime1);
1963
1964 nsr = EindC[n2][mm] - SindC[n2][mm] + 1;
1965
1966 /***********************************
1967 calculation of Q
1968 ***********************************/
1969
1970 #pragma omp parallel shared(is1,ie1,myid2,nsr,Bnum,n1,n2,m1,mm,SindB,Bi,Bx,Vvec,IS,Q1,Q0,SindC,numop1) private(OMPID,Nthrds,Nprocs,j,i,k,k1,m5,l,m3,m4,numop0)
1971
1972 {
1973
1974 double complex csum1;
1975
1976 numop0 = 0.0;
1977
1978 /* get info. on OpenMP */
1979
1980 OMPID = omp_get_thread_num();
1981 Nthrds = omp_get_num_threads();
1982 Nprocs = omp_get_num_procs();
1983
1984 for ( j=is1[myid2]+OMPID; j<=ie1[myid2]; j+=Nthrds ){
1985
1986 for (i=0; i<nsr; i++){
1987
1988 csum1 = 0.0;
1989
1990 /* B_{even} V_{even} */
1991
1992 for (k=0; k<Bnum[0][n2][mm][i]; k++){
1993 k1 = SindB[n2][2*mm] + Bi[0][n2][mm][i][k];
1994 csum1 += Bx[0][n2][mm][i][k]*Vvec[j][k1];
1995 }
1996
1997 numop0 += (double)Bnum[0][n2][mm][i];
1998
1999 /* B_{odd} V_{odd} */
2000
2001 for (k=0; k<Bnum[1][n2][mm][i]; k++){
2002 k1 = SindB[n2][2*mm+1] + Bi[1][n2][mm][i][k];
2003 csum1 += Bx[1][n2][mm][i][k]*Vvec[j][k1];
2004 }
2005
2006 numop0 += (double)Bnum[1][n2][mm][i];
2007
2008 /* B[C] */
2009
2010 m5 = 1;
2011 for (l=0; l<(n1-n2); l++) m5 *= 2;
2012
2013 m3 = mm/m5;
2014 m4 = (mm/(m5/2))%2;
2015
2016 for (k=0; k<Bnum[m4][n1][m3][j]; k++){
2017 k1 = Bi[m4][n1][m3][j][k] + SindB[n1][2*m3+m4] - SindC[n2][mm];
2018 if (k1==i) csum1 -= Bx[m4][n1][m3][j][k];
2019 }
2020
2021 /* store the temporal result into Q0 */
2022
2023 Q0[j][i] = csum1;
2024
2025 } /* i */
2026
2027 /* calculate Q */
2028
2029 for (i=0; i<nsr; i++){
2030
2031 csum1 = 0.0;
2032 for (k=0; k<nsr; k++){
2033 csum1 += (IS[n2][mm][i*nsr+k].r+I*IS[n2][mm][i*nsr+k].i)*Q0[j][k];
2034 }
2035
2036 Q1[j][i] = csum1;
2037 }
2038
2039 } /* j */
2040
2041 numop1 += numop0;
2042
2043 } /* #pragma omp parallel */
2044
2045 dtime(&etime1);
2046 time32 += etime1 - stime1;
2047
2048 /**********************************
2049 update V
2050 **********************************/
2051
2052 dtime(&stime1);
2053
2054 #pragma omp parallel shared(is1,ie1,myid2,EindB,SindB,n2,mm,nsr,Lvec,Q1,Vvec,SindC,numop1) \
2055 private(OMPID,Nthrds,Nprocs,j,i,i2,k,i1,numop0)
2056
2057 {
2058
2059 double complex csum0;
2060
2061 numop0 = 0.0;
2062
2063 /* get info. on OpenMP */
2064
2065 OMPID = omp_get_thread_num();
2066 Nthrds = omp_get_num_threads();
2067 Nprocs = omp_get_num_procs();
2068
2069 for ( j=is1[myid2]+OMPID; j<=ie1[myid2]; j+=Nthrds ){
2070
2071 /* V_{even} */
2072
2073 for (i=0; i<(EindB[n2][2*mm]-SindB[n2][2*mm]+1); i++){
2074
2075 i2 = i*nsr;
2076 csum0 = 0.0;
2077
2078 for (k=0; k<nsr; k++){
2079 csum0 += (Lvec[0][n2][mm][i2+k].r+I*Lvec[0][n2][mm][i2+k].i)*Q1[j][k];
2080 }
2081
2082 Vvec[j][SindB[n2][2*mm]+i] += csum0;
2083 }
2084
2085 numop0 += (double)(EindB[n2][2*mm]-SindB[n2][2*mm]+1)*nsr;
2086
2087 /* V_{odd} */
2088
2089 for (i=0; i<(EindB[n2][2*mm+1]-SindB[n2][2*mm+1]+1); i++){
2090
2091 i2 = i*nsr;
2092 csum0 = 0.0;
2093
2094 for (k=0; k<nsr; k++){
2095 csum0 += (Lvec[1][n2][mm][i2+k].r+I*Lvec[1][n2][mm][i2+k].i)*Q1[j][k];
2096 }
2097
2098 Vvec[j][SindB[n2][2*mm+1]+i] += csum0;
2099 }
2100
2101 numop0 += (double)(EindB[n2][2*mm+1]-SindB[n2][2*mm+1]+1)*nsr;
2102
2103 /* add the contribution of -Q */
2104
2105 for (i=0; i<nsr; i++){
2106 i1 = SindC[n2][mm] + i;
2107 Vvec[j][i1] = -Q1[j][i];
2108 }
2109
2110 } /* j */
2111
2112 numop1 += numop0;
2113
2114 } /* #pragma omp parallel */
2115
2116 dtime(&etime1);
2117 time33 += etime1 - stime1;
2118
2119 } /* m1 */
2120
2121 /* update mp0 */
2122
2123 mp0 /= 2;
2124
2125 } /* n2 */
2126
2127 dtime(&etime2);
2128 time34 += etime2 - stime2;
2129
2130 dtime(&etime);
2131 time3 += etime - stime;
2132
2133 /*******************************
2134 calculate the S-matrix
2135 *******************************/
2136
2137 dtime(&stime);
2138
2139 /* initialize IS */
2140
2141 for (j=0; j<nsc; j++){
2142 for (i=0; i<nsc; i++){
2143
2144 ISTmp[2*j*nsc+2*i ] = 0.0;
2145 ISTmp[2*j*nsc+2*i+1] = 0.0;
2146 }
2147 }
2148
2149 /* put C to IS */
2150
2151 for (i=0; i<nsc; i++){
2152 for (j=0; j<Cnum[n1][m][i]; j++){
2153
2154 j1 = Ci[n1][m][i][j];
2155
2156 if (is1[myid2]<=j1 && j1<=ie1[myid2]){
2157
2158 ISTmp[2*j1*nsc+2*i ] = creal(Cx[n1][m][i][j]);
2159 ISTmp[2*j1*nsc+2*i+1] = cimag(Cx[n1][m][i][j]);
2160 }
2161 }
2162 }
2163
2164 /* calculate the inner products */
2165
2166 dtime(&stime1);
2167
2168 #pragma omp parallel shared(is1,ie1,myid2,nsc,Bnum,n1,m,SindB,Bi,Bx,Vvec,ISTmp) private(OMPID,Nthrds,Nprocs,j,i,k,k1)
2169
2170 {
2171
2172 double complex csum1;
2173
2174
2175 /* get info. on OpenMP */
2176
2177 OMPID = omp_get_thread_num();
2178 Nthrds = omp_get_num_threads();
2179 Nprocs = omp_get_num_procs();
2180
2181 for ( j=is1[myid2]+OMPID; j<=ie1[myid2]; j+=Nthrds ){
2182
2183 for (i=0; i<nsc; i++){
2184
2185 csum1 = 0.0;
2186
2187 /* contribution of even m */
2188
2189 for (k=0; k<Bnum[0][n1][m][i]; k++){
2190
2191 k1 = SindB[n1][2*m] + Bi[0][n1][m][i][k];
2192 csum1 += Bx[0][n1][m][i][k]*Vvec[j][k1];
2193 }
2194
2195 /* contribution of odd m */
2196
2197 for (k=0; k<Bnum[1][n1][m][i]; k++){
2198
2199 k1 = SindB[n1][2*m+1] + Bi[1][n1][m][i][k];
2200 csum1 += Bx[1][n1][m][i][k]*Vvec[j][k1];
2201 }
2202
2203 ISTmp[2*j*nsc+2*i ] -= creal(csum1);
2204 ISTmp[2*j*nsc+2*i+1] -= cimag(csum1);
2205 }
2206 }
2207
2208 } /* #pragma omp parallel */
2209
2210 /* MPI: IS */
2211
2212 if (1<numprocs2){
2213
2214 /* sending */
2215
2216 size0 = (ie1[myid2]-is1[myid2]+1)*nsc*2;
2217 if (size0<0) size0 = 0;
2218 k0 = 2*is1[myid2]*nsc;
2219 if (k0<0) k0 = 0;
2220
2221 for (ID=0; ID<numprocs2; ID++){
2222 MPI_Isend(&ISTmp[k0], size0, MPI_DOUBLE, ID, tag, MPI_Comm, &request_send[ID]);
2223 }
2224
2225 /* receiving */
2226
2227 for (ID=0; ID<numprocs2; ID++){
2228
2229 size1 = (ie1[ID]-is1[ID]+1)*nsc*2;
2230 if (size1<0) size1 = 0;
2231 k1 = 2*is1[ID]*nsc;
2232 if (k1<0) k1 = 0;
2233
2234 MPI_Irecv(&ISTmp[k1], size1, MPI_DOUBLE, ID, tag, MPI_Comm, &request_recv[ID]);
2235 }
2236
2237 /* Waitall */
2238
2239 MPI_Waitall(numprocs2,request_recv,stat_send);
2240 MPI_Waitall(numprocs2,request_send,stat_send);
2241 }
2242
2243 /* set up IS */
2244
2245 for (j=0; j<nsc; j++){
2246 for (i=0; i<nsc; i++){
2247 IS[n1][m][j*nsc+i].r = ISTmp[2*j*nsc+2*i ];
2248 IS[n1][m][j*nsc+i].i = ISTmp[2*j*nsc+2*i+1];
2249 }
2250 }
2251
2252 dtime(&etime1);
2253 time41 += etime1 - stime1;
2254
2255 /*************************************f******************
2256 MPI communication of Vvec using VvecTmp:
2257 In order to hide the communication, the MPI_Waitall are
2258 called after the calculation of the inverse of S
2259 ********************************************************/
2260
2261 for (j=is1[myid2]; j<=ie1[myid2]; j++){
2262 for (i=0; i<Nmat; i++){
2263 VvecTmp[2*j*Nmat+2*i ] = creal(Vvec[j][i]);
2264 VvecTmp[2*j*Nmat+2*i+1] = cimag(Vvec[j][i]);
2265 }
2266 }
2267
2268 if (1<numprocs2){
2269
2270 /* sending */
2271
2272 size0 = (ie1[myid2]-is1[myid2]+1)*Nmat*2;
2273 if (size0<0) size0 = 0;
2274 k0 = 2*is1[myid2]*Nmat;
2275 if (k0<0) k0 = 0;
2276
2277 for (ID=0; ID<numprocs2; ID++){
2278 MPI_Isend(&VvecTmp[k0], size0, MPI_DOUBLE, ID, tag, MPI_Comm, &request_send[ID]);
2279 }
2280
2281 /* receiving */
2282
2283 for (ID=0; ID<numprocs2; ID++){
2284
2285 size1 = (ie1[ID]-is1[ID]+1)*Nmat*2;
2286 if (size1<0) size1 = 0;
2287 k1 = 2*is1[ID]*Nmat;
2288 if (k1<0) k1 = 0;
2289
2290 MPI_Irecv(&VvecTmp[k1], size1, MPI_DOUBLE, ID, tag, MPI_Comm, &request_recv[ID]);
2291 }
2292 }
2293
2294 /*******************************
2295 call Lapack_LU_Zinverse
2296 to calculate the inverse of S
2297 *******************************/
2298
2299 dtime(&stime1);
2300
2301 if (0<nsc) Lapack_LU_Zinverse(nsc,IS[n1][m]);
2302
2303 dtime(&etime1);
2304 time42 += etime1 - stime1;
2305
2306 dtime(&etime);
2307 time4 += etime - stime;
2308
2309 /****************************************************
2310 To end MPI communication of VvecTmp, call waitall,
2311 ****************************************************/
2312
2313 if (1<numprocs2){
2314 MPI_Waitall(numprocs2,request_recv,stat_send);
2315 MPI_Waitall(numprocs2,request_send,stat_send);
2316 }
2317
2318 /*******************************
2319 store Vvec as Lvec
2320 *******************************/
2321
2322 for (j=0; j<nsc; j++){
2323 for (i=SindB[n1][2*m]; i<=EindB[n1][2*m]; i++){
2324 Lvec[0][n1][m][ (i-SindB[n1][2*m])*nsc+j ].r = VvecTmp[2*j*Nmat+2*i ];
2325 Lvec[0][n1][m][ (i-SindB[n1][2*m])*nsc+j ].i = VvecTmp[2*j*Nmat+2*i+1];
2326 }
2327 }
2328
2329 for (j=0; j<nsc; j++){
2330 for (i=SindB[n1][2*m+1]; i<=EindB[n1][2*m+1]; i++){
2331 Lvec[1][n1][m][ (i-SindB[n1][2*m+1])*nsc+j ].r = VvecTmp[2*j*Nmat+2*i ];
2332 Lvec[1][n1][m][ (i-SindB[n1][2*m+1])*nsc+j ].i = VvecTmp[2*j*Nmat+2*i+1];
2333 }
2334 }
2335
2336 /*****************************************
2337 set up Row2ID
2338 *****************************************/
2339
2340 ID = 0;
2341 for (i=0; i<(EindB[n1][2*m+1]-SindB[n1][2*m]+1); i++){
2342
2343 i2 = i + SindB[n1][2*m];
2344 Row2ID[i2] = ID;
2345 ID++;
2346 ID = ID % numprocs2;
2347 }
2348
2349 /*****************************************
2350 update entries in the inverse of A
2351 *****************************************/
2352
2353 dtime(&stime);
2354
2355 /*
2356 printf("n1=%2d m=%2d nsc=%2d\n",n1,m,nsc);
2357 */
2358
2359 if (0<nsc){
2360
2361 /* calculate L_{even}^t S^{-1} */
2362
2363 dtime(&stime1);
2364
2365 #pragma omp parallel shared(n1,m,EindB,SindB,Row2ID,myid2,nsc,IS,Lvec,Vvec2,numop2) private(OMPID,Nthrds,Nprocs,i,i2,j,j2,k,i3,j3,dcsum0,numop0)
2366 {
2367
2368 numop0 = 0.0;
2369
2370 /* get info. on OpenMP */
2371
2372 OMPID = omp_get_thread_num();
2373 Nthrds = omp_get_num_threads();
2374 Nprocs = omp_get_num_procs();
2375
2376 for ( i=OMPID; i<(EindB[n1][2*m]-SindB[n1][2*m]+1); i+=Nthrds ){
2377
2378 if ( Row2ID[i+SindB[n1][2*m]]==myid2 ){
2379
2380 i2 = i*nsc;
2381
2382 for (j=0; j<nsc; j++){
2383
2384 j2 = j*nsc;
2385
2386 dcsum0.r = 0.0; dcsum0.i = 0.0;
2387
2388 for (k=0; k<nsc; k++){
2389
2390 i3 = i2 + k;
2391 j3 = j2 + k;
2392
2393 dcsum0.r += IS[n1][m][j3].r*Lvec[0][n1][m][i3].r - IS[n1][m][j3].i*Lvec[0][n1][m][i3].i;
2394 dcsum0.i += IS[n1][m][j3].i*Lvec[0][n1][m][i3].r + IS[n1][m][j3].r*Lvec[0][n1][m][i3].i;
2395 }
2396
2397 Vvec2[i+SindB[n1][2*m]][j] = dcsum0.r + I*dcsum0.i;
2398 }
2399
2400 numop0 += (double)nsc*nsc;
2401 }
2402 }
2403
2404 /* calculate L_{odd}^t S^{-1} */
2405
2406 for ( i=OMPID; i<(EindB[n1][2*m+1]-SindB[n1][2*m+1]+1); i+=Nthrds ){
2407
2408 if ( Row2ID[i+SindB[n1][2*m+1]]==myid2 ){
2409
2410 i2 = i*nsc;
2411
2412 for (j=0; j<nsc; j++){
2413
2414 j2 = j*nsc;
2415
2416 dcsum0.r = 0.0; dcsum0.i = 0.0;
2417
2418 for (k=0; k<nsc; k++){
2419
2420 j3 = j2 + k;
2421 i3 = i2 + k;
2422
2423 dcsum0.r += IS[n1][m][j3].r*Lvec[1][n1][m][i3].r - IS[n1][m][j3].i*Lvec[1][n1][m][i3].i;
2424 dcsum0.i += IS[n1][m][j3].i*Lvec[1][n1][m][i3].r + IS[n1][m][j3].r*Lvec[1][n1][m][i3].i;
2425 }
2426
2427 Vvec2[i+SindB[n1][2*m+1]][j] = dcsum0.r + I*dcsum0.i;
2428 }
2429
2430 numop0 += (double)nsc*nsc;
2431
2432 }
2433 }
2434
2435 numop2 += numop0;
2436
2437 } /* #pragma omp parallel */
2438
2439 dtime(&etime1);
2440 time51 += etime1 - stime1;
2441
2442 /* update IAx */
2443
2444 dtime(&stime1);
2445
2446 n2 = 0;
2447
2448 for (m1=0; m1<m2; m1++){
2449
2450 m3 = m2*m + m1;
2451
2452 #pragma omp parallel shared(m2,m1,SindB,EindB,n2,m3,Row2ID,myid2,Anum,Ai,n1,m,EindC,SindC,Lvec,Vvec2,IAx) private(OMPID,Nthrds,Nprocs,k2,i,i2,j,j2,nsc2,k)
2453 {
2454
2455 double complex csum1,ctmp2;
2456
2457 /* get info. on OpenMP */
2458
2459 OMPID = omp_get_thread_num();
2460 Nthrds = omp_get_num_threads();
2461 Nprocs = omp_get_num_procs();
2462
2463 /* for even */
2464
2465 if (m2==1) k2 = 0;
2466 else k2 = ((m2/2)<=m1);
2467
2468 for ( i=SindB[n2][2*m3]+OMPID; i<=EindB[n2][2*m3]; i+=Nthrds ){
2469
2470 if (Row2ID[i]==myid2){
2471
2472 i2 = i - SindB[n2][2*m3];
2473
2474 for (j=0; j<Anum[2*m3][i2]; j++){
2475
2476 j2 = Ai[2*m3][i2][j] + SindB[0][2*m3] - SindB[n1][2*m+k2];
2477 csum1 = 0.0;
2478
2479 nsc2 = EindC[n1][m]-SindC[n1][m]+1;
2480
2481 for (k=0; k<nsc2; k++){
2482 ctmp2 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2483 csum1 += Vvec2[i][k]*ctmp2;
2484 }
2485
2486 IAx[2*m3][i2][j] += csum1;
2487 }
2488 }
2489 }
2490
2491 /* for odd */
2492
2493 if (m2==1) k2 = 1;
2494 else k2 = ((m2/2)<=m1);
2495
2496 for ( i=SindB[n2][2*m3+1]+OMPID; i<=EindB[n2][2*m3+1]; i+=Nthrds ){
2497
2498 if (Row2ID[i]==myid2){
2499
2500 i2 = i - SindB[n2][2*m3+1];
2501
2502 for (j=0; j<Anum[2*m3+1][i2]; j++){
2503
2504 j2 = Ai[2*m3+1][i2][j] + SindB[0][2*m3+1] - SindB[n1][2*m+k2];
2505 csum1 = 0.0;
2506
2507 nsc2 = EindC[n1][m] - SindC[n1][m] + 1;
2508
2509 for (k=0; k<nsc2; k++){
2510 ctmp2 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2511 csum1 += Vvec2[i][k]*ctmp2;
2512 }
2513
2514 IAx[2*m3+1][i2][j] += csum1;
2515 }
2516 }
2517 }
2518
2519 } /* #pragma omp parallel */
2520
2521 } /* m1 */
2522
2523 dtime(&etime1);
2524 time52 += etime1 - stime1;
2525
2526 /* update the outer IBx */
2527
2528 dtime(&stime1);
2529
2530 #pragma omp parallel shared(EindC,SindC,n1,m,Bnum,SindB,Bi,Row2ID,myid2,IBx,Vvec2) \
2531 private(OMPID,Nthrds,Nprocs,i,j,j2)
2532 {
2533
2534 /* get info. on OpenMP */
2535
2536 OMPID = omp_get_thread_num();
2537 Nthrds = omp_get_num_threads();
2538 Nprocs = omp_get_num_procs();
2539
2540 for ( i=OMPID; i<(EindC[n1][m]-SindC[n1][m]+1); i+=Nthrds ){
2541
2542 /* for even */
2543
2544 for (j=0; j<Bnum[0][n1][m][i]; j++){
2545
2546 j2 = SindB[n1][2*m] + Bi[0][n1][m][i][j];
2547
2548 if (Row2ID[j2]==myid2){
2549 IBx[0][n1][m][i][j] = -Vvec2[j2][i];
2550 }
2551 }
2552
2553 /* for odd */
2554
2555 for (j=0; j<Bnum[1][n1][m][i]; j++){
2556
2557 j2 = SindB[n1][2*m+1] + Bi[1][n1][m][i][j];
2558
2559 if (Row2ID[j2]==myid2){
2560 IBx[1][n1][m][i][j] = -Vvec2[j2][i];
2561 }
2562 }
2563
2564 } /* i */
2565
2566 } /* #pragma omp parallel */
2567
2568 dtime(&etime1);
2569 time53 += etime1 - stime1;
2570
2571 /* update the inner IBx */
2572
2573 dtime(&stime1);
2574
2575 #pragma omp parallel shared(m2,n1,m,SindC,EindC,Row2ID,myid2,Bnum,Bi,SindB,Lvec,Vvec2,IBx) \
2576 private(OMPID,Nthrds,Nprocs,mp0,n2,m1,m3,k2,i,i2,j,j2,nsc2,k)
2577 {
2578
2579 double complex ctmp1,csum1;
2580
2581 /* get info. on OpenMP */
2582
2583 OMPID = omp_get_thread_num();
2584 Nthrds = omp_get_num_threads();
2585 Nprocs = omp_get_num_procs();
2586
2587 mp0 = m2;
2588
2589 for (n2=0; n2<n1; n2++){
2590 for (m1=0; m1<mp0; m1++){
2591
2592 m3 = mp0*m + m1;
2593 k2 = ((mp0/2)<=m1);
2594
2595 for ( i=SindC[n2][m3]+OMPID; i<=EindC[n2][m3]; i+=Nthrds ){
2596
2597 if (Row2ID[i]==myid2){
2598
2599 i2 = i - SindC[n2][m3];
2600
2601 /* for even */
2602
2603 for (j=0; j<Bnum[0][n2][m3][i2]; j++){
2604
2605 j2 = Bi[0][n2][m3][i2][j] + SindB[n2][2*m3] - SindB[n1][2*m+k2];
2606 csum1 = 0.0;
2607
2608 nsc2 = EindC[n1][m]-SindC[n1][m]+1;
2609
2610 for (k=0; k<nsc2; k++){
2611
2612 ctmp1 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2613 csum1 += Vvec2[i][k]*ctmp1;
2614 }
2615
2616 IBx[0][n2][m3][i2][j] += csum1;
2617 }
2618
2619 /* for odd */
2620
2621 for (j=0; j<Bnum[1][n2][m3][i2]; j++){
2622
2623 j2 = Bi[1][n2][m3][i2][j] + SindB[n2][2*m3+1] - SindB[n1][2*m+k2];
2624 csum1 = 0.0;
2625
2626 nsc2 = EindC[n1][m]-SindC[n1][m]+1;
2627
2628 for (k=0; k<nsc2; k++){
2629 ctmp1 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2630 csum1 += Vvec2[i][k]*ctmp1;
2631 }
2632
2633 IBx[1][n2][m3][i2][j] += csum1;
2634 }
2635
2636 } /* if (Row2ID[i]==myid2) */
2637 }
2638
2639 } /* m1 */
2640
2641 mp0 /= 2;
2642
2643 } /* n2 */
2644
2645 } /* #pragma omp parallel */
2646
2647 dtime(&etime1);
2648 time54 += etime1 - stime1;
2649
2650 /* update the inner ICx */
2651
2652 dtime(&stime1);
2653
2654 #pragma omp parallel shared(m2,n1,m,SindC,EindC,Row2ID,myid2,Cnum,Ci,SindB,Lvec,Vvec2,ICx) \
2655 private(OMPID,Nthrds,Nprocs,mp0,n2,m1,m3,k2,i,i2,j,j2,nsc2,k)
2656 {
2657
2658 double complex ctmp1,csum1;
2659
2660 /* get info. on OpenMP */
2661
2662 OMPID = omp_get_thread_num();
2663 Nthrds = omp_get_num_threads();
2664 Nprocs = omp_get_num_procs();
2665
2666 mp0 = m2;
2667
2668 for (n2=0; n2<n1; n2++){
2669 for (m1=0; m1<mp0; m1++){
2670
2671 m3 = mp0*m + m1;
2672 k2 = ((mp0/2)<=m1);
2673
2674 for ( i=SindC[n2][m3]+OMPID; i<=EindC[n2][m3]; i+=Nthrds ){
2675
2676 if (Row2ID[i]==myid2){
2677
2678 i2 = i - SindC[n2][m3];
2679
2680 for (j=0; j<Cnum[n2][m3][i2]; j++){
2681
2682 j2 = Ci[n2][m3][i2][j] + SindC[n2][m3] - SindB[n1][2*m+k2];
2683 csum1 = 0.0;
2684
2685 nsc2 = EindC[n1][m]-SindC[n1][m]+1;
2686
2687 for (k=0; k<nsc2; k++){
2688 ctmp1 = Lvec[k2][n1][m][j2*nsc2+k].r + I*Lvec[k2][n1][m][j2*nsc2+k].i;
2689 csum1 += Vvec2[i][k]*ctmp1;
2690 }
2691
2692 ICx[n2][m3][i2][j] += csum1;
2693
2694 } /* j */
2695 } /* if (Row2ID[i]==myid2) */
2696 } /* i */
2697 } /* m1 */
2698
2699 mp0 /= 2;
2700
2701 } /* n2 */
2702
2703 } /* #pragma omp parallel */
2704
2705 dtime(&etime1);
2706 time55 += etime1 - stime1;
2707
2708 /* update the outer ICx */
2709
2710 if (myid2==0){
2711
2712 for (i=0; i<nsc; i++){
2713 for (j=0; j<Cnum[n1][m][i]; j++){
2714
2715 j2 = Ci[n1][m][i][j];
2716 ICx[n1][m][i][j] = IS[n1][m][j2*nsc+i].r + I*IS[n1][m][j2*nsc+i].i;
2717 }
2718 }
2719 }
2720
2721 } /* if (0<nsc), end of "update entries in the inverse of A" */
2722
2723 dtime(&etime);
2724 time5 += etime - stime;
2725
2726 } /* m */
2727
2728 /* update m2 and mp */
2729
2730 m2 *= 2;
2731 mp /= 2;
2732
2733 } /* n1 */
2734
2735 /***********************************************************
2736 add the part of inverse to DMall
2737 ***********************************************************/
2738
2739 dtime(&stime);
2740
2741 mp = mpmax/2;
2742 for (n=0; n<nl; n++){
2743 for (m=0; m<mp; m++){
2744 for (i=SindC[n][m]; i<=EindC[n][m]; i++){
2745 for (j=0; j<Bnum[0][n][m][i-SindC[n][m]]; j++){
2746
2747 j1 = Bi[0][n][m][i-SindC[n][m]][j];
2748 j2 = j1 + SindB[n][2*m];
2749 i2 = i - SindC[n][m];
2750
2751 i3 = Index_BC[nl][i];
2752 j3 = Index_BC[nl][j2];
2753 k = conv_ind2[i3][j3];
2754 Tx[k] = creal(weight*IBx[0][n][m][i2][j]);
2755
2756 k = conv_ind2[j3][i3];
2757 Tx[k] = creal(weight*IBx[0][n][m][i2][j]);
2758 }
2759 }
2760 }
2761
2762 mp /= 2;
2763 }
2764
2765 mp = mpmax/2;
2766 for (n=0; n<nl; n++){
2767 for (m=0; m<mp; m++){
2768 for (i=SindC[n][m]; i<=EindC[n][m]; i++){
2769 for (j=0; j<Bnum[1][n][m][i-SindC[n][m]]; j++){
2770
2771 j1 = Bi[1][n][m][i-SindC[n][m]][j];
2772 j2 = j1 + SindB[n][2*m+1];
2773 i2 = i - SindC[n][m];
2774
2775 i3 = Index_BC[nl][i];
2776 j3 = Index_BC[nl][j2];
2777 k = conv_ind2[i3][j3];
2778 Tx[k] = creal(weight*IBx[1][n][m][i2][j]);
2779
2780 k = conv_ind2[j3][i3];
2781 Tx[k] = creal(weight*IBx[1][n][m][i2][j]);
2782 }
2783 }
2784 }
2785
2786 mp /= 2;
2787 }
2788
2789 mp = mpmax/2;
2790 for (n=0; n<nl; n++){
2791 for (m=0; m<mp; m++){
2792 for (i=SindC[n][m]; i<=EindC[n][m]; i++){
2793 for (j=0; j<Cnum[n][m][i-SindC[n][m]]; j++){
2794
2795 j1 = Ci[n][m][i-SindC[n][m]][j];
2796 j2 = j1 + SindC[n][m];
2797 i2 = i - SindC[n][m];
2798
2799 i3 = Index_BC[nl][i];
2800 j3 = Index_BC[nl][j2];
2801 k = conv_ind2[i3][j3];
2802
2803 Tx[k] = creal(weight*ICx[n][m][i2][j]);
2804 }
2805 }
2806 }
2807
2808 mp /= 2;
2809 }
2810
2811 mp = mpmax;
2812 for (m=0; m<mp; m++){
2813 for (i=SindB[0][m]; i<=EindB[0][m]; i++){
2814 for (j=0; j<Anum[m][i-SindB[0][m]]; j++){
2815
2816 j1 = Ai[m][i-SindB[0][m]][j];
2817 j2 = j1 + SindB[0][m];
2818 i2 = i - SindB[0][m];
2819
2820 i3 = Index_BC[nl][i];
2821 j3 = Index_BC[nl][j2];
2822
2823 k = conv_ind2[i3][j3];
2824
2825 Tx[k] = creal(weight*IAx[m][i2][j]);
2826 }
2827 }
2828 }
2829
2830 if (strcasecmp(mode,"contour")==0){
2831
2832 for (i=0; i<TNZE; i++){
2833
2834 row = conv_index_row[i]; /* row index */
2835 col = conv_index_col[i]; /* column index */
2836 k = conv_ind2[row][col];
2837
2838 DMall[spin][i] += Tx[k];
2839 }
2840 }
2841 else if (strcasecmp(mode,"retarded")==0){
2842
2843 sum = 0.0;
2844 for (i=0; i<TNZE; i++){
2845
2846 row = conv_index_row[i]; /* row index */
2847 col = conv_index_col[i]; /* column index */
2848 k = conv_ind2[row][col];
2849 sum += Tx[k]*Sall[i];
2850 }
2851
2852 WRG[spin][Epoint] = sum;
2853 }
2854
2855 else if (strcasecmp(mode,"force")==0){
2856
2857 for (i=0; i<TNZE; i++){
2858
2859 row = conv_index_row[i]; /* row index */
2860 col = conv_index_col[i]; /* column index */
2861 k = conv_ind2[row][col];
2862
2863 DMall[spin][i] += Tx[k];
2864 }
2865 }
2866
2867 dtime(&etime);
2868 time6 = etime - stime;
2869
2870 if (0<measure_time){
2871
2872 printf("OND myid=%2d numop1=%15.12f\n",myid,numop1); fflush(stdout);
2873 printf("OND myid=%2d numop2=%15.12f\n",myid,numop2); fflush(stdout);
2874 printf("OND time1 =%15.12f\n",time1); fflush(stdout);
2875 printf("OND time1a=%15.12f\n",time1a); fflush(stdout);
2876 printf("ONDtime1a0=%15.12f\n",time1a0); fflush(stdout);
2877 printf("ONDtime1a1=%15.12f\n",time1a1); fflush(stdout);
2878 printf("ONDtime1a2=%15.12f\n",time1a2); fflush(stdout);
2879 printf("OND time1b=%15.12f\n",time1b); fflush(stdout);
2880 printf("OND time1c=%15.12f\n",time1c); fflush(stdout);
2881 printf("OND time1d=%15.12f\n",time1d); fflush(stdout);
2882 printf("OND time2 =%15.12f\n",time2); fflush(stdout);
2883 printf("OND time3 =%15.12f\n",time3); fflush(stdout);
2884 printf("OND time31=%15.12f\n",time31); fflush(stdout);
2885 printf("OND time32=%15.12f\n",time32); fflush(stdout);
2886 printf("OND time33=%15.12f\n",time33); fflush(stdout);
2887 printf("OND time34=%15.12f\n",time34); fflush(stdout);
2888 printf("OND time4 =%15.12f\n",time4); fflush(stdout);
2889 printf("OND time41=%15.12f\n",time41); fflush(stdout);
2890 printf("OND time42=%15.12f\n",time42); fflush(stdout);
2891 printf("OND time5 =%15.12f\n",time5); fflush(stdout);
2892 printf("OND time51=%15.12f\n",time51); fflush(stdout);
2893 printf("OND time52=%15.12f\n",time52); fflush(stdout);
2894 printf("OND time53=%15.12f\n",time53); fflush(stdout);
2895 printf("OND time54=%15.12f\n",time54); fflush(stdout);
2896 printf("OND time55=%15.12f\n",time55); fflush(stdout);
2897 printf("OND myid=%2d time6 =%15.12f\n",myid,time6); fflush(stdout);
2898 }
2899
2900 /* free arrays */
2901
2902 free(Row2ID);
2903
2904 free(ie1);
2905 free(is1);
2906
2907 for (i=0; i<max_nsc; i++){
2908 free(Q0[i]);
2909 }
2910 free(Q0);
2911
2912 for (i=0; i<max_nsc; i++){
2913 free(Q1[i]);
2914 }
2915 free(Q1);
2916
2917 free(VvecTmp);
2918 free(ISTmp);
2919
2920 for (i=0; i<max_nsc; i++){
2921 free(Vvec[i]);
2922 }
2923 free(Vvec);
2924
2925 for (i=0; i<Nmat; i++){
2926 free(Vvec2[i]);
2927 }
2928 free(Vvec2);
2929
2930 mp = mpmax/2;
2931 for (n=0; n<nl; n++){
2932 for (m=0; m<mp; m++){
2933 free(Lvec[0][n][m]);
2934 }
2935 free(Lvec[0][n]);
2936 mp /= 2;
2937 }
2938 free(Lvec[0]);
2939
2940 mp = mpmax/2;
2941 for (n=0; n<nl; n++){
2942 for (m=0; m<mp; m++){
2943 free(Lvec[1][n][m]);
2944 }
2945 free(Lvec[1][n]);
2946 mp /= 2;
2947 }
2948 free(Lvec[1]);
2949
2950 for (m=0; m<mpmax; m++){
2951
2952 nsa = EindB[0][m] - SindB[0][m] + 1;
2953
2954 Anum[m] = (int*)malloc(sizeof(int)*nsa);
2955
2956 for (i=SindB[0][m]; i<=EindB[0][m]; i++){
2957
2958 i2 = i - SindB[0][m];
2959
2960 free(Ai[m][i2]);
2961 free(Ax[m][i2]);
2962 free(IAx[m][i2]);
2963 }
2964
2965 free(Ai[m]);
2966 free(Ax[m]);
2967 free(IAx[m]);
2968 free(Anum[m]);
2969 }
2970 free(Ai);
2971 free(Ax);
2972 free(IAx);
2973 free(Anum);
2974
2975 for (m=0; m<mp; m++){
2976 free(IA[m]);
2977 }
2978 free(IA);
2979
2980 free(IA0);
2981
2982 mp = mpmax/2;
2983 for (n=0; n<nl; n++){
2984 for (m=0; m<mp; m++){
2985 free(IS[n][m]);
2986 }
2987 free(IS[n]);
2988 mp /= 2;
2989 }
2990 free(IS);
2991
2992 mp = mpmax/2;
2993
2994 for (n=0; n<nl; n++){
2995 for (m=0; m<mp; m++){
2996 for (i=SindC[n][m]; i<=EindC[n][m]; i++){
2997
2998 free(IBx[0][n][m][i-SindC[n][m]]);
2999 free(IBx[1][n][m][i-SindC[n][m]]);
3000 free(ICx[n][m][i-SindC[n][m]]);
3001
3002 free(Bx[0][n][m][i-SindC[n][m]]);
3003 free(Bx[1][n][m][i-SindC[n][m]]);
3004 free(Cx[n][m][i-SindC[n][m]]);
3005
3006 free(Bi[0][n][m][i-SindC[n][m]]);
3007 free(Bi[1][n][m][i-SindC[n][m]]);
3008 free(Ci[n][m][i-SindC[n][m]]);
3009
3010 } /* i */
3011
3012 free(IBx[0][n][m]);
3013 free(IBx[1][n][m]);
3014 free(ICx[n][m]);
3015
3016 free(Bx[0][n][m]);
3017 free(Bx[1][n][m]);
3018 free(Cx[n][m]);
3019
3020 free(Bi[0][n][m]);
3021 free(Bi[1][n][m]);
3022 free(Ci[n][m]);
3023
3024 free(Bnum[0][n][m]);
3025 free(Bnum[1][n][m]);
3026 free(Cnum[n][m]);
3027
3028 } /* m */
3029
3030 free(IBx[0][n]);
3031 free(IBx[1][n]);
3032 free(ICx[n]);
3033
3034 free(Bx[0][n]);
3035 free(Bx[1][n]);
3036 free(Cx[n]);
3037
3038 free(Bi[0][n]);
3039 free(Bi[1][n]);
3040 free(Ci[n]);
3041
3042 free(Bnum[0][n]);
3043 free(Bnum[1][n]);
3044 free(Cnum[n]);
3045
3046 mp /= 2;
3047
3048 } /* n */
3049
3050 free(IBx[0]);
3051 free(IBx[1]);
3052 free(ICx);
3053
3054 free(Bx[0]);
3055 free(Bx[1]);
3056 free(Cx);
3057
3058 free(Bi[0]);
3059 free(Bi[1]);
3060 free(Ci);
3061
3062 free(Bnum[0]);
3063 free(Bnum[1]);
3064 free(Cnum);
3065
3066 free(invp);
3067
3068 free(Ti);
3069 free(Tx);
3070 free(Tp);
3071 free(Tp0);
3072
3073 free(request_recv);
3074 free(request_send);
3075 free(stat_send);
3076 }
3077
3078
3079
3080
3081
3082
3083
qsort_complex(long n,long * a,double complex * b)3084 static void qsort_complex(long n, long *a, double complex *b)
3085 {
3086 long int i;
3087 clists *AB;
3088
3089 AB = (clists*)malloc(sizeof(clists)*n);
3090
3091 for (i=0; i<n; i++){
3092 AB[i].a = a[i];
3093 AB[i].b = b[i];
3094 }
3095
3096 qsort(AB, n, sizeof(clists), (int(*)(const void*, const void*))clists_cmp);
3097
3098 for (i=0; i<n; i++){
3099 a[i] = AB[i].a;
3100 b[i] = AB[i].b;
3101 }
3102
3103 free(AB);
3104 }
3105
3106
clists_cmp(const clists * x,const clists * y)3107 static int clists_cmp(const clists *x, const clists *y)
3108 {
3109 return (x->a < y->a ? -1 :
3110 y->a < x->a ? 1 : 0);
3111 }
3112
3113
3114
3115
Cluster_collinear_ON2_Force(int SCF_iter,int SpinP_switch,double *** C,double ** ko,double ***** nh,double **** CntOLP,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])3116 static double Cluster_collinear_ON2_Force(
3117 int SCF_iter,
3118 int SpinP_switch,
3119 double ***C,
3120 double **ko,
3121 double *****nh, double ****CntOLP,
3122 double *****CDM,
3123 double *****EDM,
3124 double Eele0[2], double Eele1[2])
3125 {
3126 static int firsttime=1;
3127 double TStime,TEtime;
3128 double stime,etime;
3129 int numprocs,myid,ID,tag=999;
3130 int i,j,k,k2,wan,wanA,wanB,Epoint,column;
3131 int tnoA,tnoB,LB_AN,i1,j1,l;
3132 int MA_AN,GA_AN,GB_AN,Gc_AN,h_AN,ni,nj,TNZE;
3133 int Cwan,Hwan,Gh_AN,Anum,tnum,num,num4;
3134 int loop1,Nloop1,Nmat,spin,NEV,imax;
3135 int *NZE,*PZE,*MP;
3136 double **Hall,*Sall;
3137 double **EDMall;
3138 double **eval,**evec;
3139 double **WRG;
3140 double eachsum,totalsum,pChemP;
3141 double TZ,ChemP_MIN,ChemP_MAX,ymax;
3142 double Dnum,spin_degeneracy;
3143 double totalsum1,f0,f1,x1,y1,x0,y0;
3144 double x,Trial_ChemP,dE_step;
3145 double cTrial_ChemP,ChemP0;
3146 double My_Eele1[2];
3147 int po,po3,po4,po6,loopN,free_flag;
3148 int num_loop,end_flag;
3149 int Depth_Level,Number_Division;
3150 int *conv_index_row;
3151 int *conv_index_col;
3152 int **conv_ind2;
3153 int *My_NZeros;
3154 int *is1,*ie1;
3155 /* for MPI */
3156 int myworld1;
3157 int numprocs1,myid1;
3158 int numprocs2,myid2;
3159 int Num_Comm_World1;
3160 int *NPROCS_ID1,*NPROCS_WD1;
3161 int *Comm_World1;
3162 int *Comm_World_StartID1;
3163 MPI_Comm *MPI_CommWD1;
3164 int myworld2B;
3165 int numprocs2B,myid2B;
3166 int Num_Comm_World2B;
3167 int *NPROCS_ID2B,*NPROCS_WD2B;
3168 int *Comm_World2B;
3169 int *Comm_World_StartID2B;
3170 MPI_Comm *MPI_CommWD2B;
3171
3172 /* MPI */
3173 MPI_Comm_size(mpi_comm_level1,&numprocs);
3174 MPI_Comm_rank(mpi_comm_level1,&myid);
3175
3176 /* for elapsed time */
3177 MPI_Barrier(mpi_comm_level1);
3178 dtime(&TStime);
3179
3180 if (0<measure_time) dtime(&stime);
3181
3182 /****************************************************
3183 set up the first communication world
3184 ****************************************************/
3185
3186 Num_Comm_World1 = SpinP_switch + 1;
3187
3188 NPROCS_ID1 = (int*)malloc(sizeof(int)*numprocs);
3189 Comm_World1 = (int*)malloc(sizeof(int)*numprocs);
3190 NPROCS_WD1 = (int*)malloc(sizeof(int)*Num_Comm_World1);
3191 Comm_World_StartID1 = (int*)malloc(sizeof(int)*Num_Comm_World1);
3192 MPI_CommWD1 = (MPI_Comm*)malloc(sizeof(MPI_Comm)*Num_Comm_World1);
3193
3194 Make_Comm_Worlds(mpi_comm_level1, myid, numprocs, Num_Comm_World1, &myworld1, MPI_CommWD1,
3195 NPROCS_ID1, Comm_World1, NPROCS_WD1, Comm_World_StartID1);
3196
3197 /****************************************************
3198 calculation of dimension of matrix
3199 ****************************************************/
3200
3201 Nmat = 0;
3202 for (i=1; i<=atomnum; i++){
3203 wanA = WhatSpecies[i];
3204 Nmat += Spe_Total_CNO[wanA];
3205 }
3206
3207 /****************************************************
3208 total core charge
3209 ****************************************************/
3210
3211 TZ = 0.0;
3212 for (i=1; i<=atomnum; i++){
3213 wan = WhatSpecies[i];
3214 TZ = TZ + Spe_Core_Charge[wan];
3215 }
3216
3217 /****************************************************
3218 calculate TNZE
3219 ****************************************************/
3220
3221 TNZE = 0;
3222 for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
3223 Cwan = WhatSpecies[Gc_AN];
3224
3225 nj = 0;
3226 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
3227 Gh_AN = natn[Gc_AN][h_AN];
3228 Hwan = WhatSpecies[Gh_AN];
3229 nj += Spe_Total_NO[Hwan];
3230 }
3231
3232 TNZE += Spe_Total_NO[Cwan]*nj;
3233 }
3234
3235 /****************************************************
3236 allocate arrays
3237 ****************************************************/
3238
3239 WRG = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3240 for (spin=0; spin<(SpinP_switch+1); spin++){
3241 WRG[spin] = (double*)malloc(sizeof(double)*ON2_Npoles_f);
3242 for (i=0; i<ON2_Npoles_f; i++) WRG[spin][i] = 0.0;
3243 }
3244
3245 conv_ind2 = (int**)malloc(sizeof(int*)*Nmat);
3246 for (i=0; i<Nmat; i++){
3247 conv_ind2[i] = (int*)malloc(sizeof(int)*Nmat);
3248 }
3249
3250 EDMall = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3251 for (spin=0; spin<(SpinP_switch+1); spin++){
3252 EDMall[spin] = (double*)malloc(sizeof(double)*TNZE);
3253 for (i=0; i<TNZE; i++) EDMall[spin][i] = 0.0;
3254 }
3255
3256 Hall = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3257 for (spin=0; spin<(SpinP_switch+1); spin++){
3258 Hall[spin] = (double*)malloc(sizeof(double)*TNZE);
3259 }
3260
3261 Sall = (double*)malloc(sizeof(double)*TNZE);
3262
3263 MP = (int*)malloc(sizeof(int)*(atomnum+1));
3264
3265 /* set MP */
3266
3267 Anum = 1;
3268 for (i=1; i<=atomnum; i++){
3269 MP[i] = Anum;
3270 wanA = WhatSpecies[i];
3271 Anum += Spe_Total_CNO[wanA];
3272 }
3273
3274 conv_index_row = (int*)malloc(sizeof(int)*TNZE);
3275 conv_index_col = (int*)malloc(sizeof(int)*TNZE);
3276
3277 /******************************************
3278 MPI communication of nh and CntOLP
3279 ******************************************/
3280
3281 /* allocation of arrays */
3282
3283 My_NZeros = (int*)malloc(sizeof(int)*numprocs);
3284 is1 = (int*)malloc(sizeof(int)*numprocs);
3285 ie1 = (int*)malloc(sizeof(int)*numprocs);
3286
3287 /* find my total number of non-zero elements in myid */
3288
3289 My_NZeros[myid] = 0;
3290 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3291 GA_AN = M2G[MA_AN];
3292 wanA = WhatSpecies[GA_AN];
3293 tnoA = Spe_Total_CNO[wanA];
3294
3295 num = 0;
3296 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3297 GB_AN = natn[GA_AN][LB_AN];
3298 wanB = WhatSpecies[GB_AN];
3299 tnoB = Spe_Total_CNO[wanB];
3300 num += tnoB;
3301 }
3302
3303 My_NZeros[myid] += tnoA*num;
3304 }
3305
3306 for (ID=0; ID<numprocs; ID++){
3307 MPI_Bcast(&My_NZeros[ID],1,MPI_INT,ID,mpi_comm_level1);
3308 }
3309
3310 tnum = 0;
3311 for (ID=0; ID<numprocs; ID++){
3312 tnum += My_NZeros[ID];
3313 }
3314
3315 is1[0] = 0;
3316 ie1[0] = My_NZeros[0] - 1;
3317
3318 for (ID=1; ID<numprocs; ID++){
3319 is1[ID] = ie1[ID-1] + 1;
3320 ie1[ID] = is1[ID] + My_NZeros[ID] - 1;
3321 }
3322
3323 /******************
3324 for nh
3325 ******************/
3326
3327 for (spin=0; spin<=SpinP_switch; spin++){
3328
3329 /* set nh */
3330
3331 k = is1[myid];
3332 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3333 GA_AN = M2G[MA_AN];
3334 wanA = WhatSpecies[GA_AN];
3335 tnoA = Spe_Total_CNO[wanA];
3336 for (i=0; i<tnoA; i++){
3337 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3338 GB_AN = natn[GA_AN][LB_AN];
3339 wanB = WhatSpecies[GB_AN];
3340 tnoB = Spe_Total_CNO[wanB];
3341 for (j=0; j<tnoB; j++){
3342 Hall[spin][k] = nh[spin][MA_AN][LB_AN][i][j];
3343 k++;
3344 }
3345 }
3346 }
3347 }
3348
3349 /* MPI Hall */
3350
3351 MPI_Barrier(mpi_comm_level1);
3352
3353 for (ID=0; ID<numprocs; ID++){
3354 k = is1[ID];
3355 MPI_Bcast(&Hall[spin][k], My_NZeros[ID], MPI_DOUBLE, ID, mpi_comm_level1);
3356 }
3357
3358 }
3359
3360 /******************
3361 for CntOLP
3362 ******************/
3363
3364 k = is1[myid];
3365 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3366 GA_AN = M2G[MA_AN];
3367 wanA = WhatSpecies[GA_AN];
3368 tnoA = Spe_Total_CNO[wanA];
3369 for (i=0; i<tnoA; i++){
3370 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3371 GB_AN = natn[GA_AN][LB_AN];
3372 wanB = WhatSpecies[GB_AN];
3373 tnoB = Spe_Total_CNO[wanB];
3374 for (j=0; j<tnoB; j++){
3375 Sall[k] = CntOLP[MA_AN][LB_AN][i][j];
3376 k++;
3377 }
3378 }
3379 }
3380 }
3381
3382 /* MPI S1 */
3383
3384 MPI_Barrier(mpi_comm_level1);
3385
3386 for (ID=0; ID<numprocs; ID++){
3387 k = is1[ID];
3388 MPI_Bcast(&Sall[k], My_NZeros[ID], MPI_DOUBLE, ID, mpi_comm_level1);
3389 }
3390
3391 /******************************************
3392 set up conv_index_row and conv_index_col
3393 ******************************************/
3394
3395 k = is1[myid];
3396
3397 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3398 GA_AN = M2G[MA_AN];
3399 wanA = WhatSpecies[GA_AN];
3400 tnoA = Spe_Total_CNO[wanA];
3401 for (i=0; i<tnoA; i++){
3402 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3403 GB_AN = natn[GA_AN][LB_AN];
3404 wanB = WhatSpecies[GB_AN];
3405 tnoB = Spe_Total_CNO[wanB];
3406 for (j=0; j<tnoB; j++){
3407
3408 conv_index_row[k] = MP[GA_AN] + i - 1;
3409 conv_index_col[k] = MP[GB_AN] + j - 1;
3410 k++;
3411 }
3412 }
3413 }
3414 }
3415
3416 for (ID=0; ID<numprocs; ID++){
3417 k = is1[ID];
3418 MPI_Bcast(&conv_index_row[k], My_NZeros[ID], MPI_INT, ID, mpi_comm_level1);
3419 }
3420
3421 for (ID=0; ID<numprocs; ID++){
3422 k = is1[ID];
3423 MPI_Bcast(&conv_index_col[k], My_NZeros[ID], MPI_INT, ID, mpi_comm_level1);
3424 }
3425
3426 /****************************************************
3427 set up NZE and PZE
3428 ****************************************************/
3429
3430 NZE = (int*)malloc(sizeof(int)*Nmat);
3431 PZE = (int*)malloc(sizeof(int)*Nmat);
3432
3433 for (i=0; i<Nmat; i++){
3434 NZE[i] = 0;
3435 PZE[i] = -1;
3436 }
3437
3438 for (k=0; k<TNZE; k++){
3439
3440 i = conv_index_row[k];
3441
3442 NZE[i]++;
3443 if (PZE[i]==-1) PZE[i] = k;
3444 }
3445
3446 /* freeing of arrays */
3447
3448 free(My_NZeros);
3449 free(ie1);
3450
3451 if (0<measure_time){
3452 dtime(&etime);
3453 printf("myid=%2d OND_Solver time1=%15.12f\n",myid,etime-stime);fflush(stdout);
3454 }
3455
3456 /*************************************************************************
3457 *************************************************************************
3458
3459 Part 0: nested dissection of matrix
3460
3461 *************************************************************************
3462 *************************************************************************/
3463
3464 free_flag = 0;
3465 Nested_Dissection(free_flag,Nmat,MP,&Depth_Level,&Number_Division);
3466
3467 /*************************************************************************
3468 *************************************************************************
3469
3470 Part 1: contour integration with a chemical potential \mu_0
3471
3472 main loop which is decomposed to
3473 spin and energy point
3474
3475 *************************************************************************
3476 *************************************************************************/
3477
3478 MPI_Barrier(mpi_comm_level1);
3479
3480 Nloop1 = (SpinP_switch+1)*ON2_Npoles_f;
3481
3482 if (Nloop1<=numprocs){
3483
3484 Num_Comm_World2B = Nloop1;
3485
3486 NPROCS_ID2B = (int*)malloc(sizeof(int)*numprocs);
3487 Comm_World2B = (int*)malloc(sizeof(int)*numprocs);
3488 NPROCS_WD2B = (int*)malloc(sizeof(int)*Num_Comm_World2B);
3489 Comm_World_StartID2B = (int*)malloc(sizeof(int)*Num_Comm_World2B);
3490 MPI_CommWD2B = (MPI_Comm*)malloc(sizeof(MPI_Comm)*Num_Comm_World2B);
3491
3492 Make_Comm_Worlds(mpi_comm_level1, myid, numprocs, Num_Comm_World2B, &myworld2B, MPI_CommWD2B,
3493 NPROCS_ID2B, Comm_World2B, NPROCS_WD2B, Comm_World_StartID2B);
3494
3495 MPI_Comm_size(MPI_CommWD2B[myworld2B],&numprocs2);
3496 MPI_Comm_rank(MPI_CommWD2B[myworld2B],&myid2);
3497 }
3498 else {
3499 numprocs2 = 1;
3500 myid2 = 0;
3501 }
3502
3503 for (spin=0; spin<(SpinP_switch+1); spin++){
3504 for (i=0; i<TNZE; i++) EDMall[spin][i] = 0.0;
3505 }
3506
3507 if (Nloop1<=numprocs){
3508
3509 loop1 = myworld2B;
3510
3511 /* get spin, Epoint, and column */
3512 spin = loop1/ON2_Npoles_f;
3513 Epoint = loop1 - spin*ON2_Npoles_f;
3514
3515 OND_Solver("force",
3516 myid,numprocs,
3517 myid2,numprocs2,
3518 Nloop1,
3519 MPI_CommWD2B[myworld2B],
3520 spin,Epoint,0.0,
3521 Nmat,NZE,PZE,
3522 Depth_Level,
3523 Number_Division,
3524 ChemP,
3525 Hall,Sall,EDMall,
3526 WRG,
3527 conv_index_row,
3528 conv_index_col,
3529 conv_ind2,
3530 TNZE);
3531 }
3532
3533 else {
3534
3535 for ( loop1=myid; loop1<Nloop1; loop1+=numprocs ){
3536
3537 /* get spin, Epoint, and column */
3538 spin = loop1/ON2_Npoles_f;
3539 Epoint = loop1 - spin*ON2_Npoles_f;
3540
3541 /* call OND_Solver */
3542
3543 OND_Solver("force",
3544 myid,numprocs,
3545 myid2,numprocs2,
3546 Nloop1,
3547 mpi_comm_level1,
3548 spin,Epoint,0.0,
3549 Nmat,NZE,PZE,
3550 Depth_Level,
3551 Number_Division,
3552 ChemP,
3553 Hall,Sall,EDMall,
3554 WRG,
3555 conv_index_row,
3556 conv_index_col,
3557 conv_ind2,
3558 TNZE);
3559
3560 }
3561 } /* else */
3562
3563 /*************************************************************************
3564 store the energy density matrix into an array EDM
3565 **************************************************************************/
3566
3567 MPI_Barrier(mpi_comm_level1);
3568
3569 for (spin=0; spin<=SpinP_switch; spin++){
3570
3571 MPI_Allreduce(&EDMall[spin][0], &Sall[0], TNZE, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
3572
3573 k = is1[myid];
3574 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3575 GA_AN = M2G[MA_AN];
3576 wanA = WhatSpecies[GA_AN];
3577 tnoA = Spe_Total_CNO[wanA];
3578 for (i=0; i<tnoA; i++){
3579 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3580 GB_AN = natn[GA_AN][LB_AN];
3581 wanB = WhatSpecies[GB_AN];
3582 tnoB = Spe_Total_CNO[wanB];
3583 for (j=0; j<tnoB; j++){
3584 EDM[spin][MA_AN][LB_AN][i][j] = Sall[k];
3585 k++;
3586 }
3587 }
3588 }
3589 }
3590 }
3591
3592 /****************************************************
3593 freeing Index_BC, SindB, EindB, SindC, EindE
3594 ****************************************************/
3595
3596 free_flag = 1;
3597 Nested_Dissection(free_flag,Nmat,MP,&Depth_Level,&Number_Division);
3598
3599 /****************************************************
3600 freeing of arrays
3601 ****************************************************/
3602
3603 free(is1);
3604
3605 free(NZE);
3606 free(PZE);
3607
3608 for (spin=0; spin<(SpinP_switch+1); spin++){
3609 free(WRG[spin]);
3610 }
3611 free(WRG);
3612
3613 for (i=0; i<Nmat; i++){
3614 free(conv_ind2[i]);
3615 }
3616 free(conv_ind2);
3617
3618 for (spin=0; spin<(SpinP_switch+1); spin++){
3619 free(EDMall[spin]);
3620 }
3621 free(EDMall);
3622
3623 for (spin=0; spin<(SpinP_switch+1); spin++){
3624 free(Hall[spin]);
3625 }
3626 free(Hall);
3627
3628 free(Sall);
3629
3630 free(MP);
3631 free(conv_index_row);
3632 free(conv_index_col);
3633
3634 /* freeing of arrays for the first world */
3635
3636 if (Num_Comm_World1<=numprocs){
3637 MPI_Comm_free(&MPI_CommWD1[myworld1]);
3638 }
3639
3640 free(NPROCS_ID1);
3641 free(Comm_World1);
3642 free(NPROCS_WD1);
3643 free(Comm_World_StartID1);
3644 free(MPI_CommWD1);
3645
3646 /* freeing of arrays for the second B world */
3647
3648 if (Nloop1<=numprocs){
3649
3650 MPI_Comm_free(&MPI_CommWD2B[myworld2B]);
3651 free(NPROCS_ID2B);
3652 free(Comm_World2B);
3653 free(NPROCS_WD2B);
3654 free(Comm_World_StartID2B);
3655 free(MPI_CommWD2B);
3656 }
3657
3658 /* for elapsed time */
3659 MPI_Barrier(mpi_comm_level1);
3660 dtime(&TEtime);
3661 return (TEtime-TStime);
3662 }
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
Cluster_collinear_ON2_Iter(int SCF_iter,int SpinP_switch,double *** C,double ** ko,double ***** nh,double **** CntOLP,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])3675 static double Cluster_collinear_ON2_Iter(
3676 int SCF_iter,
3677 int SpinP_switch,
3678 double ***C,
3679 double **ko,
3680 double *****nh, double ****CntOLP,
3681 double *****CDM,
3682 double *****EDM,
3683 double Eele0[2], double Eele1[2])
3684 {
3685 static int firsttime=1;
3686 double TStime,TEtime;
3687 double stime,etime;
3688 int numprocs,myid,ID,tag=999;
3689 int i,j,k,k2,wan,wanA,wanB,Epoint,column;
3690 int tnoA,tnoB,LB_AN,i1,j1,l;
3691 int MA_AN,GA_AN,GB_AN,Gc_AN,h_AN,ni,nj,TNZE;
3692 int Cwan,Hwan,Gh_AN,Anum,tnum,num,num4;
3693 int loop1,Nloop1,Nmat,spin,NEV,imax;
3694 int *NZE,*PZE,*MP;
3695 double **Hall,*Sall;
3696 double **DMall;
3697 double **eval,**evec;
3698 double **WRG;
3699 double eachsum,totalsum,pChemP;
3700 double TZ,ChemP_MIN,ChemP_MAX,ymax;
3701 double Dnum,spin_degeneracy;
3702 double totalsum1,f0,f1,x1,y1,x0,y0;
3703 double x,Trial_ChemP,dE_step;
3704 double cTrial_ChemP,ChemP0;
3705 double x3,y3,startE,Real_Ene;
3706 double My_Eele1[2];
3707 static double pTrial_ChemP[4],pDiff_Num[5];
3708 static double DeltaMu[7],DeltaN[3],DeltaNp[3];
3709 int po,po3,po4,po6,loopN,free_flag;
3710 int num_loop,end_flag;
3711 int Depth_Level,Number_Division;
3712 int *conv_index_row;
3713 int *conv_index_col;
3714 int **conv_ind2;
3715 int *My_NZeros;
3716 int *is1,*ie1;
3717 /* for MPI */
3718 int myworld1;
3719 int numprocs1,myid1;
3720 int numprocs2,myid2;
3721 int Num_Comm_World1;
3722 int *NPROCS_ID1,*NPROCS_WD1;
3723 int *Comm_World1;
3724 int *Comm_World_StartID1;
3725 MPI_Comm *MPI_CommWD1;
3726 int myworld2B;
3727 int numprocs2B,myid2B;
3728 int Num_Comm_World2B;
3729 int *NPROCS_ID2B,*NPROCS_WD2B;
3730 int *Comm_World2B;
3731 int *Comm_World_StartID2B;
3732 MPI_Comm *MPI_CommWD2B;
3733
3734 /* MPI */
3735 MPI_Comm_size(mpi_comm_level1,&numprocs);
3736 MPI_Comm_rank(mpi_comm_level1,&myid);
3737
3738 /* for elapsed time */
3739 MPI_Barrier(mpi_comm_level1);
3740 dtime(&TStime);
3741
3742 if (0<measure_time) dtime(&stime);
3743
3744 /****************************************************
3745 set up the first communication world
3746 ****************************************************/
3747
3748 Num_Comm_World1 = SpinP_switch + 1;
3749
3750 NPROCS_ID1 = (int*)malloc(sizeof(int)*numprocs);
3751 Comm_World1 = (int*)malloc(sizeof(int)*numprocs);
3752 NPROCS_WD1 = (int*)malloc(sizeof(int)*Num_Comm_World1);
3753 Comm_World_StartID1 = (int*)malloc(sizeof(int)*Num_Comm_World1);
3754 MPI_CommWD1 = (MPI_Comm*)malloc(sizeof(MPI_Comm)*Num_Comm_World1);
3755
3756 Make_Comm_Worlds(mpi_comm_level1, myid, numprocs, Num_Comm_World1, &myworld1, MPI_CommWD1,
3757 NPROCS_ID1, Comm_World1, NPROCS_WD1, Comm_World_StartID1);
3758
3759 /****************************************************
3760 calculation of dimension of matrix
3761 ****************************************************/
3762
3763 Nmat = 0;
3764 for (i=1; i<=atomnum; i++){
3765 wanA = WhatSpecies[i];
3766 Nmat += Spe_Total_CNO[wanA];
3767 }
3768
3769 /****************************************************
3770 total core charge
3771 ****************************************************/
3772
3773 TZ = 0.0;
3774 for (i=1; i<=atomnum; i++){
3775 wan = WhatSpecies[i];
3776 TZ = TZ + Spe_Core_Charge[wan];
3777 }
3778
3779 /****************************************************
3780 calculate TNZE
3781 ****************************************************/
3782
3783 TNZE = 0;
3784 for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
3785 Cwan = WhatSpecies[Gc_AN];
3786
3787 nj = 0;
3788 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
3789 Gh_AN = natn[Gc_AN][h_AN];
3790 Hwan = WhatSpecies[Gh_AN];
3791 nj += Spe_Total_NO[Hwan];
3792 }
3793
3794 TNZE += Spe_Total_NO[Cwan]*nj;
3795 }
3796
3797 /****************************************************
3798 allocate arrays
3799 ****************************************************/
3800
3801 WRG = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3802 for (spin=0; spin<(SpinP_switch+1); spin++){
3803 WRG[spin] = (double*)malloc(sizeof(double)*ON2_Npoles);
3804 for (i=0; i<ON2_Npoles; i++) WRG[spin][i] = 0.0;
3805 }
3806
3807 conv_ind2 = (int**)malloc(sizeof(int*)*Nmat);
3808 for (i=0; i<Nmat; i++){
3809 conv_ind2[i] = (int*)malloc(sizeof(int)*Nmat);
3810 }
3811
3812 DMall = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3813 for (spin=0; spin<(SpinP_switch+1); spin++){
3814 DMall[spin] = (double*)malloc(sizeof(double)*TNZE);
3815 for (i=0; i<TNZE; i++) DMall[spin][i] = 0.0;
3816 }
3817
3818 Hall = (double**)malloc(sizeof(double*)*(SpinP_switch+1));
3819 for (spin=0; spin<(SpinP_switch+1); spin++){
3820 Hall[spin] = (double*)malloc(sizeof(double)*TNZE);
3821 }
3822
3823 Sall = (double*)malloc(sizeof(double)*TNZE);
3824
3825 MP = (int*)malloc(sizeof(int)*(atomnum+1));
3826
3827 /* set MP */
3828
3829 Anum = 1;
3830 for (i=1; i<=atomnum; i++){
3831 MP[i] = Anum;
3832 wanA = WhatSpecies[i];
3833 Anum += Spe_Total_CNO[wanA];
3834 }
3835
3836 conv_index_row = (int*)malloc(sizeof(int)*TNZE);
3837 conv_index_col = (int*)malloc(sizeof(int)*TNZE);
3838
3839 /******************************************
3840 MPI communication of nh and CntOLP
3841 ******************************************/
3842
3843 /* allocation of arrays */
3844
3845 My_NZeros = (int*)malloc(sizeof(int)*numprocs);
3846 is1 = (int*)malloc(sizeof(int)*numprocs);
3847 ie1 = (int*)malloc(sizeof(int)*numprocs);
3848
3849 /* find my total number of non-zero elements in myid */
3850
3851 My_NZeros[myid] = 0;
3852 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3853 GA_AN = M2G[MA_AN];
3854 wanA = WhatSpecies[GA_AN];
3855 tnoA = Spe_Total_CNO[wanA];
3856
3857 num = 0;
3858 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3859 GB_AN = natn[GA_AN][LB_AN];
3860 wanB = WhatSpecies[GB_AN];
3861 tnoB = Spe_Total_CNO[wanB];
3862 num += tnoB;
3863 }
3864
3865 My_NZeros[myid] += tnoA*num;
3866 }
3867
3868 for (ID=0; ID<numprocs; ID++){
3869 MPI_Bcast(&My_NZeros[ID],1,MPI_INT,ID,mpi_comm_level1);
3870 }
3871
3872 tnum = 0;
3873 for (ID=0; ID<numprocs; ID++){
3874 tnum += My_NZeros[ID];
3875 }
3876
3877 is1[0] = 0;
3878 ie1[0] = My_NZeros[0] - 1;
3879
3880 for (ID=1; ID<numprocs; ID++){
3881 is1[ID] = ie1[ID-1] + 1;
3882 ie1[ID] = is1[ID] + My_NZeros[ID] - 1;
3883 }
3884
3885 /******************
3886 for nh
3887 ******************/
3888
3889 for (spin=0; spin<=SpinP_switch; spin++){
3890
3891 /* set nh */
3892
3893 k = is1[myid];
3894 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3895 GA_AN = M2G[MA_AN];
3896 wanA = WhatSpecies[GA_AN];
3897 tnoA = Spe_Total_CNO[wanA];
3898 for (i=0; i<tnoA; i++){
3899 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3900 GB_AN = natn[GA_AN][LB_AN];
3901 wanB = WhatSpecies[GB_AN];
3902 tnoB = Spe_Total_CNO[wanB];
3903 for (j=0; j<tnoB; j++){
3904 Hall[spin][k] = nh[spin][MA_AN][LB_AN][i][j];
3905 k++;
3906 }
3907 }
3908 }
3909 }
3910
3911 /* MPI Hall */
3912
3913 MPI_Barrier(mpi_comm_level1);
3914
3915 for (ID=0; ID<numprocs; ID++){
3916 k = is1[ID];
3917 MPI_Bcast(&Hall[spin][k], My_NZeros[ID], MPI_DOUBLE, ID, mpi_comm_level1);
3918 }
3919
3920 }
3921
3922 /******************
3923 for CntOLP
3924 ******************/
3925
3926 k = is1[myid];
3927 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3928 GA_AN = M2G[MA_AN];
3929 wanA = WhatSpecies[GA_AN];
3930 tnoA = Spe_Total_CNO[wanA];
3931 for (i=0; i<tnoA; i++){
3932 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3933 GB_AN = natn[GA_AN][LB_AN];
3934 wanB = WhatSpecies[GB_AN];
3935 tnoB = Spe_Total_CNO[wanB];
3936 for (j=0; j<tnoB; j++){
3937 Sall[k] = CntOLP[MA_AN][LB_AN][i][j];
3938 k++;
3939 }
3940 }
3941 }
3942 }
3943
3944 /* MPI S1 */
3945
3946 MPI_Barrier(mpi_comm_level1);
3947
3948 for (ID=0; ID<numprocs; ID++){
3949 k = is1[ID];
3950 MPI_Bcast(&Sall[k], My_NZeros[ID], MPI_DOUBLE, ID, mpi_comm_level1);
3951 }
3952
3953 /******************************************
3954 set up conv_index_row and conv_index_col
3955 ******************************************/
3956
3957 k = is1[myid];
3958
3959 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
3960 GA_AN = M2G[MA_AN];
3961 wanA = WhatSpecies[GA_AN];
3962 tnoA = Spe_Total_CNO[wanA];
3963 for (i=0; i<tnoA; i++){
3964 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
3965 GB_AN = natn[GA_AN][LB_AN];
3966 wanB = WhatSpecies[GB_AN];
3967 tnoB = Spe_Total_CNO[wanB];
3968 for (j=0; j<tnoB; j++){
3969
3970 conv_index_row[k] = MP[GA_AN] + i - 1;
3971 conv_index_col[k] = MP[GB_AN] + j - 1;
3972 k++;
3973 }
3974 }
3975 }
3976 }
3977
3978 for (ID=0; ID<numprocs; ID++){
3979 k = is1[ID];
3980 MPI_Bcast(&conv_index_row[k], My_NZeros[ID], MPI_INT, ID, mpi_comm_level1);
3981 }
3982
3983 for (ID=0; ID<numprocs; ID++){
3984 k = is1[ID];
3985 MPI_Bcast(&conv_index_col[k], My_NZeros[ID], MPI_INT, ID, mpi_comm_level1);
3986 }
3987
3988 /****************************************************
3989 set up NZE and PZE
3990 ****************************************************/
3991
3992 NZE = (int*)malloc(sizeof(int)*Nmat);
3993 PZE = (int*)malloc(sizeof(int)*Nmat);
3994
3995 for (i=0; i<Nmat; i++){
3996 NZE[i] = 0;
3997 PZE[i] = -1;
3998 }
3999
4000 for (k=0; k<TNZE; k++){
4001
4002 i = conv_index_row[k];
4003
4004 NZE[i]++;
4005 if (PZE[i]==-1) PZE[i] = k;
4006 }
4007
4008 /* freeing of arrays */
4009
4010 free(My_NZeros);
4011 free(ie1);
4012
4013 if (0<measure_time){
4014 dtime(&etime);
4015 printf("myid=%2d OND_Solver time1=%15.12f\n",myid,etime-stime);fflush(stdout);
4016 }
4017
4018 /*************************************************************************
4019 *************************************************************************
4020
4021 Part 0: nested dissection of matrix
4022
4023 *************************************************************************
4024 *************************************************************************/
4025
4026 free_flag = 0;
4027 Nested_Dissection(free_flag,Nmat,MP,&Depth_Level,&Number_Division);
4028
4029 /*************************************************************************
4030 *************************************************************************
4031
4032 Part 1: contour integration with a chemical potential \mu_0
4033
4034 main loop which is decomposed to
4035 spin and energy point
4036
4037 *************************************************************************
4038 *************************************************************************/
4039
4040 MPI_Barrier(mpi_comm_level1);
4041
4042 Nloop1 = (SpinP_switch+1)*ON2_Npoles;
4043
4044 if (Nloop1<=numprocs){
4045
4046 Num_Comm_World2B = Nloop1;
4047
4048 NPROCS_ID2B = (int*)malloc(sizeof(int)*numprocs);
4049 Comm_World2B = (int*)malloc(sizeof(int)*numprocs);
4050 NPROCS_WD2B = (int*)malloc(sizeof(int)*Num_Comm_World2B);
4051 Comm_World_StartID2B = (int*)malloc(sizeof(int)*Num_Comm_World2B);
4052 MPI_CommWD2B = (MPI_Comm*)malloc(sizeof(MPI_Comm)*Num_Comm_World2B);
4053
4054 Make_Comm_Worlds(mpi_comm_level1, myid, numprocs, Num_Comm_World2B, &myworld2B, MPI_CommWD2B,
4055 NPROCS_ID2B, Comm_World2B, NPROCS_WD2B, Comm_World_StartID2B);
4056
4057 MPI_Comm_size(MPI_CommWD2B[myworld2B],&numprocs2);
4058 MPI_Comm_rank(MPI_CommWD2B[myworld2B],&myid2);
4059 }
4060 else {
4061 numprocs2 = 1;
4062 myid2 = 0;
4063 }
4064
4065 end_flag = 0;
4066 num_loop = 1;
4067 Trial_ChemP = ChemP;
4068
4069 do {
4070
4071 for (spin=0; spin<(SpinP_switch+1); spin++){
4072 for (i=0; i<TNZE; i++) DMall[spin][i] = 0.0;
4073 }
4074
4075 if (0<measure_time) dtime(&stime);
4076
4077 if (Nloop1<=numprocs){
4078
4079 loop1 = myworld2B;
4080
4081 /* get spin, Epoint, and column */
4082 spin = loop1/ON2_Npoles;
4083 Epoint = loop1 - spin*ON2_Npoles;
4084
4085 OND_Solver("contour",
4086 myid,numprocs,
4087 myid2,numprocs2,
4088 Nloop1,
4089 MPI_CommWD2B[myworld2B],
4090 spin,Epoint,Real_Ene,
4091 Nmat,NZE,PZE,
4092 Depth_Level,
4093 Number_Division,
4094 Trial_ChemP,
4095 Hall,Sall,DMall,
4096 WRG,
4097 conv_index_row,
4098 conv_index_col,
4099 conv_ind2,
4100 TNZE);
4101 }
4102
4103 else {
4104
4105 for ( loop1=myid; loop1<Nloop1; loop1+=numprocs ){
4106
4107 /* get spin, Epoint, and column */
4108 spin = loop1/ON2_Npoles;
4109 Epoint = loop1 - spin*ON2_Npoles;
4110
4111 /* call OND_Solver */
4112
4113 OND_Solver("contour",
4114 myid,numprocs,
4115 myid2,numprocs2,
4116 Nloop1,
4117 mpi_comm_level1,
4118 spin,Epoint,Real_Ene,
4119 Nmat,NZE,PZE,
4120 Depth_Level,
4121 Number_Division,
4122 Trial_ChemP,
4123 Hall,Sall,DMall,
4124 WRG,
4125 conv_index_row,
4126 conv_index_col,
4127 conv_ind2,
4128 TNZE);
4129
4130 }
4131 } /* else */
4132
4133 if (0<measure_time){
4134 dtime(&etime);
4135 printf("myid=%2d OND_Solver time2 =%15.12f\n",myid,etime-stime);fflush(stdout);
4136 }
4137
4138 /*******************************************************
4139 calculate "each" number of electrons on each processor,
4140 and MPI communicate to get the total sum.
4141 ********************************************************/
4142
4143 MPI_Barrier(mpi_comm_level1);
4144
4145 if (0<measure_time) dtime(&stime);
4146
4147 eachsum = 0.0;
4148 for (spin=0; spin<=SpinP_switch; spin++){
4149 for (i=0; i<TNZE; i++){
4150 eachsum += DMall[spin][i]*Sall[i];
4151 }
4152 }
4153
4154 MPI_Allreduce(&eachsum, &totalsum, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4155
4156 if (SpinP_switch==0){
4157 eachsum *= 2.0;
4158 totalsum *= 2.0;
4159 }
4160
4161 if (0<measure_time){
4162 dtime(&etime);
4163 printf("myid=%2d OND_Solver time3=%15.12f\n",myid,etime-stime);fflush(stdout);
4164 }
4165
4166 /* set x3 and y3 */
4167
4168 x3 = Trial_ChemP;
4169 y3 = -TZ + totalsum + system_charge;
4170 pTrial_ChemP[3] = x3;
4171 pDiff_Num[3] = y3;
4172
4173 if (0<measure_time){
4174 printf("myid=%2d eachsum=%15.12f totalsum=%15.12f\n",myid,eachsum,totalsum);
4175 if (myid==0)
4176 printf("SSS SCF_iter=%2d num_loop=%2d x3=%18.15f y3=%18.15f\n",SCF_iter,num_loop,x3,y3);
4177 }
4178
4179 /*******************************************************
4180 find the next Trial_ChemP.
4181 updating Trial_ChemP is performed by two ways:
4182
4183 (1) Step 1:
4184 if fabs(y3) is large enough so that finding a proper
4185 Trial_ChemP can be difficult, then we try to find
4186 Trial_ChemP using the retarded Green function, G(E+i0^+).
4187
4188 (2) Step 2:
4189 if fabs(y3) is small, then a linear interpolation or
4190 the Muller method is used to estimate Trial_ChemP.
4191 ********************************************************/
4192
4193 /**********************************************
4194 Step 1:
4195 ***********************************************/
4196
4197 if (0.01<fabs(y3) && num_loop==1){
4198
4199 cTrial_ChemP = Trial_ChemP;
4200 dE_step = 0.02/eV2Hartree;
4201
4202 if (0.0<y3){
4203 startE = x3 - dE_step*(double)ON2_Npoles*0.8;
4204 }
4205 else {
4206 startE = x3 - dE_step*(double)ON2_Npoles*0.2;
4207 }
4208
4209 if (Nloop1<=numprocs){
4210
4211 loop1 = myworld2B;
4212
4213 /* get spin, Epoint, and column */
4214 spin = loop1/ON2_Npoles;
4215 Epoint = loop1 - spin*ON2_Npoles;
4216 Real_Ene = startE + dE_step*(double)Epoint;
4217
4218 OND_Solver("retarded",
4219 myid,numprocs,
4220 myid2,numprocs2,
4221 Nloop1,
4222 MPI_CommWD2B[myworld2B],
4223 spin,Epoint,Real_Ene,
4224 Nmat,NZE,PZE,
4225 Depth_Level,
4226 Number_Division,
4227 Trial_ChemP,
4228 Hall,Sall,DMall,
4229 WRG,
4230 conv_index_row,
4231 conv_index_col,
4232 conv_ind2,
4233 TNZE);
4234 }
4235
4236 else {
4237
4238 for ( loop1=myid; loop1<Nloop1; loop1+=numprocs ){
4239
4240 /* get spin, Epoint, and column */
4241 spin = loop1/ON2_Npoles;
4242 Epoint = loop1 - spin*ON2_Npoles;
4243 Real_Ene = startE + dE_step*(double)Epoint;
4244
4245 /* call OND_Solver */
4246
4247 OND_Solver("retarded",
4248 myid,numprocs,
4249 myid2,numprocs2,
4250 Nloop1,
4251 mpi_comm_level1,
4252 spin,Epoint,Real_Ene,
4253 Nmat,NZE,PZE,
4254 Depth_Level,
4255 Number_Division,
4256 Trial_ChemP,
4257 Hall,Sall,DMall,
4258 WRG,
4259 conv_index_row,
4260 conv_index_col,
4261 conv_ind2,
4262 TNZE);
4263
4264 }
4265 } /* else */
4266
4267 /*******************************
4268 search the next Trial_ChemP
4269 *******************************/
4270
4271 if (0.0<y3)
4272 ChemP0 = x3 - dE_step*(double)ON2_Npoles*0.8;
4273 else
4274 ChemP0 = x3 + dE_step*(double)ON2_Npoles*0.8;
4275
4276 eachsum = 0.0;
4277
4278 for (spin=0; spin<=SpinP_switch; spin++){
4279 for (i=0; i<ON2_Npoles; i++){
4280
4281 Real_Ene = startE + dE_step*(double)i;
4282 x = (Real_Ene - Trial_ChemP)*Beta;
4283 f0 = 1.0/(1.0 + exp(x));
4284 x = (Real_Ene - ChemP0)*Beta;
4285 f1 = 1.0/(1.0 + exp(x));
4286 eachsum += WRG[spin][i]*(f1-f0);
4287 }
4288 }
4289
4290 eachsum = eachsum/PI*dE_step;
4291 if (SpinP_switch==0) eachsum *= 2.0;
4292
4293 MPI_Allreduce(&eachsum, &totalsum, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4294
4295 if ( 1<measure_time ){
4296 printf("GGG1 SCF_iter=%2d Trial_ChemP=%15.12f ChemP0=%15.12f y3=%15.12f totalsum=%15.12f\n",
4297 SCF_iter,Trial_ChemP,ChemP0,y3,totalsum);
4298 }
4299
4300 if ( 0.0<(y3*(y3+totalsum)) ){
4301
4302 /* just use ChemP0 */
4303
4304 Trial_ChemP = ChemP0;
4305 }
4306 else {
4307
4308 /* refinement of Tria_ChemP by a bisection method */
4309
4310 /* set up the range of Trial_ChemP */
4311
4312 if (0.0<y3){
4313 ChemP_MAX = Trial_ChemP;
4314 ChemP_MIN = ChemP0;
4315 }
4316 else {
4317 ChemP_MAX = ChemP0;
4318 ChemP_MIN = Trial_ChemP;
4319 }
4320
4321 /* do refinement */
4322
4323 do {
4324
4325 ChemP0 = 0.50*(ChemP_MAX + ChemP_MIN);
4326
4327 eachsum = 0.0;
4328
4329 for (spin=0; spin<=SpinP_switch; spin++){
4330 for (i=0; i<ON2_Npoles; i++){
4331
4332 Real_Ene = startE + dE_step*(double)i;
4333 x = (Real_Ene - Trial_ChemP)*Beta;
4334 f0 = 1.0/(1.0 + exp(x));
4335 x = (Real_Ene - ChemP0)*Beta;
4336 f1 = 1.0/(1.0 + exp(x));
4337 eachsum += WRG[spin][i]*(f1-f0);
4338 }
4339 }
4340
4341 eachsum = eachsum/PI*dE_step;
4342 if (SpinP_switch==0) eachsum *= 2.0;
4343
4344 MPI_Allreduce(&eachsum, &totalsum, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4345
4346 if ( 1<measure_time ){
4347 printf("EEE myid=%2d SCF_iter=%2d ChemP=%15.12f diff=%15.12f\n",myid,SCF_iter,ChemP0,y3+totalsum);
4348 }
4349
4350 if ( 0.0<=(y3+totalsum) ){
4351 ChemP_MAX = ChemP0;
4352 }
4353 else {
4354 ChemP_MIN = ChemP0;
4355 }
4356
4357 if (fabs(y3+totalsum)<1.0e-8) po6 = 1;
4358
4359 } while (po6==0);
4360
4361 /* update Trial_ChemP */
4362
4363 Trial_ChemP = cTrial_ChemP + 2.0*(ChemP0-cTrial_ChemP);
4364 }
4365
4366 pTrial_ChemP[0] = x3;
4367 pDiff_Num[0] = y3;
4368
4369 DeltaMu[6] = Trial_ChemP - cTrial_ChemP;
4370 pDiff_Num[4] = y3;
4371
4372 }
4373
4374 /**********************************************
4375 Step 2:
4376 ***********************************************/
4377
4378 else {
4379
4380 Trial_ChemP = Find_Trial_ChemP_by_Muller( SCF_iter, num_loop,
4381 pTrial_ChemP, pDiff_Num,
4382 DeltaMu, DeltaN, DeltaNp );
4383 }
4384
4385 /* check its convergence */
4386
4387 if ( fabs(y3)<1.0e-8
4388 || (fabs(y3)<1.0e-7 && SCF_iter<5) ){
4389
4390 end_flag = 1;
4391 }
4392 else {
4393 num_loop++;
4394 }
4395
4396 } while (end_flag==0 && num_loop<13);
4397
4398 /* update the new ChemP */
4399
4400 ChemP = Trial_ChemP;
4401
4402 /* show num_loop */
4403
4404 if (myid==Host_ID){
4405 printf("<Cluster2> SCF_iter=%2d ChemP_Iter=%2d\n",SCF_iter,num_loop);
4406 }
4407
4408 /*************************************************************************
4409 store the density matrix by the part 1 into an array DM
4410 **************************************************************************/
4411
4412 MPI_Barrier(mpi_comm_level1);
4413
4414 for (spin=0; spin<=SpinP_switch; spin++){
4415
4416 MPI_Allreduce(&DMall[spin][0], &Sall[0], TNZE, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4417
4418 k = is1[myid];
4419 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
4420 GA_AN = M2G[MA_AN];
4421 wanA = WhatSpecies[GA_AN];
4422 tnoA = Spe_Total_CNO[wanA];
4423 for (i=0; i<tnoA; i++){
4424 for (LB_AN=0; LB_AN<=FNAN[GA_AN]; LB_AN++){
4425 GB_AN = natn[GA_AN][LB_AN];
4426 wanB = WhatSpecies[GB_AN];
4427 tnoB = Spe_Total_CNO[wanB];
4428 for (j=0; j<tnoB; j++){
4429 CDM[spin][MA_AN][LB_AN][i][j] = Sall[k];
4430 k++;
4431 }
4432 }
4433 }
4434 }
4435 }
4436
4437 /****************************************************
4438 calculation of band energy
4439 ****************************************************/
4440
4441 My_Eele1[0] = 0.0;
4442 My_Eele1[1] = 0.0;
4443
4444 for (spin=0; spin<=SpinP_switch; spin++){
4445 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
4446 GA_AN = M2G[MA_AN];
4447 wanA = WhatSpecies[GA_AN];
4448 tnoA = Spe_Total_CNO[wanA];
4449 for (j=0; j<=FNAN[GA_AN]; j++){
4450 wanB = WhatSpecies[natn[GA_AN][j]];
4451 tnoB = Spe_Total_CNO[wanB];
4452 for (k=0; k<tnoA; k++){
4453 for (l=0; l<tnoB; l++){
4454 My_Eele1[spin] += CDM[spin][MA_AN][j][k][l]*nh[spin][MA_AN][j][k][l];
4455 }
4456 }
4457 }
4458 }
4459 }
4460
4461 MPI_Barrier(mpi_comm_level1);
4462
4463 /* MPI, My_Eele1 */
4464 for (spin=0; spin<=SpinP_switch; spin++){
4465 MPI_Allreduce(&My_Eele1[spin], &Eele1[spin], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
4466 }
4467
4468 if (SpinP_switch==0) Eele1[1] = Eele1[0];
4469
4470 /****************************************************
4471 freeing Index_BC, SindB, EindB, SindC, EindE
4472 ****************************************************/
4473
4474 free_flag = 1;
4475 Nested_Dissection(free_flag,Nmat,MP,&Depth_Level,&Number_Division);
4476
4477 /****************************************************
4478 freeing of arrays
4479 ****************************************************/
4480
4481 free(is1);
4482
4483 free(NZE);
4484 free(PZE);
4485
4486 for (spin=0; spin<(SpinP_switch+1); spin++){
4487 free(WRG[spin]);
4488 }
4489 free(WRG);
4490
4491 for (i=0; i<Nmat; i++){
4492 free(conv_ind2[i]);
4493 }
4494 free(conv_ind2);
4495
4496 for (spin=0; spin<(SpinP_switch+1); spin++){
4497 free(DMall[spin]);
4498 }
4499 free(DMall);
4500
4501 for (spin=0; spin<(SpinP_switch+1); spin++){
4502 free(Hall[spin]);
4503 }
4504 free(Hall);
4505
4506 free(Sall);
4507
4508 free(MP);
4509 free(conv_index_row);
4510 free(conv_index_col);
4511
4512 /* freeing of arrays for the first world */
4513
4514 if (Num_Comm_World1<=numprocs){
4515 MPI_Comm_free(&MPI_CommWD1[myworld1]);
4516 }
4517
4518 free(NPROCS_ID1);
4519 free(Comm_World1);
4520 free(NPROCS_WD1);
4521 free(Comm_World_StartID1);
4522 free(MPI_CommWD1);
4523
4524 /* freeing of arrays for the second B world */
4525
4526 if (Nloop1<=numprocs){
4527
4528 MPI_Comm_free(&MPI_CommWD2B[myworld2B]);
4529 free(NPROCS_ID2B);
4530 free(Comm_World2B);
4531 free(NPROCS_WD2B);
4532 free(Comm_World_StartID2B);
4533 free(MPI_CommWD2B);
4534 }
4535
4536 /* for elapsed time */
4537 MPI_Barrier(mpi_comm_level1);
4538 dtime(&TEtime);
4539 return (TEtime-TStime);
4540 }
4541
4542
4543
4544
Find_Trial_ChemP_by_Muller(int SCF_iter,int num_loop,double pTrial_ChemP[4],double pDiff_Num[5],double DeltaMu[7],double DeltaN[3],double DeltaNp[3])4545 static double Find_Trial_ChemP_by_Muller(
4546 int SCF_iter, int num_loop,
4547 double pTrial_ChemP[4], double pDiff_Num[5],
4548 double DeltaMu[7], double DeltaN[3], double DeltaNp[3])
4549 {
4550 static double stepw;
4551 double y31,x0,x1,x2,x3,y0,y1,y2,y3;
4552 int i,i0,i1,i2,num4,po3,po4;
4553 double a,b,c,d,g0,g1,g2,dFa,dFb,dFc,dF,dF0,F;
4554 double xx[5],yy[5],zz[5],yy0[5];
4555 double z0,z1,z2,a1,a2,a3,det;
4556 double a11,a12,a13,a21,a22,a23,a31,a32,a33;
4557 double scaling4;
4558 double Trial_ChemP,Trial_ChemP0;
4559 int flag_nan;
4560 char nanchar[300];
4561
4562 Trial_ChemP = pTrial_ChemP[3];
4563
4564 x0 = pTrial_ChemP[0];
4565 x1 = pTrial_ChemP[1];
4566 x2 = pTrial_ChemP[2];
4567 x3 = pTrial_ChemP[3];
4568
4569 y0 = pDiff_Num[0];
4570 y1 = pDiff_Num[1];
4571 y2 = pDiff_Num[2];
4572 y3 = pDiff_Num[3];
4573 y31 = pDiff_Num[4];
4574
4575 /*******************************************************
4576 store history of stepw and y3
4577 *******************************************************/
4578
4579 if (num_loop==2){
4580
4581 DeltaMu[2] = DeltaMu[1];
4582 DeltaN[2] = DeltaN[1];
4583 DeltaNp[2] = DeltaNp[1];
4584
4585 DeltaMu[1] = DeltaMu[0];
4586 DeltaN[1] = DeltaN[0];
4587 DeltaNp[1] = DeltaNp[0];
4588
4589 DeltaMu[0] = DeltaMu[6];
4590 DeltaN[0] = y31;
4591 DeltaNp[0] = y3;
4592 }
4593
4594 /*******************************************************
4595 estimate a new chemical potential
4596 *******************************************************/
4597
4598 /* num_loop==1 */
4599
4600 if (num_loop==1){
4601
4602 y31 = y3;
4603
4604 if (SCF_iter<4){
4605
4606 stepw = fabs(y3);
4607 if (0.02<stepw) stepw = -sgn(y3)*0.02;
4608 else stepw = -(y3 + 10.0*y3*y3);
4609 Trial_ChemP = Trial_ChemP + stepw;
4610 }
4611
4612 else {
4613
4614 /* if (4<=SCF_iter), estimate Trial_ChemP using an extrapolation. */
4615
4616 z0 = DeltaNp[0];
4617 z1 = DeltaNp[1];
4618 z2 = DeltaNp[2];
4619
4620 a11 = DeltaN[0]; a12 = DeltaMu[0]; a13 = DeltaN[0]*DeltaMu[0];
4621 a21 = DeltaN[1]; a22 = DeltaMu[1]; a23 = DeltaN[1]*DeltaMu[1];
4622 a31 = DeltaN[2]; a32 = DeltaMu[2]; a33 = DeltaN[2]*DeltaMu[2];
4623
4624 det = a11*a22*a33+a21*a32*a13+a31*a12*a23-a11*a32*a23-a31*a22*a13-a21*a12*a33;
4625
4626 a1 = ((a22*a33-a23*a32)*z0 + (a13*a32-a12*a33)*z1 + (a12*a23-a13*a22)*z2)/det;
4627 a2 = ((a23*a31-a21*a33)*z0 + (a11*a33-a13*a31)*z1 + (a13*a21-a11*a23)*z2)/det;
4628 a3 = ((a21*a32-a22*a31)*z0 + (a12*a31-a11*a32)*z1 + (a11*a22-a12*a21)*z2)/det;
4629
4630 stepw = -a1*y3/(a2+a3*y3);
4631
4632 flag_nan = 0;
4633 sprintf(nanchar,"%8.4f",stepw);
4634
4635 if (strstr(nanchar,"nan")!=NULL || strstr(nanchar,"NaN")!=NULL
4636 || strstr(nanchar,"inf")!=NULL || strstr(nanchar,"Inf")!=NULL){
4637
4638 flag_nan = 1;
4639
4640 stepw = fabs(y3);
4641 if (0.02<stepw) stepw = -sgn(y3)*0.02;
4642 else stepw = -(y3 + y3*y3);
4643 Trial_ChemP = Trial_ChemP + stepw;
4644 }
4645 else {
4646 Trial_ChemP = Trial_ChemP + stepw;
4647 }
4648
4649 }
4650
4651 x0 = x3;
4652 y0 = y3;
4653 }
4654
4655 /* num_loop==2 */
4656
4657 else if (num_loop==2){
4658
4659 x1 = x3;
4660 y1 = y3;
4661
4662 Trial_ChemP0 = (x1*y0-x0*y1)/(y0-y1);
4663
4664 if ( 0.1<fabs(Trial_ChemP-Trial_ChemP0) ){
4665 Trial_ChemP = Trial_ChemP + 0.1*sgn(Trial_ChemP0 - Trial_ChemP);
4666 }
4667 else {
4668
4669 Trial_ChemP = Trial_ChemP0;
4670 }
4671 }
4672
4673 /* num_loop==3 */
4674
4675 else if (num_loop==3){
4676
4677 x2 = x3;
4678 y2 = y3;
4679
4680 a = y0/(x0-x2)/(x0-x1) - y1/(x1-x2)/(x0-x1) + y2/(x0-x2)/(x1-x2);
4681 b = -(x2+x1)*y0/(x0-x2)/(x0-x1) + (x0+x2)*y1/(x1-x2)/(x0-x1) - (x1+x0)*y2/(x0-x2)/(x1-x2);
4682 c = x1*x2*y0/(x0-x2)/(x0-x1) - x2*x0*y1/(x1-x2)/(x0-x1) + x0*x1*y2/(x0-x2)/(x1-x2);
4683
4684 /* refinement of a, b, and c by the iterative method */
4685
4686 po4 = 0;
4687 num4 = 0;
4688 scaling4 = 0.3;
4689 dF = 1000.0;
4690
4691 do {
4692
4693 g0 = a*x0*x0 + b*x0 + c - y0;
4694 g1 = a*x1*x1 + b*x1 + c - y1;
4695 g2 = a*x2*x2 + b*x2 + c - y2;
4696
4697 dFa = 2.0*(g0*x0*x0 + g1*x1*x1 + g2*x2*x2);
4698 dFb = 2.0*(g0*x0 + g1*x1 + g2*x2);
4699 dFc = 2.0*(g0 + g1 + g2);
4700
4701 F = g0*g0 + g1*g1 + g2*g2;
4702 dF0 = dF;
4703 dF = sqrt(fabs(dFa*dFa + dFb*dFb + dFc*dFc));
4704
4705 if (dF0<dF) scaling4 /= 1.5;
4706
4707 /*
4708 printf("3 F=%18.15f dF=%18.14f scaling4=%15.12f\n",F,dF,scaling4);
4709 */
4710
4711 a -= scaling4*dFa;
4712 b -= scaling4*dFb;
4713 c -= scaling4*dFc;
4714
4715 if (dF<1.0e-11) po4 = 1;
4716 num4++;
4717
4718 } while (po4==0 && num4<100);
4719
4720 /* calculte Trial_ChemP */
4721
4722 /*
4723 printf("3 a=%18.15f\n",a);
4724 printf("3 b=%18.15f\n",b);
4725 printf("3 c=%18.15f\n",c);
4726 */
4727
4728 if (0.0<=b)
4729 Trial_ChemP0 = -2.0*c/(b+sqrt(fabs(b*b-4*a*c)));
4730 else
4731 Trial_ChemP0 = (-b+sqrt(fabs(b*b-4*a*c)))/(2.0*a);
4732
4733 flag_nan = 0;
4734 sprintf(nanchar,"%8.4f",Trial_ChemP0);
4735
4736 if ( strstr(nanchar,"nan")!=NULL || strstr(nanchar,"NaN")!=NULL
4737 || strstr(nanchar,"inf")!=NULL || strstr(nanchar,"Inf")!=NULL){
4738
4739 flag_nan = 1;
4740
4741 stepw = fabs(y3);
4742 if (0.02<stepw) stepw = -sgn(y3)*0.02;
4743 else stepw = -(y3 + y3*y3);
4744 Trial_ChemP = Trial_ChemP + stepw;
4745 }
4746 else {
4747
4748 if ( 0.02<fabs(Trial_ChemP-Trial_ChemP0) ){
4749 Trial_ChemP = Trial_ChemP + 0.02*sgn(Trial_ChemP0 - Trial_ChemP);
4750 }
4751 else {
4752 Trial_ChemP = Trial_ChemP0;
4753 }
4754 }
4755
4756 /*
4757 printf("3 Trial_ChemP0=%18.15f\n",Trial_ChemP0);
4758 printf("3 Trial_ChemP =%18.15f\n",Trial_ChemP);
4759 */
4760
4761 }
4762
4763 /* if (4<=num_loop) */
4764
4765 else {
4766
4767 Trial_ChemP0 = Trial_ChemP;
4768
4769 xx[1] = x0;
4770 xx[2] = x1;
4771 xx[3] = x2;
4772 xx[4] = x3;
4773
4774 yy[1] = y0;
4775 yy[2] = y1;
4776 yy[3] = y2;
4777 yy[4] = y3;
4778
4779 qsort_double(4, xx, yy);
4780
4781 /* check whether the zero point is passed or not. */
4782
4783 po3 = 0;
4784 for (i=1; i<4; i++){
4785 if (yy[i]*yy[i+1]<0.0){
4786 i0 = i;
4787 po3 = 1;
4788 }
4789 }
4790
4791 /*
4792 for (i=1; i<=4; i++){
4793 printf("num_loop=%2d i0=%2d i=%2d xx=%18.15f yy=%18.15f\n",num_loop,i0,i,xx[i],yy[i]);
4794 }
4795 */
4796
4797 if (po3==1){
4798
4799 if (i0==1){
4800 x0 = xx[i0+0];
4801 x1 = xx[i0+1];
4802 x2 = xx[i0+2];
4803
4804 y0 = yy[i0+0];
4805 y1 = yy[i0+1];
4806 y2 = yy[i0+2];
4807 }
4808
4809 else if (i0==2) {
4810
4811 if (fabs(yy[i0-1])<fabs(yy[i0+2])){
4812
4813 x0 = xx[i0-1];
4814 x1 = xx[i0+0];
4815 x2 = xx[i0+1];
4816
4817 y0 = yy[i0-1];
4818 y1 = yy[i0+0];
4819 y2 = yy[i0+1];
4820 }
4821 else {
4822 x0 = xx[i0+0];
4823 x1 = xx[i0+1];
4824 x2 = xx[i0+2];
4825
4826 y0 = yy[i0+0];
4827 y1 = yy[i0+1];
4828 y2 = yy[i0+2];
4829 }
4830 }
4831
4832 else if (i0==3) {
4833 x0 = xx[i0-1];
4834 x1 = xx[i0+0];
4835 x2 = xx[i0+1];
4836
4837 y0 = yy[i0-1];
4838 y1 = yy[i0+0];
4839 y2 = yy[i0+1];
4840 }
4841
4842 a = y0/(x0-x2)/(x0-x1) - y1/(x1-x2)/(x0-x1) + y2/(x0-x2)/(x1-x2);
4843 b = -(x2+x1)*y0/(x0-x2)/(x0-x1) + (x0+x2)*y1/(x1-x2)/(x0-x1) - (x1+x0)*y2/(x0-x2)/(x1-x2);
4844 c = x1*x2*y0/(x0-x2)/(x0-x1) - x2*x0*y1/(x1-x2)/(x0-x1) + x0*x1*y2/(x0-x2)/(x1-x2);
4845
4846 /*
4847 printf("x0=%18.15f y0=%18.15f\n",x0,y0);
4848 printf("x1=%18.15f y1=%18.15f\n",x1,y1);
4849 printf("x2=%18.15f y2=%18.15f\n",x2,y2);
4850
4851 printf("a=%18.15f\n",a);
4852 printf("b=%18.15f\n",b);
4853 printf("c=%18.15f\n",c);
4854 */
4855
4856 /* refinement of a, b, and c by the iterative method */
4857
4858 po4 = 0;
4859 num4 = 0;
4860 scaling4 = 0.3;
4861 dF = 1000.0;
4862
4863 do {
4864
4865 g0 = a*x0*x0 + b*x0 + c - y0;
4866 g1 = a*x1*x1 + b*x1 + c - y1;
4867 g2 = a*x2*x2 + b*x2 + c - y2;
4868
4869 dFa = 2.0*(g0*x0*x0 + g1*x1*x1 + g2*x2*x2);
4870 dFb = 2.0*(g0*x0 + g1*x1 + g2*x2);
4871 dFc = 2.0*(g0 + g1 + g2);
4872
4873 F = g0*g0 + g1*g1 + g2*g2;
4874 dF0 = dF;
4875 dF = sqrt(fabs(dFa*dFa + dFb*dFb + dFc*dFc));
4876
4877 if (dF0<dF) scaling4 /= 1.5;
4878
4879 /*
4880 printf("F=%18.15f dF=%18.14f scaling4=%15.12f\n",F,dF,scaling4);
4881 */
4882
4883 a -= scaling4*dFa;
4884 b -= scaling4*dFb;
4885 c -= scaling4*dFc;
4886
4887 if (dF<1.0e-11) po4 = 1;
4888 num4++;
4889
4890 } while (po4==0 && num4<100);
4891
4892 /* calculte Trial_ChemP */
4893
4894 if (0.0<=b)
4895 Trial_ChemP0 = -2.0*c/(b+sqrt(fabs(b*b-4*a*c)));
4896 else
4897 Trial_ChemP0 = (-b+sqrt(fabs(b*b-4*a*c)))/(2.0*a);
4898
4899 flag_nan = 0;
4900 sprintf(nanchar,"%8.4f",Trial_ChemP0);
4901
4902 if ( strstr(nanchar,"nan")!=NULL || strstr(nanchar,"NaN")!=NULL
4903 || strstr(nanchar,"inf")!=NULL || strstr(nanchar,"Inf")!=NULL){
4904
4905 flag_nan = 1;
4906
4907 stepw = fabs(y3);
4908 if (0.02<stepw) stepw = -sgn(y3)*0.02;
4909 else stepw = -(y3 + y3*y3);
4910 Trial_ChemP = Trial_ChemP + stepw;
4911 }
4912 else {
4913
4914 if ( 0.02<fabs(Trial_ChemP-Trial_ChemP0) ){
4915 Trial_ChemP = Trial_ChemP + 0.02*sgn(Trial_ChemP0 - Trial_ChemP);
4916 }
4917 else {
4918 Trial_ChemP = Trial_ChemP0;
4919 }
4920 }
4921
4922 /*
4923 printf("F=%18.15f dF=%18.14f Trial_ChemP=%18.15f\n",F,dF,Trial_ChemP);
4924 */
4925 }
4926
4927 else {
4928
4929 yy[1] = fabs(y0);
4930 yy[2] = fabs(y1);
4931 yy[3] = fabs(y2);
4932 yy[4] = fabs(y3);
4933
4934 zz[1] = 1;
4935 zz[2] = 2;
4936 zz[3] = 3;
4937 zz[4] = 4;
4938
4939 xx[1] = x0;
4940 xx[2] = x1;
4941 xx[3] = x2;
4942 xx[4] = x3;
4943
4944 yy0[1] = y0;
4945 yy0[2] = y1;
4946 yy0[3] = y2;
4947 yy0[4] = y3;
4948
4949 qsort_double(4, yy, zz);
4950
4951 i0 = (int)zz[1];
4952 i1 = (int)zz[2];
4953 i2 = (int)zz[3];
4954
4955 x0 = xx[i0];
4956 x1 = xx[i1];
4957 x2 = xx[i2];
4958
4959 y0 = yy0[i0];
4960 y1 = yy0[i1];
4961 y2 = yy0[i2];
4962
4963 Trial_ChemP0 = (x1*y0-x0*y1)/(y0-y1);
4964
4965 flag_nan = 0;
4966 sprintf(nanchar,"%8.4f",Trial_ChemP0);
4967
4968 if ( strstr(nanchar,"nan")!=NULL || strstr(nanchar,"NaN")!=NULL
4969 || strstr(nanchar,"inf")!=NULL || strstr(nanchar,"Inf")!=NULL){
4970
4971 flag_nan = 1;
4972
4973 stepw = fabs(y3);
4974 if (0.02<stepw) stepw = -sgn(y3)*0.02;
4975 else stepw = -(y3 + y3*y3);
4976 Trial_ChemP = Trial_ChemP + stepw;
4977 }
4978 else {
4979
4980 if ( 0.02<fabs(Trial_ChemP-Trial_ChemP0) ){
4981 Trial_ChemP = Trial_ChemP + 0.02*sgn(Trial_ChemP0 - Trial_ChemP);
4982 }
4983 else {
4984 Trial_ChemP = Trial_ChemP0;
4985 }
4986 }
4987
4988 } /* else */
4989 } /* else */
4990
4991 /* store pTrial_ChemP and pDiff_Num */
4992
4993 pTrial_ChemP[0] = x0;
4994 pTrial_ChemP[1] = x1;
4995 pTrial_ChemP[2] = x2;
4996 pTrial_ChemP[3] = x3;
4997
4998 pDiff_Num[0] = y0;
4999 pDiff_Num[1] = y1;
5000 pDiff_Num[2] = y2;
5001 pDiff_Num[3] = y3;
5002 pDiff_Num[4] = y31;
5003
5004 DeltaMu[6] = stepw;
5005
5006 /* return Traial_ChemP */
5007 return Trial_ChemP;
5008 }
5009