1 /**********************************************************************
2 Krylov.c:
3
4 Krylov.c is a subroutine to perform a Krylov subspace
5 method developed by T.Ozaki.
6
7 Log of Krylov.c
8
9 10/June/2005 Released by T.Ozaki
10
11 ***********************************************************************/
12
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <math.h>
17 #include <time.h>
18 #include "openmx_common.h"
19 #include "lapack_prototypes.h"
20 #include "mpi.h"
21 #include <omp.h>
22
23 #define measure_time 0
24 #define error_check 0
25 #define cutoff_value Threshold_OLP_Eigen
26
27 #include "tran_prototypes.h"
28
29 #ifdef nosse
30 #include "mimic_sse.h"
31 #else
32 #include <emmintrin.h>
33 #endif
34
35 #ifdef kcomp
36 #define _mm_loadu_pd _mm_load_pd
37 #define _mm_storeu_pd _mm_store_pd
38 #endif
39
40 static void Generate_pMatrix( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0, double **invS,
41 double ***Krylov_U, double ***Krylov_U_OLP, double **inv_RS, int *MP,
42 int *Msize, int *Msize2, int *Msize3, int *Msize4, int Msize2_max,
43 double **tmpvec0, double **tmpvec1, double **tmpvec2 );
44
45 static void Generate_pMatrix2( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0, double ***Krylov_U,
46 int *MP, int *Msize, int *Msize2, int *Msize3,
47 double **tmpvec1 );
48 static void Krylov_IOLP( int Mc_AN, double ****OLP0, double ***Krylov_U_OLP, double **inv_RS,
49 int *MP, int *Msize2, int *Msize4, int Msize2_max,
50 double **tmpvec0, double **tmpvec1 );
51 static void S_orthonormalize_vec( int Mc_AN, int ct_on, double **vec,
52 double **workvec, double ****OLP0,
53 double **tmpmat0, double *ko, double *iko, int *MP, int *Msize2 );
54 static void Embedding_Matrix(int spin, int Mc_AN, double *****Hks,
55 double ***Krylov_U, double ****EC_matrix,
56 int *MP, int *Msize, int *Msize2, int *Msize3,
57 double **tmpvec1);
58
59 static void Inverse_S_by_Cholesky(int Mc_AN, double ****OLP0, double **invS, int *MP, int NUM, double *LoS);
60
61 static void Save_DOS_Col(double ******Residues, double ****OLP0, double ***EVal, int **LO_TC, int **HO_TC);
62
63 static double Krylov_Col(char *mode,
64 int SCF_iter,
65 double *****Hks,
66 double ****OLP0,
67 double *****CDM,
68 double *****EDM,
69 double Eele0[2],
70 double Eele1[2]);
71
72
73 /*******************************************************
74 The following subroutines are called only when
75 Matomnum==1 && openmp_threads_num>1
76 *******************************************************/
77
78 static double Krylov_Col_trd(char *mode,
79 int SCF_iter,
80 double *****Hks,
81 double ****OLP0,
82 double *****CDM,
83 double *****EDM,
84 double Eele0[2],
85 double Eele1[2]);
86
87 static void Generate_pMatrix_trd( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0, double **invS,
88 double ***Krylov_U, double ***Krylov_U_OLP, double **inv_RS, int *MP,
89 int *Msize, int *Msize2, int *Msize3, int *Msize4, int Msize2_max,
90 double **tmpvec0, double **tmpvec1, double **tmpvec2 );
91
92 static void Generate_pMatrix2_trd( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0, double ***Krylov_U,
93 int *MP, int *Msize, int *Msize2, int *Msize3,
94 double **tmpvec1 );
95 static void Krylov_IOLP_trd( int Mc_AN, double ****OLP0, double ***Krylov_U_OLP, double **inv_RS,
96 int *MP, int *Msize2, int *Msize4, int Msize2_max,
97 double **tmpvec0, double **tmpvec1 );
98 static void S_orthonormalize_vec_trd( int Mc_AN, int ct_on, double **vec,
99 double **workvec, double ****OLP0,
100 double **tmpmat0, double *ko, double *iko, int *MP, int *Msize2 );
101 static void Embedding_Matrix_trd(int spin, int Mc_AN, double *****Hks,
102 double ***Krylov_U, double ****EC_matrix,
103 int *MP, int *Msize, int *Msize2, int *Msize3,
104 double **tmpvec1, int EKC_core_size_max, int Msize2_max);
105
106
107 static int Eigen_lapack_x(double **a, double *ko, int n, int EVmax);
108 static int Eigen_lapack_d(double **a, double *ko, int n, int EVmax);
109 static int Eigen_lapack_r(double **a, double *ko, int n, int EVmax);
110
111
112
113
Krylov(char * mode,int SCF_iter,double ***** Hks,double ***** ImNL,double **** OLP0,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])114 double Krylov(char *mode,
115 int SCF_iter,
116 double *****Hks,
117 double *****ImNL,
118 double ****OLP0,
119 double *****CDM,
120 double *****EDM,
121 double Eele0[2], double Eele1[2])
122 {
123 double time0;
124
125 /****************************************************
126 collinear without spin-orbit coupling
127 ****************************************************/
128
129 if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==0 ){
130
131 if(atomnum<=NUMPROCS_MPI_COMM_WORLD && 1<openmp_threads_num)
132 time0 = Krylov_Col_trd(mode,SCF_iter, Hks, OLP0, CDM, EDM, Eele0, Eele1);
133 else
134 time0 = Krylov_Col(mode,SCF_iter, Hks, OLP0, CDM, EDM, Eele0, Eele1);
135 }
136
137 /****************************************************
138 collinear with spin-orbit coupling
139 ****************************************************/
140
141 else if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==1 ){
142 printf("Spin-orbit coupling is not supported for collinear DFT calculations.\n");
143 MPI_Finalize();
144 exit(1);
145 }
146
147 /****************************************************
148 non-collinear with and without spin-orbit coupling
149 ****************************************************/
150
151 else if (SpinP_switch==3){
152 printf("The O(N) Krylov subspace method is not supported for non-collinear DFT calculations.\n");
153 MPI_Finalize();
154 exit(1);
155 }
156
157 return time0;
158 }
159
160
161
162
163
164
165
Krylov_Col(char * mode,int SCF_iter,double ***** Hks,double **** OLP0,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])166 static double Krylov_Col(char *mode,
167 int SCF_iter,
168 double *****Hks,
169 double ****OLP0,
170 double *****CDM,
171 double *****EDM,
172 double Eele0[2],
173 double Eele1[2])
174 {
175 static int firsttime=1,recalc_firsttime=1,recalc_flag;
176 int Mc_AN,Gc_AN,i,is,js,Gi,wan,wanA,wanB,Anum;
177 int num,NUM0,NUM,NUM1,n2,Cwan,Hwan,Rn2;
178 int size1,size2,max_size1,max_size2;
179 int ih,ig,ian,j,kl,jg,jan,Bnum,m,n,spin,i2,ip;
180 int k,l,i1,j1,P_min,m_size,q1,q2,csize,Residues_size;
181 int h_AN1,Mh_AN1,h_AN2,Gh_AN1,Gh_AN2,wan1,wan2;
182 int po,po1,loopN,tno1,tno2,h_AN,Gh_AN,rl1,rl2,rl;
183 int MA_AN,GA_AN,tnoA,GB_AN,tnoB,ct_on;
184 int Msize2_max;
185 static double TZ;
186 double My_TZ,sum,FermiF,time0,srt;
187 double sum00,sum10,sum20,sum30;
188 double sum01,sum11,sum21,sum31;
189 double sum02,sum12,sum22,sum32;
190 double sum03,sum13,sum23,sum33;
191 double tmp0,tmp1,tmp2,tmp3,b2,co,x0,Sx,Dx,xmin,xmax;
192 double My_Num_State,Num_State,x,Dnum;
193 double emin,emax,de;
194 double TStime,TEtime,Stime1,Etime1;
195 double Stime2,Etime2;
196 double time1,time2,time3,time4,time5;
197 double time6,time7,time8,time9,time10;
198 double time11,time12,time13,time14,time15,time16;
199 double Erange;
200 double My_Eele0[2],My_Eele1[2];
201 double max_x=30.0;
202 double ChemP_MAX,ChemP_MIN,spin_degeneracy;
203 double spetrum_radius;
204 double ***EVal;
205 double ******Residues;
206 double ***PDOS_DC;
207 double *tmp_array;
208 double *tmp_array2;
209
210 int *MP;
211
212 /*****************************************************
213 Msize: \sum_FNAN Spe_Total_CNO
214 Msize2: \sum_FNAN+SNAN Spe_Total_CNO
215 Msize3: rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]
216 Msize4: rlmax_EC2[Mc_AN]*EKC_core_size[Mc_AN]
217 Msize5: dimension for the last column of Residues
218 *****************************************************/
219
220 int *Msize;
221 int *Msize2;
222 int *Msize3;
223 int *Msize4;
224 int *Msize5;
225 int **LO_TC;
226 int **HO_TC;
227 int numprocs,myid,ID,IDS,IDR,tag=999;
228 double Stime_atom, Etime_atom;
229
230 MPI_Status stat;
231 MPI_Request request;
232
233 /* for OpenMP */
234 int OMPID,Nthrds,Nprocs;
235
236 /* MPI */
237 MPI_Barrier(mpi_comm_level1);
238 MPI_Comm_size(mpi_comm_level1,&numprocs);
239 MPI_Comm_rank(mpi_comm_level1,&myid);
240
241 dtime(&TStime);
242
243 if (measure_time==1){
244 time1 = 0.0;
245 time2 = 0.0;
246 time3 = 0.0;
247 time4 = 0.0;
248 time5 = 0.0;
249 time6 = 0.0;
250 time7 = 0.0;
251 time8 = 0.0;
252 time9 = 0.0;
253 time10 = 0.0;
254 time11 = 0.0;
255 time12 = 0.0;
256 time13 = 0.0;
257 time14 = 0.0;
258 time15 = 0.0;
259 }
260
261 /****************************************************
262 allocation of arrays:
263 ****************************************************/
264
265 Msize = (int*)malloc(sizeof(int)*(Matomnum+1));
266 Msize2 = (int*)malloc(sizeof(int)*(Matomnum+1));
267 Msize3 = (int*)malloc(sizeof(int)*(Matomnum+1));
268 Msize4 = (int*)malloc(sizeof(int)*(Matomnum+1));
269 Msize2_max = 0;
270
271 /* find Msize */
272
273 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
274
275 if (Mc_AN==0){
276 Msize[Mc_AN] = 1;
277 }
278 else{
279
280 Gc_AN = M2G[Mc_AN];
281
282 NUM = 0;
283 for (i=0; i<=FNAN[Gc_AN]; i++){
284 Gi = natn[Gc_AN][i];
285 wanA = WhatSpecies[Gi];
286 NUM += Spe_Total_CNO[wanA];
287 }
288 Msize[Mc_AN] = NUM;
289 }
290 }
291
292 /* find Msize2 and Msize2_max */
293
294 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
295
296 if (Mc_AN==0){
297 Msize2[Mc_AN] = 1;
298 }
299 else{
300
301 Gc_AN = M2G[Mc_AN];
302
303 NUM = 0;
304 for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
305 Gi = natn[Gc_AN][i];
306 wanA = WhatSpecies[Gi];
307 NUM += Spe_Total_CNO[wanA];
308 }
309 Msize2[Mc_AN] = NUM;
310 }
311
312 if (Msize2_max<Msize2[Mc_AN]) Msize2_max = Msize2[Mc_AN] + 4;
313 }
314
315 /* find Msize3 and Msize4 */
316
317 Msize3[0] = 1;
318 Msize4[0] = 1;
319
320 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
321 Gc_AN = M2G[Mc_AN];
322 wan = WhatSpecies[Gc_AN];
323 ct_on = Spe_Total_CNO[wan];
324 Msize3[Mc_AN] = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN];
325 Msize4[Mc_AN] = rlmax_EC2[Mc_AN]*EKC_core_size[Mc_AN];
326 }
327
328 m_size = 0;
329
330 EVal = (double***)malloc(sizeof(double**)*(SpinP_switch+1));
331 for (spin=0; spin<=SpinP_switch; spin++){
332 EVal[spin] = (double**)malloc(sizeof(double*)*(Matomnum+1));
333
334 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
335 n2 = Msize3[Mc_AN] + 2;
336 m_size += n2;
337 EVal[spin][Mc_AN] = (double*)malloc(sizeof(double)*n2);
338 }
339 }
340
341 if (firsttime){
342 PrintMemory("Krylov: EVal", sizeof(double)*m_size,NULL);
343 }
344
345 if (2<=level_stdout){
346 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
347 printf("<Krylov> myid=%4d Mc_AN=%4d Gc_AN=%4d Msize=%4d\n",
348 myid,Mc_AN,M2G[Mc_AN],Msize[Mc_AN]);
349 }
350 }
351
352 /****************************************************
353 allocation of arrays:
354
355 double PDOS[SpinP_switch+1]
356 [Matomnum+1]
357 [n2]
358 ****************************************************/
359
360 m_size = 0;
361 PDOS_DC = (double***)malloc(sizeof(double**)*(SpinP_switch+1));
362 for (spin=0; spin<=SpinP_switch; spin++){
363 PDOS_DC[spin] = (double**)malloc(sizeof(double*)*(Matomnum+1));
364 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
365 n2 = Msize3[Mc_AN] + 4;
366 m_size += n2;
367 PDOS_DC[spin][Mc_AN] = (double*)malloc(sizeof(double)*n2);
368 }
369 }
370
371 if (firsttime){
372 PrintMemory("Krylov: PDOS_DC",sizeof(double)*m_size,NULL);
373 }
374
375 /****************************************************
376 allocation of arrays:
377
378 int LO_TC[SpinP_switch+1][Matomnum+1]
379 int HO_TC[SpinP_switch+1][Matomnum+1]
380 ****************************************************/
381
382 LO_TC = (int**)malloc(sizeof(int*)*(SpinP_switch+1));
383 for (spin=0; spin<(SpinP_switch+1); spin++){
384 LO_TC[spin] = (int*)malloc(sizeof(int)*(Matomnum+1));
385 }
386
387 HO_TC = (int**)malloc(sizeof(int*)*(SpinP_switch+1));
388 for (spin=0; spin<(SpinP_switch+1); spin++){
389 HO_TC[spin] = (int*)malloc(sizeof(int)*(Matomnum+1));
390 }
391
392 /****************************************************
393 allocation of array:
394
395 double Residues[SpinP_switch+1]
396 [Matomnum+1]
397 [FNAN[Gc_AN]+1]
398 [Spe_Total_CNO[Gc_AN]]
399 [Spe_Total_CNO[Gh_AN]]
400 [HO_TC-LO_TC+3]
401 ****************************************************/
402
403 Residues = (double******)malloc(sizeof(double*****)*(SpinP_switch+1));
404 for (spin=0; spin<=SpinP_switch; spin++){
405 Residues[spin] = (double*****)malloc(sizeof(double****)*(Matomnum+1));
406 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
407
408 if (Mc_AN==0){
409 Gc_AN = 0;
410 FNAN[0] = 0;
411 tno1 = 1;
412 }
413 else{
414 Gc_AN = M2G[Mc_AN];
415 wanA = WhatSpecies[Gc_AN];
416 tno1 = Spe_Total_CNO[wanA];
417 }
418
419 Residues[spin][Mc_AN] = (double****)malloc(sizeof(double***)*(FNAN[Gc_AN]+1));
420
421 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
422
423 if (Mc_AN==0){
424 tno2 = 1;
425 }
426 else {
427 Gh_AN = natn[Gc_AN][h_AN];
428 wanB = WhatSpecies[Gh_AN];
429 tno2 = Spe_Total_CNO[wanB];
430 }
431
432 Residues[spin][Mc_AN][h_AN] = (double***)malloc(sizeof(double**)*tno1);
433 for (i=0; i<tno1; i++){
434 Residues[spin][Mc_AN][h_AN][i] = (double**)malloc(sizeof(double*)*tno2);
435 /* note that the array is allocated once more in the loop */
436
437 /*
438 for (j=0; j<tno2; j++){
439 Residues[spin][Mc_AN][h_AN][i][j] = (double*)malloc(sizeof(double)*4100);
440 }
441 */
442
443 }
444 }
445 }
446 }
447
448 for (spin=0; spin<=SpinP_switch; spin++){
449 Residues[spin][0][0][0][0] = (double*)malloc(sizeof(double)*1);
450 }
451
452 /****************************************************
453 initialize density and energy density matrices
454 ****************************************************/
455
456 for (spin=0; spin<=SpinP_switch; spin++){
457 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
458 Gc_AN = M2G[Mc_AN];
459 wanA = WhatSpecies[Gc_AN];
460 tno1 = Spe_Total_CNO[wanA];
461 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
462 Gh_AN = natn[Gc_AN][h_AN];
463 wanB = WhatSpecies[Gh_AN];
464 tno2 = Spe_Total_CNO[wanB];
465 for (i=0; i<tno1; i++){
466 for (j=0; j<tno2; j++){
467 CDM[spin][Mc_AN][h_AN][i][j] = 0.0;
468 EDM[spin][Mc_AN][h_AN][i][j] = 0.0;
469 }
470 }
471 }
472 }
473 }
474
475 /****************************************************
476 MPI
477
478 Hks
479 ****************************************************/
480
481 if (measure_time==1) dtime(&Stime1);
482
483 if (SCF_iter==1){
484
485 /***********************************
486 set data size
487 ************************************/
488
489 for (ID=0; ID<numprocs; ID++){
490
491 IDS = (myid + ID) % numprocs;
492 IDR = (myid - ID + numprocs) % numprocs;
493
494 if (ID!=0){
495 tag = 999;
496
497 /* find data size to send block data */
498 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
499
500 size1 = 0;
501 for (spin=0; spin<=SpinP_switch; spin++){
502 for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
503 Mc_AN = Snd_MAN[IDS][n];
504 Gc_AN = Snd_GAN[IDS][n];
505 Cwan = WhatSpecies[Gc_AN];
506 tno1 = Spe_Total_CNO[Cwan];
507 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
508 Gh_AN = natn[Gc_AN][h_AN];
509 Hwan = WhatSpecies[Gh_AN];
510 tno2 = Spe_Total_CNO[Hwan];
511 size1 += tno1*tno2;
512 }
513 }
514 }
515
516 Snd_HFS_Size[IDS] = size1;
517 MPI_Isend(&size1, 1, MPI_INT, IDS, tag, mpi_comm_level1, &request);
518 }
519 else{
520 Snd_HFS_Size[IDS] = 0;
521 }
522
523 /* receiving of size of data */
524
525 if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
526
527 MPI_Recv(&size2, 1, MPI_INT, IDR, tag, mpi_comm_level1, &stat);
528 Rcv_HFS_Size[IDR] = size2;
529 }
530 else{
531 Rcv_HFS_Size[IDR] = 0;
532 }
533
534 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0) MPI_Wait(&request,&stat);
535 }
536 else{
537 Snd_HFS_Size[IDS] = 0;
538 Rcv_HFS_Size[IDR] = 0;
539 }
540 }
541 }
542
543 /***********************************
544 data transfer
545 ************************************/
546
547 /* find maximum size of size1 */
548
549 max_size1 = 0;
550 max_size2 = 0;
551 for (ID=0; ID<numprocs; ID++){
552 size1 = Snd_HFS_Size[ID];
553 if (max_size1<size1) max_size1 = size1;
554 size2 = Rcv_HFS_Size[ID];
555 if (max_size2<size2) max_size2 = size2;
556 }
557
558 /* allocation of arrays */
559
560 tmp_array = (double*)malloc(sizeof(double)*max_size1);
561 tmp_array2 = (double*)malloc(sizeof(double)*max_size2);
562
563 /* MPI communication */
564
565 tag = 999;
566 for (ID=0; ID<numprocs; ID++){
567
568 IDS = (myid + ID) % numprocs;
569 IDR = (myid - ID + numprocs) % numprocs;
570
571 if (ID!=0){
572
573 /*****************************
574 sending of data
575 *****************************/
576
577 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
578
579 size1 = Snd_HFS_Size[IDS];
580
581 /* multidimentional array to vector array */
582
583 num = 0;
584 for (spin=0; spin<=SpinP_switch; spin++){
585 for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
586 Mc_AN = Snd_MAN[IDS][n];
587 Gc_AN = Snd_GAN[IDS][n];
588 Cwan = WhatSpecies[Gc_AN];
589 tno1 = Spe_Total_CNO[Cwan];
590 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
591 Gh_AN = natn[Gc_AN][h_AN];
592 Hwan = WhatSpecies[Gh_AN];
593 tno2 = Spe_Total_CNO[Hwan];
594 for (i=0; i<tno1; i++){
595 for (j=0; j<tno2; j++){
596 tmp_array[num] = Hks[spin][Mc_AN][h_AN][i][j];
597 num++;
598 }
599 }
600 }
601 }
602 }
603
604 MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request);
605 }
606
607 /*****************************
608 receiving of block data
609 *****************************/
610
611 if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
612
613 size2 = Rcv_HFS_Size[IDR];
614
615 MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat);
616
617 num = 0;
618 for (spin=0; spin<=SpinP_switch; spin++){
619 Mc_AN = S_TopMAN[IDR] - 1; /* S_TopMAN should be used. */
620 for (n=0; n<(F_Rcv_Num[IDR]+S_Rcv_Num[IDR]); n++){
621 Mc_AN++;
622 Gc_AN = Rcv_GAN[IDR][n];
623 Cwan = WhatSpecies[Gc_AN];
624 tno1 = Spe_Total_CNO[Cwan];
625
626 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
627 Gh_AN = natn[Gc_AN][h_AN];
628 Hwan = WhatSpecies[Gh_AN];
629 tno2 = Spe_Total_CNO[Hwan];
630 for (i=0; i<tno1; i++){
631 for (j=0; j<tno2; j++){
632 Hks[spin][Mc_AN][h_AN][i][j] = tmp_array2[num];
633 num++;
634 }
635 }
636 }
637 }
638 }
639 }
640
641 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
642 MPI_Wait(&request,&stat);
643 }
644 }
645 }
646
647 /****************************************************
648 MPI
649
650 OLP0
651 ****************************************************/
652
653 /***********************************
654 set data size
655 ************************************/
656
657 if (SCF_iter==1){
658
659 /***********************************
660 data transfer
661 ************************************/
662
663 tag = 999;
664 for (ID=0; ID<numprocs; ID++){
665
666 IDS = (myid + ID) % numprocs;
667 IDR = (myid - ID + numprocs) % numprocs;
668
669 if (ID!=0){
670
671 /*****************************
672 sending of data
673 *****************************/
674
675 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
676
677 size1 = Snd_HFS_Size[IDS]/(1+SpinP_switch);
678
679 /* multidimentional array to vector array */
680
681 num = 0;
682
683 for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
684 Mc_AN = Snd_MAN[IDS][n];
685 Gc_AN = Snd_GAN[IDS][n];
686 Cwan = WhatSpecies[Gc_AN];
687 tno1 = Spe_Total_CNO[Cwan];
688 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
689 Gh_AN = natn[Gc_AN][h_AN];
690 Hwan = WhatSpecies[Gh_AN];
691 tno2 = Spe_Total_CNO[Hwan];
692 for (i=0; i<tno1; i++){
693 for (j=0; j<tno2; j++){
694 tmp_array[num] = OLP0[Mc_AN][h_AN][i][j];
695 num++;
696 }
697 }
698 }
699 }
700
701 MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request);
702 }
703
704 /*****************************
705 receiving of block data
706 *****************************/
707
708 if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
709
710 size2 = Rcv_HFS_Size[IDR]/(1+SpinP_switch);
711
712 MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat);
713
714 num = 0;
715 Mc_AN = S_TopMAN[IDR] - 1; /* S_TopMAN should be used. */
716 for (n=0; n<(F_Rcv_Num[IDR]+S_Rcv_Num[IDR]); n++){
717 Mc_AN++;
718 Gc_AN = Rcv_GAN[IDR][n];
719 Cwan = WhatSpecies[Gc_AN];
720 tno1 = Spe_Total_CNO[Cwan];
721
722 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
723 Gh_AN = natn[Gc_AN][h_AN];
724 Hwan = WhatSpecies[Gh_AN];
725 tno2 = Spe_Total_CNO[Hwan];
726 for (i=0; i<tno1; i++){
727 for (j=0; j<tno2; j++){
728 OLP0[Mc_AN][h_AN][i][j] = tmp_array2[num];
729 num++;
730 }
731 }
732 }
733 }
734 }
735
736 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
737 MPI_Wait(&request,&stat);
738 }
739 }
740 }
741 }
742
743 /* freeing of arrays */
744 free(tmp_array);
745 free(tmp_array2);
746
747 if (measure_time==1){
748 dtime(&Etime1);
749 time1 = Etime1 - Stime1;
750 }
751
752 /***********************************************
753 for regeneration of the buffer matrix
754 ***********************************************/
755
756 if (sqrt(fabs(NormRD[0]))<(0.2+0.10*(double)atomnum) && recalc_firsttime){
757 recalc_flag = 1;
758 recalc_firsttime = 0;
759 }
760 else{
761 recalc_flag = 0;
762 }
763
764 if (SCF_iter==1) recalc_firsttime = 1;
765
766 if (error_check==1){
767 printf("SCF_iter=%2d recalc_firsttime=%2d\n",SCF_iter,recalc_firsttime);
768 }
769
770 if (measure_time==1) dtime(&Stime2);
771
772 #pragma omp parallel shared(EKC_invS_flag,List_YOUSO,Residues,EDM,CDM,HO_TC,LO_TC,ChemP,EVal,RMI1,S_G2M,EC_matrix,recalc_flag,recalc_EM,Krylov_U,SpinP_switch,EKC_core_size,EKC_core_size_max,rlmax_EC,rlmax_EC2,time11,time10,time9,time8,time7,time6,time5,time4,time3,time2,Hks,OLP0,EKC_Exact_invS_flag,SCF_iter,Msize4,Msize3,Msize2,Msize,Msize2_max,natn,FNAN,SNAN,Spe_Total_CNO,WhatSpecies,M2G,Matomnum,myid,time_per_atom,firsttime)
773 {
774 int OMPID,Nthrds,Nprocs;
775 int Mc_AN,Gc_AN,wan,spin;
776 int ig,ian,ih,kl,jg,jan,Bnum,m,n,rl;
777 int Anum,i,j,k,Gi,wanA,NUM,n2,csize,is,i2;
778 int i1,rl1,js,ip,po1,tno1,h_AN,Gh_AN,wanB,tno2;
779 int *MP;
780 int KU_d1, KU_d2,lda,ldb,ldc,M,N,K;
781 double alpha, beta;
782 double **invS;
783 double *LoS;
784 double *C,*KU;
785 double *H_DC,*ko;
786 double ***Krylov_U_OLP;
787 double **inv_RS;
788 double sum00,sum10,sum20,sum30,sum;
789 double tmp0,tmp1,tmp2,tmp3;
790 double Erange;
791 double Stime_atom,Etime_atom;
792 double Stime1,Etime1;
793 double **tmpvec0;
794 double **tmpvec1;
795 double **tmpvec2;
796
797 /* get info. on OpenMP */
798
799 OMPID = omp_get_thread_num();
800 Nthrds = omp_get_num_threads();
801 Nprocs = omp_get_num_procs();
802
803 /* allocation of arrays */
804
805 MP = (int*)malloc(sizeof(int)*List_YOUSO[2]);
806
807 tmpvec0 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
808 for (i=0; i<EKC_core_size_max; i++){
809 tmpvec0[i] = (double*)malloc(sizeof(double)*Msize2_max);
810 }
811
812 tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
813 for (i=0; i<EKC_core_size_max; i++){
814 tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
815 }
816
817 tmpvec2 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
818 for (i=0; i<EKC_core_size_max; i++){
819 tmpvec2[i] = (double*)malloc(sizeof(double)*Msize2_max);
820 }
821
822 if (firsttime && OMPID==0){
823 PrintMemory("Krylov: tmpvec0",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
824 PrintMemory("Krylov: tmpvec1",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
825 PrintMemory("Krylov: tmpvec2",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
826 }
827
828 /***********************************************
829 main loop of calculation
830 ***********************************************/
831
832 for (Mc_AN=1+OMPID; Mc_AN<=Matomnum; Mc_AN+=Nthrds){
833
834 dtime(&Stime_atom);
835
836 Gc_AN = M2G[Mc_AN];
837 wan = WhatSpecies[Gc_AN];
838
839 /* MP array */
840
841 Anum = 1;
842 for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
843 MP[i] = Anum;
844 Gi = natn[Gc_AN][i];
845 wanA = WhatSpecies[Gi];
846 Anum += Spe_Total_CNO[wanA];
847 }
848 NUM = Anum - 1;
849 n2 = NUM + 40;
850
851 /***********************************************
852 allocation of arrays:
853 ***********************************************/
854
855 if (Msize[Mc_AN]<Msize3[Mc_AN])
856 csize = Msize3[Mc_AN] + 40;
857 else
858 csize = Msize[Mc_AN] + 40;
859
860 KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
861 KU_d2 = Msize2[Mc_AN];
862
863 H_DC = (double*)malloc(sizeof(double)*csize*csize);
864 ko = (double*)malloc(sizeof(double)*csize);
865 C = (double*)malloc(sizeof(double)*csize*csize);
866 KU = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+2)*Msize3[Mc_AN]);
867
868 /***********************************************
869 calculate the inverse of overlap matrix
870 ***********************************************/
871
872 if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_Exact_invS_flag==1){
873
874 LoS = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3)*(Msize2[Mc_AN]+3));
875
876 invS = (double**)malloc(sizeof(double*)*(Msize2[Mc_AN]+3));
877 for (i=0; i<(Msize2[Mc_AN]+3); i++){
878 invS[i] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
879 }
880
881 if (measure_time==1) dtime(&Stime1);
882
883 Inverse_S_by_Cholesky(Mc_AN, OLP0, invS, MP, NUM, LoS);
884
885 if (measure_time==1 && OMPID==0){
886 dtime(&Etime1);
887 time2 += Etime1 - Stime1;
888 }
889 }
890
891 else if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_invS_flag==1){
892
893 Krylov_U_OLP = (double***)malloc(sizeof(double**)*rlmax_EC2[Mc_AN]);
894 for (i=0; i<rlmax_EC2[Mc_AN]; i++){
895 Krylov_U_OLP[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
896 for (j=0; j<EKC_core_size[Mc_AN]; j++){
897 Krylov_U_OLP[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
898 for (k=0; k<(Msize2[Mc_AN]+3); k++) Krylov_U_OLP[i][j][k] = 0.0;
899 }
900 }
901
902 inv_RS = (double**)malloc(sizeof(double*)*(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]);
903 for (i=0; i<(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]; i++){
904 inv_RS[i] = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]);
905 }
906
907 if (measure_time==1) dtime(&Stime1);
908
909 Krylov_IOLP( Mc_AN, OLP0, Krylov_U_OLP, inv_RS, MP, Msize2, Msize4, Msize2_max, tmpvec0, tmpvec1 );
910
911 if (measure_time==1 && OMPID==0){
912 dtime(&Etime1);
913 time2 += Etime1 - Stime1;
914 }
915 }
916
917 for (spin=0; spin<=SpinP_switch; spin++){
918
919 /****************************************************
920 generate a preconditioning matrix
921 ****************************************************/
922
923 if (measure_time==1) dtime(&Stime1);
924
925 if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN]){
926
927 Generate_pMatrix( myid, spin, Mc_AN, Hks, OLP0, invS, Krylov_U, Krylov_U_OLP, inv_RS, MP,
928 Msize, Msize2, Msize3, Msize4, Msize2_max, tmpvec0, tmpvec1, tmpvec2 );
929 }
930 else if (SCF_iter==1){
931
932 Generate_pMatrix2( myid, spin, Mc_AN, Hks, OLP0, Krylov_U, MP, Msize, Msize2, Msize3, tmpvec1 );
933 }
934
935 if (measure_time==1 && OMPID==0){
936 dtime(&Etime1);
937 time3 += Etime1 - Stime1;
938 }
939
940 if (measure_time==1) dtime(&Stime1);
941
942 if (recalc_EM==1 || SCF_iter==1 || recalc_flag==1){
943
944 Embedding_Matrix( spin, Mc_AN, Hks, Krylov_U, EC_matrix, MP, Msize, Msize2, Msize3, tmpvec1);
945 }
946
947 if (measure_time==1 && OMPID==0){
948 dtime(&Etime1);
949 time4 += Etime1 - Stime1;
950 }
951
952 /****************************************************
953 construct the Hamiltonian matrix
954 ****************************************************/
955
956 if (measure_time==1) dtime(&Stime1);
957
958 for (i=0; i<=FNAN[Gc_AN]; i++){
959 ig = natn[Gc_AN][i];
960 ian = Spe_Total_CNO[WhatSpecies[ig]];
961 Anum = MP[i] - 1;
962 ih = S_G2M[ig]; /* S_G2M should be used */
963
964 for (j=0; j<=FNAN[Gc_AN]; j++){
965
966 kl = RMI1[Mc_AN][i][j];
967 jg = natn[Gc_AN][j];
968 jan = Spe_Total_CNO[WhatSpecies[jg]];
969 Bnum = MP[j] - 1;
970
971 if (0<=kl){
972 for (m=0; m<ian; m++){
973 for (n=0; n<jan; n++){
974 H_DC[(Anum+m)*Msize[Mc_AN]+Bnum+n] = Hks[spin][ih][kl][m][n];
975 }
976 }
977 }
978
979 else{
980 for (m=0; m<ian; m++){
981 for (n=0; n<jan; n++){
982 H_DC[(Anum+m)*Msize[Mc_AN]+Bnum+n] = 0.0;
983 }
984 }
985 }
986 }
987 }
988
989 if (measure_time==1 && OMPID==0){
990 dtime(&Etime1);
991 time5 += Etime1 - Stime1;
992 }
993
994 /****************************************************
995 transform u1^+ * H_DC * u1
996 ****************************************************/
997
998 /* H_DC * u1 */
999
1000 if (measure_time==1) dtime(&Stime1);
1001
1002 /* original version
1003
1004 for (i=1; i<=Msize[Mc_AN]; i++){
1005 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
1006 for (n=0; n<EKC_core_size[Mc_AN]; n++){
1007
1008 sum = 0.0;
1009 for (j=1; j<=Msize[Mc_AN]; j++){
1010 sum += H_DC[i][j]*Krylov_U[spin][Mc_AN][rl][n][j];
1011 }
1012
1013 C[rl*EKC_core_size[Mc_AN]+n+1][i] = sum;
1014 }
1015 }
1016 }
1017 */
1018
1019 /* BLAS3 version */
1020
1021 for (i=0; i<Msize[Mc_AN]; i++){
1022 for (j=0; j<Msize3[Mc_AN]; j++){
1023 KU[j*Msize[Mc_AN]+i] = Krylov_U[spin][Mc_AN][j*Msize2[Mc_AN]+i+1];
1024 }
1025 }
1026
1027 alpha = 1.0; beta = 0.0;
1028 M = Msize[Mc_AN]; N = Msize3[Mc_AN]; K = Msize[Mc_AN];
1029 lda = M; ldb = K; ldc = Msize[Mc_AN];
1030
1031 F77_NAME(dgemm,DGEMM)( "N", "N", &M, &N, &K, &alpha,
1032 H_DC, &lda, KU, &ldb, &beta, C, &ldc);
1033
1034 if (measure_time==1 && OMPID==0){
1035 dtime(&Etime1);
1036 time6 += Etime1 - Stime1;
1037 }
1038
1039 /* u1^+ * H_DC * u1 */
1040
1041 if (measure_time==1) dtime(&Stime1);
1042
1043 /* original version */
1044
1045 /*
1046 for (rl1=0; rl1<rlmax_EC[Mc_AN]; rl1++){
1047 for (m=0; m<EKC_core_size[Mc_AN]; m++){
1048 for (rl2=rl1; rl2<rlmax_EC[Mc_AN]; rl2++){
1049 for (n=0; n<EKC_core_size[Mc_AN]; n++){
1050 sum = 0.0;
1051 i2 = rl2*EKC_core_size[Mc_AN] + n + 1;
1052 for (i=1; i<=Msize[Mc_AN]; i++){
1053 sum += Krylov_U[spin][Mc_AN][rl1][m][i]*C[i2][i];
1054 }
1055
1056 H_DC[rl1*EKC_core_size[Mc_AN]+m+1][rl2*EKC_core_size[Mc_AN]+n+1] = sum;
1057 H_DC[rl2*EKC_core_size[Mc_AN]+n+1][rl1*EKC_core_size[Mc_AN]+m+1] = sum;
1058 }
1059 }
1060 }
1061 }
1062 */
1063
1064 /* BLAS3 version */
1065
1066 alpha = 1.0; beta = 0.0;
1067 M = Msize3[Mc_AN]; N = Msize3[Mc_AN]; K = Msize[Mc_AN];
1068 lda = K; ldb = K; ldc = Msize3[Mc_AN];
1069
1070 F77_NAME(dgemm,DGEMM)("T", "N", &M, &N, &K, &alpha,
1071 KU, &lda, C, &ldb, &beta, H_DC, &ldc);
1072
1073 if (measure_time==1 && OMPID==0){
1074 dtime(&Etime1);
1075 time7 += Etime1 - Stime1;
1076 }
1077
1078 /* correction for ZeroNum */
1079
1080 m = (int)Krylov_U[spin][Mc_AN][0];
1081 for (i=0; i<m; i++){
1082 H_DC[i*Msize3[Mc_AN]+i] = 1.0e+3;
1083 }
1084
1085 /****************************************************
1086 H0 = u1^+ * H_DC * u1 + D
1087 ****************************************************/
1088
1089 if (measure_time==1) dtime(&Stime1);
1090
1091 for (i=(Msize3[Mc_AN]-1); 0<=i; i--){
1092 for (j=0; j<Msize3[Mc_AN]; j++){
1093 H_DC[(i+1)*(Msize3[Mc_AN]+1)+(j+1)] = H_DC[i*Msize3[Mc_AN]+j] + EC_matrix[spin][Mc_AN][i+1][j+1];
1094 }
1095 }
1096
1097 if (measure_time==1 && OMPID==0){
1098 dtime(&Etime1);
1099 time8 += Etime1 - Stime1;
1100 }
1101
1102 /****************************************************
1103 diagonalize
1104 ****************************************************/
1105
1106 if (measure_time==1) dtime(&Stime1);
1107
1108 Eigen_lapack2(H_DC,Msize3[Mc_AN]+1,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
1109
1110 if (measure_time==1 && OMPID==0){
1111 dtime(&Etime1);
1112 time9 += Etime1 - Stime1;
1113 }
1114
1115 /********************************************
1116 back transformation of eigenvectors
1117 c = u1 * b
1118 *********************************************/
1119
1120 if (measure_time==1) dtime(&Stime1);
1121
1122 /* original version */
1123
1124 /*
1125 for (i=1; i<=Msize[Mc_AN]; i++){
1126 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
1127 for (n=0; n<EKC_core_size[Mc_AN]; n++){
1128
1129 tmp1 = Krylov_U[spin][Mc_AN][rl][n][i];
1130 i1 = rl*EKC_core_size[Mc_AN] + n + 1;
1131
1132 for (j=1; j<=Msize3[Mc_AN]; j++){
1133 C[i][j] += tmp1*H_DC[i1][j];
1134 }
1135 }
1136 }
1137 }
1138 */
1139
1140 /* BLAS3 version */
1141
1142 alpha = 1.0; beta = 0.0;
1143 M = Msize3[Mc_AN]; N = Msize[Mc_AN]; K = Msize3[Mc_AN];
1144 lda = K; ldb = N; ldc = Msize3[Mc_AN];
1145
1146 F77_NAME(dgemm,DGEMM)("T", "T", &M, &N, &K, &alpha,
1147 H_DC, &lda, KU, &ldb, &beta, C, &ldc);
1148
1149 if (measure_time==1 && OMPID==0){
1150 dtime(&Etime1);
1151 time10 += Etime1 - Stime1;
1152 }
1153
1154 if (measure_time==1) dtime(&Stime1);
1155
1156 /***********************************************
1157 store eigenvalues and residues of poles
1158 ***********************************************/
1159
1160 for (i=1; i<=Msize3[Mc_AN]; i++){
1161 EVal[spin][Mc_AN][i-1] = ko[i];
1162 }
1163
1164 /******************************************************
1165 set an energy range (-Erange+ChemP to Erange+ChemP)
1166 of eigenvalues used to store the Residues.
1167 ******************************************************/
1168
1169 Erange = 0.367493245; /* in hartree, corresponds to 10 eV */
1170
1171 /***********************************************
1172 find LO_TC and HO_TC
1173 ***********************************************/
1174
1175 /* LO_TC */
1176 i = 0;
1177 ip = 0;
1178 po1 = 0;
1179 do{
1180 if ( (ChemP-Erange)<EVal[spin][Mc_AN][i]){
1181 ip = i;
1182 po1 = 1;
1183 }
1184 i++;
1185 } while (po1==0 && i<Msize3[Mc_AN]);
1186
1187 LO_TC[spin][Mc_AN] = ip;
1188
1189 /* HO_TC */
1190 i = 0;
1191 ip = Msize3[Mc_AN]-1;
1192 po1 = 0;
1193 do{
1194 if ( (ChemP+Erange)<EVal[spin][Mc_AN][i]){
1195 ip = i;
1196 po1 = 1;
1197 }
1198 i++;
1199 } while (po1==0 && i<Msize3[Mc_AN]);
1200
1201 HO_TC[spin][Mc_AN] = ip;
1202
1203 /***********************************************
1204 store residues of poles
1205 ***********************************************/
1206
1207 n2 = HO_TC[spin][Mc_AN] - LO_TC[spin][Mc_AN] + 3;
1208 if (n2<1) n2 = 1;
1209
1210 wanA = WhatSpecies[Gc_AN];
1211 tno1 = Spe_Total_CNO[wanA];
1212
1213 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1214
1215 Gh_AN = natn[Gc_AN][h_AN];
1216 wanB = WhatSpecies[Gh_AN];
1217 tno2 = Spe_Total_CNO[wanB];
1218 Bnum = MP[h_AN] - 1;
1219
1220 for (i=0; i<tno1; i++){
1221 for (j=0; j<tno2; j++){
1222
1223 for (i1=0; i1<LO_TC[spin][Mc_AN]; i1++){
1224 tmp1 = C[i*Msize3[Mc_AN]+i1]*C[(Bnum+j)*Msize3[Mc_AN]+i1];
1225 CDM[spin][Mc_AN][h_AN][i][j] += tmp1;
1226 EDM[spin][Mc_AN][h_AN][i][j] += tmp1*EVal[spin][Mc_AN][i1];
1227 }
1228
1229 /* <allocation of Residues */
1230 Residues[spin][Mc_AN][h_AN][i][j] = (double*)malloc(sizeof(double)*n2);
1231 /* allocation of Residues> */
1232
1233 for (i1=LO_TC[spin][Mc_AN]; i1<=HO_TC[spin][Mc_AN]; i1++){
1234 Residues[spin][Mc_AN][h_AN][i][j][i1-LO_TC[spin][Mc_AN]]
1235 = C[i*Msize3[Mc_AN]+i1]*C[(Bnum+j)*Msize3[Mc_AN]+i1];
1236 }
1237 }
1238 }
1239 }
1240
1241 if (measure_time==1 && OMPID==0){
1242 dtime(&Etime1);
1243 time11 += Etime1 - Stime1;
1244 }
1245
1246 } /* spin */
1247
1248 /***********************************************
1249 freeing of arrays:
1250 ***********************************************/
1251
1252 free(H_DC);
1253 free(ko);
1254 free(C);
1255 free(KU);
1256
1257 if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_Exact_invS_flag==1){
1258
1259 free(LoS);
1260
1261 for (i=0; i<(Msize2[Mc_AN]+3); i++){
1262 free(invS[i]);
1263 }
1264 free(invS);
1265 }
1266
1267 else if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_invS_flag==1){
1268
1269 for (i=0; i<rlmax_EC2[Mc_AN]; i++){
1270 for (j=0; j<EKC_core_size[Mc_AN]; j++){
1271 free(Krylov_U_OLP[i][j]);
1272 }
1273 free(Krylov_U_OLP[i]);
1274 }
1275 free(Krylov_U_OLP);
1276
1277 for (i=0; i<(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]; i++){
1278 free(inv_RS[i]);
1279 }
1280 free(inv_RS);
1281 }
1282
1283 dtime(&Etime_atom);
1284 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1285
1286 } /* Mc_AN */
1287
1288 /* freeing of array */
1289
1290 free(MP);
1291
1292 for (i=0; i<EKC_core_size_max; i++){
1293 free(tmpvec0[i]);
1294 }
1295 free(tmpvec0);
1296
1297 for (i=0; i<EKC_core_size_max; i++){
1298 free(tmpvec1[i]);
1299 }
1300 free(tmpvec1);
1301
1302 for (i=0; i<EKC_core_size_max; i++){
1303 free(tmpvec2[i]);
1304 }
1305 free(tmpvec2);
1306
1307 } /* #pragma omp parallel */
1308
1309 if (measure_time==1){
1310 dtime(&Etime2);
1311 time16 = Etime2 - Stime2;
1312 }
1313
1314 if (firsttime){
1315
1316 Residues_size = 1;
1317
1318 for (spin=0; spin<=SpinP_switch; spin++){
1319 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
1320 Gc_AN = M2G[Mc_AN];
1321 wan = WhatSpecies[Gc_AN];
1322 tno1 = Spe_Total_CNO[wan];
1323 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1324 Gh_AN = natn[Gc_AN][h_AN];
1325 wanB = WhatSpecies[Gh_AN];
1326 tno2 = Spe_Total_CNO[wanB];
1327 n2 = HO_TC[spin][Mc_AN] - LO_TC[spin][Mc_AN] + 3;
1328 Residues_size += tno1*tno2*n2;
1329 }
1330 }
1331 }
1332
1333 PrintMemory("Krylov: Residues",sizeof(double)*Residues_size,NULL);
1334 }
1335
1336 /****************************************************
1337 calculate the projected DOS
1338 ****************************************************/
1339
1340 if (measure_time==1) dtime(&Stime1);
1341
1342 #pragma omp parallel shared(time_per_atom,Residues,LO_TC,HO_TC,EDM,CDM,OLP0,natn,FNAN,PDOS_DC,Msize3,Spe_Total_CNO,WhatSpecies,M2G,SpinP_switch,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,Stime_atom,spin,Gc_AN,wanA,tno1,i1,i,h_AN,Gh_AN,wanB,tno2,j,tmp1,Etime_atom)
1343 {
1344
1345 /* get info. on OpenMP */
1346
1347 OMPID = omp_get_thread_num();
1348 Nthrds = omp_get_num_threads();
1349 Nprocs = omp_get_num_procs();
1350
1351 for (Mc_AN=1+OMPID; Mc_AN<=Matomnum; Mc_AN+=Nthrds){
1352
1353 dtime(&Stime_atom);
1354
1355 for (spin=0; spin<=SpinP_switch; spin++){
1356
1357 Gc_AN = M2G[Mc_AN];
1358 wanA = WhatSpecies[Gc_AN];
1359 tno1 = Spe_Total_CNO[wanA];
1360
1361 for (i1=0; i1<=(Msize3[Mc_AN]+1); i1++){
1362 PDOS_DC[spin][Mc_AN][i1] = 0.0;
1363 }
1364
1365 for (i=0; i<tno1; i++){
1366 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1367 Gh_AN = natn[Gc_AN][h_AN];
1368 wanB = WhatSpecies[Gh_AN];
1369 tno2 = Spe_Total_CNO[wanB];
1370 for (j=0; j<tno2; j++){
1371
1372 tmp1 = OLP0[Mc_AN][h_AN][i][j];
1373
1374 PDOS_DC[spin][Mc_AN][0] += tmp1*CDM[spin][Mc_AN][h_AN][i][j];
1375 PDOS_DC[spin][Mc_AN][1] += tmp1*EDM[spin][Mc_AN][h_AN][i][j];
1376
1377 for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
1378 PDOS_DC[spin][Mc_AN][i1+2] += Residues[spin][Mc_AN][h_AN][i][j][i1]*tmp1;
1379 }
1380
1381 }
1382 }
1383 }
1384
1385 } /* spin */
1386
1387 dtime(&Etime_atom);
1388 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1389
1390 } /* Mc_AN */
1391
1392 } /* #pragma omp parallel */
1393
1394 if (measure_time==1){
1395 dtime(&Etime1);
1396 time12 += Etime1 - Stime1;
1397 }
1398
1399 /****************************************************
1400 find the total number of electrons
1401 ****************************************************/
1402
1403 if (measure_time==1) dtime(&Stime1);
1404
1405 My_TZ = 0.0;
1406 for (i=1; i<=Matomnum; i++){
1407 Gc_AN = M2G[i];
1408 wan = WhatSpecies[Gc_AN];
1409 My_TZ += Spe_Core_Charge[wan];
1410 }
1411
1412 /* MPI, My_TZ */
1413
1414 MPI_Barrier(mpi_comm_level1);
1415 MPI_Allreduce(&My_TZ, &TZ, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
1416
1417 /****************************************************
1418 find the chemical potential
1419 ****************************************************/
1420
1421 po = 0;
1422 loopN = 0;
1423
1424 ChemP_MAX = 10.0;
1425 ChemP_MIN =-10.0;
1426 if (SpinP_switch==0) spin_degeneracy = 2.0;
1427 else if (SpinP_switch==1) spin_degeneracy = 1.0;
1428
1429 do {
1430 ChemP = 0.50*(ChemP_MAX + ChemP_MIN);
1431
1432 My_Num_State = 0.0;
1433
1434 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
1435 for (spin=0; spin<=SpinP_switch; spin++){
1436
1437 dtime(&Stime_atom);
1438
1439 Gc_AN = M2G[Mc_AN];
1440
1441 My_Num_State += spin_degeneracy*PDOS_DC[spin][Mc_AN][0];
1442
1443 for (i=0; i<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i++){
1444
1445 x = (EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
1446 if (x<=-max_x) x = -max_x;
1447 if (max_x<=x) x = max_x;
1448 FermiF = 1.0/(1.0 + exp(x));
1449 My_Num_State += spin_degeneracy*FermiF*PDOS_DC[spin][Mc_AN][i+2];
1450 }
1451
1452 dtime(&Etime_atom);
1453 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1454 }
1455 }
1456
1457 /* MPI, My_Num_State */
1458
1459 MPI_Barrier(mpi_comm_level1);
1460 MPI_Allreduce(&My_Num_State, &Num_State, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
1461
1462 Dnum = (TZ - Num_State) - system_charge;
1463 if (0.0<=Dnum) ChemP_MIN = ChemP;
1464 else ChemP_MAX = ChemP;
1465 if (fabs(Dnum)<1.0e-11) po = 1;
1466
1467
1468 if (myid==Host_ID && 3<=level_stdout){
1469 printf(" ChemP=%15.12f TZ=%15.12f Num_state=%15.12f\n",ChemP,TZ,Num_State);
1470 }
1471
1472 loopN++;
1473 }
1474 while (po==0 && loopN<1000);
1475
1476 if (measure_time==1){
1477 dtime(&Etime1);
1478 time13 += Etime1 - Stime1;
1479 }
1480
1481 /****************************************************
1482 eigenenergy by summing up eigenvalues
1483 ****************************************************/
1484
1485 if (measure_time==1) dtime(&Stime1);
1486
1487 My_Eele0[0] = 0.0;
1488 My_Eele0[1] = 0.0;
1489 for (spin=0; spin<=SpinP_switch; spin++){
1490 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
1491
1492 dtime(&Stime_atom);
1493
1494 Gc_AN = M2G[Mc_AN];
1495 My_Eele0[spin] += PDOS_DC[spin][Mc_AN][1];
1496
1497 for (i=0; i<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i++){
1498
1499 x = (EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
1500
1501 if (x<=-max_x) x = -max_x;
1502 if (max_x<=x) x = max_x;
1503 FermiF = 1.0/(1.0 + exp(x));
1504 My_Eele0[spin] += FermiF*EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]]*PDOS_DC[spin][Mc_AN][i+2];
1505 }
1506
1507 dtime(&Etime_atom);
1508 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1509 }
1510 }
1511
1512 /* MPI, My_Eele0 */
1513 for (spin=0; spin<=SpinP_switch; spin++){
1514 MPI_Barrier(mpi_comm_level1);
1515 MPI_Allreduce(&My_Eele0[spin], &Eele0[spin], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
1516 }
1517
1518 if (SpinP_switch==0){
1519 Eele0[1] = Eele0[0];
1520 }
1521
1522 if (measure_time==1){
1523 dtime(&Etime1);
1524 time14 += Etime1 - Stime1;
1525 }
1526
1527 if (measure_time==1) dtime(&Stime1);
1528
1529 #pragma omp parallel shared(FNAN,time_per_atom,EDM,CDM,Residues,natn,max_x,Beta,ChemP,EVal,LO_TC,HO_TC,Spe_Total_CNO,WhatSpecies,M2G,SpinP_switch,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,spin,Stime_atom,Gc_AN,wanA,tno1,i1,x,FermiF,h_AN,Gh_AN,wanB,tno2,i,j,tmp1,Etime_atom)
1530 {
1531
1532 /* get info. on OpenMP */
1533
1534 OMPID = omp_get_thread_num();
1535 Nthrds = omp_get_num_threads();
1536 Nprocs = omp_get_num_procs();
1537
1538 for (Mc_AN=1+OMPID; Mc_AN<=Matomnum; Mc_AN+=Nthrds){
1539 for (spin=0; spin<=SpinP_switch; spin++){
1540
1541 dtime(&Stime_atom);
1542
1543 Gc_AN = M2G[Mc_AN];
1544 wanA = WhatSpecies[Gc_AN];
1545 tno1 = Spe_Total_CNO[wanA];
1546
1547 for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
1548
1549 x = (EVal[spin][Mc_AN][i1+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
1550 if (x<=-max_x) x = -max_x;
1551 if (max_x<=x) x = max_x;
1552 FermiF = 1.0/(1.0 + exp(x));
1553
1554 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1555 Gh_AN = natn[Gc_AN][h_AN];
1556 wanB = WhatSpecies[Gh_AN];
1557 tno2 = Spe_Total_CNO[wanB];
1558 for (i=0; i<tno1; i++){
1559 for (j=0; j<tno2; j++){
1560 tmp1 = FermiF*Residues[spin][Mc_AN][h_AN][i][j][i1];
1561 CDM[spin][Mc_AN][h_AN][i][j] += tmp1;
1562 EDM[spin][Mc_AN][h_AN][i][j] += tmp1*EVal[spin][Mc_AN][i1+LO_TC[spin][Mc_AN]];
1563 }
1564 }
1565 }
1566 }
1567
1568 dtime(&Etime_atom);
1569 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1570 }
1571 }
1572
1573 } /* #pragma omp parallel */
1574
1575 /****************************************************
1576 bond energies
1577 ****************************************************/
1578
1579 My_Eele1[0] = 0.0;
1580 My_Eele1[1] = 0.0;
1581 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
1582 GA_AN = M2G[MA_AN];
1583 wanA = WhatSpecies[GA_AN];
1584 tnoA = Spe_Total_CNO[wanA];
1585
1586 for (j=0; j<=FNAN[GA_AN]; j++){
1587 GB_AN = natn[GA_AN][j];
1588 wanB = WhatSpecies[GB_AN];
1589 tnoB = Spe_Total_CNO[wanB];
1590
1591 for (k=0; k<tnoA; k++){
1592 for (l=0; l<tnoB; l++){
1593 for (spin=0; spin<=SpinP_switch; spin++){
1594 My_Eele1[spin] += CDM[spin][MA_AN][j][k][l]*Hks[spin][MA_AN][j][k][l];
1595 }
1596 }
1597 }
1598
1599 }
1600 }
1601
1602 /* MPI, My_Eele1 */
1603 MPI_Barrier(mpi_comm_level1);
1604 for (spin=0; spin<=SpinP_switch; spin++){
1605 MPI_Allreduce(&My_Eele1[spin], &Eele1[spin], 1, MPI_DOUBLE,
1606 MPI_SUM, mpi_comm_level1);
1607 }
1608
1609 if (SpinP_switch==0){
1610 Eele1[1] = Eele1[0];
1611 }
1612
1613 if (3<=level_stdout && myid==Host_ID){
1614 printf(" Eele00=%15.12f Eele01=%15.12f\n",Eele0[0],Eele0[1]);
1615 printf(" Eele10=%15.12f Eele11=%15.12f\n",Eele1[0],Eele1[1]);
1616 }
1617
1618 if (measure_time==1){
1619 dtime(&Etime1);
1620 time15 += Etime1 - Stime1;
1621 }
1622
1623 if ( strcasecmp(mode,"dos")==0 ){
1624 Save_DOS_Col(Residues,OLP0,EVal,LO_TC,HO_TC);
1625 }
1626
1627 if (measure_time==1){
1628 printf("myid=%2d time1 =%5.3f time2 =%5.3f time3 =%5.3f time4 =%5.3f time5 =%5.3f\n",
1629 myid,time1,time2,time3,time4,time5);
1630 printf("myid=%2d time6 =%5.3f time7 =%5.3f time8 =%5.3f time9 =%5.3f time10=%5.3f\n",
1631 myid,time6,time7,time8,time9,time10);
1632 printf("myid=%2d time11=%5.3f time12=%5.3f time13=%5.3f time14=%5.3f time15=%5.3f\n",
1633 myid,time11,time12,time13,time14,time15);
1634 printf("myid=%2d time16=%5.3f\n",myid,time16);
1635 }
1636
1637 /****************************************************
1638 freeing of arrays:
1639
1640 ****************************************************/
1641
1642 free(Msize);
1643 free(Msize2);
1644 free(Msize3);
1645 free(Msize4);
1646
1647 for (spin=0; spin<(SpinP_switch+1); spin++){
1648 free(LO_TC[spin]);
1649 }
1650 free(LO_TC);
1651
1652 for (spin=0; spin<(SpinP_switch+1); spin++){
1653 free(HO_TC[spin]);
1654 }
1655 free(HO_TC);
1656
1657 for (spin=0; spin<=SpinP_switch; spin++){
1658 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
1659 free(EVal[spin][Mc_AN]);
1660 }
1661 free(EVal[spin]);
1662 }
1663 free(EVal);
1664
1665 for (spin=0; spin<=SpinP_switch; spin++){
1666 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
1667
1668 if (Mc_AN==0){
1669 Gc_AN = 0;
1670 FNAN[0] = 0;
1671 tno1 = 1;
1672 }
1673 else{
1674 Gc_AN = M2G[Mc_AN];
1675 wanA = WhatSpecies[Gc_AN];
1676 tno1 = Spe_Total_CNO[wanA];
1677 }
1678
1679 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1680
1681 if (Mc_AN==0){
1682 tno2 = 1;
1683 }
1684 else {
1685 Gh_AN = natn[Gc_AN][h_AN];
1686 wanB = WhatSpecies[Gh_AN];
1687 tno2 = Spe_Total_CNO[wanB];
1688 }
1689
1690 for (i=0; i<tno1; i++){
1691 for (j=0; j<tno2; j++){
1692 free(Residues[spin][Mc_AN][h_AN][i][j]);
1693 }
1694 free(Residues[spin][Mc_AN][h_AN][i]);
1695 }
1696 free(Residues[spin][Mc_AN][h_AN]);
1697 }
1698 free(Residues[spin][Mc_AN]);
1699 }
1700 free(Residues[spin]);
1701 }
1702 free(Residues);
1703
1704 for (spin=0; spin<=SpinP_switch; spin++){
1705 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
1706 free(PDOS_DC[spin][Mc_AN]);
1707 }
1708 free(PDOS_DC[spin]);
1709 }
1710 free(PDOS_DC);
1711
1712 /* for time */
1713 dtime(&TEtime);
1714 time0 = TEtime - TStime;
1715
1716 if (measure_time==1){
1717 printf("total time=%15.12f\n",time0);
1718 }
1719
1720 /* for PrintMemory */
1721 firsttime=0;
1722
1723 return time0;
1724 }
1725
1726
1727
1728
1729
1730
1731
1732
1733
Generate_pMatrix(int myid,int spin,int Mc_AN,double ***** Hks,double **** OLP0,double ** invS,double *** Krylov_U,double *** Krylov_U_OLP,double ** inv_RS,int * MP,int * Msize,int * Msize2,int * Msize3,int * Msize4,int Msize2_max,double ** tmpvec0,double ** tmpvec1,double ** tmpvec2)1734 void Generate_pMatrix( int myid, int spin, int Mc_AN, double *****Hks,
1735 double ****OLP0, double **invS,
1736 double ***Krylov_U, double ***Krylov_U_OLP, double **inv_RS, int *MP,
1737 int *Msize, int *Msize2, int *Msize3, int *Msize4, int Msize2_max,
1738 double **tmpvec0, double **tmpvec1, double **tmpvec2 )
1739 {
1740 int rl,rl0,rl1,ct_AN,fan,san,can,wan,ct_on,i,j;
1741 int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1;
1742 int ZeroNum,Gh_AN,wanB,m1s,is;
1743 int rl00,rl01,rl02,rl03,rl04,rl05,rl06,rl07;
1744 int mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;
1745
1746 int KU_d1, KU_d2, csize;
1747 __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
1748
1749 double mmArr[8];
1750 double time1,time2,time3,time4,time5;
1751 double time6,time7,time8,time9,time10;
1752 double Stime1,Etime1;
1753 double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1754 double sum,dum,tmp0,tmp1,tmp2,tmp3,rcutA,r0;
1755 double **Utmp,**matRS0,**matRS1;
1756 double **tmpmat0;
1757 double *ko,*iko;
1758 double **FS;
1759 double ***U0;
1760
1761 ct_AN = M2G[Mc_AN];
1762 fan = FNAN[ct_AN];
1763 san = SNAN[ct_AN];
1764 can = fan + san;
1765 wan = WhatSpecies[ct_AN];
1766 ct_on = Spe_Total_CNO[wan];
1767 rcutA = Spe_Atom_Cut1[wan];
1768
1769 if (Msize[Mc_AN]<Msize3[Mc_AN])
1770 csize = Msize3[Mc_AN] + 40;
1771 else
1772 csize = Msize[Mc_AN] + 40;
1773
1774 KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
1775 KU_d2 = Msize2[Mc_AN];
1776
1777 /* allocation of arrays */
1778
1779 Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
1780 for (i=0; i<rlmax_EC[Mc_AN]; i++){
1781 Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
1782 }
1783
1784 U0 = (double***)malloc(sizeof(double**)*rlmax_EC[Mc_AN]);
1785 for (i=0; i<rlmax_EC[Mc_AN]; i++){
1786 U0[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
1787 for (j=0; j<EKC_core_size[Mc_AN]; j++){
1788 U0[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
1789 for (k=0; k<(Msize2[Mc_AN]+3); k++) U0[i][j][k] = 0.0;
1790 }
1791 }
1792
1793 tmpmat0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+4));
1794 for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
1795 tmpmat0[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+4));
1796 }
1797
1798 FS = (double**)malloc(sizeof(double*)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
1799 for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
1800 FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
1801 }
1802
1803 ko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
1804 iko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
1805
1806 matRS0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+2));
1807 for (i=0; i<(EKC_core_size[Mc_AN]+2); i++){
1808 matRS0[i] = (double*)malloc(sizeof(double)*(Msize4[Mc_AN]+3));
1809 }
1810
1811 matRS1 = (double**)malloc(sizeof(double*)*(Msize4[Mc_AN]+3));
1812 for (i=0; i<(Msize4[Mc_AN]+3); i++){
1813 matRS1[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+2));
1814 }
1815
1816 /****************************************************
1817 initialize
1818 ****************************************************/
1819
1820 if (measure_time==1){
1821 time1 = 0.0;
1822 time2 = 0.0;
1823 time3 = 0.0;
1824 time4 = 0.0;
1825 time5 = 0.0;
1826 time6 = 0.0;
1827 time7 = 0.0;
1828 time8 = 0.0;
1829 time9 = 0.0;
1830 time10 = 0.0;
1831 }
1832
1833 if (measure_time==1) dtime(&Stime1);
1834
1835 for (i=0; i<EKC_core_size_max; i++){
1836 for (j=0; j<Msize2_max; j++){
1837 tmpvec0[i][j] = 0.0;
1838 tmpvec1[i][j] = 0.0;
1839 }
1840 }
1841
1842 /* find the nearest atom with distance of r0 */
1843
1844 r0 = 1.0e+10;
1845 for (k=1; k<=FNAN[ct_AN]; k++){
1846 Gh_AN = natn[ct_AN][k];
1847 wanB = WhatSpecies[Gh_AN];
1848 if (Dis[ct_AN][k]<r0) r0 = Dis[ct_AN][k];
1849 }
1850
1851 /* starting vector */
1852
1853 m = 0;
1854 for (k=0; k<=FNAN[ct_AN]; k++){
1855
1856 Gh_AN = natn[ct_AN][k];
1857 wanB = WhatSpecies[Gh_AN];
1858
1859 if ( Dis[ct_AN][k]<(scale_rc_EKC[Mc_AN]*r0) ){
1860
1861 Anum = MP[k] - 1;
1862
1863 for (i=0; i<Spe_Total_CNO[wanB]; i++){
1864
1865 tmpvec0[m][Anum+i] = 1.0;
1866
1867 m++;
1868 }
1869 }
1870 }
1871
1872 S_orthonormalize_vec( Mc_AN, ct_on, tmpvec0, tmpvec1, OLP0, tmpmat0, ko, iko, MP, Msize2 );
1873
1874 for (n=0; n<EKC_core_size[Mc_AN]; n++){
1875 for (i=0; i<Msize2[Mc_AN]; i++){
1876 U0[0][n][i] = tmpvec0[n][i];
1877 }
1878 }
1879
1880 if (measure_time==1){
1881 dtime(&Etime1);
1882 time1 = Etime1 - Stime1;
1883 }
1884
1885 /****************************************************
1886 generate Krylov subspace vectors
1887 ****************************************************/
1888
1889 for (rl=0; rl<(rlmax_EC[Mc_AN]-1); rl++){
1890
1891 if (measure_time==1) dtime(&Stime1);
1892
1893 /*******************************************************
1894 H * |Wn)
1895 *******************************************************/
1896
1897 for (n=0; n<EKC_core_size[Mc_AN]; n++){
1898 for (i=0; i<Msize2[Mc_AN]; i++){
1899 tmpvec1[n][i] = 0.0;
1900 }
1901 }
1902
1903 for (i=0; i<=can; i++){
1904
1905 ig = natn[ct_AN][i];
1906 ian = Spe_Total_CNO[WhatSpecies[ig]];
1907 Anum = MP[i] - 1;
1908 ih = S_G2M[ig];
1909
1910 for (j=0; j<=can; j++){
1911
1912 kl = RMI1[Mc_AN][i][j];
1913 jg = natn[ct_AN][j];
1914 jan = Spe_Total_CNO[WhatSpecies[jg]];
1915 Bnum = MP[j] - 1;
1916
1917 if (0<=kl){
1918
1919 #ifdef nosse
1920
1921 /* original version */
1922
1923 for (m=0; m<ian; m++){
1924 for (n=0; n<EKC_core_size[Mc_AN]; n++){
1925
1926 sum = 0.0;
1927 for (k=0; k<jan; k++){
1928 sum += Hks[spin][ih][kl][m][k]*tmpvec0[n][Bnum+k];
1929 }
1930
1931 tmpvec1[n][Anum+m] += sum;
1932 }
1933 }
1934
1935 #else
1936 /* Loop Unrolling + SSE version */
1937
1938 for (m=0; m<(ian-3); m+=4){
1939 for (n=0; n<EKC_core_size[Mc_AN]; n++){
1940
1941 mmSum00 = _mm_setzero_pd();
1942 mmSum01 = _mm_setzero_pd();
1943 mmSum10 = _mm_setzero_pd();
1944 mmSum11 = _mm_setzero_pd();
1945 mmSum20 = _mm_setzero_pd();
1946 mmSum21 = _mm_setzero_pd();
1947 mmSum30 = _mm_setzero_pd();
1948 mmSum31 = _mm_setzero_pd();
1949
1950 for (k=0; k<(jan-3); k+=4){
1951 mmTmp0 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+0]);
1952 mmTmp1 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+2]);
1953
1954 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
1955 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
1956
1957 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
1958 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
1959
1960 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
1961 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
1962
1963 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
1964 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
1965 }
1966
1967 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
1968 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
1969 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
1970 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
1971
1972 _mm_storeu_pd(&mmArr[0], mmSum00);
1973 _mm_storeu_pd(&mmArr[2], mmSum10);
1974 _mm_storeu_pd(&mmArr[4], mmSum20);
1975 _mm_storeu_pd(&mmArr[6], mmSum30);
1976
1977 sum0 = mmArr[0] + mmArr[1];
1978 sum1 = mmArr[2] + mmArr[3];
1979 sum2 = mmArr[4] + mmArr[5];
1980 sum3 = mmArr[6] + mmArr[7];
1981
1982 for (; k<jan; k++){
1983 sum0 += Hks[spin][ih][kl][m+0][k]*tmpvec0[n][Bnum+k];
1984 sum1 += Hks[spin][ih][kl][m+1][k]*tmpvec0[n][Bnum+k];
1985 sum2 += Hks[spin][ih][kl][m+2][k]*tmpvec0[n][Bnum+k];
1986 sum3 += Hks[spin][ih][kl][m+3][k]*tmpvec0[n][Bnum+k];
1987 }
1988
1989 tmpvec1[n][Anum+m+0] += sum0;
1990 tmpvec1[n][Anum+m+1] += sum1;
1991 tmpvec1[n][Anum+m+2] += sum2;
1992 tmpvec1[n][Anum+m+3] += sum3;
1993 }
1994 }
1995
1996 for (; m<ian; m++){
1997 for (n=0; n<EKC_core_size[Mc_AN]; n++){
1998
1999 sum = 0.0;
2000 for (k=0; k<jan; k++){
2001 sum += Hks[spin][ih][kl][m][k]*tmpvec0[n][Bnum+k];
2002 }
2003
2004 tmpvec1[n][Anum+m] += sum;
2005 }
2006 }
2007 #endif
2008 }
2009 }
2010 }
2011
2012 if (measure_time==1){
2013 dtime(&Etime1);
2014 time2 += Etime1 - Stime1;
2015 }
2016
2017 /*******************************************************
2018 S^{-1} * H * |Wn)
2019 *******************************************************/
2020
2021 if (EKC_Exact_invS_flag==1){
2022
2023 if (measure_time==1) dtime(&Stime1);
2024
2025 #ifdef nosse
2026
2027 /* original version */
2028 /*
2029 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2030 for (i=0; i<Msize2[Mc_AN]; i++){
2031 sum = 0.0;
2032 for (j=0; j<Msize2[Mc_AN]; j++){
2033 sum += invS[i][j]*tmpvec1[n][j];
2034 }
2035 tmpvec0[n][i] = sum;
2036 }
2037 }
2038 */
2039 /* unrolling version */
2040
2041 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
2042 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2043
2044 sum0 = 0.0;
2045 sum1 = 0.0;
2046 sum2 = 0.0;
2047 sum3 = 0.0;
2048
2049 for (j=0; j<Msize2[Mc_AN]; j++){
2050 sum0 += invS[i+0][j]*tmpvec1[n][j];
2051 sum1 += invS[i+1][j]*tmpvec1[n][j];
2052 sum2 += invS[i+2][j]*tmpvec1[n][j];
2053 sum3 += invS[i+3][j]*tmpvec1[n][j];
2054 }
2055
2056 tmpvec0[n][i+0] = sum0;
2057 tmpvec0[n][i+1] = sum1;
2058 tmpvec0[n][i+2] = sum2;
2059 tmpvec0[n][i+3] = sum3;
2060 }
2061 }
2062
2063 is = Msize2[Mc_AN] - Msize2[Mc_AN]%4;
2064
2065 for (i=is; i<Msize2[Mc_AN]; i++){
2066 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2067 sum = 0.0;
2068 for (j=0; j<Msize2[Mc_AN]; j++){
2069 sum += invS[i][j]*tmpvec1[n][j];
2070 }
2071 tmpvec0[n][i] = sum;
2072 }
2073 }
2074
2075 #else
2076 /* unrolling + SSE version */
2077
2078 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
2079 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2080
2081 mmSum00 = _mm_setzero_pd();
2082 mmSum01 = _mm_setzero_pd();
2083 mmSum10 = _mm_setzero_pd();
2084 mmSum11 = _mm_setzero_pd();
2085 mmSum20 = _mm_setzero_pd();
2086 mmSum21 = _mm_setzero_pd();
2087 mmSum30 = _mm_setzero_pd();
2088 mmSum31 = _mm_setzero_pd();
2089
2090 for (j=0; j<(Msize2[Mc_AN]-3); j+=4){
2091 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][j+0]);
2092 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][j+2]);
2093
2094 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&invS[i+0][j+0]),mmTmp0));
2095 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&invS[i+0][j+2]),mmTmp1));
2096
2097 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&invS[i+1][j+0]),mmTmp0));
2098 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&invS[i+1][j+2]),mmTmp1));
2099
2100 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&invS[i+2][j+0]),mmTmp0));
2101 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&invS[i+2][j+2]),mmTmp1));
2102
2103 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&invS[i+3][j+0]),mmTmp0));
2104 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&invS[i+3][j+2]),mmTmp1));
2105 }
2106
2107 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2108 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2109 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2110 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2111
2112 _mm_storeu_pd(&mmArr[0], mmSum00);
2113 _mm_storeu_pd(&mmArr[2], mmSum10);
2114 _mm_storeu_pd(&mmArr[4], mmSum20);
2115 _mm_storeu_pd(&mmArr[6], mmSum30);
2116
2117 sum0 = mmArr[0] + mmArr[1];
2118 sum1 = mmArr[2] + mmArr[3];
2119 sum2 = mmArr[4] + mmArr[5];
2120 sum3 = mmArr[6] + mmArr[7];
2121
2122 for (; j<Msize2[Mc_AN]; j++){
2123 sum0 += invS[i+0][j]*tmpvec1[n][j];
2124 sum1 += invS[i+1][j]*tmpvec1[n][j];
2125 sum2 += invS[i+2][j]*tmpvec1[n][j];
2126 sum3 += invS[i+3][j]*tmpvec1[n][j];
2127 }
2128
2129 tmpvec0[n][i+0] = sum0;
2130 tmpvec0[n][i+1] = sum1;
2131 tmpvec0[n][i+2] = sum2;
2132 tmpvec0[n][i+3] = sum3;
2133 }
2134 }
2135
2136 is = Msize2[Mc_AN] - Msize2[Mc_AN]%4;
2137
2138 for (i=is; i<Msize2[Mc_AN]; i++){
2139 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2140 sum = 0.0;
2141 for (j=0; j<Msize2[Mc_AN]; j++){
2142 sum += invS[i][j]*tmpvec1[n][j];
2143 }
2144 tmpvec0[n][i] = sum;
2145 }
2146 }
2147
2148 #endif
2149
2150 if (measure_time==1){
2151 dtime(&Etime1);
2152 time3 += Etime1 - Stime1;
2153 }
2154 }
2155
2156 /*******************************************************
2157 U * RS^-1 * U^+ * H * |Wn)
2158 *******************************************************/
2159
2160 else if (EKC_invS_flag==1){
2161
2162 /* U^+ * H * |Wn) */
2163
2164 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
2165 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2166 for (m=0; m<EKC_core_size[Mc_AN]; m++){
2167
2168 sum = 0.0;
2169 for (i=0; i<Msize2[Mc_AN]; i++){
2170 sum += Krylov_U_OLP[rl0][n][i]*tmpvec1[m][i];
2171 }
2172
2173 /* transpose the later calcualtion */
2174 matRS0[m][rl0*EKC_core_size[Mc_AN]+n] = sum;
2175 }
2176 }
2177 }
2178
2179 /* RS^-1 * U^+ * H * |Wn) */
2180
2181 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
2182 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2183
2184 for (m=0; m<EKC_core_size[Mc_AN]; m++){
2185
2186 sum = 0.0;
2187 for (i=0; i<Msize4[Mc_AN]; i++){
2188 sum += inv_RS[rl0*EKC_core_size[Mc_AN]+n][i]*matRS0[m][i];
2189 }
2190
2191 matRS1[rl0*EKC_core_size[Mc_AN]+n][m] = sum;
2192 }
2193 }
2194 }
2195
2196 /* U * RS^-1 * U^+ * H * |Wn) */
2197
2198 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2199 for (i=0; i<Msize2[Mc_AN]; i++){
2200 tmpvec0[n][i] = 0.0;
2201 }
2202 }
2203
2204 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
2205 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2206 for (m=0; m<EKC_core_size[Mc_AN]; m++){
2207 tmp0 = matRS1[rl0*EKC_core_size[Mc_AN]+n][m];
2208 for (i=0; i<Msize2[Mc_AN]; i++){
2209 tmpvec0[m][i] += Krylov_U_OLP[rl0][n][i]*tmp0;
2210 }
2211 }
2212 }
2213 }
2214 }
2215
2216 else {
2217 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2218 for (i=0; i<Msize2[Mc_AN]; i++){
2219 tmpvec0[n][i] = tmpvec1[n][i];
2220 }
2221 }
2222 }
2223
2224 if (measure_time==1) dtime(&Stime1);
2225
2226 /*************************************************************
2227 S-orthogonalization by a classical block Gram-Schmidt method
2228 *************************************************************/
2229
2230 /* |tmpvec2) = S * |tmpvec0) */
2231
2232 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2233 for (i=0; i<Msize2[Mc_AN]; i++){
2234 tmpvec2[n][i] = 0.0;
2235 }
2236 }
2237
2238 for (i=0; i<=can; i++){
2239
2240 ig = natn[ct_AN][i];
2241 ian = Spe_Total_CNO[WhatSpecies[ig]];
2242 Anum = MP[i] - 1;
2243 ih = S_G2M[ig];
2244
2245 for (j=0; j<=can; j++){
2246
2247 kl = RMI1[Mc_AN][i][j];
2248 jg = natn[ct_AN][j];
2249 jan = Spe_Total_CNO[WhatSpecies[jg]];
2250 Bnum = MP[j] - 1;
2251
2252 if (0<=kl){
2253
2254 #ifdef nosse
2255
2256 /* Original version */
2257
2258 for (m=0; m<ian; m++){
2259 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2260
2261 sum = 0.0;
2262 for (k=0; k<jan; k++){
2263 sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
2264 }
2265
2266 tmpvec2[n][Anum+m] += sum;
2267 }
2268 }
2269
2270 #else
2271
2272 /* Unrolling + SSE version */
2273
2274 for (m=0; m<(ian-3); m+=4){
2275 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2276
2277 mmSum00 = _mm_setzero_pd();
2278 mmSum01 = _mm_setzero_pd();
2279 mmSum10 = _mm_setzero_pd();
2280 mmSum11 = _mm_setzero_pd();
2281 mmSum20 = _mm_setzero_pd();
2282 mmSum21 = _mm_setzero_pd();
2283 mmSum30 = _mm_setzero_pd();
2284 mmSum31 = _mm_setzero_pd();
2285
2286 for (k=0; k<(jan-3); k+=4){
2287 mmTmp0 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+0]);
2288 mmTmp1 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+2]);
2289
2290 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
2291 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
2292
2293 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
2294 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
2295
2296 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
2297 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
2298
2299 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
2300 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
2301 }
2302
2303 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2304 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2305 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2306 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2307
2308 _mm_storeu_pd(&mmArr[0], mmSum00);
2309 _mm_storeu_pd(&mmArr[2], mmSum10);
2310 _mm_storeu_pd(&mmArr[4], mmSum20);
2311 _mm_storeu_pd(&mmArr[6], mmSum30);
2312
2313 sum0 = mmArr[0] + mmArr[1];
2314 sum1 = mmArr[2] + mmArr[3];
2315 sum2 = mmArr[4] + mmArr[5];
2316 sum3 = mmArr[6] + mmArr[7];
2317
2318 for (; k<jan; k++){
2319 sum0 += OLP0[ih][kl][m+0][k]*tmpvec0[n][Bnum+k];
2320 sum1 += OLP0[ih][kl][m+1][k]*tmpvec0[n][Bnum+k];
2321 sum2 += OLP0[ih][kl][m+2][k]*tmpvec0[n][Bnum+k];
2322 sum3 += OLP0[ih][kl][m+3][k]*tmpvec0[n][Bnum+k];
2323 }
2324
2325 tmpvec2[n][Anum+m+0] += sum0;
2326 tmpvec2[n][Anum+m+1] += sum1;
2327 tmpvec2[n][Anum+m+2] += sum2;
2328 tmpvec2[n][Anum+m+3] += sum3;
2329 }
2330 }
2331
2332 for (; m<ian; m++){
2333 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2334
2335 sum = 0.0;
2336 for (k=0; k<jan; k++){
2337 sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
2338 }
2339
2340 tmpvec2[n][Anum+m] += sum;
2341 }
2342 }
2343
2344 #endif
2345
2346 }
2347 }
2348 }
2349
2350 if (measure_time==1){
2351 dtime(&Etime1);
2352 time4 += Etime1 - Stime1;
2353 }
2354
2355 if (measure_time==1) dtime(&Stime1);
2356
2357 #ifdef nosse
2358
2359 /* Original version */
2360
2361 for (rl0=0; rl0<=rl; rl0++){
2362
2363 /* (U_rl0|tmpvec2) */
2364
2365 for (m=0; m<EKC_core_size[Mc_AN]; m++){
2366 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2367 sum = 0.0;
2368 for (i=0; i<Msize2[Mc_AN]; i++){
2369 sum += U0[rl0][m][i]*tmpvec2[n][i];
2370 }
2371 tmpmat0[m][n] = sum;
2372 }
2373 }
2374
2375 /* |tmpvec0) - |U_rl0) * (U_rl0|tmpvec2) */
2376
2377 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2378 for (k=0; k<EKC_core_size[Mc_AN]; k++){
2379 dum = tmpmat0[k][n];
2380 for (i=0; i<Msize2[Mc_AN]; i++) tmpvec0[n][i] -= U0[rl0][k][i]*dum;
2381 }
2382 }
2383
2384 }
2385
2386 #else
2387
2388 /* Unrolling + SSE version */
2389
2390 for (rl0=0; rl0<=rl; rl0++){
2391
2392 /* (U_rl0|tmpvec2) */
2393
2394 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
2395 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2396
2397 mmSum00 = _mm_setzero_pd();
2398 mmSum01 = _mm_setzero_pd();
2399 mmSum10 = _mm_setzero_pd();
2400 mmSum11 = _mm_setzero_pd();
2401 mmSum20 = _mm_setzero_pd();
2402 mmSum21 = _mm_setzero_pd();
2403 mmSum30 = _mm_setzero_pd();
2404 mmSum31 = _mm_setzero_pd();
2405
2406 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
2407 mmTmp0 = _mm_loadu_pd(&tmpvec2[n][i+0]);
2408 mmTmp1 = _mm_loadu_pd(&tmpvec2[n][i+2]);
2409
2410 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
2411 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
2412
2413 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
2414 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
2415
2416 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
2417 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
2418
2419 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
2420 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
2421 }
2422
2423 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2424 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2425 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2426 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2427
2428 _mm_storeu_pd(&mmArr[0], mmSum00);
2429 _mm_storeu_pd(&mmArr[2], mmSum10);
2430 _mm_storeu_pd(&mmArr[4], mmSum20);
2431 _mm_storeu_pd(&mmArr[6], mmSum30);
2432
2433 sum0 = mmArr[0] + mmArr[1];
2434 sum1 = mmArr[2] + mmArr[3];
2435 sum2 = mmArr[4] + mmArr[5];
2436 sum3 = mmArr[6] + mmArr[7];
2437
2438 for (; i<Msize2[Mc_AN]; i++){
2439 sum0 += U0[rl0][m+0][i]*tmpvec2[n][i];
2440 sum1 += U0[rl0][m+1][i]*tmpvec2[n][i];
2441 sum2 += U0[rl0][m+2][i]*tmpvec2[n][i];
2442 sum3 += U0[rl0][m+3][i]*tmpvec2[n][i];
2443 }
2444
2445 tmpmat0[m+0][n] = sum0;
2446 tmpmat0[m+1][n] = sum1;
2447 tmpmat0[m+2][n] = sum2;
2448 tmpmat0[m+3][n] = sum3;
2449 }
2450 }
2451
2452 for (; m<EKC_core_size[Mc_AN]; m++){
2453 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2454 sum = 0.0;
2455 for (i=0; i<Msize2[Mc_AN]; i++){
2456 sum += U0[rl0][m][i]*tmpvec2[n][i];
2457 }
2458 tmpmat0[m][n] = sum;
2459 }
2460 }
2461
2462 /* |tmpvec0) - |U_rl0) * (U_rl0|tmpvec2) */
2463
2464 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2465 for (k=0; k<EKC_core_size[Mc_AN]; k++){
2466 dum = tmpmat0[k][n];
2467 for (i=0; i<Msize2[Mc_AN]; i++) tmpvec0[n][i] -= U0[rl0][k][i]*dum;
2468 }
2469 }
2470
2471 }
2472
2473 #endif
2474
2475 if (measure_time==1){
2476 dtime(&Etime1);
2477 time5 += Etime1 - Stime1;
2478 }
2479
2480 /*************************************************************
2481 S-orthonormalization of tmpvec0
2482 *************************************************************/
2483
2484 if (measure_time==1) dtime(&Stime1);
2485
2486 S_orthonormalize_vec( Mc_AN, ct_on, tmpvec0, tmpvec1, OLP0, tmpmat0, ko, iko, MP, Msize2 );
2487
2488 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2489 for (i=0; i<Msize2[Mc_AN]; i++){
2490 U0[rl+1][n][i] = tmpvec0[n][i];
2491 }
2492 }
2493
2494 if (measure_time==1){
2495 dtime(&Etime1);
2496 time6 += Etime1 - Stime1;
2497 }
2498
2499 } /* rl */
2500
2501 /************************************************************
2502 orthogonalization by diagonalization
2503 ************************************************************/
2504
2505 if (measure_time==1) dtime(&Stime1);
2506
2507 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2508
2509 /* S * |Vn) */
2510
2511 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2512 for (i=0; i<Msize2[Mc_AN]; i++){
2513 tmpvec1[n][i] = 0.0;
2514 }
2515 }
2516
2517 for (i=0; i<=can; i++){
2518
2519 ig = natn[ct_AN][i];
2520 ian = Spe_Total_CNO[WhatSpecies[ig]];
2521 Anum = MP[i] - 1;
2522 ih = S_G2M[ig];
2523
2524 for (j=0; j<=can; j++){
2525
2526 kl = RMI1[Mc_AN][i][j];
2527 jg = natn[ct_AN][j];
2528 jan = Spe_Total_CNO[WhatSpecies[jg]];
2529 Bnum = MP[j] - 1;
2530
2531 if (0<=kl){
2532
2533 #ifdef nosse
2534
2535 /* Original version */
2536 /**/
2537 for (m=0; m<ian; m++){
2538 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2539
2540 sum = 0.0;
2541 for (k=0; k<jan; k++){
2542 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
2543 }
2544 tmpvec1[n][Anum+m] += sum;
2545 }
2546 }
2547 /**/
2548
2549 #else
2550
2551 /* Unrolling + SSE version */
2552 /**/
2553 for (m=0; m<(ian-3); m+=4){
2554 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2555
2556 mmSum00 = _mm_setzero_pd();
2557 mmSum01 = _mm_setzero_pd();
2558 mmSum10 = _mm_setzero_pd();
2559 mmSum11 = _mm_setzero_pd();
2560 mmSum20 = _mm_setzero_pd();
2561 mmSum21 = _mm_setzero_pd();
2562 mmSum30 = _mm_setzero_pd();
2563 mmSum31 = _mm_setzero_pd();
2564
2565 for (k=0; k<(jan-3); k+=4){
2566 mmTmp0 = _mm_loadu_pd(&U0[rl][n][Bnum+k+0]);
2567 mmTmp1 = _mm_loadu_pd(&U0[rl][n][Bnum+k+2]);
2568
2569 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
2570 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
2571
2572 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
2573 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
2574
2575 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
2576 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
2577
2578 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
2579 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
2580 }
2581
2582 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2583 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2584 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2585 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2586
2587 _mm_storeu_pd(&mmArr[0], mmSum00);
2588 _mm_storeu_pd(&mmArr[2], mmSum10);
2589 _mm_storeu_pd(&mmArr[4], mmSum20);
2590 _mm_storeu_pd(&mmArr[6], mmSum30);
2591
2592 sum0 = mmArr[0] + mmArr[1];
2593 sum1 = mmArr[2] + mmArr[3];
2594 sum2 = mmArr[4] + mmArr[5];
2595 sum3 = mmArr[6] + mmArr[7];
2596
2597 for (; k<jan; k++){
2598 sum0 += OLP0[ih][kl][m+0][k]*U0[rl][n][Bnum+k];
2599 sum1 += OLP0[ih][kl][m+1][k]*U0[rl][n][Bnum+k];
2600 sum2 += OLP0[ih][kl][m+2][k]*U0[rl][n][Bnum+k];
2601 sum3 += OLP0[ih][kl][m+3][k]*U0[rl][n][Bnum+k];
2602 }
2603
2604 tmpvec1[n][Anum+m+0] += sum0;
2605 tmpvec1[n][Anum+m+1] += sum1;
2606 tmpvec1[n][Anum+m+2] += sum2;
2607 tmpvec1[n][Anum+m+3] += sum3;
2608 }
2609 }
2610
2611 for (; m<ian; m++){
2612 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2613
2614 sum = 0.0;
2615 for (k=0; k<jan; k++){
2616 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
2617 }
2618 tmpvec1[n][Anum+m] += sum;
2619 }
2620 }
2621 /**/
2622 #endif
2623
2624 }
2625 }
2626 }
2627
2628 #ifdef nosse
2629
2630 /* Original version */
2631 /**/
2632 for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
2633 for (m=0; m<EKC_core_size[Mc_AN]; m++){
2634 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2635 sum = 0.0;
2636 for (i=0; i<Msize2[Mc_AN]; i++){
2637 sum += U0[rl0][m][i]*tmpvec1[n][i];
2638 }
2639 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
2640 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
2641 }
2642 }
2643 }
2644 /**/
2645
2646 #else
2647
2648 /* Unrolling + SSE version */
2649 /**/
2650 for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
2651 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
2652 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2653
2654 mmSum00 = _mm_setzero_pd();
2655 mmSum01 = _mm_setzero_pd();
2656 mmSum10 = _mm_setzero_pd();
2657 mmSum11 = _mm_setzero_pd();
2658 mmSum20 = _mm_setzero_pd();
2659 mmSum21 = _mm_setzero_pd();
2660 mmSum30 = _mm_setzero_pd();
2661 mmSum31 = _mm_setzero_pd();
2662
2663 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
2664 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
2665 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
2666
2667 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
2668 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
2669
2670 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
2671 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
2672
2673 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
2674 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
2675
2676 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
2677 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
2678 }
2679
2680 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2681 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2682 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2683 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2684
2685 _mm_storeu_pd(&mmArr[0], mmSum00);
2686 _mm_storeu_pd(&mmArr[2], mmSum10);
2687 _mm_storeu_pd(&mmArr[4], mmSum20);
2688 _mm_storeu_pd(&mmArr[6], mmSum30);
2689
2690 sum0 = mmArr[0] + mmArr[1];
2691 sum1 = mmArr[2] + mmArr[3];
2692 sum2 = mmArr[4] + mmArr[5];
2693 sum3 = mmArr[6] + mmArr[7];
2694
2695 for (; i<Msize2[Mc_AN]; i++){
2696 sum0 += U0[rl0][m+0][i]*tmpvec1[n][i];
2697 sum1 += U0[rl0][m+1][i]*tmpvec1[n][i];
2698 sum2 += U0[rl0][m+2][i]*tmpvec1[n][i];
2699 sum3 += U0[rl0][m+3][i]*tmpvec1[n][i];
2700 }
2701
2702 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
2703 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum0;
2704
2705 FS[rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
2706 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+2] = sum1;
2707
2708 FS[rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
2709 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+3] = sum2;
2710
2711 FS[rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
2712 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+4] = sum3;
2713 }
2714 }
2715
2716 for (; m<EKC_core_size[Mc_AN]; m++){
2717 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2718 sum = 0.0;
2719 for (i=0; i<Msize2[Mc_AN]; i++){
2720 sum += U0[rl0][m][i]*tmpvec1[n][i];
2721 }
2722 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
2723 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
2724 }
2725 }
2726
2727 }
2728 /**/
2729
2730 #endif
2731
2732 }
2733
2734 if (measure_time==1){
2735 dtime(&Etime1);
2736 time7 += Etime1 - Stime1;
2737 }
2738
2739 if (measure_time==1) dtime(&Stime1);
2740
2741 Eigen_lapack(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
2742
2743 if (measure_time==1){
2744 dtime(&Etime1);
2745 time8 += Etime1 - Stime1;
2746 }
2747
2748 ZeroNum = 0;
2749
2750 for (i=1; i<=Msize3[Mc_AN]; i++){
2751
2752 if (error_check==1){
2753 printf("spin=%2d Mc_AN=%2d i=%3d ko[i]=%18.15f\n",spin,Mc_AN,i,ko[i]);
2754 }
2755
2756 if (cutoff_value<ko[i]){
2757 ko[i] = sqrt(fabs(ko[i]));
2758 iko[i] = 1.0/ko[i];
2759 }
2760 else{
2761 ZeroNum++;
2762 ko[i] = 0.0;
2763 iko[i] = 0.0;
2764 }
2765 }
2766
2767 if (error_check==1){
2768 printf("spin=%2d Mc_AN=%2d ZeroNum=%2d\n",spin,Mc_AN,ZeroNum);
2769 }
2770
2771 for (i=1; i<=Msize3[Mc_AN]; i++){
2772 for (j=1; j<=Msize3[Mc_AN]; j++){
2773 FS[i][j] = FS[i][j]*iko[j];
2774 }
2775 }
2776
2777 /* transpose for later calculation */
2778 for (i=1; i<=Msize3[Mc_AN]; i++){
2779 for (j=i+1; j<=Msize3[Mc_AN]; j++){
2780 tmp1 = FS[i][j];
2781 tmp2 = FS[j][i];
2782 FS[i][j] = tmp2;
2783 FS[j][i] = tmp1;
2784 }
2785 }
2786
2787 if (measure_time==1) dtime(&Stime1);
2788
2789 /* U0 * U * lamda^{-1/2} */
2790
2791 #ifdef nosse
2792
2793 /* original version */
2794 /*
2795 for (i=0; i<Msize2[Mc_AN]; i++){
2796 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
2797 for (m=0; m<EKC_core_size[Mc_AN]; m++){
2798
2799 m1 = rl0*EKC_core_size[Mc_AN] + m + 1;
2800
2801 sum = 0.0;
2802 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2803
2804 n1 = rl*EKC_core_size[Mc_AN] + 1;
2805
2806 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2807 sum += U0[rl][n][i]*FS[m1][n1+n];
2808 }
2809 }
2810
2811 Utmp[rl0][m] = sum;
2812 }
2813 }
2814
2815 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
2816 for (m=0; m<EKC_core_size[Mc_AN]; m++){
2817 U0[rl0][m][i] = Utmp[rl0][m];
2818 }
2819 }
2820 }
2821 */
2822
2823 /* unrolling version */
2824
2825 for (i=0; i<Msize2[Mc_AN]; i++){
2826
2827 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2828 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2829 Utmp[rl][n] = U0[rl][n][i];
2830 }
2831 }
2832
2833 for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
2834
2835 rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
2836 rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
2837 rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
2838 rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
2839
2840 mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
2841 mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
2842 mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
2843 mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
2844
2845 sum0 = 0.0;
2846 sum1 = 0.0;
2847 sum2 = 0.0;
2848 sum3 = 0.0;
2849
2850 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2851
2852 n1 = rl*EKC_core_size[Mc_AN] + 1;
2853
2854 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2855 sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
2856 sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
2857 sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
2858 sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
2859 }
2860 }
2861
2862 U0[rl00][mm0][i] = sum0;
2863 U0[rl01][mm1][i] = sum1;
2864 U0[rl02][mm2][i] = sum2;
2865 U0[rl03][mm3][i] = sum3;
2866 }
2867
2868 m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
2869
2870 for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
2871
2872 rl0 = (m1-1)/EKC_core_size[Mc_AN];
2873 m = (m1-1)%EKC_core_size[Mc_AN];
2874
2875 sum = 0.0;
2876
2877 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2878
2879 n1 = rl*EKC_core_size[Mc_AN] + 1;
2880
2881 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2882 sum += Utmp[rl][n]*FS[m1][n1+n];
2883 }
2884 }
2885
2886 U0[rl0][m][i] = sum;
2887 }
2888 } /* i */
2889
2890 #else
2891
2892 /* Unrolling + SSE version */
2893 /**/
2894
2895 for (i=0; i<Msize2[Mc_AN]; i++){
2896
2897 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2898 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2899 Utmp[rl][n] = U0[rl][n][i];
2900 }
2901 }
2902
2903 for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
2904
2905 rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
2906 rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
2907 rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
2908 rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
2909
2910 mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
2911 mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
2912 mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
2913 mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
2914
2915 sum0 = 0.0;
2916 sum1 = 0.0;
2917 sum2 = 0.0;
2918 sum3 = 0.0;
2919
2920 mmSum00 = _mm_setzero_pd();
2921 mmSum01 = _mm_setzero_pd();
2922 mmSum10 = _mm_setzero_pd();
2923 mmSum11 = _mm_setzero_pd();
2924 mmSum20 = _mm_setzero_pd();
2925 mmSum21 = _mm_setzero_pd();
2926 mmSum30 = _mm_setzero_pd();
2927 mmSum31 = _mm_setzero_pd();
2928
2929 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2930
2931 n1 = rl*EKC_core_size[Mc_AN] + 1;
2932
2933 if (0){
2934 mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
2935 mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
2936 }
2937
2938 for (n=0; n<(EKC_core_size[Mc_AN]-3); n+=4){
2939 mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
2940 mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
2941
2942 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+0]),mmTmp0));
2943 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+2]),mmTmp1));
2944
2945 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+0]),mmTmp0));
2946 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+2]),mmTmp1));
2947
2948 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+0]),mmTmp0));
2949 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+2]),mmTmp1));
2950
2951 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+0]),mmTmp0));
2952 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+2]),mmTmp1));
2953 }
2954
2955 for (; n<EKC_core_size[Mc_AN]; n++){
2956 sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
2957 sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
2958 sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
2959 sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
2960 }
2961
2962 }
2963
2964 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2965 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2966 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2967 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2968
2969 _mm_storeu_pd(&mmArr[0], mmSum00);
2970 _mm_storeu_pd(&mmArr[2], mmSum10);
2971 _mm_storeu_pd(&mmArr[4], mmSum20);
2972 _mm_storeu_pd(&mmArr[6], mmSum30);
2973
2974 sum0 += mmArr[0] + mmArr[1];
2975 sum1 += mmArr[2] + mmArr[3];
2976 sum2 += mmArr[4] + mmArr[5];
2977 sum3 += mmArr[6] + mmArr[7];
2978
2979 U0[rl00][mm0][i] = sum0;
2980 U0[rl01][mm1][i] = sum1;
2981 U0[rl02][mm2][i] = sum2;
2982 U0[rl03][mm3][i] = sum3;
2983 }
2984
2985 m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
2986
2987 for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
2988
2989 rl0 = (m1-1)/EKC_core_size[Mc_AN];
2990 m = (m1-1)%EKC_core_size[Mc_AN];
2991
2992 sum = 0.0;
2993
2994 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2995
2996 n1 = rl*EKC_core_size[Mc_AN] + 1;
2997
2998 for (n=0; n<EKC_core_size[Mc_AN]; n++){
2999 sum += Utmp[rl][n]*FS[m1][n1+n];
3000 }
3001 }
3002
3003 U0[rl0][m][i] = sum;
3004 }
3005 } /* i */
3006
3007 #endif
3008
3009 Krylov_U[spin][Mc_AN][0] = ZeroNum;
3010
3011 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3012 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3013 for (i=0; i<Msize2[Mc_AN]; i++){
3014 Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+i+1] = U0[rl][n][i];
3015 }
3016 }
3017 }
3018
3019 if (measure_time==1){
3020 dtime(&Etime1);
3021 time9 += Etime1 - Stime1;
3022 }
3023
3024 /************************************************************
3025 check the orthonormality of Krylov vectors
3026 ************************************************************/
3027
3028 if (error_check==1){
3029
3030 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3031
3032 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3033 for (i=0; i<Msize2[Mc_AN]; i++){
3034 tmpvec1[n][i] = 0.0;
3035 }
3036 }
3037
3038 for (i=0; i<=can; i++){
3039
3040 ig = natn[ct_AN][i];
3041 ian = Spe_Total_CNO[WhatSpecies[ig]];
3042 Anum = MP[i] - 1;
3043 ih = S_G2M[ig];
3044
3045 for (j=0; j<=can; j++){
3046
3047 kl = RMI1[Mc_AN][i][j];
3048 jg = natn[ct_AN][j];
3049 jan = Spe_Total_CNO[WhatSpecies[jg]];
3050 Bnum = MP[j] - 1;
3051
3052 if (0<=kl){
3053
3054 for (m=0; m<ian; m++){
3055 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3056
3057 sum = 0.0;
3058 for (k=0; k<jan; k++){
3059 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
3060 }
3061
3062 tmpvec1[n][Anum+m] += sum;
3063 }
3064 }
3065 }
3066 }
3067 }
3068
3069 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3070 for (m=0; m<EKC_core_size[Mc_AN]; m++){
3071 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3072 sum = 0.0;
3073 for (i=0; i<Msize2[Mc_AN]; i++){
3074 sum += U0[rl0][m][i]*tmpvec1[n][i];
3075 }
3076
3077 if (rl==rl0 && m==n){
3078 if ( 1.0e-10<fabs(sum-1.0) ) {
3079 printf("A spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
3080 spin,Mc_AN,rl,rl0,m,n,sum);
3081 }
3082 }
3083 else{
3084 if ( 1.0e-10<fabs(sum) ) {
3085 printf("B spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
3086 spin,Mc_AN,rl,rl0,m,n,sum);
3087 }
3088 }
3089
3090 }
3091 }
3092 }
3093 }
3094 }
3095
3096 if (measure_time==1){
3097 printf("pMatrix myid=%2d time1 =%5.3f time2 =%5.3f time3 =%5.3f time4 =%5.3f\n",
3098 myid,time1,time2,time3,time4);
3099 printf("pMatrix myid=%2d time5 =%5.3f time6 =%5.3f time7 =%5.3f time8 =%5.3f\n",
3100 myid,time5,time6,time7,time8);
3101 printf("pMatrix myid=%2d time9 =%5.3f\n",myid,time9);
3102 }
3103
3104 /* freeing of arrays */
3105
3106 for (i=0; i<rlmax_EC[Mc_AN]; i++){
3107 free(Utmp[i]);
3108 }
3109 free(Utmp);
3110
3111 for (i=0; i<rlmax_EC[Mc_AN]; i++){
3112 for (j=0; j<EKC_core_size[Mc_AN]; j++){
3113 free(U0[i][j]);
3114 }
3115 free(U0[i]);
3116 }
3117 free(U0);
3118
3119 for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
3120 free(tmpmat0[i]);
3121 }
3122 free(tmpmat0);
3123
3124 for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
3125 free(FS[i]);
3126 }
3127 free(FS);
3128
3129 free(ko);
3130 free(iko);
3131
3132 for (i=0; i<(EKC_core_size[Mc_AN]+2); i++){
3133 free(matRS0[i]);
3134 }
3135 free(matRS0);
3136
3137 for (i=0; i<(Msize4[Mc_AN]+3); i++){
3138 free(matRS1[i]);
3139 }
3140 free(matRS1);
3141 }
3142
3143
3144
3145
3146
3147
3148
3149
Generate_pMatrix2(int myid,int spin,int Mc_AN,double ***** Hks,double **** OLP0,double *** Krylov_U,int * MP,int * Msize,int * Msize2,int * Msize3,double ** tmpvec1)3150 void Generate_pMatrix2( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0,
3151 double ***Krylov_U, int *MP, int *Msize, int *Msize2, int *Msize3,
3152 double **tmpvec1)
3153 {
3154 int rl,rl0,rl1,ct_AN,fan,san,can,wan,ct_on,i,j;
3155 int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1;
3156 int ZeroNum,rl_half;
3157 int KU_d1,KU_d2,csize;
3158 double sum,dum,tmp0,tmp1,tmp2,tmp3;
3159 double **Utmp;
3160 double *ko,*iko;
3161 double **FS;
3162 double ***U0;
3163
3164 int rl00,rl01,rl02,rl03,rl04,rl05,rl06,rl07;
3165 int mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,m1s;
3166 __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
3167
3168 double mmArr[8];
3169 double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
3170
3171 ct_AN = M2G[Mc_AN];
3172 fan = FNAN[ct_AN];
3173 san = SNAN[ct_AN];
3174 can = fan + san;
3175 wan = WhatSpecies[ct_AN];
3176 ct_on = Spe_Total_CNO[wan];
3177
3178 if (Msize[Mc_AN]<Msize3[Mc_AN])
3179 csize = Msize3[Mc_AN] + 40;
3180 else
3181 csize = Msize[Mc_AN] + 40;
3182
3183 KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
3184 KU_d2 = Msize2[Mc_AN];
3185
3186 /* allocation of arrays */
3187
3188 Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
3189 for (i=0; i<rlmax_EC[Mc_AN]; i++){
3190 Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
3191 }
3192
3193 U0 = (double***)malloc(sizeof(double**)*rlmax_EC[Mc_AN]);
3194 for (i=0; i<rlmax_EC[Mc_AN]; i++){
3195 U0[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
3196 for (j=0; j<EKC_core_size[Mc_AN]; j++){
3197 U0[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
3198 for (k=0; k<(Msize2[Mc_AN]+3); k++) U0[i][j][k] = 0.0;
3199 }
3200 }
3201
3202 FS = (double**)malloc(sizeof(double*)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
3203 for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
3204 FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
3205 }
3206
3207 ko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
3208 iko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
3209
3210 /****************************************************
3211 initialize
3212 ****************************************************/
3213
3214 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3215 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3216 for (i=0; i<Msize2[Mc_AN]; i++){
3217 U0[rl][n][i] = 0.0;
3218 }
3219 }
3220 }
3221
3222 i = 0;
3223 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3224 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3225 if (i<Msize2[Mc_AN]) U0[rl][n][i] = 1.0;
3226 i++;
3227 }
3228 }
3229
3230 /************************************************************
3231 orthogonalization by diagonalization
3232 ************************************************************/
3233
3234 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3235
3236 /* S * |Vn) */
3237
3238 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3239 for (i=0; i<Msize2[Mc_AN]; i++){
3240 tmpvec1[n][i] = 0.0;
3241 }
3242 }
3243
3244 for (i=0; i<=can; i++){
3245
3246 ig = natn[ct_AN][i];
3247 ian = Spe_Total_CNO[WhatSpecies[ig]];
3248 Anum = MP[i] - 1;
3249 ih = S_G2M[ig];
3250
3251 for (j=0; j<=can; j++){
3252
3253 kl = RMI1[Mc_AN][i][j];
3254 jg = natn[ct_AN][j];
3255 jan = Spe_Total_CNO[WhatSpecies[jg]];
3256 Bnum = MP[j] - 1;
3257
3258 if (0<=kl){
3259
3260 #ifdef nosse
3261
3262 /* Original version */
3263 /**/
3264 for (m=0; m<ian; m++){
3265 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3266
3267 sum = 0.0;
3268 for (k=0; k<jan; k++){
3269 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
3270 }
3271 tmpvec1[n][Anum+m] += sum;
3272 }
3273 }
3274 /**/
3275
3276 #else
3277
3278 /* Unrolling + SSE version */
3279
3280 for (m=0; m<(ian-3); m+=4){
3281 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3282
3283 mmSum00 = _mm_setzero_pd();
3284 mmSum01 = _mm_setzero_pd();
3285 mmSum10 = _mm_setzero_pd();
3286 mmSum11 = _mm_setzero_pd();
3287 mmSum20 = _mm_setzero_pd();
3288 mmSum21 = _mm_setzero_pd();
3289 mmSum30 = _mm_setzero_pd();
3290 mmSum31 = _mm_setzero_pd();
3291
3292 for (k=0; k<(jan-3); k+=4){
3293 mmTmp0 = _mm_loadu_pd(&U0[rl][n][Bnum+k+0]);
3294 mmTmp1 = _mm_loadu_pd(&U0[rl][n][Bnum+k+2]);
3295
3296 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
3297 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
3298
3299 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
3300 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
3301
3302 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
3303 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
3304
3305 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
3306 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
3307 }
3308
3309 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
3310 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
3311 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
3312 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
3313
3314 _mm_storeu_pd(&mmArr[0], mmSum00);
3315 _mm_storeu_pd(&mmArr[2], mmSum10);
3316 _mm_storeu_pd(&mmArr[4], mmSum20);
3317 _mm_storeu_pd(&mmArr[6], mmSum30);
3318
3319 sum0 = mmArr[0] + mmArr[1];
3320 sum1 = mmArr[2] + mmArr[3];
3321 sum2 = mmArr[4] + mmArr[5];
3322 sum3 = mmArr[6] + mmArr[7];
3323
3324 for (; k<jan; k++){
3325 sum0 += OLP0[ih][kl][m+0][k]*U0[rl][n][Bnum+k];
3326 sum1 += OLP0[ih][kl][m+1][k]*U0[rl][n][Bnum+k];
3327 sum2 += OLP0[ih][kl][m+2][k]*U0[rl][n][Bnum+k];
3328 sum3 += OLP0[ih][kl][m+3][k]*U0[rl][n][Bnum+k];
3329 }
3330
3331 tmpvec1[n][Anum+m+0] += sum0;
3332 tmpvec1[n][Anum+m+1] += sum1;
3333 tmpvec1[n][Anum+m+2] += sum2;
3334 tmpvec1[n][Anum+m+3] += sum3;
3335 }
3336 }
3337
3338 for (; m<ian; m++){
3339 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3340
3341 sum = 0.0;
3342 for (k=0; k<jan; k++){
3343 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
3344 }
3345 tmpvec1[n][Anum+m] += sum;
3346 }
3347 }
3348
3349 #endif
3350
3351 }
3352 }
3353 }
3354
3355 #ifdef nosse
3356
3357 /* Original version */
3358 /**/
3359 for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
3360 for (m=0; m<EKC_core_size[Mc_AN]; m++){
3361 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3362 sum = 0.0;
3363 for (i=0; i<Msize2[Mc_AN]; i++){
3364 sum += U0[rl0][m][i]*tmpvec1[n][i];
3365 }
3366 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
3367 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
3368 }
3369 }
3370 }
3371 /**/
3372
3373 #else
3374
3375 /* Unrolling + SSE version */
3376
3377 for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
3378 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
3379 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3380
3381 mmSum00 = _mm_setzero_pd();
3382 mmSum01 = _mm_setzero_pd();
3383 mmSum10 = _mm_setzero_pd();
3384 mmSum11 = _mm_setzero_pd();
3385 mmSum20 = _mm_setzero_pd();
3386 mmSum21 = _mm_setzero_pd();
3387 mmSum30 = _mm_setzero_pd();
3388 mmSum31 = _mm_setzero_pd();
3389
3390 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
3391 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
3392 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
3393
3394 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
3395 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
3396
3397 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
3398 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
3399
3400 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
3401 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
3402
3403 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
3404 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
3405 }
3406
3407 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
3408 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
3409 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
3410 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
3411
3412 _mm_storeu_pd(&mmArr[0], mmSum00);
3413 _mm_storeu_pd(&mmArr[2], mmSum10);
3414 _mm_storeu_pd(&mmArr[4], mmSum20);
3415 _mm_storeu_pd(&mmArr[6], mmSum30);
3416
3417 sum0 = mmArr[0] + mmArr[1];
3418 sum1 = mmArr[2] + mmArr[3];
3419 sum2 = mmArr[4] + mmArr[5];
3420 sum3 = mmArr[6] + mmArr[7];
3421
3422 for (; i<Msize2[Mc_AN]; i++){
3423 sum0 += U0[rl0][m+0][i]*tmpvec1[n][i];
3424 sum1 += U0[rl0][m+1][i]*tmpvec1[n][i];
3425 sum2 += U0[rl0][m+2][i]*tmpvec1[n][i];
3426 sum3 += U0[rl0][m+3][i]*tmpvec1[n][i];
3427 }
3428
3429 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
3430 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum0;
3431
3432 FS[rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
3433 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+2] = sum1;
3434
3435 FS[rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
3436 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+3] = sum2;
3437
3438 FS[rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
3439 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+4] = sum3;
3440 }
3441 }
3442
3443 for (; m<EKC_core_size[Mc_AN]; m++){
3444 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3445 sum = 0.0;
3446 for (i=0; i<Msize2[Mc_AN]; i++){
3447 sum += U0[rl0][m][i]*tmpvec1[n][i];
3448 }
3449 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
3450 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
3451 }
3452 }
3453 }
3454
3455 #endif
3456
3457 }
3458
3459 Eigen_lapack(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
3460 ZeroNum = 0;
3461
3462 for (i=1; i<=Msize3[Mc_AN]; i++){
3463
3464 if (error_check==1){
3465 printf("spin=%2d Mc_AN=%2d i=%3d ko[i]=%18.15f\n",spin,Mc_AN,i,ko[i]);
3466 }
3467
3468 if (cutoff_value<ko[i]){
3469 ko[i] = sqrt(fabs(ko[i]));
3470 iko[i] = 1.0/ko[i];
3471 }
3472 else{
3473 ZeroNum++;
3474 ko[i] = 0.0;
3475 iko[i] = 0.0;
3476 }
3477 }
3478
3479 if (error_check==1){
3480 printf("spin=%2d Mc_AN=%2d ZeroNum=%2d\n",spin,Mc_AN,ZeroNum);
3481 }
3482
3483 for (i=1; i<=Msize3[Mc_AN]; i++){
3484 for (j=1; j<=Msize3[Mc_AN]; j++){
3485 FS[i][j] = FS[i][j]*iko[j];
3486 }
3487 }
3488
3489 /* transpose for later calculation */
3490 for (i=1; i<=Msize3[Mc_AN]; i++){
3491 for (j=i+1; j<=Msize3[Mc_AN]; j++){
3492 tmp1 = FS[i][j];
3493 tmp2 = FS[j][i];
3494 FS[i][j] = tmp2;
3495 FS[j][i] = tmp1;
3496 }
3497 }
3498
3499 /* U0 * U * lamda^{-1/2} */
3500
3501 #ifdef nosse
3502
3503 /* original version */
3504
3505 /**/
3506 for (i=0; i<Msize2[Mc_AN]; i++){
3507 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3508 for (m=0; m<EKC_core_size[Mc_AN]; m++){
3509
3510 m1 = rl0*EKC_core_size[Mc_AN] + m + 1;
3511
3512 sum = 0.0;
3513 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3514
3515 n1 = rl*EKC_core_size[Mc_AN] + 1;
3516
3517 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3518 sum += U0[rl][n][i]*FS[m1][n1+n];
3519 }
3520 }
3521
3522 Utmp[rl0][m] = sum;
3523 }
3524 }
3525
3526 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3527 for (m=0; m<EKC_core_size[Mc_AN]; m++){
3528 U0[rl0][m][i] = Utmp[rl0][m];
3529 }
3530 }
3531 }
3532 /**/
3533
3534 #else
3535
3536 /* Unrolling + SSE version */
3537 /**/
3538 for (i=0; i<Msize2[Mc_AN]; i++){
3539
3540 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3541 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3542 Utmp[rl][n] = U0[rl][n][i];
3543 }
3544 }
3545
3546 for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
3547
3548 rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
3549 rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
3550 rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
3551 rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
3552
3553 mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
3554 mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
3555 mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
3556 mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
3557
3558 sum0 = 0.0;
3559 sum1 = 0.0;
3560 sum2 = 0.0;
3561 sum3 = 0.0;
3562
3563 mmSum00 = _mm_setzero_pd();
3564 mmSum01 = _mm_setzero_pd();
3565 mmSum10 = _mm_setzero_pd();
3566 mmSum11 = _mm_setzero_pd();
3567 mmSum20 = _mm_setzero_pd();
3568 mmSum21 = _mm_setzero_pd();
3569 mmSum30 = _mm_setzero_pd();
3570 mmSum31 = _mm_setzero_pd();
3571
3572 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3573
3574 n1 = rl*EKC_core_size[Mc_AN] + 1;
3575
3576 if (0){
3577 mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]); /*???????????*/
3578 mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]); /*???????????*/
3579 }
3580
3581 for (n=0; n<(EKC_core_size[Mc_AN]-3); n+=4){
3582 mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
3583 mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
3584 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+0]),mmTmp0));
3585 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+2]),mmTmp1));
3586
3587 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+0]),mmTmp0));
3588 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+2]),mmTmp1));
3589
3590 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+0]),mmTmp0));
3591 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+2]),mmTmp1));
3592
3593 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+0]),mmTmp0));
3594 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+2]),mmTmp1));
3595 }
3596
3597 for (; n<EKC_core_size[Mc_AN]; n++){
3598 sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
3599 sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
3600 sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
3601 sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
3602 }
3603 }
3604
3605 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
3606 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
3607 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
3608 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
3609
3610 _mm_storeu_pd(&mmArr[0], mmSum00);
3611 _mm_storeu_pd(&mmArr[2], mmSum10);
3612 _mm_storeu_pd(&mmArr[4], mmSum20);
3613 _mm_storeu_pd(&mmArr[6], mmSum30);
3614
3615 sum0 += mmArr[0] + mmArr[1];
3616 sum1 += mmArr[2] + mmArr[3];
3617 sum2 += mmArr[4] + mmArr[5];
3618 sum3 += mmArr[6] + mmArr[7];
3619
3620 U0[rl00][mm0][i] = sum0;
3621 U0[rl01][mm1][i] = sum1;
3622 U0[rl02][mm2][i] = sum2;
3623 U0[rl03][mm3][i] = sum3;
3624 }
3625
3626 m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
3627
3628 for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
3629
3630 rl0 = (m1-1)/EKC_core_size[Mc_AN];
3631 m = (m1-1)%EKC_core_size[Mc_AN];
3632
3633 sum = 0.0;
3634
3635 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3636
3637 n1 = rl*EKC_core_size[Mc_AN] + 1;
3638
3639 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3640 sum += Utmp[rl][n]*FS[m1][n1+n];
3641 }
3642 }
3643
3644 U0[rl0][m][i] = sum;
3645 }
3646 } /* i */
3647
3648 #endif
3649
3650 Krylov_U[spin][Mc_AN][0] = ZeroNum;
3651
3652 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3653 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3654 for (i=0; i<Msize2[Mc_AN]; i++){
3655 Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+i+1] = U0[rl][n][i];
3656 }
3657 }
3658 }
3659
3660 /************************************************************
3661 check the orthonormality of Krylov vectors
3662 ************************************************************/
3663
3664 if (error_check==1){
3665
3666 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3667
3668 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3669 for (i=0; i<Msize2[Mc_AN]; i++){
3670 tmpvec1[n][i] = 0.0;
3671 }
3672 }
3673
3674 for (i=0; i<=can; i++){
3675
3676 ig = natn[ct_AN][i];
3677 ian = Spe_Total_CNO[WhatSpecies[ig]];
3678 Anum = MP[i] - 1;
3679 ih = S_G2M[ig];
3680
3681 for (j=0; j<=can; j++){
3682
3683 kl = RMI1[Mc_AN][i][j];
3684 jg = natn[ct_AN][j];
3685 jan = Spe_Total_CNO[WhatSpecies[jg]];
3686 Bnum = MP[j] - 1;
3687
3688 if (0<=kl){
3689
3690 for (m=0; m<ian; m++){
3691 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3692
3693 sum = 0.0;
3694 for (k=0; k<jan; k++){
3695 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
3696 }
3697
3698 tmpvec1[n][Anum+m] += sum;
3699 }
3700 }
3701 }
3702 }
3703 }
3704
3705 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3706 for (m=0; m<EKC_core_size[Mc_AN]; m++){
3707 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3708 sum = 0.0;
3709 for (i=0; i<Msize2[Mc_AN]; i++){
3710 sum += U0[rl0][m][i]*tmpvec1[n][i];
3711 }
3712
3713 if (rl==rl0 && m==n){
3714 if ( 1.0e-10<fabs(sum-1.0) ) {
3715 printf("A spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
3716 spin,Mc_AN,rl,rl0,m,n,sum);
3717 }
3718 }
3719 else{
3720 if ( 1.0e-10<fabs(sum) ) {
3721 printf("B spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
3722 spin,Mc_AN,rl,rl0,m,n,sum);
3723 }
3724 }
3725
3726 }
3727 }
3728 }
3729 }
3730 }
3731
3732 /* freeing of arrays */
3733
3734 for (i=0; i<rlmax_EC[Mc_AN]; i++){
3735 free(Utmp[i]);
3736 }
3737 free(Utmp);
3738
3739 for (i=0; i<rlmax_EC[Mc_AN]; i++){
3740 for (j=0; j<EKC_core_size[Mc_AN]; j++){
3741 free(U0[i][j]);
3742 }
3743 free(U0[i]);
3744 }
3745 free(U0);
3746
3747 for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
3748 free(FS[i]);
3749 }
3750 free(FS);
3751
3752 free(ko);
3753 free(iko);
3754 }
3755
3756
3757
3758
3759
3760
3761
Embedding_Matrix(int spin,int Mc_AN,double ***** Hks,double *** Krylov_U,double **** EC_matrix,int * MP,int * Msize,int * Msize2,int * Msize3,double ** tmpvec1)3762 void Embedding_Matrix(int spin, int Mc_AN, double *****Hks,
3763 double ***Krylov_U, double ****EC_matrix,
3764 int *MP, int *Msize, int *Msize2, int *Msize3,
3765 double **tmpvec1 )
3766 {
3767 int ct_AN,fan,san,can,wan,ct_on;
3768 int rl,rl0,m,n,i,j,k,kl,jg,jan,ih,ian;
3769 int Anum,Bnum,ig;
3770 int KU_d1,KU_d2,csize;
3771 double sum,tmp1,tmp2,tmp3;
3772 __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
3773
3774 double mmArr[8];
3775 double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
3776 double stime,etime;
3777 double time1,time2,time3,time4,time5,time6,time7,time8;
3778
3779 ct_AN = M2G[Mc_AN];
3780 fan = FNAN[ct_AN];
3781 san = SNAN[ct_AN];
3782 can = fan + san;
3783 wan = WhatSpecies[ct_AN];
3784 ct_on = Spe_Total_CNO[wan];
3785
3786 if (Msize[Mc_AN]<Msize3[Mc_AN])
3787 csize = Msize3[Mc_AN] + 40;
3788 else
3789 csize = Msize[Mc_AN] + 40;
3790
3791 KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
3792 KU_d2 = Msize2[Mc_AN];
3793
3794 if (measure_time==1){
3795 time1 = 0.0;
3796 time2 = 0.0;
3797 time3 = 0.0;
3798 time4 = 0.0;
3799 time5 = 0.0;
3800 time6 = 0.0;
3801 time7 = 0.0;
3802 time8 = 0.0;
3803 }
3804
3805 /*******************************
3806 u1^+ C^+ u2
3807 *******************************/
3808
3809 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3810
3811 /* C^+ u2 */
3812
3813 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3814 for (i=0; i<Msize2[Mc_AN]; i++){
3815 tmpvec1[n][i] = 0.0;
3816 }
3817 }
3818
3819 for (i=0; i<=fan; i++){
3820
3821 ig = natn[ct_AN][i];
3822 ian = Spe_Total_CNO[WhatSpecies[ig]];
3823 Anum = MP[i] - 1;
3824 ih = S_G2M[ig];
3825
3826 for (j=fan+1; j<=can; j++){
3827
3828 kl = RMI1[Mc_AN][i][j];
3829 jg = natn[ct_AN][j];
3830 jan = Spe_Total_CNO[WhatSpecies[jg]];
3831 Bnum = MP[j];
3832
3833 if (0<=kl){
3834
3835 #ifdef nosse
3836
3837 if (measure_time==1) dtime(&stime);
3838
3839 /* Original version */
3840 /**/
3841 for (m=0; m<ian; m++){
3842 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3843
3844 sum = 0.0;
3845 for (k=0; k<jan; k++){
3846 sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3847 }
3848
3849 tmpvec1[n][Anum+m] += sum;
3850 }
3851 }
3852 /**/
3853
3854 if (measure_time==1){
3855 dtime(&etime);
3856 time1 += etime - stime;
3857 }
3858
3859 #else
3860 if (measure_time==1) dtime(&stime);
3861
3862 /* Unrolling + SSE version */
3863 /**/
3864 for (m=0; m<(ian-3); m+=4){
3865 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3866
3867 mmSum00 = _mm_setzero_pd();
3868 mmSum01 = _mm_setzero_pd();
3869 mmSum10 = _mm_setzero_pd();
3870 mmSum11 = _mm_setzero_pd();
3871 mmSum20 = _mm_setzero_pd();
3872 mmSum21 = _mm_setzero_pd();
3873 mmSum30 = _mm_setzero_pd();
3874 mmSum31 = _mm_setzero_pd();
3875
3876 for (k=0; k<(jan-3); k+=4){
3877 mmTmp0 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0]);
3878 mmTmp1 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+2]);
3879
3880 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
3881 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
3882
3883 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
3884 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
3885
3886 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
3887 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
3888
3889 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
3890 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
3891 }
3892
3893 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
3894 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
3895 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
3896 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
3897
3898 _mm_storeu_pd(&mmArr[0], mmSum00);
3899 _mm_storeu_pd(&mmArr[2], mmSum10);
3900 _mm_storeu_pd(&mmArr[4], mmSum20);
3901 _mm_storeu_pd(&mmArr[6], mmSum30);
3902
3903 sum0 = mmArr[0] + mmArr[1];
3904 sum1 = mmArr[2] + mmArr[3];
3905 sum2 = mmArr[4] + mmArr[5];
3906 sum3 = mmArr[6] + mmArr[7];
3907
3908 for (; k<jan; k++){
3909 sum0 += Hks[spin][ih][kl][m+0][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3910 sum1 += Hks[spin][ih][kl][m+1][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3911 sum2 += Hks[spin][ih][kl][m+2][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3912 sum3 += Hks[spin][ih][kl][m+3][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3913 }
3914
3915 tmpvec1[n][Anum+m+0] += sum0;
3916 tmpvec1[n][Anum+m+1] += sum1;
3917 tmpvec1[n][Anum+m+2] += sum2;
3918 tmpvec1[n][Anum+m+3] += sum3;
3919 }
3920 }
3921
3922 for (; m<ian; m++){
3923 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3924
3925 sum = 0.0;
3926 for (k=0; k<jan; k++){
3927 sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3928 }
3929
3930 tmpvec1[n][Anum+m] += sum;
3931 }
3932 }
3933
3934 if (measure_time==1){
3935 dtime(&etime);
3936 time1 += etime - stime;
3937 }
3938 #endif
3939
3940 }
3941 }
3942 }
3943
3944 /* u1^+ C^+ u2 */
3945
3946 #ifdef nosse
3947
3948 if (measure_time==1) dtime(&stime);
3949
3950 /* Original version */
3951 /**/
3952 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3953 for (m=0; m<EKC_core_size[Mc_AN]; m++){
3954 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3955 sum = 0.0;
3956 for (i=0; i<Msize[Mc_AN]; i++){
3957 sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+m*KU_d2+i+1]*tmpvec1[n][i];
3958 }
3959 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
3960 }
3961 }
3962 }
3963 /**/
3964
3965 if (measure_time==1){
3966 dtime(&etime);
3967 time2 += etime - stime;
3968 }
3969
3970 #else
3971
3972 /*
3973 printf("ABC1 Mc_AN=%2d %2d %2d %2d\n",Mc_AN,rlmax_EC[Mc_AN],EKC_core_size[Mc_AN],Msize[Mc_AN]);
3974 */
3975
3976 if (measure_time==1) dtime(&stime);
3977
3978 /* Unrolling + SSE version */
3979 /**/
3980 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3981 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
3982 for (n=0; n<EKC_core_size[Mc_AN]; n++){
3983
3984 mmSum00 = _mm_setzero_pd();
3985 mmSum01 = _mm_setzero_pd();
3986 mmSum10 = _mm_setzero_pd();
3987 mmSum11 = _mm_setzero_pd();
3988 mmSum20 = _mm_setzero_pd();
3989 mmSum21 = _mm_setzero_pd();
3990 mmSum30 = _mm_setzero_pd();
3991 mmSum31 = _mm_setzero_pd();
3992
3993 for (i=0; i<(Msize[Mc_AN]-3); i+=4){
3994 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
3995 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
3996
3997 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]),mmTmp0));
3998 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+3]),mmTmp1));
3999
4000 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]),mmTmp0));
4001 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+3]),mmTmp1));
4002
4003 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]),mmTmp0));
4004 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+3]),mmTmp1));
4005
4006 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]),mmTmp0));
4007 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+3]),mmTmp1));
4008 }
4009
4010 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4011 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4012 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4013 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4014
4015 _mm_storeu_pd(&mmArr[0], mmSum00);
4016 _mm_storeu_pd(&mmArr[2], mmSum10);
4017 _mm_storeu_pd(&mmArr[4], mmSum20);
4018 _mm_storeu_pd(&mmArr[6], mmSum30);
4019
4020 sum0 = mmArr[0] + mmArr[1];
4021 sum1 = mmArr[2] + mmArr[3];
4022 sum2 = mmArr[4] + mmArr[5];
4023 sum3 = mmArr[6] + mmArr[7];
4024
4025 for (; i<Msize[Mc_AN]; i++){
4026 sum0 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
4027 sum1 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]*tmpvec1[n][i];
4028 sum2 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]*tmpvec1[n][i];
4029 sum3 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]*tmpvec1[n][i];
4030 }
4031
4032 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
4033 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
4034 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
4035 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
4036 }
4037 }
4038
4039 for (; m<EKC_core_size[Mc_AN]; m++){
4040 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4041 sum = 0.0;
4042 for (i=0; i<Msize[Mc_AN]; i++){
4043 sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
4044 }
4045 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
4046 }
4047 }
4048
4049 }
4050 /**/
4051
4052 if (measure_time==1){
4053 dtime(&etime);
4054 time2 += etime - stime;
4055 }
4056
4057 #endif
4058
4059 } /* rl */
4060
4061 /*******************************
4062 u2^+ C u1
4063 *******************************/
4064
4065 for (i=1; i<=Msize3[Mc_AN]; i++){
4066 for (j=i; j<=Msize3[Mc_AN]; j++){
4067
4068 tmp1 = EC_matrix[spin][Mc_AN][i][j];
4069 tmp2 = EC_matrix[spin][Mc_AN][j][i];
4070 tmp3 = tmp1 + tmp2;
4071
4072 EC_matrix[spin][Mc_AN][i][j] = tmp3;
4073 EC_matrix[spin][Mc_AN][j][i] = tmp3;
4074 }
4075 }
4076
4077 /*******************************
4078 u2^+ B u2
4079 *******************************/
4080
4081 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
4082
4083 /* B u2 */
4084
4085 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4086 for (i=0; i<Msize2[Mc_AN]; i++){
4087 tmpvec1[n][i] = 0.0;
4088 }
4089 }
4090
4091 for (i=fan+1; i<=can; i++){
4092
4093 ig = natn[ct_AN][i];
4094 ian = Spe_Total_CNO[WhatSpecies[ig]];
4095 Anum = MP[i] - 1;
4096 ih = S_G2M[ig];
4097
4098 for (j=fan+1; j<=can; j++){
4099
4100 kl = RMI1[Mc_AN][i][j];
4101 jg = natn[ct_AN][j];
4102 jan = Spe_Total_CNO[WhatSpecies[jg]];
4103 Bnum = MP[j];
4104
4105 if (0<=kl){
4106
4107 #ifdef nosse
4108
4109 if (measure_time==1) dtime(&stime);
4110
4111 /* Original version */
4112 /**/
4113 for (m=0; m<ian; m++){
4114 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4115
4116 sum = 0.0;
4117 for (k=0; k<jan; k++){
4118 sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
4119 }
4120
4121 tmpvec1[n][Anum+m] += sum;
4122 }
4123 }
4124 /**/
4125
4126 if (measure_time==1){
4127 dtime(&etime);
4128 time3 += etime - stime;
4129 }
4130
4131 #else
4132
4133 if (measure_time==1) dtime(&stime);
4134
4135 /* Unrolling + SSE version */
4136 /**/
4137 for (m=0; m<(ian-3); m+=4){
4138 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4139
4140 mmSum00 = _mm_setzero_pd();
4141 mmSum01 = _mm_setzero_pd();
4142 mmSum10 = _mm_setzero_pd();
4143 mmSum11 = _mm_setzero_pd();
4144 mmSum20 = _mm_setzero_pd();
4145 mmSum21 = _mm_setzero_pd();
4146 mmSum30 = _mm_setzero_pd();
4147 mmSum31 = _mm_setzero_pd();
4148
4149 for (k=0; k<(jan-3); k+=4){
4150 mmTmp0 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0]);
4151 mmTmp1 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+2]);
4152
4153 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
4154 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
4155
4156 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
4157 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
4158
4159 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
4160 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
4161
4162 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
4163 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
4164 }
4165
4166 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4167 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4168 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4169 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4170
4171 _mm_storeu_pd(&mmArr[0], mmSum00);
4172 _mm_storeu_pd(&mmArr[2], mmSum10);
4173 _mm_storeu_pd(&mmArr[4], mmSum20);
4174 _mm_storeu_pd(&mmArr[6], mmSum30);
4175
4176 sum0 = mmArr[0] + mmArr[1];
4177 sum1 = mmArr[2] + mmArr[3];
4178 sum2 = mmArr[4] + mmArr[5];
4179 sum3 = mmArr[6] + mmArr[7];
4180
4181 for (; k<jan; k++){
4182 sum0 += Hks[spin][ih][kl][m+0][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
4183 sum1 += Hks[spin][ih][kl][m+1][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
4184 sum2 += Hks[spin][ih][kl][m+2][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
4185 sum3 += Hks[spin][ih][kl][m+3][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
4186 }
4187
4188 tmpvec1[n][Anum+m+0] += sum0;
4189 tmpvec1[n][Anum+m+1] += sum1;
4190 tmpvec1[n][Anum+m+2] += sum2;
4191 tmpvec1[n][Anum+m+3] += sum3;
4192 }
4193 }
4194
4195 for (; m<ian; m++){
4196 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4197
4198 sum = 0.0;
4199 for (k=0; k<jan; k++){
4200 sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
4201 }
4202
4203 tmpvec1[n][Anum+m] += sum;
4204 }
4205 }
4206 /**/
4207
4208 if (measure_time==1){
4209 dtime(&etime);
4210 time3 += etime - stime;
4211 }
4212
4213 #endif
4214
4215 }
4216 }
4217 }
4218
4219 /* u2^+ B u2 */
4220
4221 #ifdef nosse
4222
4223 if (measure_time==1) dtime(&stime);
4224
4225 /* Original version */
4226 /**/
4227 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
4228 for (m=0; m<EKC_core_size[Mc_AN]; m++){
4229 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4230 sum = 0.0;
4231 for (i=Msize[Mc_AN]; i<Msize2[Mc_AN]; i++){
4232 sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+m*KU_d2+i+1]*tmpvec1[n][i];
4233 }
4234 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum;
4235 }
4236 }
4237 }
4238 /**/
4239
4240 if (measure_time==1){
4241 dtime(&etime);
4242 time4 += etime - stime;
4243 }
4244
4245 #else
4246
4247 if (measure_time==1) dtime(&stime);
4248
4249 /* Unrolling + SSE version */
4250 /**/
4251 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
4252 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
4253 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4254
4255 mmSum00 = _mm_setzero_pd();
4256 mmSum01 = _mm_setzero_pd();
4257 mmSum10 = _mm_setzero_pd();
4258 mmSum11 = _mm_setzero_pd();
4259 mmSum20 = _mm_setzero_pd();
4260 mmSum21 = _mm_setzero_pd();
4261 mmSum30 = _mm_setzero_pd();
4262 mmSum31 = _mm_setzero_pd();
4263
4264 for (i=Msize[Mc_AN]; i<(Msize2[Mc_AN]-3); i+=4){
4265 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
4266 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
4267
4268 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]),mmTmp0));
4269 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+3]),mmTmp1));
4270
4271 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]),mmTmp0));
4272 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+3]),mmTmp1));
4273
4274 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]),mmTmp0));
4275 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+3]),mmTmp1));
4276
4277 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]),mmTmp0));
4278 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+3]),mmTmp1));
4279 }
4280
4281 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4282 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4283 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4284 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4285
4286 _mm_storeu_pd(&mmArr[0], mmSum00);
4287 _mm_storeu_pd(&mmArr[2], mmSum10);
4288 _mm_storeu_pd(&mmArr[4], mmSum20);
4289 _mm_storeu_pd(&mmArr[6], mmSum30);
4290
4291 sum0 = mmArr[0] + mmArr[1];
4292 sum1 = mmArr[2] + mmArr[3];
4293 sum2 = mmArr[4] + mmArr[5];
4294 sum3 = mmArr[6] + mmArr[7];
4295
4296 for (; i<Msize2[Mc_AN]; i++){
4297 sum0 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
4298 sum1 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]*tmpvec1[n][i];
4299 sum2 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]*tmpvec1[n][i];
4300 sum3 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]*tmpvec1[n][i];
4301 }
4302
4303 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum0;
4304 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] += sum1;
4305 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] += sum2;
4306 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] += sum3;
4307 }
4308 }
4309
4310 for (; m<EKC_core_size[Mc_AN]; m++){
4311 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4312 sum = 0.0;
4313 for (i=Msize[Mc_AN]; i<Msize2[Mc_AN]; i++){
4314 sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
4315 }
4316 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum;
4317 }
4318 }
4319
4320 }
4321 /**/
4322
4323 if (measure_time==1){
4324 dtime(&etime);
4325 time4 += etime - stime;
4326 }
4327
4328 #endif
4329
4330 } /* rl */
4331
4332 if (measure_time==1){
4333 printf("Embedding_Matrix time1=%5.3f time2=%5.3f time3=%5.3f time4=%5.3f\n",
4334 time1,time2,time3,time4);
4335 }
4336 }
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
Krylov_IOLP(int Mc_AN,double **** OLP0,double *** Krylov_U_OLP,double ** inv_RS,int * MP,int * Msize2,int * Msize4,int Msize2_max,double ** tmpvec0,double ** tmpvec1)4347 void Krylov_IOLP( int Mc_AN, double ****OLP0, double ***Krylov_U_OLP, double **inv_RS, int *MP,
4348 int *Msize2, int *Msize4, int Msize2_max, double **tmpvec0, double **tmpvec1 )
4349 {
4350 int rl,ct_AN,fan,san,can,wan,ct_on,i,j;
4351 int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1;
4352 int rl0,ZeroNum,Neumann_series,ns,Gh_AN,wanB;
4353 double sum,dum,tmp0,tmp1,tmp2,tmp3,rcutA,r0;
4354 double **tmpmat0;
4355 double *ko,*iko;
4356 double **FS;
4357
4358 ct_AN = M2G[Mc_AN];
4359 fan = FNAN[ct_AN];
4360 san = SNAN[ct_AN];
4361 can = fan + san;
4362 wan = WhatSpecies[ct_AN];
4363 ct_on = Spe_Total_CNO[wan];
4364 rcutA = Spe_Atom_Cut1[wan];
4365
4366 /* allocation of arrays */
4367
4368 tmpmat0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+4));
4369 for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
4370 tmpmat0[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+4));
4371 }
4372
4373 FS = (double**)malloc(sizeof(double*)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
4374 for (i=0; i<(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
4375 FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
4376 }
4377
4378 ko = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
4379 iko = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
4380
4381 /****************************************************
4382 initialize
4383 ****************************************************/
4384
4385 for (i=0; i<EKC_core_size_max; i++){
4386 for (j=0; j<Msize2_max; j++){
4387 tmpvec0[i][j] = 0.0;
4388 tmpvec1[i][j] = 0.0;
4389 }
4390 }
4391
4392 /* find the nearest atom with distance of r0 */
4393
4394 r0 = 1.0e+10;
4395 for (k=1; k<=FNAN[ct_AN]; k++){
4396 Gh_AN = natn[ct_AN][k];
4397 wanB = WhatSpecies[Gh_AN];
4398 if (Dis[ct_AN][k]<r0) r0 = Dis[ct_AN][k];
4399 }
4400
4401 /* starting vector */
4402
4403 m = 0;
4404 for (k=0; k<=FNAN[ct_AN]; k++){
4405
4406 Gh_AN = natn[ct_AN][k];
4407 wanB = WhatSpecies[Gh_AN];
4408
4409 if ( Dis[ct_AN][k]<(scale_rc_EKC[Mc_AN]*r0) ){
4410
4411 Anum = MP[k] - 1;
4412
4413 for (i=0; i<Spe_Total_CNO[wanB]; i++){
4414
4415 tmpvec0[m][Anum+i] = 1.0;
4416 Krylov_U_OLP[0][m][Anum+i] = 1.0;
4417
4418 m++;
4419 }
4420 }
4421 }
4422
4423 /*
4424 for (i=0; i<EKC_core_size[Mc_AN]; i++){
4425 tmpvec0[i][i] = 1.0;
4426 Krylov_U_OLP[0][i][i] = 1.0;
4427 }
4428 */
4429
4430 /****************************************************
4431 generate Krylov subspace vectors
4432 ****************************************************/
4433
4434 for (rl=0; rl<(rlmax_EC2[Mc_AN]-1); rl++){
4435
4436 /*******************************************************
4437 S * |Wn)
4438 *******************************************************/
4439
4440 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4441 for (i=0; i<Msize2[Mc_AN]; i++){
4442 tmpvec1[n][i] = 0.0;
4443 }
4444 }
4445
4446 for (i=0; i<=can; i++){
4447
4448 ig = natn[ct_AN][i];
4449 ian = Spe_Total_CNO[WhatSpecies[ig]];
4450 Anum = MP[i] - 1;
4451 ih = S_G2M[ig];
4452
4453 for (j=0; j<=can; j++){
4454
4455 kl = RMI1[Mc_AN][i][j];
4456 jg = natn[ct_AN][j];
4457 jan = Spe_Total_CNO[WhatSpecies[jg]];
4458 Bnum = MP[j] - 1;
4459
4460 if (0<=kl){
4461
4462 for (m=0; m<ian; m++){
4463 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4464
4465 sum = 0.0;
4466 for (k=0; k<jan; k++){
4467 sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
4468 }
4469
4470 tmpvec1[n][Anum+m] += sum;
4471 }
4472 }
4473 }
4474 }
4475 }
4476
4477 /*************************************************************
4478 orthogonalization by a modified block Gram-Schmidt method
4479 *************************************************************/
4480
4481 /* |tmpvec1) := (I - \sum_{rl0} |U_rl0)(U_rl0|)|tmpvec1) */
4482
4483 for (rl0=0; rl0<=rl; rl0++){
4484
4485 /* (U_rl0|tmpvec1) */
4486
4487 for (m=0; m<EKC_core_size[Mc_AN]; m++){
4488 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4489 sum = 0.0;
4490 for (i=0; i<Msize2[Mc_AN]; i++){
4491 sum += Krylov_U_OLP[rl0][m][i]*tmpvec1[n][i];
4492 }
4493 tmpmat0[m][n] = sum;
4494 }
4495 }
4496
4497 /* |tmpvec1) := |tmpvec1) - |U_rl0)(U_rl0|tmpvec1) */
4498
4499 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4500 for (k=0; k<EKC_core_size[Mc_AN]; k++){
4501 dum = tmpmat0[k][n];
4502 for (i=0; i<Msize2[Mc_AN]; i++) tmpvec1[n][i] -= Krylov_U_OLP[rl0][k][i]*dum;
4503 }
4504 }
4505 }
4506
4507 /*************************************************************
4508 normalization of tmpvec1
4509 *************************************************************/
4510
4511 for (m=0; m<EKC_core_size[Mc_AN]; m++){
4512 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4513
4514 sum = 0.0;
4515 for (i=0; i<Msize2[Mc_AN]; i++){
4516 sum += tmpvec1[m][i]*tmpvec1[n][i];
4517 }
4518
4519 tmpmat0[m+1][n+1] = sum;
4520 }
4521 }
4522
4523 /* diagonalize tmpmat0 */
4524
4525 if ( EKC_core_size[Mc_AN]==1){
4526 ko[1] = tmpmat0[1][1];
4527 tmpmat0[1][1] = 1.0;
4528 }
4529 else{
4530 Eigen_lapack( tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN] );
4531 }
4532
4533 ZeroNum = 0;
4534
4535 for (n=1; n<=EKC_core_size[Mc_AN]; n++){
4536 if (cutoff_value<ko[n]){
4537 ko[n] = sqrt(fabs(ko[n]));
4538 iko[n] = 1.0/ko[n];
4539 }
4540 else{
4541 ZeroNum++;
4542 ko[n] = 0.0;
4543 iko[n] = 0.0;
4544 }
4545
4546 if (error_check==1){
4547 printf("rl=%3d ko=%18.15f\n",rl,ko[n]);
4548 }
4549 }
4550
4551 if (error_check==1){
4552 printf("rl=%3d ZeroNum=%3d\n",rl,ZeroNum);
4553 }
4554
4555 /* tmpvec0 = tmpvec1 * tmpmat0^{-1/2} */
4556
4557 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4558 for (i=0; i<Msize2[Mc_AN]; i++){
4559 tmpvec0[n][i] = 0.0;
4560 }
4561 }
4562
4563 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4564 for (k=0; k<EKC_core_size[Mc_AN]; k++){
4565 dum = tmpmat0[k+1][n+1]*iko[n+1];
4566 for (i=0; i<Msize2[Mc_AN]; i++) tmpvec0[n][i] += tmpvec1[k][i]*dum;
4567 }
4568 }
4569
4570 /*************************************************************
4571 store Krylov vectors
4572 *************************************************************/
4573
4574 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4575 for (i=0; i<Msize2[Mc_AN]; i++){
4576 Krylov_U_OLP[rl+1][n][i] = tmpvec0[n][i];
4577 }
4578 }
4579
4580 } /* rl */
4581
4582 /************************************************************
4583 calculate the inverse of the reduced overlap matrix
4584 ************************************************************/
4585
4586 /* construct the reduced overlap matrix */
4587
4588 for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
4589
4590 /* S * |Vn) */
4591
4592 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4593 for (i=0; i<Msize2[Mc_AN]; i++){
4594 tmpvec1[n][i] = 0.0;
4595 }
4596 }
4597
4598 for (i=0; i<=can; i++){
4599
4600 ig = natn[ct_AN][i];
4601 ian = Spe_Total_CNO[WhatSpecies[ig]];
4602 Anum = MP[i] - 1;
4603 ih = S_G2M[ig];
4604
4605 for (j=0; j<=can; j++){
4606
4607 kl = RMI1[Mc_AN][i][j];
4608 jg = natn[ct_AN][j];
4609 jan = Spe_Total_CNO[WhatSpecies[jg]];
4610 Bnum = MP[j] - 1;
4611
4612 if (0<=kl){
4613
4614 for (m=0; m<ian; m++){
4615 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4616
4617 sum = 0.0;
4618 for (k=0; k<jan; k++){
4619 sum += OLP0[ih][kl][m][k]*Krylov_U_OLP[rl][n][Bnum+k];
4620 }
4621 tmpvec1[n][Anum+m] += sum;
4622 }
4623 }
4624 }
4625 }
4626 }
4627
4628 for (rl0=rl; rl0<rlmax_EC2[Mc_AN]; rl0++){
4629 for (m=0; m<EKC_core_size[Mc_AN]; m++){
4630 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4631 sum = 0.0;
4632 for (i=0; i<Msize2[Mc_AN]; i++){
4633 sum += Krylov_U_OLP[rl0][m][i]*tmpvec1[n][i];
4634 }
4635 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
4636 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
4637 }
4638 }
4639 }
4640 }
4641
4642 /* diagonalize FS */
4643
4644 Eigen_lapack(FS,ko,Msize4[Mc_AN],Msize4[Mc_AN]);
4645
4646 /* find ill-conditioned eigenvalues */
4647
4648 ZeroNum = 0;
4649 for (i=1; i<=Msize4[Mc_AN]; i++){
4650
4651 if (error_check==1){
4652 printf("Mc_AN=%2d i=%3d ko[i]=%18.15f\n",Mc_AN,i,ko[i]);
4653 }
4654
4655 if (cutoff_value<ko[i]){
4656 iko[i] = 1.0/ko[i];
4657 }
4658 else{
4659 ZeroNum++;
4660 ko[i] = 0.0;
4661 iko[i] = 0.0;
4662 }
4663 }
4664
4665 if (error_check==1){
4666 printf("Mc_AN=%2d ZeroNum=%2d\n",Mc_AN,ZeroNum);
4667 }
4668
4669 /* construct the inverse */
4670
4671 for (i=1; i<=Msize4[Mc_AN]; i++){
4672 for (j=1; j<=Msize4[Mc_AN]; j++){
4673 sum = 0.0;
4674 for (k=1; k<=Msize4[Mc_AN]; k++){
4675 sum += FS[i][k]*iko[k]*FS[j][k];
4676 }
4677 inv_RS[i-1][j-1] = sum;
4678 }
4679 }
4680
4681 /* symmetrization of inv_RS */
4682
4683 for (i=1; i<=Msize4[Mc_AN]; i++){
4684 for (j=i+1; j<=Msize4[Mc_AN]; j++){
4685 tmp0 = inv_RS[i-1][j-1];
4686 tmp1 = inv_RS[j-1][i-1];
4687 tmp2 = 0.5*(tmp0 + tmp1);
4688 inv_RS[i-1][j-1] = tmp2;
4689 inv_RS[j-1][i-1] = tmp2;
4690 }
4691 }
4692
4693
4694
4695 /*
4696 {
4697
4698
4699 double mat1[1000][1000];
4700 double tsum;
4701 int myid;
4702
4703 MPI_Comm_rank(mpi_comm_level1,&myid);
4704
4705 if (myid==0){
4706
4707 printf("check normalization\n");
4708
4709 for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
4710 for (m=0; m<EKC_core_size[Mc_AN]; m++){
4711 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
4712 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4713
4714 sum = 0.0;
4715 for (i=0; i<Msize2[Mc_AN]; i++){
4716 sum += Krylov_U_OLP[rl][m][i]*Krylov_U_OLP[rl0][n][i];
4717 }
4718 printf("rl=%3d rl0=%3d m=%3d n=%3d <|>=%18.15f\n",rl,rl0,m,n,sum);
4719 }
4720 }
4721 }
4722 }
4723
4724
4725 printf("\n\ninvS\n");
4726
4727
4728 for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
4729 for (m=0; m<EKC_core_size[Mc_AN]; m++){
4730
4731 for (i=0; i<Msize2[Mc_AN]; i++){
4732
4733 sum = 0.0;
4734
4735 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
4736 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4737 sum += inv_RS[rl*EKC_core_size[Mc_AN]+m][rl0*EKC_core_size[Mc_AN]+n]*Krylov_U_OLP[rl0][n][i];
4738 }
4739 }
4740
4741 mat1[rl*EKC_core_size[Mc_AN]+m][i] = sum;
4742 }
4743 }
4744 }
4745
4746
4747 tsum = 0.0;
4748
4749 for (i=0; i<Msize2[Mc_AN]; i++){
4750 for (j=0; j<Msize2[Mc_AN]; j++){
4751
4752 sum = 0.0;
4753 for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
4754 for (m=0; m<EKC_core_size[Mc_AN]; m++){
4755 sum += Krylov_U_OLP[rl][m][i]*mat1[rl*EKC_core_size[Mc_AN]+m][j];
4756 }
4757 }
4758
4759 printf("i=%4d j=%4d %18.15f\n",i,j,sum);
4760
4761 tsum += fabs(sum);
4762
4763 }
4764 }
4765
4766
4767 printf("\n\ntsum=%18.15f\n",tsum);
4768
4769 }
4770
4771
4772 MPI_Finalize();
4773 exit(0);
4774 }
4775 */
4776
4777 /* freeing of arrays */
4778
4779 for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
4780 free(tmpmat0[i]);
4781 }
4782 free(tmpmat0);
4783
4784 for (i=0; i<(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
4785 free(FS[i]);
4786 }
4787 free(FS);
4788
4789 free(ko);
4790 free(iko);
4791 }
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
S_orthonormalize_vec(int Mc_AN,int ct_on,double ** vec,double ** workvec,double **** OLP0,double ** tmpmat0,double * ko,double * iko,int * MP,int * Msize2)4804 void S_orthonormalize_vec( int Mc_AN, int ct_on, double **vec,
4805 double **workvec, double ****OLP0,
4806 double **tmpmat0, double *ko, double *iko,
4807 int *MP, int *Msize2 )
4808 {
4809 int n,i,j,can,san,fan,ct_AN;
4810 int k,m,ZeroNum,Anum,Bnum,ih;
4811 int kl,jg,jan,ig,ian;
4812 double dum,sum;
4813 double tmp0,tmp1,tmp2,tmp3;
4814 __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
4815 double mmArr[8];
4816 double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
4817
4818 ct_AN = M2G[Mc_AN];
4819 fan = FNAN[ct_AN];
4820 san = SNAN[ct_AN];
4821 can = fan + san;
4822
4823 /* S|Vn) */
4824
4825 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4826 for (i=0; i<Msize2[Mc_AN]; i++){
4827 workvec[n][i] = 0.0;
4828 }
4829 }
4830
4831 for (i=0; i<=can; i++){
4832
4833 ig = natn[ct_AN][i];
4834 ian = Spe_Total_CNO[WhatSpecies[ig]];
4835 Anum = MP[i] - 1;
4836 ih = S_G2M[ig];
4837
4838 for (j=0; j<=can; j++){
4839
4840 kl = RMI1[Mc_AN][i][j];
4841 jg = natn[ct_AN][j];
4842 jan = Spe_Total_CNO[WhatSpecies[jg]];
4843 Bnum = MP[j] - 1;
4844
4845 if (0<=kl){
4846
4847 #ifdef nosse
4848
4849 /* Original version */
4850 /**/
4851 for (m=0; m<ian; m++){
4852 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4853
4854 sum = 0.0;
4855 for (k=0; k<jan; k++){
4856 sum += OLP0[ih][kl][m][k]*vec[n][Bnum+k];
4857 }
4858
4859 workvec[n][Anum+m] += sum;
4860 }
4861 }
4862 /**/
4863
4864 #else
4865
4866 /* Unrolling + SSE version */
4867 /**/
4868 for (m=0; m<(ian-3); m+=4){
4869 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4870
4871 mmSum00 = _mm_setzero_pd();
4872 mmSum01 = _mm_setzero_pd();
4873 mmSum10 = _mm_setzero_pd();
4874 mmSum11 = _mm_setzero_pd();
4875 mmSum20 = _mm_setzero_pd();
4876 mmSum21 = _mm_setzero_pd();
4877 mmSum30 = _mm_setzero_pd();
4878 mmSum31 = _mm_setzero_pd();
4879
4880 for (k=0; k<(jan-3); k+=4){
4881 mmTmp0 = _mm_loadu_pd(&vec[n][Bnum+k+0]);
4882 mmTmp1 = _mm_loadu_pd(&vec[n][Bnum+k+2]);
4883
4884 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
4885 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
4886
4887 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
4888 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
4889
4890 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
4891 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
4892
4893 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
4894 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
4895 }
4896
4897 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4898 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4899 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4900 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4901
4902 _mm_storeu_pd(&mmArr[0], mmSum00);
4903 _mm_storeu_pd(&mmArr[2], mmSum10);
4904 _mm_storeu_pd(&mmArr[4], mmSum20);
4905 _mm_storeu_pd(&mmArr[6], mmSum30);
4906
4907 sum0 = mmArr[0] + mmArr[1];
4908 sum1 = mmArr[2] + mmArr[3];
4909 sum2 = mmArr[4] + mmArr[5];
4910 sum3 = mmArr[6] + mmArr[7];
4911
4912 for (; k<jan; k++){
4913 sum0 += OLP0[ih][kl][m+0][k]*vec[n][Bnum+k];
4914 sum1 += OLP0[ih][kl][m+1][k]*vec[n][Bnum+k];
4915 sum2 += OLP0[ih][kl][m+2][k]*vec[n][Bnum+k];
4916 sum3 += OLP0[ih][kl][m+3][k]*vec[n][Bnum+k];
4917 }
4918
4919 workvec[n][Anum+m+0] += sum0;
4920 workvec[n][Anum+m+1] += sum1;
4921 workvec[n][Anum+m+2] += sum2;
4922 workvec[n][Anum+m+3] += sum3;
4923 }
4924 }
4925
4926 for (; m<ian; m++){
4927 for (n=0; n<EKC_core_size[Mc_AN]; n++){
4928
4929 sum = 0.0;
4930 for (k=0; k<jan; k++){
4931 sum += OLP0[ih][kl][m][k]*vec[n][Bnum+k];
4932 }
4933
4934 workvec[n][Anum+m] += sum;
4935 }
4936 }
4937 /**/
4938
4939 #endif
4940
4941 }
4942 }
4943 }
4944
4945 /* (Vn|S|Vn) */
4946
4947 #ifdef nosse
4948
4949 /* Original version */
4950 /**/
4951 for (m=0; m<EKC_core_size[Mc_AN]; m++){
4952 for (n=m; n<EKC_core_size[Mc_AN]; n++){
4953 sum = 0.0;
4954 for (i=0; i<Msize2[Mc_AN]; i++){
4955 sum += vec[m][i]*workvec[n][i];
4956 }
4957 tmpmat0[m+1][n+1] = sum;
4958 tmpmat0[n+1][m+1] = sum;
4959 }
4960 }
4961 /**/
4962
4963 #else
4964
4965 /* Unrolling + SSE version */
4966 /**/
4967 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
4968 for (n=m; n<EKC_core_size[Mc_AN]; n++){
4969 mmSum00 = _mm_setzero_pd();
4970 mmSum01 = _mm_setzero_pd();
4971 mmSum10 = _mm_setzero_pd();
4972 mmSum11 = _mm_setzero_pd();
4973 mmSum20 = _mm_setzero_pd();
4974 mmSum21 = _mm_setzero_pd();
4975 mmSum30 = _mm_setzero_pd();
4976 mmSum31 = _mm_setzero_pd();
4977
4978 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
4979 mmTmp0 = _mm_loadu_pd(&workvec[n][i+0]);
4980 mmTmp1 = _mm_loadu_pd(&workvec[n][i+2]);
4981
4982 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&vec[m+0][i+0]),mmTmp0));
4983 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&vec[m+0][i+2]),mmTmp1));
4984
4985 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&vec[m+1][i+0]),mmTmp0));
4986 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&vec[m+1][i+2]),mmTmp1));
4987
4988 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&vec[m+2][i+0]),mmTmp0));
4989 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&vec[m+2][i+2]),mmTmp1));
4990
4991 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&vec[m+3][i+0]),mmTmp0));
4992 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&vec[m+3][i+2]),mmTmp1));
4993 }
4994
4995 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4996 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4997 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4998 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4999
5000 _mm_storeu_pd(&mmArr[0], mmSum00);
5001 _mm_storeu_pd(&mmArr[2], mmSum10);
5002 _mm_storeu_pd(&mmArr[4], mmSum20);
5003 _mm_storeu_pd(&mmArr[6], mmSum30);
5004
5005 sum0 = mmArr[0] + mmArr[1];
5006 sum1 = mmArr[2] + mmArr[3];
5007 sum2 = mmArr[4] + mmArr[5];
5008 sum3 = mmArr[6] + mmArr[7];
5009
5010 for (; i<Msize2[Mc_AN]; i++){
5011 sum0 += vec[m+0][i]*workvec[n][i];
5012 sum1 += vec[m+1][i]*workvec[n][i];
5013 sum2 += vec[m+2][i]*workvec[n][i];
5014 sum3 += vec[m+3][i]*workvec[n][i];
5015 }
5016
5017 tmpmat0[m+1][n+1] = sum0;
5018 tmpmat0[n+1][m+1] = sum0;
5019
5020 tmpmat0[m+2][n+1] = sum1;
5021 tmpmat0[n+1][m+2] = sum1;
5022
5023 tmpmat0[m+3][n+1] = sum2;
5024 tmpmat0[n+1][m+3] = sum2;
5025
5026 tmpmat0[m+4][n+1] = sum3;
5027 tmpmat0[n+1][m+4] = sum3;
5028 }
5029 }
5030
5031 for (; m<EKC_core_size[Mc_AN]; m++){
5032 for (n=m; n<EKC_core_size[Mc_AN]; n++){
5033 sum = 0.0;
5034 for (i=0; i<Msize2[Mc_AN]; i++){
5035 sum += vec[m][i]*workvec[n][i];
5036 }
5037 tmpmat0[m+1][n+1] = sum;
5038 tmpmat0[n+1][m+1] = sum;
5039 }
5040 }
5041 /**/
5042
5043 #endif
5044
5045 /* diagonalize tmpmat0 */
5046
5047 if ( EKC_core_size[Mc_AN]==1){
5048 ko[1] = tmpmat0[1][1];
5049 tmpmat0[1][1] = 1.0;
5050 }
5051 else{
5052 Eigen_lapack( tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN] );
5053 }
5054
5055 ZeroNum = 0;
5056
5057 for (n=1; n<=EKC_core_size[Mc_AN]; n++){
5058 if (cutoff_value<ko[n]){
5059 ko[n] = sqrt(fabs(ko[n]));
5060 iko[n] = 1.0/ko[n];
5061 }
5062 else{
5063 ZeroNum++;
5064 ko[n] = 0.0;
5065 iko[n] = 0.0;
5066 }
5067 }
5068
5069 /* U0 = vec * tmpmat0^{-1/2} */
5070
5071 for (n=0; n<EKC_core_size[Mc_AN]; n++){
5072 for (i=0; i<Msize2[Mc_AN]; i++){
5073 workvec[n][i] = 0.0;
5074 }
5075 }
5076
5077 for (n=0; n<EKC_core_size[Mc_AN]; n++){
5078 for (k=0; k<EKC_core_size[Mc_AN]; k++){
5079 dum = tmpmat0[k+1][n+1]*iko[n+1];
5080 for (i=0; i<Msize2[Mc_AN]; i++) workvec[n][i] += vec[k][i]*dum;
5081 }
5082 }
5083
5084 for (n=0; n<EKC_core_size[Mc_AN]; n++){
5085 for (i=0; i<Msize2[Mc_AN]; i++){
5086 vec[n][i] = workvec[n][i];
5087 }
5088 }
5089 }
5090
5091
5092
Inverse_S_by_Cholesky(int Mc_AN,double **** OLP0,double ** invS,int * MP,int NUM,double * LoS)5093 void Inverse_S_by_Cholesky(int Mc_AN, double ****OLP0, double **invS, int *MP, int NUM, double *LoS)
5094 {
5095 char *UPLO="U";
5096 INTEGER N,lda,info,lwork;
5097 int Gc_AN,i,j,k,Anum,Gi,wanA,time1,time2;
5098 int ig,ian,ih,kl,jg,jan,Bnum,m,n;
5099 double tmp0,tmp1;
5100
5101 N = NUM;
5102 Gc_AN = M2G[Mc_AN];
5103
5104 /* OLP0 to invS */
5105
5106 for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
5107
5108 ig = natn[Gc_AN][i];
5109 ian = Spe_Total_CNO[WhatSpecies[ig]];
5110 Anum = MP[i] - 1;
5111 ih = S_G2M[ig];
5112
5113 for (j=0; j<=(FNAN[Gc_AN]+SNAN[Gc_AN]); j++){
5114
5115 kl = RMI1[Mc_AN][i][j];
5116 jg = natn[Gc_AN][j];
5117 jan = Spe_Total_CNO[WhatSpecies[jg]];
5118 Bnum = MP[j] - 1;
5119
5120 if (0<=kl){
5121 for (m=0; m<ian; m++){
5122 for (n=0; n<jan; n++){
5123 invS[Anum+m][Bnum+n] = OLP0[ih][kl][m][n];
5124 }
5125 }
5126 }
5127
5128 else{
5129 for (m=0; m<ian; m++){
5130 for (n=0; n<jan; n++){
5131 invS[Anum+m][Bnum+n] = 0.0;
5132 }
5133 }
5134 }
5135 }
5136 }
5137
5138 /* invS -> LoS */
5139
5140 for (i=0;i<N;i++) {
5141 for (j=0;j<N;j++) {
5142 LoS[j*N+i]= invS[i][j];
5143 }
5144 }
5145
5146 /* call dpotrf_() in clapack */
5147
5148 lda = N;
5149 F77_NAME(dpotrf,DPOTRF)(UPLO, &N, LoS, &lda, &info);
5150
5151 if (info!=0){
5152 printf("Error in dpotrf_() which is called from Embedding_Cluster.c info=%2d\n",info);
5153 }
5154
5155 /* call dpotri_() in clapack */
5156
5157 lwork = N;
5158 F77_NAME(dpotri,DPOTRI)(UPLO, &N, LoS, &lda, &info);
5159
5160 if (info!=0){
5161 printf("Error in dpotri_() which is called from Embedding_Cluster.c info=%2d\n",info);
5162 }
5163
5164 /* LoS -> invS */
5165
5166 for (j=0; j<N; j++) {
5167 m = j*N;
5168 for (i=0; i<=j; i++) {
5169 invS[i][j] = LoS[m+i];
5170 invS[j][i] = LoS[m+i];
5171 }
5172 }
5173
5174 }
5175
5176
Save_DOS_Col(double ****** Residues,double **** OLP0,double *** EVal,int ** LO_TC,int ** HO_TC)5177 void Save_DOS_Col(double ******Residues, double ****OLP0, double ***EVal, int **LO_TC, int **HO_TC)
5178 {
5179 int spin,Mc_AN,wanA,Gc_AN,tno1;
5180 int i1,i,j,MaxL,l,h_AN,Gh_AN,wanB,tno2;
5181 double Stime_atom,Etime_atom;
5182 double sum;
5183 int i_vec[10];
5184 char file_eig[YOUSO10],file_ev[YOUSO10];
5185 FILE *fp_eig, *fp_ev;
5186 int numprocs,myid,ID,tag;
5187
5188 /* for OpenMP */
5189 int OMPID,Nthrds,Nprocs;
5190
5191 /* MPI */
5192
5193 MPI_Comm_size(mpi_comm_level1,&numprocs);
5194 MPI_Comm_rank(mpi_comm_level1,&myid);
5195
5196 if (myid==Host_ID){
5197 printf("The DOS is supported for a range from -10 to 10 eV for the O(N) Krylov subspace method.\n");
5198 }
5199
5200 /* open file pointers */
5201
5202 if (myid==Host_ID){
5203
5204 sprintf(file_eig,"%s%s.Dos.val",filepath,filename);
5205 if ( (fp_eig=fopen(file_eig,"w"))==NULL ) {
5206 printf("cannot open a file %s\n",file_eig);
5207 }
5208 }
5209
5210 sprintf(file_ev, "%s%s.Dos.vec%i",filepath,filename,myid);
5211 if ( (fp_ev=fopen(file_ev,"w"))==NULL ) {
5212 printf("cannot open a file %s\n",file_ev);
5213 }
5214
5215 /****************************************************
5216 save *.Dos.vec
5217 ****************************************************/
5218
5219 for (spin=0; spin<=SpinP_switch; spin++){
5220 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5221
5222 dtime(&Stime_atom);
5223
5224 Gc_AN = M2G[Mc_AN];
5225 wanA = WhatSpecies[Gc_AN];
5226 tno1 = Spe_Total_CNO[wanA];
5227
5228 fprintf(fp_ev,"<AN%dAN%d\n",Gc_AN,spin);
5229 fprintf(fp_ev,"%d\n",(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1));
5230
5231 for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
5232
5233 fprintf(fp_ev,"%4d %10.6f ",i1,EVal[spin][Mc_AN][i1-1+LO_TC[spin][Mc_AN]]);
5234
5235 for (i=0; i<tno1; i++){
5236
5237 sum = 0.0;
5238 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5239 Gh_AN = natn[Gc_AN][h_AN];
5240 wanB = WhatSpecies[Gh_AN];
5241 tno2 = Spe_Total_CNO[wanB];
5242 for (j=0; j<tno2; j++){
5243 sum += Residues[spin][Mc_AN][h_AN][i][j][i1]*
5244 OLP0[Mc_AN][h_AN][i][j];
5245 }
5246 }
5247
5248 fprintf(fp_ev,"%8.5f",sum);
5249 }
5250 fprintf(fp_ev,"\n");
5251 }
5252
5253 fprintf(fp_ev,"AN%dAN%d>\n",Gc_AN,spin);
5254
5255 dtime(&Etime_atom);
5256 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
5257 }
5258 }
5259
5260 /****************************************************
5261 save *.Dos.val
5262 ****************************************************/
5263
5264 if ( (fp_eig=fopen(file_eig,"w")) != NULL ) {
5265
5266 fprintf(fp_eig,"mode 5\n");
5267 fprintf(fp_eig,"NonCol 0\n");
5268 /* fprintf(fp_eig,"N %d\n",n); */
5269 fprintf(fp_eig,"Nspin %d\n",SpinP_switch);
5270 fprintf(fp_eig,"Erange %lf %lf\n",Dos_Erange[0],Dos_Erange[1]);
5271 fprintf(fp_eig,"atomnum %d\n",atomnum);
5272
5273 fprintf(fp_eig,"<WhatSpecies\n");
5274 for (i=1;i<=atomnum;i++) {
5275 fprintf(fp_eig,"%d ",WhatSpecies[i]);
5276 }
5277 fprintf(fp_eig,"\nWhatSpecies>\n");
5278
5279 fprintf(fp_eig,"SpeciesNum %d\n",SpeciesNum);
5280 fprintf(fp_eig,"<Spe_Total_CNO\n");
5281 for (i=0;i<SpeciesNum;i++) {
5282 fprintf(fp_eig,"%d ",Spe_Total_CNO[i]);
5283 }
5284 fprintf(fp_eig,"\nSpe_Total_CNO>\n");
5285
5286 MaxL=Supported_MaxL;
5287 fprintf(fp_eig,"MaxL %d\n",Supported_MaxL);
5288 fprintf(fp_eig,"<Spe_Num_CBasis\n");
5289 for (i=0;i<SpeciesNum;i++) {
5290 for (l=0;l<=MaxL;l++) {
5291 fprintf(fp_eig,"%d ",Spe_Num_CBasis[i][l]);
5292 }
5293 fprintf(fp_eig,"\n");
5294 }
5295 fprintf(fp_eig,"Spe_Num_CBasis>\n");
5296 fprintf(fp_eig,"ChemP %lf\n",ChemP);
5297
5298 }
5299
5300 /* close file pointers */
5301
5302 if (myid==Host_ID){
5303 if (fp_eig) fclose(fp_eig);
5304 }
5305
5306 if (fp_ev) fclose(fp_ev);
5307 }
5308
5309
Krylov_Col_trd(char * mode,int SCF_iter,double ***** Hks,double **** OLP0,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])5310 static double Krylov_Col_trd(char *mode,
5311 int SCF_iter,
5312 double *****Hks,
5313 double ****OLP0,
5314 double *****CDM,
5315 double *****EDM,
5316 double Eele0[2],
5317 double Eele1[2])
5318
5319 /**********************************************************************************
5320 The structure of this subroutine is exactly the same as the original Krylov_Col,
5321 except for the main loop where OMP parallelization at the atom level is removed.
5322 **********************************************************************************/
5323
5324 {
5325 static int firsttime=1,recalc_firsttime=1,recalc_flag;
5326 int Mc_AN,Gc_AN,i,is,js,Gi,wan,wanA,wanB,Anum;
5327 int num,NUM0,NUM,NUM1,n2,Cwan,Hwan,Rn2;
5328 int size1,size2,max_size1,max_size2;
5329 int ih,ig,ian,j,kl,jg,jan,Bnum,m,n,spin,i2,ip;
5330 int k,l,i1,j1,P_min,m_size,q1,q2,csize,Residues_size;
5331 int h_AN1,Mh_AN1,h_AN2,Gh_AN1,Gh_AN2,wan1,wan2;
5332 int po,po1,loopN,tno1,tno2,h_AN,Gh_AN,rl1,rl2,rl;
5333 int MA_AN,GA_AN,tnoA,GB_AN,tnoB,ct_on;
5334 int Msize2_max;
5335 static double TZ;
5336 double My_TZ,sum,FermiF,time0,srt;
5337 double sum00,sum10,sum20,sum30;
5338 double sum01,sum11,sum21,sum31;
5339 double sum02,sum12,sum22,sum32;
5340 double sum03,sum13,sum23,sum33;
5341 double tmp0,tmp1,tmp2,tmp3,b2,co,x0,Sx,Dx,xmin,xmax;
5342 double My_Num_State,Num_State,x,Dnum;
5343 double emin,emax,de;
5344 double TStime,TEtime,Stime1,Etime1;
5345 double Stime2,Etime2;
5346 double time1,time2,time3,time4,time5;
5347 double time6,time7,time8,time9,time10;
5348 double time11,time12,time13,time14,time15,time16;
5349 double Erange;
5350 double My_Eele0[2],My_Eele1[2];
5351 double max_x=30.0;
5352 double ChemP_MAX,ChemP_MIN,spin_degeneracy;
5353 double spetrum_radius;
5354 double ***EVal;
5355 double ******Residues;
5356 double ***PDOS_DC;
5357 double *tmp_array;
5358 double *tmp_array2;
5359
5360 int *MP;
5361
5362 /*****************************************************
5363 Msize: \sum_FNAN Spe_Total_CNO
5364 Msize2: \sum_FNAN+SNAN Spe_Total_CNO
5365 Msize3: rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]
5366 Msize4: rlmax_EC2[Mc_AN]*EKC_core_size[Mc_AN]
5367 Msize5: dimension for the last column of Residues
5368 *****************************************************/
5369
5370 int *Msize;
5371 int *Msize2;
5372 int *Msize3;
5373 int *Msize4;
5374 int *Msize5;
5375 int **LO_TC;
5376 int **HO_TC;
5377 int numprocs,myid,ID,IDS,IDR,tag=999;
5378 double Stime_atom, Etime_atom;
5379
5380 MPI_Status stat;
5381 MPI_Request request;
5382
5383 /* for OpenMP */
5384 int OMPID,Nthrds,Nprocs;
5385
5386 /* MPI */
5387 MPI_Barrier(mpi_comm_level1);
5388 MPI_Comm_size(mpi_comm_level1,&numprocs);
5389 MPI_Comm_rank(mpi_comm_level1,&myid);
5390
5391 dtime(&TStime);
5392
5393 if (measure_time==1){
5394 time1 = 0.0;
5395 time2 = 0.0;
5396 time3 = 0.0;
5397 time4 = 0.0;
5398 time5 = 0.0;
5399 time6 = 0.0;
5400 time7 = 0.0;
5401 time8 = 0.0;
5402 time9 = 0.0;
5403 time10 = 0.0;
5404 time11 = 0.0;
5405 time12 = 0.0;
5406 time13 = 0.0;
5407 time14 = 0.0;
5408 time15 = 0.0;
5409 }
5410
5411 /****************************************************
5412 allocation of arrays:
5413 ****************************************************/
5414
5415 Msize = (int*)malloc(sizeof(int)*(Matomnum+1));
5416 Msize2 = (int*)malloc(sizeof(int)*(Matomnum+1));
5417 Msize3 = (int*)malloc(sizeof(int)*(Matomnum+1));
5418 Msize4 = (int*)malloc(sizeof(int)*(Matomnum+1));
5419 Msize2_max = 0;
5420
5421 /* find Msize */
5422
5423 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5424
5425 if (Mc_AN==0){
5426 Msize[Mc_AN] = 1;
5427 }
5428 else{
5429
5430 Gc_AN = M2G[Mc_AN];
5431
5432 NUM = 0;
5433 for (i=0; i<=FNAN[Gc_AN]; i++){
5434 Gi = natn[Gc_AN][i];
5435 wanA = WhatSpecies[Gi];
5436 NUM += Spe_Total_CNO[wanA];
5437 }
5438 Msize[Mc_AN] = NUM;
5439 }
5440 }
5441
5442 /* find Msize2 and Msize2_max */
5443
5444 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5445
5446 if (Mc_AN==0){
5447 Msize2[Mc_AN] = 1;
5448 }
5449 else{
5450
5451 Gc_AN = M2G[Mc_AN];
5452
5453 NUM = 0;
5454 for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
5455 Gi = natn[Gc_AN][i];
5456 wanA = WhatSpecies[Gi];
5457 NUM += Spe_Total_CNO[wanA];
5458 }
5459 Msize2[Mc_AN] = NUM;
5460 }
5461
5462 if (Msize2_max<Msize2[Mc_AN]) Msize2_max = Msize2[Mc_AN] + 4;
5463 }
5464
5465 /* find Msize3 and Msize4 */
5466
5467 Msize3[0] = 1;
5468 Msize4[0] = 1;
5469
5470 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5471 Gc_AN = M2G[Mc_AN];
5472 wan = WhatSpecies[Gc_AN];
5473 ct_on = Spe_Total_CNO[wan];
5474 Msize3[Mc_AN] = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN];
5475 Msize4[Mc_AN] = rlmax_EC2[Mc_AN]*EKC_core_size[Mc_AN];
5476 }
5477
5478 m_size = 0;
5479
5480 EVal = (double***)malloc(sizeof(double**)*(SpinP_switch+1));
5481 for (spin=0; spin<=SpinP_switch; spin++){
5482 EVal[spin] = (double**)malloc(sizeof(double*)*(Matomnum+1));
5483
5484 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5485 n2 = Msize3[Mc_AN] + 2;
5486 m_size += n2;
5487 EVal[spin][Mc_AN] = (double*)malloc(sizeof(double)*n2);
5488 }
5489 }
5490
5491 if (firsttime){
5492 PrintMemory("Krylov: EVal", sizeof(double)*m_size,NULL);
5493 }
5494
5495 if (2<=level_stdout){
5496 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5497 printf("<Krylov> myid=%4d Mc_AN=%4d Gc_AN=%4d Msize=%4d\n",
5498 myid,Mc_AN,M2G[Mc_AN],Msize[Mc_AN]);
5499 }
5500 }
5501
5502 /****************************************************
5503 allocation of arrays:
5504
5505 double PDOS[SpinP_switch+1]
5506 [Matomnum+1]
5507 [n2]
5508 ****************************************************/
5509
5510 m_size = 0;
5511 PDOS_DC = (double***)malloc(sizeof(double**)*(SpinP_switch+1));
5512 for (spin=0; spin<=SpinP_switch; spin++){
5513 PDOS_DC[spin] = (double**)malloc(sizeof(double*)*(Matomnum+1));
5514 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5515 n2 = Msize3[Mc_AN] + 4;
5516 m_size += n2;
5517 PDOS_DC[spin][Mc_AN] = (double*)malloc(sizeof(double)*n2);
5518 }
5519 }
5520
5521 if (firsttime){
5522 PrintMemory("Krylov: PDOS_DC",sizeof(double)*m_size,NULL);
5523 }
5524
5525 /****************************************************
5526 allocation of arrays:
5527
5528 int LO_TC[SpinP_switch+1][Matomnum+1]
5529 int HO_TC[SpinP_switch+1][Matomnum+1]
5530 ****************************************************/
5531
5532 LO_TC = (int**)malloc(sizeof(int*)*(SpinP_switch+1));
5533 for (spin=0; spin<(SpinP_switch+1); spin++){
5534 LO_TC[spin] = (int*)malloc(sizeof(int)*(Matomnum+1));
5535 }
5536
5537 HO_TC = (int**)malloc(sizeof(int*)*(SpinP_switch+1));
5538 for (spin=0; spin<(SpinP_switch+1); spin++){
5539 HO_TC[spin] = (int*)malloc(sizeof(int)*(Matomnum+1));
5540 }
5541
5542 /****************************************************
5543 allocation of array:
5544
5545 double Residues[SpinP_switch+1]
5546 [Matomnum+1]
5547 [FNAN[Gc_AN]+1]
5548 [Spe_Total_CNO[Gc_AN]]
5549 [Spe_Total_CNO[Gh_AN]]
5550 [HO_TC-LO_TC+3]
5551 ****************************************************/
5552
5553 Residues = (double******)malloc(sizeof(double*****)*(SpinP_switch+1));
5554 for (spin=0; spin<=SpinP_switch; spin++){
5555 Residues[spin] = (double*****)malloc(sizeof(double****)*(Matomnum+1));
5556 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5557
5558 if (Mc_AN==0){
5559 Gc_AN = 0;
5560 FNAN[0] = 0;
5561 tno1 = 1;
5562 }
5563 else{
5564
5565 Gc_AN = M2G[Mc_AN];
5566 wanA = WhatSpecies[Gc_AN];
5567 tno1 = Spe_Total_CNO[wanA];
5568 }
5569
5570 Residues[spin][Mc_AN] = (double****)malloc(sizeof(double***)*(FNAN[Gc_AN]+1));
5571
5572 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5573
5574 if (Mc_AN==0){
5575 tno2 = 1;
5576 }
5577 else {
5578 Gh_AN = natn[Gc_AN][h_AN];
5579 wanB = WhatSpecies[Gh_AN];
5580 tno2 = Spe_Total_CNO[wanB];
5581 }
5582
5583 Residues[spin][Mc_AN][h_AN] = (double***)malloc(sizeof(double**)*tno1);
5584 for (i=0; i<tno1; i++){
5585 Residues[spin][Mc_AN][h_AN][i] = (double**)malloc(sizeof(double*)*tno2);
5586 /* note that the array is allocated once more in the loop */
5587 }
5588 }
5589 }
5590 }
5591
5592 for (spin=0; spin<=SpinP_switch; spin++){
5593 Residues[spin][0][0][0][0] = (double*)malloc(sizeof(double)*1);
5594 }
5595
5596 /****************************************************
5597 initialize density and energy density matrices
5598 ****************************************************/
5599
5600 for (spin=0; spin<=SpinP_switch; spin++){
5601 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5602 Gc_AN = M2G[Mc_AN];
5603 wanA = WhatSpecies[Gc_AN];
5604 tno1 = Spe_Total_CNO[wanA];
5605 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5606 Gh_AN = natn[Gc_AN][h_AN];
5607 wanB = WhatSpecies[Gh_AN];
5608 tno2 = Spe_Total_CNO[wanB];
5609 for (i=0; i<tno1; i++){
5610 for (j=0; j<tno2; j++){
5611 CDM[spin][Mc_AN][h_AN][i][j] = 0.0;
5612 EDM[spin][Mc_AN][h_AN][i][j] = 0.0;
5613 }
5614 }
5615 }
5616 }
5617 }
5618
5619 /****************************************************
5620 MPI
5621
5622 Hks
5623 ****************************************************/
5624
5625 if (measure_time==1) dtime(&Stime1);
5626
5627 if (SCF_iter==1){
5628
5629 /***********************************
5630 set data size
5631 ************************************/
5632
5633 for (ID=0; ID<numprocs; ID++){
5634
5635 IDS = (myid + ID) % numprocs;
5636 IDR = (myid - ID + numprocs) % numprocs;
5637
5638 if (ID!=0){
5639 tag = 999;
5640
5641 /* find data size to send block data */
5642 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5643
5644 size1 = 0;
5645 for (spin=0; spin<=SpinP_switch; spin++){
5646 for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
5647 Mc_AN = Snd_MAN[IDS][n];
5648 Gc_AN = Snd_GAN[IDS][n];
5649 Cwan = WhatSpecies[Gc_AN];
5650 tno1 = Spe_Total_CNO[Cwan];
5651 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5652 Gh_AN = natn[Gc_AN][h_AN];
5653 Hwan = WhatSpecies[Gh_AN];
5654 tno2 = Spe_Total_CNO[Hwan];
5655 size1 += tno1*tno2;
5656 }
5657 }
5658 }
5659
5660 Snd_HFS_Size[IDS] = size1;
5661 MPI_Isend(&size1, 1, MPI_INT, IDS, tag, mpi_comm_level1, &request);
5662 }
5663 else{
5664 Snd_HFS_Size[IDS] = 0;
5665 }
5666
5667 /* receiving of size of data */
5668
5669 if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
5670
5671 MPI_Recv(&size2, 1, MPI_INT, IDR, tag, mpi_comm_level1, &stat);
5672 Rcv_HFS_Size[IDR] = size2;
5673 }
5674 else{
5675 Rcv_HFS_Size[IDR] = 0;
5676 }
5677
5678 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0) MPI_Wait(&request,&stat);
5679 }
5680 else{
5681 Snd_HFS_Size[IDS] = 0;
5682 Rcv_HFS_Size[IDR] = 0;
5683 }
5684 }
5685 }
5686
5687 /***********************************
5688 data transfer
5689 ************************************/
5690
5691 /* find maximum size of size1 */
5692
5693 max_size1 = 0;
5694 max_size2 = 0;
5695 for (ID=0; ID<numprocs; ID++){
5696 size1 = Snd_HFS_Size[ID];
5697 if (max_size1<size1) max_size1 = size1;
5698 size2 = Rcv_HFS_Size[ID];
5699 if (max_size2<size2) max_size2 = size2;
5700 }
5701
5702 /* allocation of arrays */
5703
5704 tmp_array = (double*)malloc(sizeof(double)*max_size1);
5705 tmp_array2 = (double*)malloc(sizeof(double)*max_size2);
5706
5707 /* MPI communication */
5708
5709 tag = 999;
5710 for (ID=0; ID<numprocs; ID++){
5711
5712 IDS = (myid + ID) % numprocs;
5713 IDR = (myid - ID + numprocs) % numprocs;
5714
5715 if (ID!=0){
5716
5717 /*****************************
5718 sending of data
5719 *****************************/
5720
5721 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5722
5723 size1 = Snd_HFS_Size[IDS];
5724
5725 /* multidimentional array to vector array */
5726
5727 num = 0;
5728 for (spin=0; spin<=SpinP_switch; spin++){
5729 for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
5730 Mc_AN = Snd_MAN[IDS][n];
5731 Gc_AN = Snd_GAN[IDS][n];
5732 Cwan = WhatSpecies[Gc_AN];
5733 tno1 = Spe_Total_CNO[Cwan];
5734 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5735 Gh_AN = natn[Gc_AN][h_AN];
5736 Hwan = WhatSpecies[Gh_AN];
5737 tno2 = Spe_Total_CNO[Hwan];
5738 for (i=0; i<tno1; i++){
5739 for (j=0; j<tno2; j++){
5740 tmp_array[num] = Hks[spin][Mc_AN][h_AN][i][j];
5741 num++;
5742 }
5743 }
5744 }
5745 }
5746 }
5747
5748 MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request);
5749 }
5750
5751 /*****************************
5752 receiving of block data
5753 *****************************/
5754
5755 if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
5756
5757 size2 = Rcv_HFS_Size[IDR];
5758
5759 MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat);
5760
5761 num = 0;
5762 for (spin=0; spin<=SpinP_switch; spin++){
5763 Mc_AN = S_TopMAN[IDR] - 1; /* S_TopMAN should be used. */
5764 for (n=0; n<(F_Rcv_Num[IDR]+S_Rcv_Num[IDR]); n++){
5765 Mc_AN++;
5766 Gc_AN = Rcv_GAN[IDR][n];
5767 Cwan = WhatSpecies[Gc_AN];
5768 tno1 = Spe_Total_CNO[Cwan];
5769
5770 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5771 Gh_AN = natn[Gc_AN][h_AN];
5772 Hwan = WhatSpecies[Gh_AN];
5773 tno2 = Spe_Total_CNO[Hwan];
5774 for (i=0; i<tno1; i++){
5775 for (j=0; j<tno2; j++){
5776 Hks[spin][Mc_AN][h_AN][i][j] = tmp_array2[num];
5777 num++;
5778 }
5779 }
5780 }
5781 }
5782 }
5783 }
5784
5785 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5786 MPI_Wait(&request,&stat);
5787 }
5788 }
5789 }
5790
5791 /****************************************************
5792 MPI
5793
5794 OLP0
5795 ****************************************************/
5796
5797 /***********************************
5798 set data size
5799 ************************************/
5800
5801 if (SCF_iter==1){
5802
5803 /***********************************
5804 data transfer
5805 ************************************/
5806
5807 tag = 999;
5808 for (ID=0; ID<numprocs; ID++){
5809
5810 IDS = (myid + ID) % numprocs;
5811 IDR = (myid - ID + numprocs) % numprocs;
5812
5813 if (ID!=0){
5814
5815 /*****************************
5816 sending of data
5817 *****************************/
5818
5819 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5820
5821 size1 = Snd_HFS_Size[IDS]/(1+SpinP_switch);
5822
5823 /* multidimentional array to vector array */
5824
5825 num = 0;
5826
5827 for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
5828 Mc_AN = Snd_MAN[IDS][n];
5829 Gc_AN = Snd_GAN[IDS][n];
5830 Cwan = WhatSpecies[Gc_AN];
5831 tno1 = Spe_Total_CNO[Cwan];
5832 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5833 Gh_AN = natn[Gc_AN][h_AN];
5834 Hwan = WhatSpecies[Gh_AN];
5835 tno2 = Spe_Total_CNO[Hwan];
5836 for (i=0; i<tno1; i++){
5837 for (j=0; j<tno2; j++){
5838 tmp_array[num] = OLP0[Mc_AN][h_AN][i][j];
5839 num++;
5840 }
5841 }
5842 }
5843 }
5844
5845 MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request);
5846 }
5847
5848 /*****************************
5849 receiving of block data
5850 *****************************/
5851
5852 if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
5853
5854 size2 = Rcv_HFS_Size[IDR]/(1+SpinP_switch);
5855
5856 MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat);
5857
5858 num = 0;
5859 Mc_AN = S_TopMAN[IDR] - 1; /* S_TopMAN should be used. */
5860 for (n=0; n<(F_Rcv_Num[IDR]+S_Rcv_Num[IDR]); n++){
5861 Mc_AN++;
5862 Gc_AN = Rcv_GAN[IDR][n];
5863 Cwan = WhatSpecies[Gc_AN];
5864 tno1 = Spe_Total_CNO[Cwan];
5865
5866 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5867 Gh_AN = natn[Gc_AN][h_AN];
5868 Hwan = WhatSpecies[Gh_AN];
5869 tno2 = Spe_Total_CNO[Hwan];
5870 for (i=0; i<tno1; i++){
5871 for (j=0; j<tno2; j++){
5872 OLP0[Mc_AN][h_AN][i][j] = tmp_array2[num];
5873 num++;
5874 }
5875 }
5876 }
5877 }
5878 }
5879
5880 if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5881 MPI_Wait(&request,&stat);
5882 }
5883 }
5884 }
5885 }
5886
5887 /* freeing of arrays */
5888 free(tmp_array);
5889 free(tmp_array2);
5890
5891 if (measure_time==1){
5892 dtime(&Etime1);
5893 time1 = Etime1 - Stime1;
5894 }
5895
5896 /***********************************************
5897 for regeneration of the buffer matrix
5898 ***********************************************/
5899
5900 if (sqrt(fabs(NormRD[0]))<(0.2+0.10*(double)atomnum) && recalc_firsttime){
5901 recalc_flag = 1;
5902 recalc_firsttime = 0;
5903 }
5904 else{
5905 recalc_flag = 0;
5906 }
5907
5908 if (SCF_iter==1) recalc_firsttime = 1;
5909
5910 if (error_check==1){
5911 printf("SCF_iter=%2d recalc_firsttime=%2d\n",SCF_iter,recalc_firsttime);
5912 }
5913
5914 if (measure_time==1) dtime(&Stime2);
5915
5916 /*
5917 #pragma omp parallel shared(List_YOUSO,Residues,EDM,CDM,HO_TC,LO_TC,ChemP,EVal,RMI1,S_G2M,EC_matrix,recalc_flag,recalc_EM,Krylov_U,SpinP_switch,EKC_core_size,rlmax_EC,rlmax_EC2,time11,time10,time9,time8,time7,time6,time5,time4,time3,time2,Hks,OLP0,EKC_Exact_invS_flag,SCF_iter,Msize3,Msize2,Msize,natn,FNAN,SNAN,Spe_Total_CNO,WhatSpecies,M2G,Matomnum,myid,time_per_atom,firsttime)
5918 */
5919
5920 {
5921 int OMPID,Nthrds,Nprocs;
5922 int Mc_AN,Gc_AN,wan,spin;
5923 int ig,ian,ih,kl,jg,jan,Bnum,m,n,rl;
5924 int Anum,i,j,k,Gi,wanA,NUM,n2,csize,is,i2;
5925 int i1,rl1,js,ip,po1,tno1,h_AN,Gh_AN,wanB,tno2;
5926 int *MP;
5927 int KU_d1, KU_d2,lda,ldb,ldc,M,N,K;
5928 double alpha, beta;
5929 double **invS;
5930 double *LoS;
5931 double *C,*KU;
5932 double *H_DC,*ko;
5933 double ***Krylov_U_OLP;
5934 double **inv_RS;
5935 double sum00,sum10,sum20,sum30,sum;
5936 double tmp0,tmp1,tmp2,tmp3;
5937 double Erange;
5938 double Stime_atom,Etime_atom;
5939 double Stime1,Etime1;
5940 double **tmpvec0;
5941 double **tmpvec1;
5942 double **tmpvec2;
5943
5944 __m128d mmSum00,mmSum01,mmSum10,mmSum11;
5945 __m128d mmSum20,mmSum21,mmSum30,mmSum31;
5946 __m128d mmTmp0, mmTmp1, mmTmp2, mmTmp3;
5947 __m128d mmTmp4, mmTmp5;
5948
5949 /* get info. on OpenMP */
5950
5951 OMPID = omp_get_thread_num();
5952
5953 /*
5954 Nthrds = omp_get_num_threads();
5955 Nprocs = omp_get_num_procs();
5956 */
5957
5958 /* allocation of arrays */
5959
5960 MP = (int*)malloc(sizeof(int)*List_YOUSO[2]);
5961
5962 tmpvec0 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
5963 for (i=0; i<EKC_core_size_max; i++){
5964 tmpvec0[i] = (double*)malloc(sizeof(double)*Msize2_max);
5965 }
5966
5967 tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
5968 for (i=0; i<EKC_core_size_max; i++){
5969 tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
5970 }
5971
5972 tmpvec2 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
5973 for (i=0; i<EKC_core_size_max; i++){
5974 tmpvec2[i] = (double*)malloc(sizeof(double)*Msize2_max);
5975 }
5976
5977 if (firsttime && OMPID==0){
5978 PrintMemory("Krylov: tmpvec0",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
5979 PrintMemory("Krylov: tmpvec1",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
5980 PrintMemory("Krylov: tmpvec2",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
5981 }
5982
5983 /***********************************************
5984 main loop of calculation
5985 ***********************************************/
5986
5987 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5988
5989 dtime(&Stime_atom);
5990
5991 Gc_AN = M2G[Mc_AN];
5992 wan = WhatSpecies[Gc_AN];
5993
5994 /* MP array */
5995
5996 Anum = 1;
5997 for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
5998 MP[i] = Anum;
5999 Gi = natn[Gc_AN][i];
6000 wanA = WhatSpecies[Gi];
6001 Anum += Spe_Total_CNO[wanA];
6002 }
6003 NUM = Anum - 1;
6004 n2 = NUM + 40;
6005
6006 /***********************************************
6007 allocation of arrays:
6008 ***********************************************/
6009
6010 if (Msize[Mc_AN]<Msize3[Mc_AN])
6011 csize = Msize3[Mc_AN] + 40;
6012 else
6013 csize = Msize[Mc_AN] + 40;
6014
6015 KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
6016 KU_d2 = Msize2[Mc_AN];
6017
6018 H_DC = (double*)malloc(sizeof(double)*csize*csize);
6019 ko = (double*)malloc(sizeof(double)*csize);
6020 C = (double*)malloc(sizeof(double)*csize*csize);
6021 KU = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+2)*Msize3[Mc_AN]);
6022
6023 /***********************************************
6024 calculate the inverse of overlap matrix
6025 ***********************************************/
6026
6027 if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_Exact_invS_flag==1){
6028
6029 LoS = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3)*(Msize2[Mc_AN]+3));
6030
6031 invS = (double**)malloc(sizeof(double*)*(Msize2[Mc_AN]+3));
6032 for (i=0; i<(Msize2[Mc_AN]+3); i++){
6033 invS[i] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
6034 }
6035
6036 if (measure_time==1) dtime(&Stime1);
6037
6038 Inverse_S_by_Cholesky(Mc_AN, OLP0, invS, MP, NUM, LoS);
6039
6040 if (measure_time==1){
6041 dtime(&Etime1);
6042 time2 += Etime1 - Stime1;
6043 }
6044 }
6045
6046 else if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_invS_flag==1){
6047
6048 Krylov_U_OLP = (double***)malloc(sizeof(double**)*rlmax_EC2[Mc_AN]);
6049 for (i=0; i<rlmax_EC2[Mc_AN]; i++){
6050 Krylov_U_OLP[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
6051 for (j=0; j<EKC_core_size[Mc_AN]; j++){
6052 Krylov_U_OLP[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
6053 for (k=0; k<(Msize2[Mc_AN]+3); k++) Krylov_U_OLP[i][j][k] = 0.0;
6054 }
6055 }
6056
6057 inv_RS = (double**)malloc(sizeof(double*)*(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]);
6058 for (i=0; i<(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]; i++){
6059 inv_RS[i] = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]);
6060 }
6061
6062 if (measure_time==1) dtime(&Stime1);
6063
6064 Krylov_IOLP_trd( Mc_AN, OLP0, Krylov_U_OLP, inv_RS, MP, Msize2, Msize4, Msize2_max, tmpvec0, tmpvec1 );
6065
6066 if (measure_time==1){
6067 dtime(&Etime1);
6068 time2 += Etime1 - Stime1;
6069 }
6070 }
6071
6072 for (spin=0; spin<=SpinP_switch; spin++){
6073
6074 /****************************************************
6075 generate a preconditioning matrix
6076 ****************************************************/
6077
6078 if (measure_time==1) dtime(&Stime1);
6079
6080 if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN]){
6081
6082 Generate_pMatrix_trd( myid, spin, Mc_AN, Hks, OLP0, invS, Krylov_U, Krylov_U_OLP, inv_RS, MP,
6083 Msize, Msize2, Msize3, Msize4, Msize2_max, tmpvec0, tmpvec1, tmpvec2 );
6084 }
6085 else if (SCF_iter==1){
6086
6087 Generate_pMatrix2_trd( myid, spin, Mc_AN, Hks, OLP0, Krylov_U, MP, Msize, Msize2, Msize3, tmpvec1 );
6088 }
6089
6090 if (measure_time==1){
6091 dtime(&Etime1);
6092 time3 += Etime1 - Stime1;
6093 }
6094
6095 if (measure_time==1) dtime(&Stime1);
6096
6097 if (recalc_EM==1 || SCF_iter<=3 || recalc_flag==1){
6098
6099 Embedding_Matrix_trd( spin, Mc_AN, Hks, Krylov_U, EC_matrix, MP, Msize, Msize2, Msize3, tmpvec1, EKC_core_size_max, Msize2_max);
6100 }
6101
6102 if (measure_time==1){
6103 dtime(&Etime1);
6104 time4 += Etime1 - Stime1;
6105 }
6106
6107 /****************************************************
6108 construct the Hamiltonian matrix
6109 ****************************************************/
6110
6111 if (measure_time==1) dtime(&Stime1);
6112
6113 for (i=0; i<=FNAN[Gc_AN]; i++){
6114 ig = natn[Gc_AN][i];
6115 ian = Spe_Total_CNO[WhatSpecies[ig]];
6116 Anum = MP[i] - 1;
6117 ih = S_G2M[ig]; /* S_G2M should be used */
6118
6119 for (j=0; j<=FNAN[Gc_AN]; j++){
6120
6121 kl = RMI1[Mc_AN][i][j];
6122 jg = natn[Gc_AN][j];
6123 jan = Spe_Total_CNO[WhatSpecies[jg]];
6124 Bnum = MP[j] - 1;
6125
6126 if (0<=kl){
6127 for (m=0; m<ian; m++){
6128 for (n=0; n<jan; n++){
6129 H_DC[(Anum+m)*Msize[Mc_AN]+Bnum+n] = Hks[spin][ih][kl][m][n];
6130 }
6131 }
6132 }
6133
6134 else{
6135 for (m=0; m<ian; m++){
6136 for (n=0; n<jan; n++){
6137 H_DC[(Anum+m)*Msize[Mc_AN]+Bnum+n] = 0.0;
6138 }
6139 }
6140 }
6141 }
6142 }
6143
6144 if (measure_time==1){
6145 dtime(&Etime1);
6146 time5 += Etime1 - Stime1;
6147 }
6148
6149 /****************************************************
6150 transform u1^+ * H_DC * u1
6151 ****************************************************/
6152
6153 /* H_DC * u1 */
6154
6155 if (measure_time==1) dtime(&Stime1);
6156
6157 /* original version
6158
6159 for (i=1; i<=Msize[Mc_AN]; i++){
6160 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
6161 for (n=0; n<EKC_core_size[Mc_AN]; n++){
6162
6163 sum = 0.0;
6164 for (j=1; j<=Msize[Mc_AN]; j++){
6165 sum += H_DC[i][j]*Krylov_U[spin][Mc_AN][rl][n][j];
6166 }
6167
6168 C[rl*EKC_core_size[Mc_AN]+n+1][i] = sum;
6169 }
6170 }
6171 }
6172 */
6173
6174 /* BLAS3 version */
6175
6176 for (i=0; i<Msize[Mc_AN]; i++){
6177 for (j=0; j<Msize3[Mc_AN]; j++){
6178 KU[j*Msize[Mc_AN]+i] = Krylov_U[spin][Mc_AN][j*Msize2[Mc_AN]+i+1];
6179 }
6180 }
6181
6182 alpha = 1.0; beta = 0.0;
6183 M = Msize[Mc_AN]; N = Msize3[Mc_AN]; K = Msize[Mc_AN];
6184 lda = M; ldb = K; ldc = Msize[Mc_AN];
6185
6186 F77_NAME(dgemm,DGEMM)( "N", "N", &M, &N, &K, &alpha,
6187 H_DC, &lda, KU, &ldb, &beta, C, &ldc);
6188
6189 if (measure_time==1){
6190 dtime(&Etime1);
6191 time6 += Etime1 - Stime1;
6192 }
6193
6194 /* u1^+ * H_DC * u1 */
6195
6196 if (measure_time==1) dtime(&Stime1);
6197
6198 /* original version */
6199
6200 /*
6201 for (rl1=0; rl1<rlmax_EC[Mc_AN]; rl1++){
6202 for (m=0; m<EKC_core_size[Mc_AN]; m++){
6203 for (rl2=rl1; rl2<rlmax_EC[Mc_AN]; rl2++){
6204 for (n=0; n<EKC_core_size[Mc_AN]; n++){
6205 sum = 0.0;
6206 i2 = rl2*EKC_core_size[Mc_AN] + n + 1;
6207 for (i=1; i<=Msize[Mc_AN]; i++){
6208 sum += Krylov_U[spin][Mc_AN][rl1][m][i]*C[i2][i];
6209 }
6210
6211 H_DC[rl1*EKC_core_size[Mc_AN]+m+1][rl2*EKC_core_size[Mc_AN]+n+1] = sum;
6212 H_DC[rl2*EKC_core_size[Mc_AN]+n+1][rl1*EKC_core_size[Mc_AN]+m+1] = sum;
6213 }
6214 }
6215 }
6216 }
6217 */
6218
6219 /* BLAS3 version */
6220
6221 alpha = 1.0; beta = 0.0;
6222 M = Msize3[Mc_AN]; N = Msize3[Mc_AN]; K = Msize[Mc_AN];
6223 lda = K; ldb = K; ldc = Msize3[Mc_AN];
6224
6225 F77_NAME(dgemm,DGEMM)("T", "N", &M, &N, &K, &alpha,
6226 KU, &lda, C, &ldb, &beta, H_DC, &ldc);
6227
6228 if (measure_time==1){
6229 dtime(&Etime1);
6230 time7 += Etime1 - Stime1;
6231 }
6232
6233 /* correction for ZeroNum */
6234
6235 m = (int)Krylov_U[spin][Mc_AN][0];
6236 for (i=0; i<m; i++){
6237 H_DC[i*Msize3[Mc_AN]+i] = 1.0e+3;
6238 }
6239
6240 /****************************************************
6241 H0 = u1^+ * H_DC * u1 + D
6242 ****************************************************/
6243
6244 if (measure_time==1) dtime(&Stime1);
6245
6246 for (i=(Msize3[Mc_AN]-1); 0<=i; i--){
6247 for (j=0; j<Msize3[Mc_AN]; j++){
6248 H_DC[(i+1)*(Msize3[Mc_AN]+1)+(j+1)] = H_DC[i*Msize3[Mc_AN]+j] + EC_matrix[spin][Mc_AN][i+1][j+1];
6249 }
6250 }
6251
6252 if (measure_time==1){
6253 dtime(&Etime1);
6254 time8 += Etime1 - Stime1;
6255 }
6256
6257 /****************************************************
6258 diagonalize
6259 ****************************************************/
6260
6261 if (measure_time==1) dtime(&Stime1);
6262
6263 Eigen_lapack2(H_DC,Msize3[Mc_AN]+1,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
6264
6265 if (measure_time==1){
6266 dtime(&Etime1);
6267 time9 += Etime1 - Stime1;
6268 }
6269
6270 /********************************************
6271 back transformation of eigenvectors
6272 c = u1 * b
6273 *********************************************/
6274
6275 if (measure_time==1) dtime(&Stime1);
6276
6277 /* original version */
6278
6279 /*
6280 for (i=1; i<=Msize[Mc_AN]; i++){
6281 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
6282 for (n=0; n<EKC_core_size[Mc_AN]; n++){
6283
6284 tmp1 = Krylov_U[spin][Mc_AN][rl][n][i];
6285 i1 = rl*EKC_core_size[Mc_AN] + n + 1;
6286
6287 for (j=1; j<=Msize3[Mc_AN]; j++){
6288 C[i][j] += tmp1*H_DC[i1][j];
6289 }
6290 }
6291 }
6292 }
6293 */
6294
6295 /* BLAS3 version */
6296
6297 alpha = 1.0; beta = 0.0;
6298 M = Msize3[Mc_AN]; N = Msize[Mc_AN]; K = Msize3[Mc_AN];
6299 lda = K; ldb = N; ldc = Msize3[Mc_AN];
6300
6301 F77_NAME(dgemm,DGEMM)("T", "T", &M, &N, &K, &alpha,
6302 H_DC, &lda, KU, &ldb, &beta, C, &ldc);
6303
6304 if (measure_time==1){
6305 dtime(&Etime1);
6306 time10 += Etime1 - Stime1;
6307 }
6308
6309 if (measure_time==1) dtime(&Stime1);
6310
6311 /***********************************************
6312 store eigenvalues and residues of poles
6313 ***********************************************/
6314
6315 for (i=1; i<=Msize3[Mc_AN]; i++){
6316 EVal[spin][Mc_AN][i-1] = ko[i];
6317 }
6318
6319 /******************************************************
6320 set an energy range (-Erange+ChemP to Erange+ChemP)
6321 of eigenvalues used to store the Residues.
6322 ******************************************************/
6323
6324 Erange = 0.367493245; /* in hartree, corresponds to 10 eV */
6325
6326 /***********************************************
6327 find LO_TC and HO_TC
6328 ***********************************************/
6329
6330 /* LO_TC */
6331 i = 0;
6332 ip = 0;
6333 po1 = 0;
6334 do{
6335 if ( (ChemP-Erange)<EVal[spin][Mc_AN][i]){
6336 ip = i;
6337 po1 = 1;
6338 }
6339 i++;
6340 } while (po1==0 && i<Msize3[Mc_AN]);
6341
6342 LO_TC[spin][Mc_AN] = ip;
6343
6344 /* HO_TC */
6345 i = 0;
6346 ip = Msize3[Mc_AN]-1;
6347 po1 = 0;
6348 do{
6349 if ( (ChemP+Erange)<EVal[spin][Mc_AN][i]){
6350 ip = i;
6351 po1 = 1;
6352 }
6353 i++;
6354 } while (po1==0 && i<Msize3[Mc_AN]);
6355
6356 HO_TC[spin][Mc_AN] = ip;
6357
6358 /***********************************************
6359 store residues of poles
6360 ***********************************************/
6361
6362 wanA = WhatSpecies[Gc_AN];
6363 tno1 = Spe_Total_CNO[wanA];
6364
6365 for (i=0; i<tno1; i++){
6366 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6367
6368 Gh_AN = natn[Gc_AN][h_AN];
6369 wanB = WhatSpecies[Gh_AN];
6370 tno2 = Spe_Total_CNO[wanB];
6371 Bnum = MP[h_AN] - 1;
6372 for (j=0; j<tno2; j++){
6373
6374 for (i1=0; i1<LO_TC[spin][Mc_AN]; i1++){
6375 tmp1 = C[i*Msize3[Mc_AN]+i1]*C[(Bnum+j)*Msize3[Mc_AN]+i1];
6376 CDM[spin][Mc_AN][h_AN][i][j] += tmp1;
6377 EDM[spin][Mc_AN][h_AN][i][j] += tmp1*EVal[spin][Mc_AN][i1];
6378 }
6379
6380 /* <allocation of Residues */
6381 n2 = HO_TC[spin][Mc_AN] - LO_TC[spin][Mc_AN] + 3;
6382 Residues[spin][Mc_AN][h_AN][i][j] = (double*)malloc(sizeof(double)*n2);
6383 /* allocation of Residues> */
6384 for (i1=LO_TC[spin][Mc_AN]; i1<=HO_TC[spin][Mc_AN]; i1++){
6385 Residues[spin][Mc_AN][h_AN][i][j][i1-LO_TC[spin][Mc_AN]]
6386 = C[i*Msize3[Mc_AN]+i1]*C[(Bnum+j)*Msize3[Mc_AN]+i1];
6387
6388 }
6389 }
6390 }
6391 }
6392
6393 if (measure_time==1){
6394 dtime(&Etime1);
6395 time11 += Etime1 - Stime1;
6396 }
6397
6398 } /* spin */
6399
6400 /***********************************************
6401 freeing of arrays:
6402 ***********************************************/
6403
6404 free(H_DC);
6405 free(ko);
6406 free(C);
6407 free(KU);
6408
6409 if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_Exact_invS_flag==1){
6410
6411 free(LoS);
6412
6413 for (i=0; i<(Msize2[Mc_AN]+3); i++){
6414 free(invS[i]);
6415 }
6416 free(invS);
6417 }
6418
6419 else if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_invS_flag==1){
6420
6421 for (i=0; i<rlmax_EC2[Mc_AN]; i++){
6422 for (j=0; j<EKC_core_size[Mc_AN]; j++){
6423 free(Krylov_U_OLP[i][j]);
6424 }
6425 free(Krylov_U_OLP[i]);
6426 }
6427 free(Krylov_U_OLP);
6428
6429 for (i=0; i<(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]; i++){
6430 free(inv_RS[i]);
6431 }
6432 free(inv_RS);
6433 }
6434
6435 dtime(&Etime_atom);
6436 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6437
6438 } /* Mc_AN */
6439
6440 /* freeing of array */
6441
6442 free(MP);
6443
6444 for (i=0; i<EKC_core_size_max; i++){
6445 free(tmpvec0[i]);
6446 }
6447 free(tmpvec0);
6448
6449 for (i=0; i<EKC_core_size_max; i++){
6450 free(tmpvec1[i]);
6451 }
6452 free(tmpvec1);
6453
6454 for (i=0; i<EKC_core_size_max; i++){
6455 free(tmpvec2[i]);
6456 }
6457 free(tmpvec2);
6458
6459 } /* #pragma omp parallel */
6460
6461 if (measure_time==1){
6462 dtime(&Etime2);
6463 time16 = Etime2 - Stime2;
6464 }
6465
6466 if (firsttime){
6467
6468 Residues_size = 1;
6469
6470 for (spin=0; spin<=SpinP_switch; spin++){
6471 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6472 Gc_AN = M2G[Mc_AN];
6473 wan = WhatSpecies[Gc_AN];
6474 tno1 = Spe_Total_CNO[wan];
6475 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6476 Gh_AN = natn[Gc_AN][h_AN];
6477 wanB = WhatSpecies[Gh_AN];
6478 tno2 = Spe_Total_CNO[wanB];
6479 n2 = HO_TC[spin][Mc_AN] - LO_TC[spin][Mc_AN] + 3;
6480 Residues_size += tno1*tno2*n2;
6481 }
6482 }
6483 }
6484
6485 PrintMemory("Krylov: Residues",sizeof(double)*Residues_size,NULL);
6486 }
6487
6488 /****************************************************
6489 calculate the projected DOS
6490 ****************************************************/
6491
6492 if (measure_time==1) dtime(&Stime1);
6493
6494 /*
6495 #pragma omp parallel shared(time_per_atom,Residues,LO_TC,HO_TC,EDM,CDM,OLP0,natn,FNAN,PDOS_DC,Msize3,Spe_Total_CNO,WhatSpecies,M2G,SpinP_switch,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,Stime_atom,spin,Gc_AN,wanA,tno1,i1,i,h_AN,Gh_AN,wanB,tno2,j,tmp1,Etime_atom)
6496 */
6497
6498 {
6499
6500 /* get info. on OpenMP */
6501
6502 /*
6503 OMPID = omp_get_thread_num();
6504 Nthrds = omp_get_num_threads();
6505 Nprocs = omp_get_num_procs();
6506 */
6507
6508 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6509
6510 dtime(&Stime_atom);
6511
6512 for (spin=0; spin<=SpinP_switch; spin++){
6513
6514 Gc_AN = M2G[Mc_AN];
6515 wanA = WhatSpecies[Gc_AN];
6516 tno1 = Spe_Total_CNO[wanA];
6517
6518 for (i1=0; i1<=(Msize3[Mc_AN]+1); i1++){
6519 PDOS_DC[spin][Mc_AN][i1] = 0.0;
6520 }
6521
6522 for (i=0; i<tno1; i++){
6523
6524 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6525 Gh_AN = natn[Gc_AN][h_AN];
6526 wanB = WhatSpecies[Gh_AN];
6527 tno2 = Spe_Total_CNO[wanB];
6528 for (j=0; j<tno2; j++){
6529
6530 tmp1 = OLP0[Mc_AN][h_AN][i][j];
6531
6532 PDOS_DC[spin][Mc_AN][0] += tmp1*CDM[spin][Mc_AN][h_AN][i][j];
6533 PDOS_DC[spin][Mc_AN][1] += tmp1*EDM[spin][Mc_AN][h_AN][i][j];
6534
6535 for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
6536 PDOS_DC[spin][Mc_AN][i1+2] += Residues[spin][Mc_AN][h_AN][i][j][i1]*tmp1;
6537 }
6538
6539 }
6540 }
6541 }
6542
6543 } /* spin */
6544
6545 dtime(&Etime_atom);
6546 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6547
6548 } /* Mc_AN */
6549
6550 } /* #pragma omp parallel */
6551
6552 if (measure_time==1){
6553 dtime(&Etime1);
6554 time12 += Etime1 - Stime1;
6555 }
6556
6557 /****************************************************
6558 find the total number of electrons
6559 ****************************************************/
6560
6561 if (measure_time==1) dtime(&Stime1);
6562
6563 My_TZ = 0.0;
6564 for (i=1; i<=Matomnum; i++){
6565 Gc_AN = M2G[i];
6566 wan = WhatSpecies[Gc_AN];
6567 My_TZ += Spe_Core_Charge[wan];
6568 }
6569
6570 /* MPI, My_TZ */
6571
6572 MPI_Barrier(mpi_comm_level1);
6573 MPI_Allreduce(&My_TZ, &TZ, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
6574
6575 /****************************************************
6576 find the chemical potential
6577 ****************************************************/
6578
6579 po = 0;
6580 loopN = 0;
6581
6582 ChemP_MAX = 10.0;
6583 ChemP_MIN =-10.0;
6584 if (SpinP_switch==0) spin_degeneracy = 2.0;
6585 else if (SpinP_switch==1) spin_degeneracy = 1.0;
6586
6587 do {
6588 ChemP = 0.50*(ChemP_MAX + ChemP_MIN);
6589
6590 My_Num_State = 0.0;
6591
6592 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6593 for (spin=0; spin<=SpinP_switch; spin++){
6594
6595 dtime(&Stime_atom);
6596
6597 Gc_AN = M2G[Mc_AN];
6598
6599 My_Num_State += spin_degeneracy*PDOS_DC[spin][Mc_AN][0];
6600
6601 for (i=0; i<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i++){
6602
6603 x = (EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
6604 if (x<=-max_x) x = -max_x;
6605 if (max_x<=x) x = max_x;
6606 FermiF = 1.0/(1.0 + exp(x));
6607 My_Num_State += spin_degeneracy*FermiF*PDOS_DC[spin][Mc_AN][i+2];
6608 }
6609
6610 dtime(&Etime_atom);
6611 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6612 }
6613 }
6614
6615 /* MPI, My_Num_State */
6616
6617 MPI_Barrier(mpi_comm_level1);
6618 MPI_Allreduce(&My_Num_State, &Num_State, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
6619
6620 Dnum = (TZ - Num_State) - system_charge;
6621 if (0.0<=Dnum) ChemP_MIN = ChemP;
6622 else ChemP_MAX = ChemP;
6623 if (fabs(Dnum)<1.0e-11) po = 1;
6624
6625
6626 if (myid==Host_ID && 3<=level_stdout){
6627 printf(" ChemP=%15.12f TZ=%15.12f Num_state=%15.12f\n",ChemP,TZ,Num_State);
6628 }
6629
6630 loopN++;
6631 }
6632 while (po==0 && loopN<1000);
6633
6634 if (measure_time==1){
6635 dtime(&Etime1);
6636 time13 += Etime1 - Stime1;
6637 }
6638
6639 /****************************************************
6640 eigenenergy by summing up eigenvalues
6641 ****************************************************/
6642
6643 if (measure_time==1) dtime(&Stime1);
6644
6645 My_Eele0[0] = 0.0;
6646 My_Eele0[1] = 0.0;
6647 for (spin=0; spin<=SpinP_switch; spin++){
6648 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6649
6650 dtime(&Stime_atom);
6651
6652 Gc_AN = M2G[Mc_AN];
6653 My_Eele0[spin] += PDOS_DC[spin][Mc_AN][1];
6654
6655 for (i=0; i<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i++){
6656
6657 x = (EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
6658
6659 if (x<=-max_x) x = -max_x;
6660 if (max_x<=x) x = max_x;
6661 FermiF = 1.0/(1.0 + exp(x));
6662 My_Eele0[spin] += FermiF*EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]]*PDOS_DC[spin][Mc_AN][i+2];
6663 }
6664
6665 dtime(&Etime_atom);
6666 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6667 }
6668 }
6669
6670 /* MPI, My_Eele0 */
6671 for (spin=0; spin<=SpinP_switch; spin++){
6672 MPI_Barrier(mpi_comm_level1);
6673 MPI_Allreduce(&My_Eele0[spin], &Eele0[spin], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
6674 }
6675
6676 if (SpinP_switch==0){
6677 Eele0[1] = Eele0[0];
6678 }
6679
6680 if (measure_time==1){
6681 dtime(&Etime1);
6682 time14 += Etime1 - Stime1;
6683 }
6684
6685 if (measure_time==1) dtime(&Stime1);
6686
6687 /*
6688 #pragma omp parallel shared(time_per_atom,EDM,CDM,Residues,natn,max_x,Beta,ChemP,EVal,LO_TC,HO_TC,Spe_Total_CNO,WhatSpecies,M2G,SpinP_switch,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,spin,Stime_atom,Gc_AN,wanA,tno1,i1,x,FermiF,h_AN,Gh_AN,wanB,tno2,i,j,tmp1,Etime_atom)
6689 */
6690
6691 {
6692
6693 /* get info. on OpenMP */
6694
6695 /*
6696 OMPID = omp_get_thread_num();
6697 Nthrds = omp_get_num_threads();
6698 Nprocs = omp_get_num_procs();
6699 */
6700
6701 for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6702 for (spin=0; spin<=SpinP_switch; spin++){
6703
6704 dtime(&Stime_atom);
6705
6706 Gc_AN = M2G[Mc_AN];
6707 wanA = WhatSpecies[Gc_AN];
6708 tno1 = Spe_Total_CNO[wanA];
6709
6710 for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
6711
6712 x = (EVal[spin][Mc_AN][i1+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
6713 if (x<=-max_x) x = -max_x;
6714 if (max_x<=x) x = max_x;
6715 FermiF = 1.0/(1.0 + exp(x));
6716
6717 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6718 Gh_AN = natn[Gc_AN][h_AN];
6719 wanB = WhatSpecies[Gh_AN];
6720 tno2 = Spe_Total_CNO[wanB];
6721 for (i=0; i<tno1; i++){
6722 for (j=0; j<tno2; j++){
6723 tmp1 = FermiF*Residues[spin][Mc_AN][h_AN][i][j][i1];
6724 CDM[spin][Mc_AN][h_AN][i][j] += tmp1;
6725 EDM[spin][Mc_AN][h_AN][i][j] += tmp1*EVal[spin][Mc_AN][i1+LO_TC[spin][Mc_AN]];
6726 }
6727 }
6728 }
6729 }
6730
6731 dtime(&Etime_atom);
6732 time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6733 }
6734 }
6735
6736 } /* #pragma omp parallel */
6737
6738 /****************************************************
6739 bond energies
6740 ****************************************************/
6741
6742 My_Eele1[0] = 0.0;
6743 My_Eele1[1] = 0.0;
6744 for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
6745 GA_AN = M2G[MA_AN];
6746 wanA = WhatSpecies[GA_AN];
6747 tnoA = Spe_Total_CNO[wanA];
6748
6749 for (j=0; j<=FNAN[GA_AN]; j++){
6750 GB_AN = natn[GA_AN][j];
6751 wanB = WhatSpecies[GB_AN];
6752 tnoB = Spe_Total_CNO[wanB];
6753
6754 for (k=0; k<tnoA; k++){
6755 for (l=0; l<tnoB; l++){
6756 for (spin=0; spin<=SpinP_switch; spin++){
6757 My_Eele1[spin] += CDM[spin][MA_AN][j][k][l]*Hks[spin][MA_AN][j][k][l];
6758 }
6759 }
6760 }
6761
6762 }
6763 }
6764
6765 /* MPI, My_Eele1 */
6766 MPI_Barrier(mpi_comm_level1);
6767 for (spin=0; spin<=SpinP_switch; spin++){
6768 MPI_Allreduce(&My_Eele1[spin], &Eele1[spin], 1, MPI_DOUBLE,
6769 MPI_SUM, mpi_comm_level1);
6770 }
6771
6772 if (SpinP_switch==0){
6773 Eele1[1] = Eele1[0];
6774 }
6775
6776 if (3<=level_stdout && myid==Host_ID){
6777 printf(" Eele00=%15.12f Eele01=%15.12f\n",Eele0[0],Eele0[1]);
6778 printf(" Eele10=%15.12f Eele11=%15.12f\n",Eele1[0],Eele1[1]);
6779 }
6780
6781 if (measure_time==1){
6782 dtime(&Etime1);
6783 time15 += Etime1 - Stime1;
6784 }
6785
6786 if ( strcasecmp(mode,"dos")==0 ){
6787 Save_DOS_Col(Residues,OLP0,EVal,LO_TC,HO_TC);
6788 }
6789
6790 if (measure_time==1){
6791 printf("myid=%2d time1 =%5.3f time2 =%5.3f time3 =%5.3f time4 =%5.3f time5 =%5.3f\n",
6792 myid,time1,time2,time3,time4,time5);
6793 printf("myid=%2d time6 =%5.3f time7 =%5.3f time8 =%5.3f time9 =%5.3f time10=%5.3f\n",
6794 myid,time6,time7,time8,time9,time10);
6795 printf("myid=%2d time11=%5.3f time12=%5.3f time13=%5.3f time14=%5.3f time15=%5.3f\n",
6796 myid,time11,time12,time13,time14,time15);
6797 printf("myid=%2d time16=%5.3f\n",myid,time16);
6798 }
6799
6800 /****************************************************
6801 freeing of arrays:
6802
6803 ****************************************************/
6804
6805 free(Msize);
6806 free(Msize2);
6807 free(Msize3);
6808 free(Msize4);
6809
6810 for (spin=0; spin<(SpinP_switch+1); spin++){
6811 free(LO_TC[spin]);
6812 }
6813 free(LO_TC);
6814
6815 for (spin=0; spin<(SpinP_switch+1); spin++){
6816 free(HO_TC[spin]);
6817 }
6818 free(HO_TC);
6819
6820 for (spin=0; spin<=SpinP_switch; spin++){
6821 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
6822 free(EVal[spin][Mc_AN]);
6823 }
6824 free(EVal[spin]);
6825 }
6826 free(EVal);
6827
6828 for (spin=0; spin<=SpinP_switch; spin++){
6829 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
6830
6831 if (Mc_AN==0){
6832 Gc_AN = 0;
6833 FNAN[0] = 0;
6834 tno1 = 1;
6835 }
6836 else{
6837 Gc_AN = M2G[Mc_AN];
6838 wanA = WhatSpecies[Gc_AN];
6839 tno1 = Spe_Total_CNO[wanA];
6840 }
6841
6842 for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6843
6844 if (Mc_AN==0){
6845 tno2 = 1;
6846 }
6847 else {
6848 Gh_AN = natn[Gc_AN][h_AN];
6849 wanB = WhatSpecies[Gh_AN];
6850 tno2 = Spe_Total_CNO[wanB];
6851 }
6852
6853 for (i=0; i<tno1; i++){
6854 for (j=0; j<tno2; j++){
6855 free(Residues[spin][Mc_AN][h_AN][i][j]);
6856 }
6857 free(Residues[spin][Mc_AN][h_AN][i]);
6858 }
6859 free(Residues[spin][Mc_AN][h_AN]);
6860 }
6861 free(Residues[spin][Mc_AN]);
6862 }
6863 free(Residues[spin]);
6864 }
6865 free(Residues);
6866
6867 for (spin=0; spin<=SpinP_switch; spin++){
6868 for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
6869 free(PDOS_DC[spin][Mc_AN]);
6870 }
6871 free(PDOS_DC[spin]);
6872 }
6873 free(PDOS_DC);
6874
6875 /* for time */
6876 dtime(&TEtime);
6877 time0 = TEtime - TStime;
6878
6879 if (measure_time==1){
6880 printf("total time=%15.12f\n",time0);
6881 }
6882
6883 /* for PrintMemory */
6884 firsttime=0;
6885
6886 return time0;
6887 }
6888
6889
6890
6891
6892
6893
6894
6895
6896
Generate_pMatrix_trd(int myid,int spin,int Mc_AN,double ***** Hks,double **** OLP0,double ** invS,double *** Krylov_U,double *** Krylov_U_OLP,double ** inv_RS,int * MP,int * Msize,int * Msize2,int * Msize3,int * Msize4,int Msize2_max,double ** tmpvec0,double ** tmpvec1,double ** tmpvec2)6897 void Generate_pMatrix_trd( int myid, int spin, int Mc_AN, double *****Hks,
6898 double ****OLP0, double **invS,
6899 double ***Krylov_U, double ***Krylov_U_OLP, double **inv_RS, int *MP,
6900 int *Msize, int *Msize2, int *Msize3, int *Msize4, int Msize2_max,
6901 double **tmpvec0, double **tmpvec1, double **tmpvec2 )
6902
6903 /* This subroutine is exactly the same as the original Generate_pMatrix,
6904 except for the OMP parallelized loops in orthogonalization by diagonalization,
6905 and Eigen_lapack is replaced with Eigen_lapack_d, _x, or _r
6906 */
6907
6908 {
6909 int rl,rl0,rl1,ct_AN,fan,san,can,wan,ct_on,i,j;
6910 int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1;
6911 int ZeroNum,Gh_AN,wanB,m1s,is,info;
6912 int rl00,rl01,rl02,rl03,rl04,rl05,rl06,rl07;
6913 int mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;
6914
6915 int KU_d1, KU_d2, csize;
6916 __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
6917
6918 double mmArr[8];
6919 double time1,time2,time3,time4,time5;
6920 double time6,time7,time8,time9,time10;
6921 double Stime1,Etime1;
6922 double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
6923 double sum,dum,tmp0,tmp1,tmp2,tmp3,rcutA,r0;
6924 double **Utmp,**matRS0,**matRS1;
6925 double **tmpmat0;
6926 double *ko,*iko;
6927 double **FS;
6928 double ***U0;
6929
6930 /* for OpenMP */
6931 int OMPID,Nthrds,Nprocs;
6932
6933 ct_AN = M2G[Mc_AN];
6934 fan = FNAN[ct_AN];
6935 san = SNAN[ct_AN];
6936 can = fan + san;
6937 wan = WhatSpecies[ct_AN];
6938 ct_on = Spe_Total_CNO[wan];
6939 rcutA = Spe_Atom_Cut1[wan];
6940
6941 if (Msize[Mc_AN]<Msize3[Mc_AN])
6942 csize = Msize3[Mc_AN] + 40;
6943 else
6944 csize = Msize[Mc_AN] + 40;
6945
6946 KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
6947 KU_d2 = Msize2[Mc_AN];
6948
6949 /* allocation of arrays */
6950
6951 Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
6952 for (i=0; i<rlmax_EC[Mc_AN]; i++){
6953 Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
6954 }
6955
6956 U0 = (double***)malloc(sizeof(double**)*rlmax_EC[Mc_AN]);
6957 for (i=0; i<rlmax_EC[Mc_AN]; i++){
6958 U0[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
6959 for (j=0; j<EKC_core_size[Mc_AN]; j++){
6960 U0[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
6961 for (k=0; k<(Msize2[Mc_AN]+3); k++) U0[i][j][k] = 0.0;
6962 }
6963 }
6964
6965 tmpmat0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+4));
6966 for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
6967 tmpmat0[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+4));
6968 }
6969
6970 FS = (double**)malloc(sizeof(double*)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
6971 for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
6972 FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
6973 }
6974
6975 ko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
6976 iko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
6977
6978 matRS0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+2));
6979 for (i=0; i<(EKC_core_size[Mc_AN]+2); i++){
6980 matRS0[i] = (double*)malloc(sizeof(double)*(Msize4[Mc_AN]+3));
6981 }
6982
6983 matRS1 = (double**)malloc(sizeof(double*)*(Msize4[Mc_AN]+3));
6984 for (i=0; i<(Msize4[Mc_AN]+3); i++){
6985 matRS1[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+2));
6986 }
6987
6988 /****************************************************
6989 initialize
6990 ****************************************************/
6991
6992 if (measure_time==1){
6993 time1 = 0.0;
6994 time2 = 0.0;
6995 time3 = 0.0;
6996 time4 = 0.0;
6997 time5 = 0.0;
6998 time6 = 0.0;
6999 time7 = 0.0;
7000 time8 = 0.0;
7001 time9 = 0.0;
7002 time10 = 0.0;
7003 }
7004
7005 if (measure_time==1) dtime(&Stime1);
7006
7007 for (i=0; i<EKC_core_size_max; i++){
7008 for (j=0; j<Msize2_max; j++){
7009 tmpvec0[i][j] = 0.0;
7010 tmpvec1[i][j] = 0.0;
7011 }
7012 }
7013
7014 /* find the nearest atom with distance of r0 */
7015
7016 r0 = 1.0e+10;
7017 for (k=1; k<=FNAN[ct_AN]; k++){
7018 Gh_AN = natn[ct_AN][k];
7019 wanB = WhatSpecies[Gh_AN];
7020 if (Dis[ct_AN][k]<r0) r0 = Dis[ct_AN][k];
7021 }
7022
7023 /* starting vector */
7024
7025 m = 0;
7026 for (k=0; k<=FNAN[ct_AN]; k++){
7027
7028 Gh_AN = natn[ct_AN][k];
7029 wanB = WhatSpecies[Gh_AN];
7030
7031 if ( Dis[ct_AN][k]<(scale_rc_EKC[Mc_AN]*r0) ){
7032
7033 Anum = MP[k] - 1;
7034
7035 for (i=0; i<Spe_Total_CNO[wanB]; i++){
7036
7037 tmpvec0[m][Anum+i] = 1.0;
7038
7039 m++;
7040 }
7041 }
7042 }
7043
7044 S_orthonormalize_vec_trd( Mc_AN, ct_on, tmpvec0, tmpvec1, OLP0, tmpmat0, ko, iko, MP, Msize2 );
7045
7046 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7047 for (i=0; i<Msize2[Mc_AN]; i++){
7048 U0[0][n][i] = tmpvec0[n][i];
7049 }
7050 }
7051
7052 if (measure_time==1){
7053 dtime(&Etime1);
7054 time1 = Etime1 - Stime1;
7055 }
7056
7057 /****************************************************
7058 generate Krylov subspace vectors
7059 ****************************************************/
7060
7061 for (rl=0; rl<(rlmax_EC[Mc_AN]-1); rl++){
7062
7063 if (measure_time==1) dtime(&Stime1);
7064
7065 /*******************************************************
7066 H * |Wn)
7067 *******************************************************/
7068
7069 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7070 for (i=0; i<Msize2[Mc_AN]; i++){
7071 tmpvec1[n][i] = 0.0;
7072 }
7073 }
7074
7075 for (i=0; i<=can; i++){
7076
7077 ig = natn[ct_AN][i];
7078 ian = Spe_Total_CNO[WhatSpecies[ig]];
7079 Anum = MP[i] - 1;
7080 ih = S_G2M[ig];
7081
7082 for (j=0; j<=can; j++){
7083
7084 kl = RMI1[Mc_AN][i][j];
7085 jg = natn[ct_AN][j];
7086 jan = Spe_Total_CNO[WhatSpecies[jg]];
7087 Bnum = MP[j] - 1;
7088
7089 if (0<=kl){
7090
7091 #ifdef nosse
7092
7093 /* original version */
7094
7095 for (m=0; m<ian; m++){
7096 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7097
7098 sum = 0.0;
7099 for (k=0; k<jan; k++){
7100 sum += Hks[spin][ih][kl][m][k]*tmpvec0[n][Bnum+k];
7101 }
7102
7103 tmpvec1[n][Anum+m] += sum;
7104 }
7105 }
7106
7107 #else
7108 /* Loop Unrolling + SSE version */
7109
7110 for (m=0; m<(ian-3); m+=4){
7111 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7112
7113 mmSum00 = _mm_setzero_pd();
7114 mmSum01 = _mm_setzero_pd();
7115 mmSum10 = _mm_setzero_pd();
7116 mmSum11 = _mm_setzero_pd();
7117 mmSum20 = _mm_setzero_pd();
7118 mmSum21 = _mm_setzero_pd();
7119 mmSum30 = _mm_setzero_pd();
7120 mmSum31 = _mm_setzero_pd();
7121
7122 for (k=0; k<(jan-3); k+=4){
7123 mmTmp0 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+0]);
7124 mmTmp1 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+2]);
7125
7126 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
7127 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
7128
7129 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
7130 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
7131
7132 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
7133 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
7134
7135 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
7136 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
7137 }
7138
7139 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7140 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7141 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7142 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7143
7144 _mm_storeu_pd(&mmArr[0], mmSum00);
7145 _mm_storeu_pd(&mmArr[2], mmSum10);
7146 _mm_storeu_pd(&mmArr[4], mmSum20);
7147 _mm_storeu_pd(&mmArr[6], mmSum30);
7148
7149 sum0 = mmArr[0] + mmArr[1];
7150 sum1 = mmArr[2] + mmArr[3];
7151 sum2 = mmArr[4] + mmArr[5];
7152 sum3 = mmArr[6] + mmArr[7];
7153
7154 for (; k<jan; k++){
7155 sum0 += Hks[spin][ih][kl][m+0][k]*tmpvec0[n][Bnum+k];
7156 sum1 += Hks[spin][ih][kl][m+1][k]*tmpvec0[n][Bnum+k];
7157 sum2 += Hks[spin][ih][kl][m+2][k]*tmpvec0[n][Bnum+k];
7158 sum3 += Hks[spin][ih][kl][m+3][k]*tmpvec0[n][Bnum+k];
7159 }
7160
7161 tmpvec1[n][Anum+m+0] += sum0;
7162 tmpvec1[n][Anum+m+1] += sum1;
7163 tmpvec1[n][Anum+m+2] += sum2;
7164 tmpvec1[n][Anum+m+3] += sum3;
7165 }
7166 }
7167
7168 for (; m<ian; m++){
7169 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7170
7171 sum = 0.0;
7172 for (k=0; k<jan; k++){
7173 sum += Hks[spin][ih][kl][m][k]*tmpvec0[n][Bnum+k];
7174 }
7175
7176 tmpvec1[n][Anum+m] += sum;
7177 }
7178 }
7179 #endif
7180 }
7181 }
7182 }
7183
7184 if (measure_time==1){
7185 dtime(&Etime1);
7186 time2 += Etime1 - Stime1;
7187 }
7188
7189 /*******************************************************
7190 S^{-1} * H * |Wn)
7191 *******************************************************/
7192
7193 if (EKC_Exact_invS_flag==1){
7194
7195 if (measure_time==1) dtime(&Stime1);
7196
7197 #ifdef nosse
7198
7199 /* original version */
7200 /*
7201 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7202 for (i=0; i<Msize2[Mc_AN]; i++){
7203 sum = 0.0;
7204 for (j=0; j<Msize2[Mc_AN]; j++){
7205 sum += invS[i][j]*tmpvec1[n][j];
7206 }
7207 tmpvec0[n][i] = sum;
7208 }
7209 }
7210 */
7211 /* unrolling version */
7212
7213 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
7214 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7215
7216 sum0 = 0.0;
7217 sum1 = 0.0;
7218 sum2 = 0.0;
7219 sum3 = 0.0;
7220
7221 for (j=0; j<Msize2[Mc_AN]; j++){
7222 sum0 += invS[i+0][j]*tmpvec1[n][j];
7223 sum1 += invS[i+1][j]*tmpvec1[n][j];
7224 sum2 += invS[i+2][j]*tmpvec1[n][j];
7225 sum3 += invS[i+3][j]*tmpvec1[n][j];
7226 }
7227
7228 tmpvec0[n][i+0] = sum0;
7229 tmpvec0[n][i+1] = sum1;
7230 tmpvec0[n][i+2] = sum2;
7231 tmpvec0[n][i+3] = sum3;
7232 }
7233 }
7234
7235 is = Msize2[Mc_AN] - Msize2[Mc_AN]%4;
7236
7237 for (i=is; i<Msize2[Mc_AN]; i++){
7238 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7239 sum = 0.0;
7240 for (j=0; j<Msize2[Mc_AN]; j++){
7241 sum += invS[i][j]*tmpvec1[n][j];
7242 }
7243 tmpvec0[n][i] = sum;
7244 }
7245 }
7246
7247 #else
7248 /* unrolling + SSE version */
7249
7250 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
7251 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7252
7253 mmSum00 = _mm_setzero_pd();
7254 mmSum01 = _mm_setzero_pd();
7255 mmSum10 = _mm_setzero_pd();
7256 mmSum11 = _mm_setzero_pd();
7257 mmSum20 = _mm_setzero_pd();
7258 mmSum21 = _mm_setzero_pd();
7259 mmSum30 = _mm_setzero_pd();
7260 mmSum31 = _mm_setzero_pd();
7261
7262 for (j=0; j<(Msize2[Mc_AN]-3); j+=4){
7263 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][j+0]);
7264 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][j+2]);
7265
7266 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&invS[i+0][j+0]),mmTmp0));
7267 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&invS[i+0][j+2]),mmTmp1));
7268
7269 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&invS[i+1][j+0]),mmTmp0));
7270 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&invS[i+1][j+2]),mmTmp1));
7271
7272 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&invS[i+2][j+0]),mmTmp0));
7273 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&invS[i+2][j+2]),mmTmp1));
7274
7275 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&invS[i+3][j+0]),mmTmp0));
7276 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&invS[i+3][j+2]),mmTmp1));
7277 }
7278
7279 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7280 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7281 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7282 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7283
7284 _mm_storeu_pd(&mmArr[0], mmSum00);
7285 _mm_storeu_pd(&mmArr[2], mmSum10);
7286 _mm_storeu_pd(&mmArr[4], mmSum20);
7287 _mm_storeu_pd(&mmArr[6], mmSum30);
7288
7289 sum0 = mmArr[0] + mmArr[1];
7290 sum1 = mmArr[2] + mmArr[3];
7291 sum2 = mmArr[4] + mmArr[5];
7292 sum3 = mmArr[6] + mmArr[7];
7293
7294 for (; j<Msize2[Mc_AN]; j++){
7295 sum0 += invS[i+0][j]*tmpvec1[n][j];
7296 sum1 += invS[i+1][j]*tmpvec1[n][j];
7297 sum2 += invS[i+2][j]*tmpvec1[n][j];
7298 sum3 += invS[i+3][j]*tmpvec1[n][j];
7299 }
7300
7301 tmpvec0[n][i+0] = sum0;
7302 tmpvec0[n][i+1] = sum1;
7303 tmpvec0[n][i+2] = sum2;
7304 tmpvec0[n][i+3] = sum3;
7305 }
7306 }
7307
7308 is = Msize2[Mc_AN] - Msize2[Mc_AN]%4;
7309
7310 for (i=is; i<Msize2[Mc_AN]; i++){
7311 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7312 sum = 0.0;
7313 for (j=0; j<Msize2[Mc_AN]; j++){
7314 sum += invS[i][j]*tmpvec1[n][j];
7315 }
7316 tmpvec0[n][i] = sum;
7317 }
7318 }
7319
7320 #endif
7321
7322 if (measure_time==1){
7323 dtime(&Etime1);
7324 time3 += Etime1 - Stime1;
7325 }
7326 }
7327
7328 /*******************************************************
7329 U * RS^-1 * U^+ * H * |Wn)
7330 *******************************************************/
7331
7332 else if (EKC_invS_flag==1){
7333
7334 /* U^+ * H * |Wn) */
7335
7336 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
7337 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7338 for (m=0; m<EKC_core_size[Mc_AN]; m++){
7339
7340 sum = 0.0;
7341 for (i=0; i<Msize2[Mc_AN]; i++){
7342 sum += Krylov_U_OLP[rl0][n][i]*tmpvec1[m][i];
7343 }
7344
7345 /* transpose the later calcualtion */
7346 matRS0[m][rl0*EKC_core_size[Mc_AN]+n] = sum;
7347 }
7348 }
7349 }
7350
7351 /* RS^-1 * U^+ * H * |Wn) */
7352
7353 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
7354 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7355
7356 for (m=0; m<EKC_core_size[Mc_AN]; m++){
7357
7358 sum = 0.0;
7359 for (i=0; i<Msize4[Mc_AN]; i++){
7360 sum += inv_RS[rl0*EKC_core_size[Mc_AN]+n][i]*matRS0[m][i];
7361 }
7362
7363 matRS1[rl0*EKC_core_size[Mc_AN]+n][m] = sum;
7364 }
7365 }
7366 }
7367
7368 /* U * RS^-1 * U^+ * H * |Wn) */
7369
7370 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7371 for (i=0; i<Msize2[Mc_AN]; i++){
7372 tmpvec0[n][i] = 0.0;
7373 }
7374 }
7375
7376 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
7377 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7378 for (m=0; m<EKC_core_size[Mc_AN]; m++){
7379 tmp0 = matRS1[rl0*EKC_core_size[Mc_AN]+n][m];
7380 for (i=0; i<Msize2[Mc_AN]; i++){
7381 tmpvec0[m][i] += Krylov_U_OLP[rl0][n][i]*tmp0;
7382 }
7383 }
7384 }
7385 }
7386 }
7387
7388 else {
7389 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7390 for (i=0; i<Msize2[Mc_AN]; i++){
7391 tmpvec0[n][i] = tmpvec1[n][i];
7392 }
7393 }
7394 }
7395
7396 if (measure_time==1) dtime(&Stime1);
7397
7398 /*************************************************************
7399 S-orthogonalization by a classical block Gram-Schmidt method
7400 *************************************************************/
7401
7402 /* |tmpvec2) = S * |tmpvec0) */
7403
7404 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7405 for (i=0; i<Msize2[Mc_AN]; i++){
7406 tmpvec2[n][i] = 0.0;
7407 }
7408 }
7409
7410 for (i=0; i<=can; i++){
7411
7412 ig = natn[ct_AN][i];
7413 ian = Spe_Total_CNO[WhatSpecies[ig]];
7414 Anum = MP[i] - 1;
7415 ih = S_G2M[ig];
7416
7417 for (j=0; j<=can; j++){
7418
7419 kl = RMI1[Mc_AN][i][j];
7420 jg = natn[ct_AN][j];
7421 jan = Spe_Total_CNO[WhatSpecies[jg]];
7422 Bnum = MP[j] - 1;
7423
7424 if (0<=kl){
7425
7426 #ifdef nosse
7427
7428 /* Original version */
7429
7430 for (m=0; m<ian; m++){
7431 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7432
7433 sum = 0.0;
7434 for (k=0; k<jan; k++){
7435 sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
7436 }
7437
7438 tmpvec2[n][Anum+m] += sum;
7439 }
7440 }
7441
7442 #else
7443
7444 /* Unrolling + SSE version */
7445
7446 for (m=0; m<(ian-3); m+=4){
7447 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7448
7449 mmSum00 = _mm_setzero_pd();
7450 mmSum01 = _mm_setzero_pd();
7451 mmSum10 = _mm_setzero_pd();
7452 mmSum11 = _mm_setzero_pd();
7453 mmSum20 = _mm_setzero_pd();
7454 mmSum21 = _mm_setzero_pd();
7455 mmSum30 = _mm_setzero_pd();
7456 mmSum31 = _mm_setzero_pd();
7457
7458 for (k=0; k<(jan-3); k+=4){
7459 mmTmp0 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+0]);
7460 mmTmp1 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+2]);
7461
7462 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
7463 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
7464
7465 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
7466 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
7467
7468 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
7469 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
7470
7471 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
7472 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
7473 }
7474
7475 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7476 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7477 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7478 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7479
7480 _mm_storeu_pd(&mmArr[0], mmSum00);
7481 _mm_storeu_pd(&mmArr[2], mmSum10);
7482 _mm_storeu_pd(&mmArr[4], mmSum20);
7483 _mm_storeu_pd(&mmArr[6], mmSum30);
7484
7485 sum0 = mmArr[0] + mmArr[1];
7486 sum1 = mmArr[2] + mmArr[3];
7487 sum2 = mmArr[4] + mmArr[5];
7488 sum3 = mmArr[6] + mmArr[7];
7489
7490 for (; k<jan; k++){
7491 sum0 += OLP0[ih][kl][m+0][k]*tmpvec0[n][Bnum+k];
7492 sum1 += OLP0[ih][kl][m+1][k]*tmpvec0[n][Bnum+k];
7493 sum2 += OLP0[ih][kl][m+2][k]*tmpvec0[n][Bnum+k];
7494 sum3 += OLP0[ih][kl][m+3][k]*tmpvec0[n][Bnum+k];
7495 }
7496
7497 tmpvec2[n][Anum+m+0] += sum0;
7498 tmpvec2[n][Anum+m+1] += sum1;
7499 tmpvec2[n][Anum+m+2] += sum2;
7500 tmpvec2[n][Anum+m+3] += sum3;
7501 }
7502 }
7503
7504 for (; m<ian; m++){
7505 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7506
7507 sum = 0.0;
7508 for (k=0; k<jan; k++){
7509 sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
7510 }
7511
7512 tmpvec2[n][Anum+m] += sum;
7513 }
7514 }
7515
7516 #endif
7517
7518 }
7519 }
7520 }
7521
7522 if (measure_time==1){
7523 dtime(&Etime1);
7524 time4 += Etime1 - Stime1;
7525 }
7526
7527 if (measure_time==1) dtime(&Stime1);
7528
7529 #ifdef nosse
7530
7531 /* Original version */
7532
7533 for (rl0=0; rl0<=rl; rl0++){
7534
7535 /* (U_rl0|tmpvec2) */
7536
7537 for (m=0; m<EKC_core_size[Mc_AN]; m++){
7538 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7539 sum = 0.0;
7540 for (i=0; i<Msize2[Mc_AN]; i++){
7541 sum += U0[rl0][m][i]*tmpvec2[n][i];
7542 }
7543 tmpmat0[m][n] = sum;
7544 }
7545 }
7546
7547 /* |tmpvec0) - |U_rl0) * (U_rl0|tmpvec2) */
7548
7549 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7550 for (k=0; k<EKC_core_size[Mc_AN]; k++){
7551 dum = tmpmat0[k][n];
7552 for (i=0; i<Msize2[Mc_AN]; i++) tmpvec0[n][i] -= U0[rl0][k][i]*dum;
7553 }
7554 }
7555
7556 }
7557
7558 #else
7559
7560 /* Unrolling + SSE version */
7561
7562 for (rl0=0; rl0<=rl; rl0++){
7563
7564 /* (U_rl0|tmpvec2) */
7565
7566 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
7567 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7568
7569 mmSum00 = _mm_setzero_pd();
7570 mmSum01 = _mm_setzero_pd();
7571 mmSum10 = _mm_setzero_pd();
7572 mmSum11 = _mm_setzero_pd();
7573 mmSum20 = _mm_setzero_pd();
7574 mmSum21 = _mm_setzero_pd();
7575 mmSum30 = _mm_setzero_pd();
7576 mmSum31 = _mm_setzero_pd();
7577
7578 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
7579 mmTmp0 = _mm_loadu_pd(&tmpvec2[n][i+0]);
7580 mmTmp1 = _mm_loadu_pd(&tmpvec2[n][i+2]);
7581
7582 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
7583 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
7584
7585 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
7586 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
7587
7588 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
7589 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
7590
7591 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
7592 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
7593 }
7594
7595 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7596 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7597 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7598 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7599
7600 _mm_storeu_pd(&mmArr[0], mmSum00);
7601 _mm_storeu_pd(&mmArr[2], mmSum10);
7602 _mm_storeu_pd(&mmArr[4], mmSum20);
7603 _mm_storeu_pd(&mmArr[6], mmSum30);
7604
7605 sum0 = mmArr[0] + mmArr[1];
7606 sum1 = mmArr[2] + mmArr[3];
7607 sum2 = mmArr[4] + mmArr[5];
7608 sum3 = mmArr[6] + mmArr[7];
7609
7610 for (; i<Msize2[Mc_AN]; i++){
7611 sum0 += U0[rl0][m+0][i]*tmpvec2[n][i];
7612 sum1 += U0[rl0][m+1][i]*tmpvec2[n][i];
7613 sum2 += U0[rl0][m+2][i]*tmpvec2[n][i];
7614 sum3 += U0[rl0][m+3][i]*tmpvec2[n][i];
7615 }
7616
7617 tmpmat0[m+0][n] = sum0;
7618 tmpmat0[m+1][n] = sum1;
7619 tmpmat0[m+2][n] = sum2;
7620 tmpmat0[m+3][n] = sum3;
7621 }
7622 }
7623
7624 for (; m<EKC_core_size[Mc_AN]; m++){
7625 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7626 sum = 0.0;
7627 for (i=0; i<Msize2[Mc_AN]; i++){
7628 sum += U0[rl0][m][i]*tmpvec2[n][i];
7629 }
7630 tmpmat0[m][n] = sum;
7631 }
7632 }
7633
7634 /* |tmpvec0) - |U_rl0) * (U_rl0|tmpvec2) */
7635
7636 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7637 for (k=0; k<EKC_core_size[Mc_AN]; k++){
7638 dum = tmpmat0[k][n];
7639 for (i=0; i<Msize2[Mc_AN]; i++) tmpvec0[n][i] -= U0[rl0][k][i]*dum;
7640 }
7641 }
7642
7643 }
7644
7645 #endif
7646
7647 if (measure_time==1){
7648 dtime(&Etime1);
7649 time5 += Etime1 - Stime1;
7650 }
7651
7652 /*************************************************************
7653 S-orthonormalization of tmpvec0
7654 *************************************************************/
7655
7656 if (measure_time==1) dtime(&Stime1);
7657
7658 S_orthonormalize_vec_trd( Mc_AN, ct_on, tmpvec0, tmpvec1, OLP0, tmpmat0, ko, iko, MP, Msize2 );
7659
7660 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7661 for (i=0; i<Msize2[Mc_AN]; i++){
7662 U0[rl+1][n][i] = tmpvec0[n][i];
7663 }
7664 }
7665
7666 if (measure_time==1){
7667 dtime(&Etime1);
7668 time6 += Etime1 - Stime1;
7669 }
7670
7671 } /* rl */
7672
7673 /************************************************************
7674 orthogonalization by diagonalization
7675 ************************************************************/
7676
7677 if (measure_time==1) dtime(&Stime1);
7678
7679 #pragma omp parallel shared(EKC_core_size_max,Msize2_max,rlmax_EC,Mc_AN,EKC_core_size,Msize2,can,natn,ct_AN,ct_on,Spe_Total_CNO,WhatSpecies,MP,S_G2M,RMI1,Hks,spin,Msize,Msize4,OLP0,U0,rlmax_EC2,FS) private(OMPID,Nthrds,Nprocs,rl,n,i,ig,ian,Anum,ih,j,kl,jg,jan,Bnum,m,sum,k,mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31,mmTmp0,mmTmp1,mmArr,sum0,sum1,sum2,sum3,rl0)
7680
7681 {
7682
7683 double **tmpvec1;
7684
7685 tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
7686 for (i=0; i<EKC_core_size_max; i++){
7687 tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
7688 }
7689
7690
7691 OMPID = omp_get_thread_num();
7692 Nthrds = omp_get_num_threads();
7693 Nprocs = omp_get_num_procs();
7694
7695 for (rl=0+OMPID; rl<rlmax_EC[Mc_AN]; rl+=Nthrds){
7696
7697 /* S * |Vn) */
7698
7699 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7700 for (i=0; i<Msize2[Mc_AN]; i++){
7701 tmpvec1[n][i] = 0.0;
7702 }
7703 }
7704
7705 for (i=0; i<=can; i++){
7706
7707 ig = natn[ct_AN][i];
7708 ian = Spe_Total_CNO[WhatSpecies[ig]];
7709 Anum = MP[i] - 1;
7710 ih = S_G2M[ig];
7711
7712 for (j=0; j<=can; j++){
7713
7714 kl = RMI1[Mc_AN][i][j];
7715 jg = natn[ct_AN][j];
7716 jan = Spe_Total_CNO[WhatSpecies[jg]];
7717 Bnum = MP[j] - 1;
7718
7719 if (0<=kl){
7720
7721 #ifdef nosse
7722
7723 /* Original version */
7724 /**/
7725 for (m=0; m<ian; m++){
7726 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7727
7728 sum = 0.0;
7729 for (k=0; k<jan; k++){
7730 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
7731 }
7732 tmpvec1[n][Anum+m] += sum;
7733 }
7734 }
7735 /**/
7736
7737 #else
7738
7739 /* Unrolling + SSE version */
7740 /**/
7741 for (m=0; m<(ian-3); m+=4){
7742 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7743
7744 mmSum00 = _mm_setzero_pd();
7745 mmSum01 = _mm_setzero_pd();
7746 mmSum10 = _mm_setzero_pd();
7747 mmSum11 = _mm_setzero_pd();
7748 mmSum20 = _mm_setzero_pd();
7749 mmSum21 = _mm_setzero_pd();
7750 mmSum30 = _mm_setzero_pd();
7751 mmSum31 = _mm_setzero_pd();
7752
7753 for (k=0; k<(jan-3); k+=4){
7754 mmTmp0 = _mm_loadu_pd(&U0[rl][n][Bnum+k+0]);
7755 mmTmp1 = _mm_loadu_pd(&U0[rl][n][Bnum+k+2]);
7756
7757 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
7758 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
7759
7760 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
7761 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
7762
7763 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
7764 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
7765
7766 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
7767 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
7768 }
7769
7770 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7771 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7772 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7773 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7774
7775 _mm_storeu_pd(&mmArr[0], mmSum00);
7776 _mm_storeu_pd(&mmArr[2], mmSum10);
7777 _mm_storeu_pd(&mmArr[4], mmSum20);
7778 _mm_storeu_pd(&mmArr[6], mmSum30);
7779
7780 sum0 = mmArr[0] + mmArr[1];
7781 sum1 = mmArr[2] + mmArr[3];
7782 sum2 = mmArr[4] + mmArr[5];
7783 sum3 = mmArr[6] + mmArr[7];
7784
7785 for (; k<jan; k++){
7786 sum0 += OLP0[ih][kl][m+0][k]*U0[rl][n][Bnum+k];
7787 sum1 += OLP0[ih][kl][m+1][k]*U0[rl][n][Bnum+k];
7788 sum2 += OLP0[ih][kl][m+2][k]*U0[rl][n][Bnum+k];
7789 sum3 += OLP0[ih][kl][m+3][k]*U0[rl][n][Bnum+k];
7790 }
7791
7792 tmpvec1[n][Anum+m+0] += sum0;
7793 tmpvec1[n][Anum+m+1] += sum1;
7794 tmpvec1[n][Anum+m+2] += sum2;
7795 tmpvec1[n][Anum+m+3] += sum3;
7796 }
7797 }
7798
7799 for (; m<ian; m++){
7800 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7801
7802 sum = 0.0;
7803 for (k=0; k<jan; k++){
7804 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
7805 }
7806 tmpvec1[n][Anum+m] += sum;
7807 }
7808 }
7809 /**/
7810 #endif
7811
7812 }
7813 }
7814 }
7815
7816 #ifdef nosse
7817
7818 /* Original version */
7819 /**/
7820 for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
7821 for (m=0; m<EKC_core_size[Mc_AN]; m++){
7822 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7823 sum = 0.0;
7824 for (i=0; i<Msize2[Mc_AN]; i++){
7825 sum += U0[rl0][m][i]*tmpvec1[n][i];
7826 }
7827 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
7828 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
7829 }
7830 }
7831 }
7832 /**/
7833
7834 #else
7835
7836 /* Unrolling + SSE version */
7837 /**/
7838 for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
7839 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
7840 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7841
7842 mmSum00 = _mm_setzero_pd();
7843 mmSum01 = _mm_setzero_pd();
7844 mmSum10 = _mm_setzero_pd();
7845 mmSum11 = _mm_setzero_pd();
7846 mmSum20 = _mm_setzero_pd();
7847 mmSum21 = _mm_setzero_pd();
7848 mmSum30 = _mm_setzero_pd();
7849 mmSum31 = _mm_setzero_pd();
7850
7851 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
7852 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
7853 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
7854
7855 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
7856 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
7857
7858 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
7859 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
7860
7861 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
7862 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
7863
7864 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
7865 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
7866 }
7867
7868 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7869 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7870 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7871 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7872
7873 _mm_storeu_pd(&mmArr[0], mmSum00);
7874 _mm_storeu_pd(&mmArr[2], mmSum10);
7875 _mm_storeu_pd(&mmArr[4], mmSum20);
7876 _mm_storeu_pd(&mmArr[6], mmSum30);
7877
7878 sum0 = mmArr[0] + mmArr[1];
7879 sum1 = mmArr[2] + mmArr[3];
7880 sum2 = mmArr[4] + mmArr[5];
7881 sum3 = mmArr[6] + mmArr[7];
7882
7883 for (; i<Msize2[Mc_AN]; i++){
7884 sum0 += U0[rl0][m+0][i]*tmpvec1[n][i];
7885 sum1 += U0[rl0][m+1][i]*tmpvec1[n][i];
7886 sum2 += U0[rl0][m+2][i]*tmpvec1[n][i];
7887 sum3 += U0[rl0][m+3][i]*tmpvec1[n][i];
7888 }
7889
7890 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
7891 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum0;
7892
7893 FS[rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
7894 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+2] = sum1;
7895
7896 FS[rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
7897 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+3] = sum2;
7898
7899 FS[rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
7900 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+4] = sum3;
7901 }
7902 }
7903
7904 for (; m<EKC_core_size[Mc_AN]; m++){
7905 for (n=0; n<EKC_core_size[Mc_AN]; n++){
7906 sum = 0.0;
7907 for (i=0; i<Msize2[Mc_AN]; i++){
7908 sum += U0[rl0][m][i]*tmpvec1[n][i];
7909 }
7910 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
7911 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
7912 }
7913 }
7914
7915 }
7916 /**/
7917
7918 #endif
7919
7920 }
7921
7922 for (i=0; i<EKC_core_size_max; i++){
7923 free(tmpvec1[i]);
7924 }
7925 free(tmpvec1);
7926
7927 } /* #pragma omp parallel */
7928
7929 if (measure_time==1){
7930 dtime(&Etime1);
7931 time7 += Etime1 - Stime1;
7932 }
7933
7934 if (measure_time==1) dtime(&Stime1);
7935
7936 info = Eigen_lapack_d(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
7937 if (info!=0){
7938 info = Eigen_lapack_x(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
7939 if (info!=0){
7940 Eigen_lapack_r(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
7941 }
7942 }
7943
7944 if (measure_time==1){
7945 dtime(&Etime1);
7946 time8 += Etime1 - Stime1;
7947 }
7948
7949 ZeroNum = 0;
7950
7951 for (i=1; i<=Msize3[Mc_AN]; i++){
7952
7953 if (error_check==1){
7954 printf("spin=%2d Mc_AN=%2d i=%3d ko[i]=%18.15f\n",spin,Mc_AN,i,ko[i]);
7955 }
7956
7957 if (cutoff_value<ko[i]){
7958 ko[i] = sqrt(fabs(ko[i]));
7959 iko[i] = 1.0/ko[i];
7960 }
7961 else{
7962 ZeroNum++;
7963 ko[i] = 0.0;
7964 iko[i] = 0.0;
7965 }
7966 }
7967
7968 if (error_check==1){
7969 printf("spin=%2d Mc_AN=%2d ZeroNum=%2d\n",spin,Mc_AN,ZeroNum);
7970 }
7971
7972 for (i=1; i<=Msize3[Mc_AN]; i++){
7973 for (j=1; j<=Msize3[Mc_AN]; j++){
7974 FS[i][j] = FS[i][j]*iko[j];
7975 }
7976 }
7977
7978 /* transpose for later calculation */
7979 for (i=1; i<=Msize3[Mc_AN]; i++){
7980 for (j=i+1; j<=Msize3[Mc_AN]; j++){
7981 tmp1 = FS[i][j];
7982 tmp2 = FS[j][i];
7983 FS[i][j] = tmp2;
7984 FS[j][i] = tmp1;
7985 }
7986 }
7987
7988 if (measure_time==1) dtime(&Stime1);
7989
7990 /* U0 * U * lamda^{-1/2} */
7991
7992 #ifdef nosse
7993
7994 /* original version */
7995 /*
7996 for (i=0; i<Msize2[Mc_AN]; i++){
7997 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
7998 for (m=0; m<EKC_core_size[Mc_AN]; m++){
7999
8000 m1 = rl0*EKC_core_size[Mc_AN] + m + 1;
8001
8002 sum = 0.0;
8003 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8004
8005 n1 = rl*EKC_core_size[Mc_AN] + 1;
8006
8007 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8008 sum += U0[rl][n][i]*FS[m1][n1+n];
8009 }
8010 }
8011
8012 Utmp[rl0][m] = sum;
8013 }
8014 }
8015
8016 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8017 for (m=0; m<EKC_core_size[Mc_AN]; m++){
8018 U0[rl0][m][i] = Utmp[rl0][m];
8019 }
8020 }
8021 }
8022 */
8023
8024 /* unrolling version */
8025
8026 for (i=0; i<Msize2[Mc_AN]; i++){
8027
8028 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8029 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8030 Utmp[rl][n] = U0[rl][n][i];
8031 }
8032 }
8033
8034 for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
8035
8036 rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
8037 rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
8038 rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
8039 rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
8040
8041 mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
8042 mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
8043 mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
8044 mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
8045
8046 sum0 = 0.0;
8047 sum1 = 0.0;
8048 sum2 = 0.0;
8049 sum3 = 0.0;
8050
8051 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8052
8053 n1 = rl*EKC_core_size[Mc_AN] + 1;
8054
8055 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8056 sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
8057 sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
8058 sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
8059 sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
8060 }
8061 }
8062
8063 U0[rl00][mm0][i] = sum0;
8064 U0[rl01][mm1][i] = sum1;
8065 U0[rl02][mm2][i] = sum2;
8066 U0[rl03][mm3][i] = sum3;
8067 }
8068
8069 m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
8070
8071 for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
8072
8073 rl0 = (m1-1)/EKC_core_size[Mc_AN];
8074 m = (m1-1)%EKC_core_size[Mc_AN];
8075
8076 sum = 0.0;
8077
8078 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8079
8080 n1 = rl*EKC_core_size[Mc_AN] + 1;
8081
8082 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8083 sum += Utmp[rl][n]*FS[m1][n1+n];
8084 }
8085 }
8086
8087 U0[rl0][m][i] = sum;
8088 }
8089 } /* i */
8090
8091 #else
8092
8093 /* Unrolling + SSE version */
8094 /**/
8095
8096 #pragma omp parallel shared(rlmax_EC,Mc_AN,EKC_core_size,Msize2,U0,FS) private(OMPID,Nthrds,Nprocs,rl,n,i,j,m,sum,k,mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31,mmTmp0,mmTmp1,mmArr,sum0,sum1,sum2,sum3,rl0,rl00,rl01,rl02,rl03,mm0,mm1,mm2,mm3,n1,m1,m1s,Utmp)
8097
8098 {
8099
8100 double ** Utmp;
8101 Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
8102 for (i=0; i<rlmax_EC[Mc_AN]; i++){
8103 Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
8104 }
8105
8106 OMPID = omp_get_thread_num();
8107 Nthrds = omp_get_num_threads();
8108 Nprocs = omp_get_num_procs();
8109
8110 for (i=0+OMPID; i<Msize2[Mc_AN]; i+=Nthrds){
8111
8112 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8113 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8114 Utmp[rl][n] = U0[rl][n][i];
8115 }
8116 }
8117
8118 for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
8119
8120 rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
8121 rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
8122 rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
8123 rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
8124
8125 mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
8126 mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
8127 mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
8128 mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
8129
8130 sum0 = 0.0;
8131 sum1 = 0.0;
8132 sum2 = 0.0;
8133 sum3 = 0.0;
8134
8135 mmSum00 = _mm_setzero_pd();
8136 mmSum01 = _mm_setzero_pd();
8137 mmSum10 = _mm_setzero_pd();
8138 mmSum11 = _mm_setzero_pd();
8139 mmSum20 = _mm_setzero_pd();
8140 mmSum21 = _mm_setzero_pd();
8141 mmSum30 = _mm_setzero_pd();
8142 mmSum31 = _mm_setzero_pd();
8143
8144 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8145
8146 n1 = rl*EKC_core_size[Mc_AN] + 1;
8147
8148 if (0){
8149 mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
8150 mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
8151 }
8152
8153 for (n=0; n<(EKC_core_size[Mc_AN]-3); n+=4){
8154 mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
8155 mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
8156
8157 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+0]),mmTmp0));
8158 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+2]),mmTmp1));
8159
8160 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+0]),mmTmp0));
8161 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+2]),mmTmp1));
8162
8163 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+0]),mmTmp0));
8164 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+2]),mmTmp1));
8165
8166 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+0]),mmTmp0));
8167 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+2]),mmTmp1));
8168 }
8169
8170 for (; n<EKC_core_size[Mc_AN]; n++){
8171 sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
8172 sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
8173 sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
8174 sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
8175 }
8176
8177 }
8178
8179 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
8180 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
8181 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
8182 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
8183
8184 _mm_storeu_pd(&mmArr[0], mmSum00);
8185 _mm_storeu_pd(&mmArr[2], mmSum10);
8186 _mm_storeu_pd(&mmArr[4], mmSum20);
8187 _mm_storeu_pd(&mmArr[6], mmSum30);
8188
8189 sum0 += mmArr[0] + mmArr[1];
8190 sum1 += mmArr[2] + mmArr[3];
8191 sum2 += mmArr[4] + mmArr[5];
8192 sum3 += mmArr[6] + mmArr[7];
8193
8194 U0[rl00][mm0][i] = sum0;
8195 U0[rl01][mm1][i] = sum1;
8196 U0[rl02][mm2][i] = sum2;
8197 U0[rl03][mm3][i] = sum3;
8198 }
8199
8200 m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
8201
8202 for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
8203
8204 rl0 = (m1-1)/EKC_core_size[Mc_AN];
8205 m = (m1-1)%EKC_core_size[Mc_AN];
8206
8207 sum = 0.0;
8208
8209 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8210
8211 n1 = rl*EKC_core_size[Mc_AN] + 1;
8212
8213 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8214 sum += Utmp[rl][n]*FS[m1][n1+n];
8215 }
8216 }
8217
8218 U0[rl0][m][i] = sum;
8219 }
8220 } /* i */
8221
8222 for (i=0; i<rlmax_EC[Mc_AN]; i++){
8223 free(Utmp[i]);
8224 }
8225 free(Utmp);
8226
8227
8228 } /* #pragma omp parallel */
8229
8230 #endif
8231
8232 Krylov_U[spin][Mc_AN][0] = ZeroNum;
8233
8234 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8235 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8236 for (i=0; i<Msize2[Mc_AN]; i++){
8237 Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+i+1] = U0[rl][n][i];
8238 }
8239 }
8240 }
8241
8242 if (measure_time==1){
8243 dtime(&Etime1);
8244 time9 += Etime1 - Stime1;
8245 }
8246
8247 /************************************************************
8248 check the orthonormality of Krylov vectors
8249 ************************************************************/
8250
8251 if (error_check==1){
8252
8253 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8254
8255 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8256 for (i=0; i<Msize2[Mc_AN]; i++){
8257 tmpvec1[n][i] = 0.0;
8258 }
8259 }
8260
8261 for (i=0; i<=can; i++){
8262
8263 ig = natn[ct_AN][i];
8264 ian = Spe_Total_CNO[WhatSpecies[ig]];
8265 Anum = MP[i] - 1;
8266 ih = S_G2M[ig];
8267
8268 for (j=0; j<=can; j++){
8269
8270 kl = RMI1[Mc_AN][i][j];
8271 jg = natn[ct_AN][j];
8272 jan = Spe_Total_CNO[WhatSpecies[jg]];
8273 Bnum = MP[j] - 1;
8274
8275 if (0<=kl){
8276
8277 for (m=0; m<ian; m++){
8278 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8279
8280 sum = 0.0;
8281 for (k=0; k<jan; k++){
8282 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
8283 }
8284
8285 tmpvec1[n][Anum+m] += sum;
8286 }
8287 }
8288 }
8289 }
8290 }
8291
8292 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8293 for (m=0; m<EKC_core_size[Mc_AN]; m++){
8294 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8295 sum = 0.0;
8296 for (i=0; i<Msize2[Mc_AN]; i++){
8297 sum += U0[rl0][m][i]*tmpvec1[n][i];
8298 }
8299
8300 if (rl==rl0 && m==n){
8301 if ( 1.0e-10<fabs(sum-1.0) ) {
8302 printf("A spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
8303 spin,Mc_AN,rl,rl0,m,n,sum);
8304 }
8305 }
8306 else{
8307 if ( 1.0e-10<fabs(sum) ) {
8308 printf("B spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
8309 spin,Mc_AN,rl,rl0,m,n,sum);
8310 }
8311 }
8312
8313 }
8314 }
8315 }
8316 }
8317 }
8318
8319 if (measure_time==1){
8320 printf("pMatrix myid=%2d time1 =%5.3f time2 =%5.3f time3 =%5.3f time4 =%5.3f\n",
8321 myid,time1,time2,time3,time4);
8322 printf("pMatrix myid=%2d time5 =%5.3f time6 =%5.3f time7 =%5.3f time8 =%5.3f\n",
8323 myid,time5,time6,time7,time8);
8324 printf("pMatrix myid=%2d time9 =%5.3f\n",myid,time9);
8325 }
8326
8327 /* freeing of arrays */
8328
8329 for (i=0; i<rlmax_EC[Mc_AN]; i++){
8330 free(Utmp[i]);
8331 }
8332 free(Utmp);
8333
8334 for (i=0; i<rlmax_EC[Mc_AN]; i++){
8335 for (j=0; j<EKC_core_size[Mc_AN]; j++){
8336 free(U0[i][j]);
8337 }
8338 free(U0[i]);
8339 }
8340 free(U0);
8341
8342 for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
8343 free(tmpmat0[i]);
8344 }
8345 free(tmpmat0);
8346
8347 for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
8348 free(FS[i]);
8349 }
8350 free(FS);
8351
8352 free(ko);
8353 free(iko);
8354
8355 for (i=0; i<(EKC_core_size[Mc_AN]+2); i++){
8356 free(matRS0[i]);
8357 }
8358 free(matRS0);
8359
8360 for (i=0; i<(Msize4[Mc_AN]+3); i++){
8361 free(matRS1[i]);
8362 }
8363 free(matRS1);
8364 }
8365
8366
8367
8368
8369
8370
8371
8372
Generate_pMatrix2_trd(int myid,int spin,int Mc_AN,double ***** Hks,double **** OLP0,double *** Krylov_U,int * MP,int * Msize,int * Msize2,int * Msize3,double ** tmpvec1)8373 void Generate_pMatrix2_trd( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0,
8374 double ***Krylov_U, int *MP, int *Msize, int *Msize2, int *Msize3,
8375 double **tmpvec1)
8376
8377 /* This subroutine is exactly the same as the original Generate_pMatrix2,
8378 except that Eigen_lapack is replaced with Eigen_lapack_d, _x, or _r
8379 */
8380
8381 {
8382 int rl,rl0,rl1,ct_AN,fan,san,can,wan,ct_on,i,j;
8383 int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1,info;
8384 int ZeroNum,rl_half;
8385 int KU_d1,KU_d2,csize;
8386 double sum,dum,tmp0,tmp1,tmp2,tmp3;
8387 double **Utmp;
8388 double *ko,*iko;
8389 double **FS;
8390 double ***U0;
8391
8392 int rl00,rl01,rl02,rl03,rl04,rl05,rl06,rl07;
8393 int mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,m1s;
8394 __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
8395
8396 double mmArr[8];
8397 double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
8398
8399 ct_AN = M2G[Mc_AN];
8400 fan = FNAN[ct_AN];
8401 san = SNAN[ct_AN];
8402 can = fan + san;
8403 wan = WhatSpecies[ct_AN];
8404 ct_on = Spe_Total_CNO[wan];
8405
8406 if (Msize[Mc_AN]<Msize3[Mc_AN])
8407 csize = Msize3[Mc_AN] + 40;
8408 else
8409 csize = Msize[Mc_AN] + 40;
8410
8411 KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
8412 KU_d2 = Msize2[Mc_AN];
8413
8414 /* allocation of arrays */
8415
8416 Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
8417 for (i=0; i<rlmax_EC[Mc_AN]; i++){
8418 Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
8419 }
8420
8421 U0 = (double***)malloc(sizeof(double**)*rlmax_EC[Mc_AN]);
8422 for (i=0; i<rlmax_EC[Mc_AN]; i++){
8423 U0[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
8424 for (j=0; j<EKC_core_size[Mc_AN]; j++){
8425 U0[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
8426 for (k=0; k<(Msize2[Mc_AN]+3); k++) U0[i][j][k] = 0.0;
8427 }
8428 }
8429
8430 FS = (double**)malloc(sizeof(double*)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
8431 for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
8432 FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
8433 }
8434
8435 ko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
8436 iko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
8437
8438 /****************************************************
8439 initialize
8440 ****************************************************/
8441
8442 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8443 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8444 for (i=0; i<Msize2[Mc_AN]; i++){
8445 U0[rl][n][i] = 0.0;
8446 }
8447 }
8448 }
8449
8450 i = 0;
8451 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8452 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8453 if (i<Msize2[Mc_AN]) U0[rl][n][i] = 1.0;
8454 i++;
8455 }
8456 }
8457
8458 /************************************************************
8459 orthogonalization by diagonalization
8460 ************************************************************/
8461
8462 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8463
8464 /* S * |Vn) */
8465
8466 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8467 for (i=0; i<Msize2[Mc_AN]; i++){
8468 tmpvec1[n][i] = 0.0;
8469 }
8470 }
8471
8472 for (i=0; i<=can; i++){
8473
8474 ig = natn[ct_AN][i];
8475 ian = Spe_Total_CNO[WhatSpecies[ig]];
8476 Anum = MP[i] - 1;
8477 ih = S_G2M[ig];
8478
8479 for (j=0; j<=can; j++){
8480
8481 kl = RMI1[Mc_AN][i][j];
8482 jg = natn[ct_AN][j];
8483 jan = Spe_Total_CNO[WhatSpecies[jg]];
8484 Bnum = MP[j] - 1;
8485
8486 if (0<=kl){
8487
8488 #ifdef nosse
8489
8490 /* Original version */
8491 /**/
8492 for (m=0; m<ian; m++){
8493 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8494
8495 sum = 0.0;
8496 for (k=0; k<jan; k++){
8497 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
8498 }
8499 tmpvec1[n][Anum+m] += sum;
8500 }
8501 }
8502 /**/
8503
8504 #else
8505
8506 /* Unrolling + SSE version */
8507
8508 for (m=0; m<(ian-3); m+=4){
8509 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8510
8511 mmSum00 = _mm_setzero_pd();
8512 mmSum01 = _mm_setzero_pd();
8513 mmSum10 = _mm_setzero_pd();
8514 mmSum11 = _mm_setzero_pd();
8515 mmSum20 = _mm_setzero_pd();
8516 mmSum21 = _mm_setzero_pd();
8517 mmSum30 = _mm_setzero_pd();
8518 mmSum31 = _mm_setzero_pd();
8519
8520 for (k=0; k<(jan-3); k+=4){
8521 mmTmp0 = _mm_loadu_pd(&U0[rl][n][Bnum+k+0]);
8522 mmTmp1 = _mm_loadu_pd(&U0[rl][n][Bnum+k+2]);
8523
8524 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
8525 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
8526
8527 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
8528 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
8529
8530 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
8531 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
8532
8533 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
8534 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
8535 }
8536
8537 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
8538 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
8539 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
8540 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
8541
8542 _mm_storeu_pd(&mmArr[0], mmSum00);
8543 _mm_storeu_pd(&mmArr[2], mmSum10);
8544 _mm_storeu_pd(&mmArr[4], mmSum20);
8545 _mm_storeu_pd(&mmArr[6], mmSum30);
8546
8547 sum0 = mmArr[0] + mmArr[1];
8548 sum1 = mmArr[2] + mmArr[3];
8549 sum2 = mmArr[4] + mmArr[5];
8550 sum3 = mmArr[6] + mmArr[7];
8551
8552 for (; k<jan; k++){
8553 sum0 += OLP0[ih][kl][m+0][k]*U0[rl][n][Bnum+k];
8554 sum1 += OLP0[ih][kl][m+1][k]*U0[rl][n][Bnum+k];
8555 sum2 += OLP0[ih][kl][m+2][k]*U0[rl][n][Bnum+k];
8556 sum3 += OLP0[ih][kl][m+3][k]*U0[rl][n][Bnum+k];
8557 }
8558
8559 tmpvec1[n][Anum+m+0] += sum0;
8560 tmpvec1[n][Anum+m+1] += sum1;
8561 tmpvec1[n][Anum+m+2] += sum2;
8562 tmpvec1[n][Anum+m+3] += sum3;
8563 }
8564 }
8565
8566 for (; m<ian; m++){
8567 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8568
8569 sum = 0.0;
8570 for (k=0; k<jan; k++){
8571 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
8572 }
8573 tmpvec1[n][Anum+m] += sum;
8574 }
8575 }
8576
8577 #endif
8578
8579 }
8580 }
8581 }
8582
8583 #ifdef nosse
8584
8585 /* Original version */
8586 /**/
8587 for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
8588 for (m=0; m<EKC_core_size[Mc_AN]; m++){
8589 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8590 sum = 0.0;
8591 for (i=0; i<Msize2[Mc_AN]; i++){
8592 sum += U0[rl0][m][i]*tmpvec1[n][i];
8593 }
8594 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
8595 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
8596 }
8597 }
8598 }
8599 /**/
8600
8601 #else
8602
8603 /* Unrolling + SSE version */
8604
8605 for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
8606 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
8607 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8608
8609 mmSum00 = _mm_setzero_pd();
8610 mmSum01 = _mm_setzero_pd();
8611 mmSum10 = _mm_setzero_pd();
8612 mmSum11 = _mm_setzero_pd();
8613 mmSum20 = _mm_setzero_pd();
8614 mmSum21 = _mm_setzero_pd();
8615 mmSum30 = _mm_setzero_pd();
8616 mmSum31 = _mm_setzero_pd();
8617
8618 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
8619 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
8620 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
8621
8622 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
8623 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
8624
8625 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
8626 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
8627
8628 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
8629 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
8630
8631 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
8632 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
8633 }
8634
8635 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
8636 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
8637 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
8638 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
8639
8640 _mm_storeu_pd(&mmArr[0], mmSum00);
8641 _mm_storeu_pd(&mmArr[2], mmSum10);
8642 _mm_storeu_pd(&mmArr[4], mmSum20);
8643 _mm_storeu_pd(&mmArr[6], mmSum30);
8644
8645 sum0 = mmArr[0] + mmArr[1];
8646 sum1 = mmArr[2] + mmArr[3];
8647 sum2 = mmArr[4] + mmArr[5];
8648 sum3 = mmArr[6] + mmArr[7];
8649
8650 for (; i<Msize2[Mc_AN]; i++){
8651 sum0 += U0[rl0][m+0][i]*tmpvec1[n][i];
8652 sum1 += U0[rl0][m+1][i]*tmpvec1[n][i];
8653 sum2 += U0[rl0][m+2][i]*tmpvec1[n][i];
8654 sum3 += U0[rl0][m+3][i]*tmpvec1[n][i];
8655 }
8656
8657 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
8658 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum0;
8659
8660 FS[rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
8661 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+2] = sum1;
8662
8663 FS[rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
8664 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+3] = sum2;
8665
8666 FS[rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
8667 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+4] = sum3;
8668 }
8669 }
8670
8671 for (; m<EKC_core_size[Mc_AN]; m++){
8672 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8673 sum = 0.0;
8674 for (i=0; i<Msize2[Mc_AN]; i++){
8675 sum += U0[rl0][m][i]*tmpvec1[n][i];
8676 }
8677 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
8678 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
8679 }
8680 }
8681 }
8682
8683 #endif
8684
8685 }
8686
8687
8688 info = Eigen_lapack_d(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
8689 if (info!=0){
8690 info = Eigen_lapack_x(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
8691 if (info!=0){
8692 Eigen_lapack_r(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
8693 }
8694 }
8695
8696 ZeroNum = 0;
8697
8698 for (i=1; i<=Msize3[Mc_AN]; i++){
8699
8700 if (error_check==1){
8701 printf("spin=%2d Mc_AN=%2d i=%3d ko[i]=%18.15f\n",spin,Mc_AN,i,ko[i]);
8702 }
8703
8704 if (cutoff_value<ko[i]){
8705 ko[i] = sqrt(fabs(ko[i]));
8706 iko[i] = 1.0/ko[i];
8707 }
8708 else{
8709 ZeroNum++;
8710 ko[i] = 0.0;
8711 iko[i] = 0.0;
8712 }
8713 }
8714
8715 if (error_check==1){
8716 printf("spin=%2d Mc_AN=%2d ZeroNum=%2d\n",spin,Mc_AN,ZeroNum);
8717 }
8718
8719 for (i=1; i<=Msize3[Mc_AN]; i++){
8720 for (j=1; j<=Msize3[Mc_AN]; j++){
8721 FS[i][j] = FS[i][j]*iko[j];
8722 }
8723 }
8724
8725 /* transpose for later calculation */
8726 for (i=1; i<=Msize3[Mc_AN]; i++){
8727 for (j=i+1; j<=Msize3[Mc_AN]; j++){
8728 tmp1 = FS[i][j];
8729 tmp2 = FS[j][i];
8730 FS[i][j] = tmp2;
8731 FS[j][i] = tmp1;
8732 }
8733 }
8734
8735 /* U0 * U * lamda^{-1/2} */
8736
8737 #ifdef nosse
8738
8739 /* original version */
8740
8741 /**/
8742 for (i=0; i<Msize2[Mc_AN]; i++){
8743 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8744 for (m=0; m<EKC_core_size[Mc_AN]; m++){
8745
8746 m1 = rl0*EKC_core_size[Mc_AN] + m + 1;
8747
8748 sum = 0.0;
8749 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8750
8751 n1 = rl*EKC_core_size[Mc_AN] + 1;
8752
8753 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8754 sum += U0[rl][n][i]*FS[m1][n1+n];
8755 }
8756 }
8757
8758 Utmp[rl0][m] = sum;
8759 }
8760 }
8761
8762 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8763 for (m=0; m<EKC_core_size[Mc_AN]; m++){
8764 U0[rl0][m][i] = Utmp[rl0][m];
8765 }
8766 }
8767 }
8768 /**/
8769
8770 #else
8771
8772 /* Unrolling + SSE version */
8773 /**/
8774 for (i=0; i<Msize2[Mc_AN]; i++){
8775
8776 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8777 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8778 Utmp[rl][n] = U0[rl][n][i];
8779 }
8780 }
8781
8782 for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
8783
8784 rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
8785 rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
8786 rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
8787 rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
8788
8789 mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
8790 mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
8791 mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
8792 mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
8793
8794 sum0 = 0.0;
8795 sum1 = 0.0;
8796 sum2 = 0.0;
8797 sum3 = 0.0;
8798
8799 mmSum00 = _mm_setzero_pd();
8800 mmSum01 = _mm_setzero_pd();
8801 mmSum10 = _mm_setzero_pd();
8802 mmSum11 = _mm_setzero_pd();
8803 mmSum20 = _mm_setzero_pd();
8804 mmSum21 = _mm_setzero_pd();
8805 mmSum30 = _mm_setzero_pd();
8806 mmSum31 = _mm_setzero_pd();
8807
8808 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8809
8810 n1 = rl*EKC_core_size[Mc_AN] + 1;
8811
8812 if (0){
8813 mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]); /*???????????*/
8814 mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]); /*???????????*/
8815 }
8816
8817 for (n=0; n<(EKC_core_size[Mc_AN]-3); n+=4){
8818 mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
8819 mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
8820 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+0]),mmTmp0));
8821 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+2]),mmTmp1));
8822
8823 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+0]),mmTmp0));
8824 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+2]),mmTmp1));
8825
8826 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+0]),mmTmp0));
8827 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+2]),mmTmp1));
8828
8829 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+0]),mmTmp0));
8830 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+2]),mmTmp1));
8831 }
8832
8833 for (; n<EKC_core_size[Mc_AN]; n++){
8834 sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
8835 sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
8836 sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
8837 sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
8838 }
8839 }
8840
8841 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
8842 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
8843 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
8844 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
8845
8846 _mm_storeu_pd(&mmArr[0], mmSum00);
8847 _mm_storeu_pd(&mmArr[2], mmSum10);
8848 _mm_storeu_pd(&mmArr[4], mmSum20);
8849 _mm_storeu_pd(&mmArr[6], mmSum30);
8850
8851 sum0 += mmArr[0] + mmArr[1];
8852 sum1 += mmArr[2] + mmArr[3];
8853 sum2 += mmArr[4] + mmArr[5];
8854 sum3 += mmArr[6] + mmArr[7];
8855
8856 U0[rl00][mm0][i] = sum0;
8857 U0[rl01][mm1][i] = sum1;
8858 U0[rl02][mm2][i] = sum2;
8859 U0[rl03][mm3][i] = sum3;
8860 }
8861
8862 m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
8863
8864 for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
8865
8866 rl0 = (m1-1)/EKC_core_size[Mc_AN];
8867 m = (m1-1)%EKC_core_size[Mc_AN];
8868
8869 sum = 0.0;
8870
8871 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8872
8873 n1 = rl*EKC_core_size[Mc_AN] + 1;
8874
8875 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8876 sum += Utmp[rl][n]*FS[m1][n1+n];
8877 }
8878 }
8879
8880 U0[rl0][m][i] = sum;
8881 }
8882 } /* i */
8883
8884 #endif
8885
8886 Krylov_U[spin][Mc_AN][0] = ZeroNum;
8887
8888 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8889 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8890 for (i=0; i<Msize2[Mc_AN]; i++){
8891 Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+i+1] = U0[rl][n][i];
8892 }
8893 }
8894 }
8895
8896 /************************************************************
8897 check the orthonormality of Krylov vectors
8898 ************************************************************/
8899
8900 if (error_check==1){
8901
8902 for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8903
8904 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8905 for (i=0; i<Msize2[Mc_AN]; i++){
8906 tmpvec1[n][i] = 0.0;
8907 }
8908 }
8909
8910 for (i=0; i<=can; i++){
8911
8912 ig = natn[ct_AN][i];
8913 ian = Spe_Total_CNO[WhatSpecies[ig]];
8914 Anum = MP[i] - 1;
8915 ih = S_G2M[ig];
8916
8917 for (j=0; j<=can; j++){
8918
8919 kl = RMI1[Mc_AN][i][j];
8920 jg = natn[ct_AN][j];
8921 jan = Spe_Total_CNO[WhatSpecies[jg]];
8922 Bnum = MP[j] - 1;
8923
8924 if (0<=kl){
8925
8926 for (m=0; m<ian; m++){
8927 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8928
8929 sum = 0.0;
8930 for (k=0; k<jan; k++){
8931 sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
8932 }
8933
8934 tmpvec1[n][Anum+m] += sum;
8935 }
8936 }
8937 }
8938 }
8939 }
8940
8941 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8942 for (m=0; m<EKC_core_size[Mc_AN]; m++){
8943 for (n=0; n<EKC_core_size[Mc_AN]; n++){
8944 sum = 0.0;
8945 for (i=0; i<Msize2[Mc_AN]; i++){
8946 sum += U0[rl0][m][i]*tmpvec1[n][i];
8947 }
8948
8949 if (rl==rl0 && m==n){
8950 if ( 1.0e-10<fabs(sum-1.0) ) {
8951 printf("A spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
8952 spin,Mc_AN,rl,rl0,m,n,sum);
8953 }
8954 }
8955 else{
8956 if ( 1.0e-10<fabs(sum) ) {
8957 printf("B spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
8958 spin,Mc_AN,rl,rl0,m,n,sum);
8959 }
8960 }
8961
8962 }
8963 }
8964 }
8965 }
8966 }
8967
8968 /* freeing of arrays */
8969
8970 for (i=0; i<rlmax_EC[Mc_AN]; i++){
8971 free(Utmp[i]);
8972 }
8973 free(Utmp);
8974
8975 for (i=0; i<rlmax_EC[Mc_AN]; i++){
8976 for (j=0; j<EKC_core_size[Mc_AN]; j++){
8977 free(U0[i][j]);
8978 }
8979 free(U0[i]);
8980 }
8981 free(U0);
8982
8983 for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
8984 free(FS[i]);
8985 }
8986 free(FS);
8987
8988 free(ko);
8989 free(iko);
8990 }
8991
8992
8993
8994
8995
8996
8997
Embedding_Matrix_trd(int spin,int Mc_AN,double ***** Hks,double *** Krylov_U,double **** EC_matrix,int * MP,int * Msize,int * Msize2,int * Msize3,double ** tmpvec,int EKC_core_size_max,int Msize2_max)8998 void Embedding_Matrix_trd(int spin, int Mc_AN, double *****Hks,
8999 double ***Krylov_U, double ****EC_matrix,
9000 int *MP, int *Msize, int *Msize2, int *Msize3,
9001 double **tmpvec, int EKC_core_size_max, int Msize2_max)
9002
9003 /* This subroutine is exactly the same as the original Embedding_Matrix,
9004 except for the OMP parallelized loops
9005 */
9006
9007 {
9008 int ct_AN,fan,san,can,wan,ct_on;
9009 int rl,rl0,m,n,i,j,k,kl,jg,jan,ih,ian;
9010 int Anum,Bnum,ig;
9011 int KU_d1,KU_d2,csize;
9012 double sum,tmp1,tmp2,tmp3;
9013
9014 __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1;
9015 double mmArr[8];
9016 double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
9017
9018 /* for OpenMP */
9019 int OMPID,Nthrds,Nprocs;
9020
9021 ct_AN = M2G[Mc_AN];
9022 fan = FNAN[ct_AN];
9023 san = SNAN[ct_AN];
9024 can = fan + san;
9025 wan = WhatSpecies[ct_AN];
9026 ct_on = Spe_Total_CNO[wan];
9027
9028 if (Msize[Mc_AN]<Msize3[Mc_AN])
9029 csize = Msize3[Mc_AN] + 40;
9030 else
9031 csize = Msize[Mc_AN] + 40;
9032
9033 KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
9034 KU_d2 = Msize2[Mc_AN];
9035
9036 /*******************************
9037 u1^+ C^+ u2
9038 *******************************/
9039
9040 #pragma omp parallel shared(EKC_core_size_max,Msize2_max,rlmax_EC,Mc_AN,EKC_core_size,Msize2,fan,can,natn,ct_AN,Spe_Total_CNO,WhatSpecies,MP,S_G2M,RMI1,Hks,Krylov_U,spin,KU_d1,KU_d2,Msize,EC_matrix) private(OMPID,Nthrds,Nprocs,rl,n,i,ig,ian,Anum,ih,j,kl,jg,jan,Bnum,m,sum,k,mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31,mmTmp0,mmTmp1,mmArr,sum0,sum1,sum2,sum3,rl0)
9041
9042 {
9043
9044 double **tmpvec1;
9045 tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
9046 for (i=0; i<EKC_core_size_max; i++){
9047 tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
9048 }
9049
9050 /* get info. on OpenMP */
9051
9052 OMPID = omp_get_thread_num();
9053 Nthrds = omp_get_num_threads();
9054 Nprocs = omp_get_num_procs();
9055
9056 for (rl=0+OMPID; rl<rlmax_EC[Mc_AN]; rl+=Nthrds){
9057
9058 /* C^+ u2 */
9059
9060 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9061 for (i=0; i<Msize2[Mc_AN]; i++){
9062 tmpvec1[n][i] = 0.0;
9063 }
9064 }
9065
9066 for (i=0; i<=fan; i++){
9067
9068 ig = natn[ct_AN][i];
9069 ian = Spe_Total_CNO[WhatSpecies[ig]];
9070 Anum = MP[i] - 1;
9071 ih = S_G2M[ig];
9072
9073 for (j=fan+1; j<=can; j++){
9074
9075 kl = RMI1[Mc_AN][i][j];
9076 jg = natn[ct_AN][j];
9077 jan = Spe_Total_CNO[WhatSpecies[jg]];
9078 Bnum = MP[j];
9079
9080 if (0<=kl){
9081
9082 #ifdef nosse
9083
9084 /* Original version */
9085 /**/
9086 for (m=0; m<ian; m++){
9087 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9088
9089 sum = 0.0;
9090 for (k=0; k<jan; k++){
9091 sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9092 }
9093
9094 tmpvec1[n][Anum+m] += sum;
9095 }
9096 }
9097 /**/
9098
9099 #else
9100
9101 /* Unrolling + SSE version */
9102 /**/
9103 for (m=0; m<(ian-3); m+=4){
9104 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9105
9106 mmSum00 = _mm_setzero_pd();
9107 mmSum01 = _mm_setzero_pd();
9108 mmSum10 = _mm_setzero_pd();
9109 mmSum11 = _mm_setzero_pd();
9110 mmSum20 = _mm_setzero_pd();
9111 mmSum21 = _mm_setzero_pd();
9112 mmSum30 = _mm_setzero_pd();
9113 mmSum31 = _mm_setzero_pd();
9114
9115 for (k=0; k<(jan-3); k+=4){
9116 mmTmp0 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0]);
9117 mmTmp1 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+2]);
9118
9119 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
9120 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
9121
9122 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
9123 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
9124
9125 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
9126 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
9127
9128 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
9129 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
9130 }
9131
9132 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
9133 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
9134 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
9135 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
9136
9137 _mm_storeu_pd(&mmArr[0], mmSum00);
9138 _mm_storeu_pd(&mmArr[2], mmSum10);
9139 _mm_storeu_pd(&mmArr[4], mmSum20);
9140 _mm_storeu_pd(&mmArr[6], mmSum30);
9141
9142 sum0 = mmArr[0] + mmArr[1];
9143 sum1 = mmArr[2] + mmArr[3];
9144 sum2 = mmArr[4] + mmArr[5];
9145 sum3 = mmArr[6] + mmArr[7];
9146
9147 for (; k<jan; k++){
9148 sum0 += Hks[spin][ih][kl][m+0][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9149 sum1 += Hks[spin][ih][kl][m+1][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9150 sum2 += Hks[spin][ih][kl][m+2][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9151 sum3 += Hks[spin][ih][kl][m+3][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9152 }
9153
9154 tmpvec1[n][Anum+m+0] += sum0;
9155 tmpvec1[n][Anum+m+1] += sum1;
9156 tmpvec1[n][Anum+m+2] += sum2;
9157 tmpvec1[n][Anum+m+3] += sum3;
9158 }
9159 }
9160
9161 for (; m<ian; m++){
9162 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9163
9164 sum = 0.0;
9165 for (k=0; k<jan; k++){
9166 sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9167 }
9168
9169 tmpvec1[n][Anum+m] += sum;
9170 }
9171 }
9172 /**/
9173 #endif
9174
9175 }
9176 }
9177 }
9178
9179 /* u1^+ C^+ u2 */
9180
9181 #ifdef nosse
9182
9183 /* Original version */
9184 /**/
9185 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
9186 for (m=0; m<EKC_core_size[Mc_AN]; m++){
9187 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9188 sum = 0.0;
9189 for (i=0; i<Msize[Mc_AN]; i++){
9190 sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+m*KU_d2+i+1]*tmpvec1[n][i];
9191 }
9192 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
9193 }
9194 }
9195 }
9196 /**/
9197
9198 #else
9199
9200 /* Unrolling + SSE version */
9201 /**/
9202 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
9203 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
9204 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9205
9206 mmSum00 = _mm_setzero_pd();
9207 mmSum01 = _mm_setzero_pd();
9208 mmSum10 = _mm_setzero_pd();
9209 mmSum11 = _mm_setzero_pd();
9210 mmSum20 = _mm_setzero_pd();
9211 mmSum21 = _mm_setzero_pd();
9212 mmSum30 = _mm_setzero_pd();
9213 mmSum31 = _mm_setzero_pd();
9214
9215 for (i=0; i<(Msize[Mc_AN]-3); i+=4){
9216 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
9217 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
9218
9219 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]),mmTmp0));
9220 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+3]),mmTmp1));
9221
9222 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]),mmTmp0));
9223 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+3]),mmTmp1));
9224
9225 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]),mmTmp0));
9226 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+3]),mmTmp1));
9227
9228 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]),mmTmp0));
9229 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+3]),mmTmp1));
9230 }
9231
9232 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
9233 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
9234 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
9235 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
9236
9237 _mm_storeu_pd(&mmArr[0], mmSum00);
9238 _mm_storeu_pd(&mmArr[2], mmSum10);
9239 _mm_storeu_pd(&mmArr[4], mmSum20);
9240 _mm_storeu_pd(&mmArr[6], mmSum30);
9241
9242 sum0 = mmArr[0] + mmArr[1];
9243 sum1 = mmArr[2] + mmArr[3];
9244 sum2 = mmArr[4] + mmArr[5];
9245 sum3 = mmArr[6] + mmArr[7];
9246
9247 for (; i<Msize[Mc_AN]; i++){
9248 sum0 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
9249 sum1 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]*tmpvec1[n][i];
9250 sum2 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]*tmpvec1[n][i];
9251 sum3 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]*tmpvec1[n][i];
9252 }
9253
9254 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
9255 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
9256 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
9257 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
9258 }
9259 }
9260
9261 for (; m<EKC_core_size[Mc_AN]; m++){
9262 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9263 sum = 0.0;
9264 for (i=0; i<Msize[Mc_AN]; i++){
9265 sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
9266 }
9267 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
9268 }
9269 }
9270
9271 }
9272 /**/
9273
9274 #endif
9275
9276 } /* rl */
9277
9278 for (i=0; i<EKC_core_size_max; i++){
9279 free(tmpvec1[i]);
9280 }
9281 free(tmpvec1);
9282
9283
9284 } /* #pragma omp parallel */
9285
9286 /*******************************
9287 u2^+ C u1
9288 *******************************/
9289
9290 for (i=1; i<=Msize3[Mc_AN]; i++){
9291 for (j=i; j<=Msize3[Mc_AN]; j++){
9292
9293 tmp1 = EC_matrix[spin][Mc_AN][i][j];
9294 tmp2 = EC_matrix[spin][Mc_AN][j][i];
9295 tmp3 = tmp1 + tmp2;
9296
9297 EC_matrix[spin][Mc_AN][i][j] = tmp3;
9298 EC_matrix[spin][Mc_AN][j][i] = tmp3;
9299 }
9300 }
9301
9302 /*******************************
9303 u2^+ B u2
9304 *******************************/
9305
9306 #pragma omp parallel shared(EKC_core_size_max,Msize2_max,rlmax_EC,Mc_AN,EKC_core_size,Msize2,fan,can,natn,ct_AN,Spe_Total_CNO,WhatSpecies,MP,S_G2M,RMI1,Hks,Krylov_U,spin,KU_d1,KU_d2,Msize,EC_matrix) private(OMPID,Nthrds,Nprocs,rl,n,i,ig,ian,Anum,ih,j,kl,jg,jan,Bnum,m,sum,k,mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31,mmTmp0,mmTmp1,mmArr,sum0,sum1,sum2,sum3,rl0)
9307
9308 {
9309
9310 double **tmpvec1;
9311 tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
9312 for (i=0; i<EKC_core_size_max; i++){
9313 tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
9314 }
9315
9316 /* get info. on OpenMP */
9317
9318 OMPID = omp_get_thread_num();
9319 Nthrds = omp_get_num_threads();
9320 Nprocs = omp_get_num_procs();
9321
9322 for (rl=0+OMPID; rl<rlmax_EC[Mc_AN]; rl+=Nthrds){
9323
9324 /* B u2 */
9325
9326 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9327 for (i=0; i<Msize2[Mc_AN]; i++){
9328 tmpvec1[n][i] = 0.0;
9329 }
9330 }
9331
9332 for (i=fan+1; i<=can; i++){
9333
9334 ig = natn[ct_AN][i];
9335 ian = Spe_Total_CNO[WhatSpecies[ig]];
9336 Anum = MP[i] - 1;
9337 ih = S_G2M[ig];
9338
9339 for (j=fan+1; j<=can; j++){
9340
9341 kl = RMI1[Mc_AN][i][j];
9342 jg = natn[ct_AN][j];
9343 jan = Spe_Total_CNO[WhatSpecies[jg]];
9344 Bnum = MP[j];
9345
9346 if (0<=kl){
9347
9348 #ifdef nosse
9349
9350 /* Original version */
9351 /**/
9352 for (m=0; m<ian; m++){
9353 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9354
9355 sum = 0.0;
9356 for (k=0; k<jan; k++){
9357 sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9358 }
9359
9360 tmpvec1[n][Anum+m] += sum;
9361 }
9362 }
9363 /**/
9364
9365 #else
9366
9367 /* Unrolling + SSE version */
9368 /**/
9369 for (m=0; m<(ian-3); m+=4){
9370 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9371
9372 mmSum00 = _mm_setzero_pd();
9373 mmSum01 = _mm_setzero_pd();
9374 mmSum10 = _mm_setzero_pd();
9375 mmSum11 = _mm_setzero_pd();
9376 mmSum20 = _mm_setzero_pd();
9377 mmSum21 = _mm_setzero_pd();
9378 mmSum30 = _mm_setzero_pd();
9379 mmSum31 = _mm_setzero_pd();
9380
9381 for (k=0; k<(jan-3); k+=4){
9382 mmTmp0 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0]);
9383 mmTmp1 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+2]);
9384
9385 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
9386 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
9387
9388 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
9389 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
9390
9391 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
9392 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
9393
9394 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
9395 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
9396 }
9397
9398 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
9399 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
9400 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
9401 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
9402
9403 _mm_storeu_pd(&mmArr[0], mmSum00);
9404 _mm_storeu_pd(&mmArr[2], mmSum10);
9405 _mm_storeu_pd(&mmArr[4], mmSum20);
9406 _mm_storeu_pd(&mmArr[6], mmSum30);
9407
9408 sum0 = mmArr[0] + mmArr[1];
9409 sum1 = mmArr[2] + mmArr[3];
9410 sum2 = mmArr[4] + mmArr[5];
9411 sum3 = mmArr[6] + mmArr[7];
9412
9413 for (; k<jan; k++){
9414 sum0 += Hks[spin][ih][kl][m+0][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
9415 sum1 += Hks[spin][ih][kl][m+1][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
9416 sum2 += Hks[spin][ih][kl][m+2][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
9417 sum3 += Hks[spin][ih][kl][m+3][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
9418 }
9419
9420 tmpvec1[n][Anum+m+0] += sum0;
9421 tmpvec1[n][Anum+m+1] += sum1;
9422 tmpvec1[n][Anum+m+2] += sum2;
9423 tmpvec1[n][Anum+m+3] += sum3;
9424 }
9425 }
9426
9427 for (; m<ian; m++){
9428 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9429
9430 sum = 0.0;
9431 for (k=0; k<jan; k++){
9432 sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9433 }
9434
9435 tmpvec1[n][Anum+m] += sum;
9436 }
9437 }
9438 /**/
9439
9440 #endif
9441
9442 }
9443 }
9444 }
9445
9446 /* u2^+ B u2 */
9447
9448 #ifdef nosse
9449
9450 /* Original version */
9451 /**/
9452 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
9453 for (m=0; m<EKC_core_size[Mc_AN]; m++){
9454 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9455 sum = 0.0;
9456 for (i=Msize[Mc_AN]; i<Msize2[Mc_AN]; i++){
9457 sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+m*KU_d2+i+1]*tmpvec1[n][i];
9458 }
9459 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum;
9460 }
9461 }
9462 }
9463 /**/
9464
9465 #else
9466
9467 /* Unrolling + SSE version */
9468 /**/
9469 for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
9470 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
9471 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9472
9473 mmSum00 = _mm_setzero_pd();
9474 mmSum01 = _mm_setzero_pd();
9475 mmSum10 = _mm_setzero_pd();
9476 mmSum11 = _mm_setzero_pd();
9477 mmSum20 = _mm_setzero_pd();
9478 mmSum21 = _mm_setzero_pd();
9479 mmSum30 = _mm_setzero_pd();
9480 mmSum31 = _mm_setzero_pd();
9481
9482 for (i=Msize[Mc_AN]; i<(Msize2[Mc_AN]-3); i+=4){
9483 mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
9484 mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
9485
9486 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]),mmTmp0));
9487 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+3]),mmTmp1));
9488
9489 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]),mmTmp0));
9490 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+3]),mmTmp1));
9491
9492 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]),mmTmp0));
9493 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+3]),mmTmp1));
9494
9495 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]),mmTmp0));
9496 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+3]),mmTmp1));
9497 }
9498
9499 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
9500 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
9501 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
9502 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
9503
9504 _mm_storeu_pd(&mmArr[0], mmSum00);
9505 _mm_storeu_pd(&mmArr[2], mmSum10);
9506 _mm_storeu_pd(&mmArr[4], mmSum20);
9507 _mm_storeu_pd(&mmArr[6], mmSum30);
9508
9509 sum0 = mmArr[0] + mmArr[1];
9510 sum1 = mmArr[2] + mmArr[3];
9511 sum2 = mmArr[4] + mmArr[5];
9512 sum3 = mmArr[6] + mmArr[7];
9513
9514 for (; i<Msize2[Mc_AN]; i++){
9515 sum0 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
9516 sum1 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]*tmpvec1[n][i];
9517 sum2 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]*tmpvec1[n][i];
9518 sum3 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]*tmpvec1[n][i];
9519 }
9520
9521 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum0;
9522 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] += sum1;
9523 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] += sum2;
9524 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] += sum3;
9525 }
9526 }
9527
9528 for (; m<EKC_core_size[Mc_AN]; m++){
9529 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9530 sum = 0.0;
9531 for (i=Msize[Mc_AN]; i<Msize2[Mc_AN]; i++){
9532 sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
9533 }
9534 EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum;
9535 }
9536 }
9537
9538 }
9539 /**/
9540
9541 #endif
9542
9543 } /* rl */
9544
9545 for (i=0; i<EKC_core_size_max; i++){
9546 free(tmpvec1[i]);
9547 }
9548 free(tmpvec1);
9549
9550 } /* #pragma omp parallel */
9551
9552 }
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
Krylov_IOLP_trd(int Mc_AN,double **** OLP0,double *** Krylov_U_OLP,double ** inv_RS,int * MP,int * Msize2,int * Msize4,int Msize2_max,double ** tmpvec0,double ** tmpvec1)9563 void Krylov_IOLP_trd( int Mc_AN, double ****OLP0, double ***Krylov_U_OLP, double **inv_RS, int *MP,
9564 int *Msize2, int *Msize4, int Msize2_max, double **tmpvec0, double **tmpvec1 )
9565
9566 /* This subroutine is exactly the same as the original Krylov_IOLP,
9567 except that Eigen_lapack is replaced with Eigen_lapack_d, _x, or _r
9568 */
9569
9570 {
9571 int rl,ct_AN,fan,san,can,wan,ct_on,i,j;
9572 int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1,info;
9573 int rl0,ZeroNum,Neumann_series,ns,Gh_AN,wanB;
9574 double sum,dum,tmp0,tmp1,tmp2,tmp3,rcutA,r0;
9575 double **tmpmat0;
9576 double *ko,*iko;
9577 double **FS;
9578
9579 ct_AN = M2G[Mc_AN];
9580 fan = FNAN[ct_AN];
9581 san = SNAN[ct_AN];
9582 can = fan + san;
9583 wan = WhatSpecies[ct_AN];
9584 ct_on = Spe_Total_CNO[wan];
9585 rcutA = Spe_Atom_Cut1[wan];
9586
9587 /* allocation of arrays */
9588
9589 tmpmat0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+4));
9590 for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
9591 tmpmat0[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+4));
9592 }
9593
9594 FS = (double**)malloc(sizeof(double*)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
9595 for (i=0; i<(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
9596 FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
9597 }
9598
9599 ko = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
9600 iko = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
9601
9602 /****************************************************
9603 initialize
9604 ****************************************************/
9605
9606 for (i=0; i<EKC_core_size_max; i++){
9607 for (j=0; j<Msize2_max; j++){
9608 tmpvec0[i][j] = 0.0;
9609 tmpvec1[i][j] = 0.0;
9610 }
9611 }
9612
9613 /* find the nearest atom with distance of r0 */
9614
9615 r0 = 1.0e+10;
9616 for (k=1; k<=FNAN[ct_AN]; k++){
9617 Gh_AN = natn[ct_AN][k];
9618 wanB = WhatSpecies[Gh_AN];
9619 if (Dis[ct_AN][k]<r0) r0 = Dis[ct_AN][k];
9620 }
9621
9622 /* starting vector */
9623
9624 m = 0;
9625 for (k=0; k<=FNAN[ct_AN]; k++){
9626
9627 Gh_AN = natn[ct_AN][k];
9628 wanB = WhatSpecies[Gh_AN];
9629
9630 if ( Dis[ct_AN][k]<(scale_rc_EKC[Mc_AN]*r0) ){
9631
9632 Anum = MP[k] - 1;
9633
9634 for (i=0; i<Spe_Total_CNO[wanB]; i++){
9635
9636 tmpvec0[m][Anum+i] = 1.0;
9637 Krylov_U_OLP[0][m][Anum+i] = 1.0;
9638
9639 m++;
9640 }
9641 }
9642 }
9643
9644 /*
9645 for (i=0; i<EKC_core_size[Mc_AN]; i++){
9646 tmpvec0[i][i] = 1.0;
9647 Krylov_U_OLP[0][i][i] = 1.0;
9648 }
9649 */
9650
9651 /****************************************************
9652 generate Krylov subspace vectors
9653 ****************************************************/
9654
9655 for (rl=0; rl<(rlmax_EC2[Mc_AN]-1); rl++){
9656
9657 /*******************************************************
9658 S * |Wn)
9659 *******************************************************/
9660
9661 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9662 for (i=0; i<Msize2[Mc_AN]; i++){
9663 tmpvec1[n][i] = 0.0;
9664 }
9665 }
9666
9667 for (i=0; i<=can; i++){
9668
9669 ig = natn[ct_AN][i];
9670 ian = Spe_Total_CNO[WhatSpecies[ig]];
9671 Anum = MP[i] - 1;
9672 ih = S_G2M[ig];
9673
9674 for (j=0; j<=can; j++){
9675
9676 kl = RMI1[Mc_AN][i][j];
9677 jg = natn[ct_AN][j];
9678 jan = Spe_Total_CNO[WhatSpecies[jg]];
9679 Bnum = MP[j] - 1;
9680
9681 if (0<=kl){
9682
9683 for (m=0; m<ian; m++){
9684 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9685
9686 sum = 0.0;
9687 for (k=0; k<jan; k++){
9688 sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
9689 }
9690
9691 tmpvec1[n][Anum+m] += sum;
9692 }
9693 }
9694 }
9695 }
9696 }
9697
9698 /*************************************************************
9699 orthogonalization by a modified block Gram-Schmidt method
9700 *************************************************************/
9701
9702 /* |tmpvec1) := (I - \sum_{rl0} |U_rl0)(U_rl0|)|tmpvec1) */
9703
9704 for (rl0=0; rl0<=rl; rl0++){
9705
9706 /* (U_rl0|tmpvec1) */
9707
9708 for (m=0; m<EKC_core_size[Mc_AN]; m++){
9709 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9710 sum = 0.0;
9711 for (i=0; i<Msize2[Mc_AN]; i++){
9712 sum += Krylov_U_OLP[rl0][m][i]*tmpvec1[n][i];
9713 }
9714 tmpmat0[m][n] = sum;
9715 }
9716 }
9717
9718 /* |tmpvec1) := |tmpvec1) - |U_rl0)(U_rl0|tmpvec1) */
9719
9720 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9721 for (k=0; k<EKC_core_size[Mc_AN]; k++){
9722 dum = tmpmat0[k][n];
9723 for (i=0; i<Msize2[Mc_AN]; i++) tmpvec1[n][i] -= Krylov_U_OLP[rl0][k][i]*dum;
9724 }
9725 }
9726 }
9727
9728 /*************************************************************
9729 normalization of tmpvec1
9730 *************************************************************/
9731
9732 for (m=0; m<EKC_core_size[Mc_AN]; m++){
9733 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9734
9735 sum = 0.0;
9736 for (i=0; i<Msize2[Mc_AN]; i++){
9737 sum += tmpvec1[m][i]*tmpvec1[n][i];
9738 }
9739
9740 tmpmat0[m+1][n+1] = sum;
9741 }
9742 }
9743
9744 /* diagonalize tmpmat0 */
9745
9746 if ( EKC_core_size[Mc_AN]==1){
9747 ko[1] = tmpmat0[1][1];
9748 tmpmat0[1][1] = 1.0;
9749 }
9750 else{
9751
9752 info = Eigen_lapack_d(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
9753 if (info!=0){
9754 info = Eigen_lapack_x(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
9755 if (info!=0){
9756 Eigen_lapack_r(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
9757 }
9758 }
9759 }
9760
9761 ZeroNum = 0;
9762
9763 for (n=1; n<=EKC_core_size[Mc_AN]; n++){
9764 if (cutoff_value<ko[n]){
9765 ko[n] = sqrt(fabs(ko[n]));
9766 iko[n] = 1.0/ko[n];
9767 }
9768 else{
9769 ZeroNum++;
9770 ko[n] = 0.0;
9771 iko[n] = 0.0;
9772 }
9773
9774 if (error_check==1){
9775 printf("rl=%3d ko=%18.15f\n",rl,ko[n]);
9776 }
9777 }
9778
9779 if (error_check==1){
9780 printf("rl=%3d ZeroNum=%3d\n",rl,ZeroNum);
9781 }
9782
9783 /* tmpvec0 = tmpvec1 * tmpmat0^{-1/2} */
9784
9785 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9786 for (i=0; i<Msize2[Mc_AN]; i++){
9787 tmpvec0[n][i] = 0.0;
9788 }
9789 }
9790
9791 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9792 for (k=0; k<EKC_core_size[Mc_AN]; k++){
9793 dum = tmpmat0[k+1][n+1]*iko[n+1];
9794 for (i=0; i<Msize2[Mc_AN]; i++) tmpvec0[n][i] += tmpvec1[k][i]*dum;
9795 }
9796 }
9797
9798 /*************************************************************
9799 store Krylov vectors
9800 *************************************************************/
9801
9802 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9803 for (i=0; i<Msize2[Mc_AN]; i++){
9804 Krylov_U_OLP[rl+1][n][i] = tmpvec0[n][i];
9805 }
9806 }
9807
9808 } /* rl */
9809
9810 /************************************************************
9811 calculate the inverse of the reduced overlap matrix
9812 ************************************************************/
9813
9814 /* construct the reduced overlap matrix */
9815
9816 for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
9817
9818 /* S * |Vn) */
9819
9820 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9821 for (i=0; i<Msize2[Mc_AN]; i++){
9822 tmpvec1[n][i] = 0.0;
9823 }
9824 }
9825
9826 for (i=0; i<=can; i++){
9827
9828 ig = natn[ct_AN][i];
9829 ian = Spe_Total_CNO[WhatSpecies[ig]];
9830 Anum = MP[i] - 1;
9831 ih = S_G2M[ig];
9832
9833 for (j=0; j<=can; j++){
9834
9835 kl = RMI1[Mc_AN][i][j];
9836 jg = natn[ct_AN][j];
9837 jan = Spe_Total_CNO[WhatSpecies[jg]];
9838 Bnum = MP[j] - 1;
9839
9840 if (0<=kl){
9841
9842 for (m=0; m<ian; m++){
9843 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9844
9845 sum = 0.0;
9846 for (k=0; k<jan; k++){
9847 sum += OLP0[ih][kl][m][k]*Krylov_U_OLP[rl][n][Bnum+k];
9848 }
9849 tmpvec1[n][Anum+m] += sum;
9850 }
9851 }
9852 }
9853 }
9854 }
9855
9856 for (rl0=rl; rl0<rlmax_EC2[Mc_AN]; rl0++){
9857 for (m=0; m<EKC_core_size[Mc_AN]; m++){
9858 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9859 sum = 0.0;
9860 for (i=0; i<Msize2[Mc_AN]; i++){
9861 sum += Krylov_U_OLP[rl0][m][i]*tmpvec1[n][i];
9862 }
9863 FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
9864 FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
9865 }
9866 }
9867 }
9868 }
9869
9870 /* diagonalize FS */
9871
9872 info = Eigen_lapack_d(FS,ko,Msize4[Mc_AN],Msize4[Mc_AN]);
9873 if (info!=0){
9874 info = Eigen_lapack_x(FS,ko,Msize4[Mc_AN],Msize4[Mc_AN]);
9875 if (info!=0){
9876 Eigen_lapack_r(FS,ko,Msize4[Mc_AN],Msize4[Mc_AN]);
9877 }
9878 }
9879
9880 /* find ill-conditioned eigenvalues */
9881
9882 ZeroNum = 0;
9883 for (i=1; i<=Msize4[Mc_AN]; i++){
9884
9885 if (error_check==1){
9886 printf("Mc_AN=%2d i=%3d ko[i]=%18.15f\n",Mc_AN,i,ko[i]);
9887 }
9888
9889 if (cutoff_value<ko[i]){
9890 iko[i] = 1.0/ko[i];
9891 }
9892 else{
9893 ZeroNum++;
9894 ko[i] = 0.0;
9895 iko[i] = 0.0;
9896 }
9897 }
9898
9899 if (error_check==1){
9900 printf("Mc_AN=%2d ZeroNum=%2d\n",Mc_AN,ZeroNum);
9901 }
9902
9903 /* construct the inverse */
9904
9905 for (i=1; i<=Msize4[Mc_AN]; i++){
9906 for (j=1; j<=Msize4[Mc_AN]; j++){
9907 sum = 0.0;
9908 for (k=1; k<=Msize4[Mc_AN]; k++){
9909 sum += FS[i][k]*iko[k]*FS[j][k];
9910 }
9911 inv_RS[i-1][j-1] = sum;
9912 }
9913 }
9914
9915 /* symmetrization of inv_RS */
9916
9917 for (i=1; i<=Msize4[Mc_AN]; i++){
9918 for (j=i+1; j<=Msize4[Mc_AN]; j++){
9919 tmp0 = inv_RS[i-1][j-1];
9920 tmp1 = inv_RS[j-1][i-1];
9921 tmp2 = 0.5*(tmp0 + tmp1);
9922 inv_RS[i-1][j-1] = tmp2;
9923 inv_RS[j-1][i-1] = tmp2;
9924 }
9925 }
9926
9927 /*
9928 {
9929
9930
9931 double mat1[1000][1000];
9932 double tsum;
9933 int myid;
9934
9935 MPI_Comm_rank(mpi_comm_level1,&myid);
9936
9937 if (myid==0){
9938
9939 printf("check normalization\n");
9940
9941 for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
9942 for (m=0; m<EKC_core_size[Mc_AN]; m++){
9943 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
9944 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9945
9946 sum = 0.0;
9947 for (i=0; i<Msize2[Mc_AN]; i++){
9948 sum += Krylov_U_OLP[rl][m][i]*Krylov_U_OLP[rl0][n][i];
9949 }
9950 printf("rl=%3d rl0=%3d m=%3d n=%3d <|>=%18.15f\n",rl,rl0,m,n,sum);
9951 }
9952 }
9953 }
9954 }
9955
9956
9957 printf("\n\ninvS\n");
9958
9959
9960 for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
9961 for (m=0; m<EKC_core_size[Mc_AN]; m++){
9962
9963 for (i=0; i<Msize2[Mc_AN]; i++){
9964
9965 sum = 0.0;
9966
9967 for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
9968 for (n=0; n<EKC_core_size[Mc_AN]; n++){
9969 sum += inv_RS[rl*EKC_core_size[Mc_AN]+m][rl0*EKC_core_size[Mc_AN]+n]*Krylov_U_OLP[rl0][n][i];
9970 }
9971 }
9972
9973 mat1[rl*EKC_core_size[Mc_AN]+m][i] = sum;
9974 }
9975 }
9976 }
9977
9978
9979 tsum = 0.0;
9980
9981 for (i=0; i<Msize2[Mc_AN]; i++){
9982 for (j=0; j<Msize2[Mc_AN]; j++){
9983
9984 sum = 0.0;
9985 for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
9986 for (m=0; m<EKC_core_size[Mc_AN]; m++){
9987 sum += Krylov_U_OLP[rl][m][i]*mat1[rl*EKC_core_size[Mc_AN]+m][j];
9988 }
9989 }
9990
9991 printf("i=%4d j=%4d %18.15f\n",i,j,sum);
9992
9993 tsum += fabs(sum);
9994
9995 }
9996 }
9997
9998
9999 printf("\n\ntsum=%18.15f\n",tsum);
10000
10001 }
10002
10003
10004 MPI_Finalize();
10005 exit(0);
10006 }
10007 */
10008
10009 /* freeing of arrays */
10010
10011 for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
10012 free(tmpmat0[i]);
10013 }
10014 free(tmpmat0);
10015
10016 for (i=0; i<(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
10017 free(FS[i]);
10018 }
10019 free(FS);
10020
10021 free(ko);
10022 free(iko);
10023 }
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
S_orthonormalize_vec_trd(int Mc_AN,int ct_on,double ** vec,double ** workvec,double **** OLP0,double ** tmpmat0,double * ko,double * iko,int * MP,int * Msize2)10036 void S_orthonormalize_vec_trd( int Mc_AN, int ct_on, double **vec,
10037 double **workvec, double ****OLP0,
10038 double **tmpmat0, double *ko, double *iko,
10039 int *MP, int *Msize2 )
10040
10041 /* This subroutine is exactly the same as the original S_orthonormalize_vec,
10042 except that Eigen_lapack is replaced with Eigen_lapack_d, _x, or _r
10043 */
10044
10045 {
10046 int n,i,j,can,san,fan,ct_AN;
10047 int k,m,ZeroNum,Anum,Bnum,ih;
10048 int kl,jg,jan,ig,ian,info;
10049 double dum,sum;
10050 double tmp0,tmp1,tmp2,tmp3;
10051 __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
10052 double mmArr[8];
10053 double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
10054
10055 ct_AN = M2G[Mc_AN];
10056 fan = FNAN[ct_AN];
10057 san = SNAN[ct_AN];
10058 can = fan + san;
10059
10060 /* S|Vn) */
10061
10062 for (n=0; n<EKC_core_size[Mc_AN]; n++){
10063 for (i=0; i<Msize2[Mc_AN]; i++){
10064 workvec[n][i] = 0.0;
10065 }
10066 }
10067
10068 for (i=0; i<=can; i++){
10069
10070 ig = natn[ct_AN][i];
10071 ian = Spe_Total_CNO[WhatSpecies[ig]];
10072 Anum = MP[i] - 1;
10073 ih = S_G2M[ig];
10074
10075 for (j=0; j<=can; j++){
10076
10077 kl = RMI1[Mc_AN][i][j];
10078 jg = natn[ct_AN][j];
10079 jan = Spe_Total_CNO[WhatSpecies[jg]];
10080 Bnum = MP[j] - 1;
10081
10082 if (0<=kl){
10083
10084 #ifdef nosse
10085
10086 /* Original version */
10087 /**/
10088 for (m=0; m<ian; m++){
10089 for (n=0; n<EKC_core_size[Mc_AN]; n++){
10090
10091 sum = 0.0;
10092 for (k=0; k<jan; k++){
10093 sum += OLP0[ih][kl][m][k]*vec[n][Bnum+k];
10094 }
10095
10096 workvec[n][Anum+m] += sum;
10097 }
10098 }
10099 /**/
10100
10101 #else
10102
10103 /* Unrolling + SSE version */
10104 /**/
10105 for (m=0; m<(ian-3); m+=4){
10106 for (n=0; n<EKC_core_size[Mc_AN]; n++){
10107
10108 mmSum00 = _mm_setzero_pd();
10109 mmSum01 = _mm_setzero_pd();
10110 mmSum10 = _mm_setzero_pd();
10111 mmSum11 = _mm_setzero_pd();
10112 mmSum20 = _mm_setzero_pd();
10113 mmSum21 = _mm_setzero_pd();
10114 mmSum30 = _mm_setzero_pd();
10115 mmSum31 = _mm_setzero_pd();
10116
10117 for (k=0; k<(jan-3); k+=4){
10118 mmTmp0 = _mm_loadu_pd(&vec[n][Bnum+k+0]);
10119 mmTmp1 = _mm_loadu_pd(&vec[n][Bnum+k+2]);
10120
10121 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
10122 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
10123
10124 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
10125 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
10126
10127 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
10128 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
10129
10130 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
10131 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
10132 }
10133
10134 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
10135 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
10136 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
10137 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
10138
10139 _mm_storeu_pd(&mmArr[0], mmSum00);
10140 _mm_storeu_pd(&mmArr[2], mmSum10);
10141 _mm_storeu_pd(&mmArr[4], mmSum20);
10142 _mm_storeu_pd(&mmArr[6], mmSum30);
10143
10144 sum0 = mmArr[0] + mmArr[1];
10145 sum1 = mmArr[2] + mmArr[3];
10146 sum2 = mmArr[4] + mmArr[5];
10147 sum3 = mmArr[6] + mmArr[7];
10148
10149 for (; k<jan; k++){
10150 sum0 += OLP0[ih][kl][m+0][k]*vec[n][Bnum+k];
10151 sum1 += OLP0[ih][kl][m+1][k]*vec[n][Bnum+k];
10152 sum2 += OLP0[ih][kl][m+2][k]*vec[n][Bnum+k];
10153 sum3 += OLP0[ih][kl][m+3][k]*vec[n][Bnum+k];
10154 }
10155
10156 workvec[n][Anum+m+0] += sum0;
10157 workvec[n][Anum+m+1] += sum1;
10158 workvec[n][Anum+m+2] += sum2;
10159 workvec[n][Anum+m+3] += sum3;
10160 }
10161 }
10162
10163 for (; m<ian; m++){
10164 for (n=0; n<EKC_core_size[Mc_AN]; n++){
10165
10166 sum = 0.0;
10167 for (k=0; k<jan; k++){
10168 sum += OLP0[ih][kl][m][k]*vec[n][Bnum+k];
10169 }
10170
10171 workvec[n][Anum+m] += sum;
10172 }
10173 }
10174 /**/
10175
10176 #endif
10177
10178 }
10179 }
10180 }
10181
10182 /* (Vn|S|Vn) */
10183
10184 #ifdef nosse
10185
10186 /* Original version */
10187 /**/
10188 for (m=0; m<EKC_core_size[Mc_AN]; m++){
10189 for (n=m; n<EKC_core_size[Mc_AN]; n++){
10190 sum = 0.0;
10191 for (i=0; i<Msize2[Mc_AN]; i++){
10192 sum += vec[m][i]*workvec[n][i];
10193 }
10194 tmpmat0[m+1][n+1] = sum;
10195 tmpmat0[n+1][m+1] = sum;
10196 }
10197 }
10198 /**/
10199
10200 #else
10201
10202 /* Unrolling + SSE version */
10203 /**/
10204 for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
10205 for (n=m; n<EKC_core_size[Mc_AN]; n++){
10206 mmSum00 = _mm_setzero_pd();
10207 mmSum01 = _mm_setzero_pd();
10208 mmSum10 = _mm_setzero_pd();
10209 mmSum11 = _mm_setzero_pd();
10210 mmSum20 = _mm_setzero_pd();
10211 mmSum21 = _mm_setzero_pd();
10212 mmSum30 = _mm_setzero_pd();
10213 mmSum31 = _mm_setzero_pd();
10214
10215 for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
10216 mmTmp0 = _mm_loadu_pd(&workvec[n][i+0]);
10217 mmTmp1 = _mm_loadu_pd(&workvec[n][i+2]);
10218
10219 mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&vec[m+0][i+0]),mmTmp0));
10220 mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&vec[m+0][i+2]),mmTmp1));
10221
10222 mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&vec[m+1][i+0]),mmTmp0));
10223 mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&vec[m+1][i+2]),mmTmp1));
10224
10225 mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&vec[m+2][i+0]),mmTmp0));
10226 mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&vec[m+2][i+2]),mmTmp1));
10227
10228 mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&vec[m+3][i+0]),mmTmp0));
10229 mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&vec[m+3][i+2]),mmTmp1));
10230 }
10231
10232 mmSum00 = _mm_add_pd(mmSum00, mmSum01);
10233 mmSum10 = _mm_add_pd(mmSum10, mmSum11);
10234 mmSum20 = _mm_add_pd(mmSum20, mmSum21);
10235 mmSum30 = _mm_add_pd(mmSum30, mmSum31);
10236
10237 _mm_storeu_pd(&mmArr[0], mmSum00);
10238 _mm_storeu_pd(&mmArr[2], mmSum10);
10239 _mm_storeu_pd(&mmArr[4], mmSum20);
10240 _mm_storeu_pd(&mmArr[6], mmSum30);
10241
10242 sum0 = mmArr[0] + mmArr[1];
10243 sum1 = mmArr[2] + mmArr[3];
10244 sum2 = mmArr[4] + mmArr[5];
10245 sum3 = mmArr[6] + mmArr[7];
10246
10247 for (; i<Msize2[Mc_AN]; i++){
10248 sum0 += vec[m+0][i]*workvec[n][i];
10249 sum1 += vec[m+1][i]*workvec[n][i];
10250 sum2 += vec[m+2][i]*workvec[n][i];
10251 sum3 += vec[m+3][i]*workvec[n][i];
10252 }
10253
10254 tmpmat0[m+1][n+1] = sum0;
10255 tmpmat0[n+1][m+1] = sum0;
10256
10257 tmpmat0[m+2][n+1] = sum1;
10258 tmpmat0[n+1][m+2] = sum1;
10259
10260 tmpmat0[m+3][n+1] = sum2;
10261 tmpmat0[n+1][m+3] = sum2;
10262
10263 tmpmat0[m+4][n+1] = sum3;
10264 tmpmat0[n+1][m+4] = sum3;
10265 }
10266 }
10267
10268 for (; m<EKC_core_size[Mc_AN]; m++){
10269 for (n=m; n<EKC_core_size[Mc_AN]; n++){
10270 sum = 0.0;
10271 for (i=0; i<Msize2[Mc_AN]; i++){
10272 sum += vec[m][i]*workvec[n][i];
10273 }
10274 tmpmat0[m+1][n+1] = sum;
10275 tmpmat0[n+1][m+1] = sum;
10276 }
10277 }
10278 /**/
10279
10280 #endif
10281
10282 /* diagonalize tmpmat0 */
10283
10284 if ( EKC_core_size[Mc_AN]==1){
10285 ko[1] = tmpmat0[1][1];
10286 tmpmat0[1][1] = 1.0;
10287 }
10288 else{
10289
10290 info = Eigen_lapack_d(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
10291 if (info!=0){
10292 info = Eigen_lapack_x(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
10293 if (info!=0){
10294 Eigen_lapack_r(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
10295 }
10296 }
10297 }
10298
10299 ZeroNum = 0;
10300
10301 for (n=1; n<=EKC_core_size[Mc_AN]; n++){
10302 if (cutoff_value<ko[n]){
10303 ko[n] = sqrt(fabs(ko[n]));
10304 iko[n] = 1.0/ko[n];
10305 }
10306 else{
10307 ZeroNum++;
10308 ko[n] = 0.0;
10309 iko[n] = 0.0;
10310 }
10311 }
10312
10313 /* U0 = vec * tmpmat0^{-1/2} */
10314
10315 for (n=0; n<EKC_core_size[Mc_AN]; n++){
10316 for (i=0; i<Msize2[Mc_AN]; i++){
10317 workvec[n][i] = 0.0;
10318 }
10319 }
10320
10321 for (n=0; n<EKC_core_size[Mc_AN]; n++){
10322 for (k=0; k<EKC_core_size[Mc_AN]; k++){
10323 dum = tmpmat0[k+1][n+1]*iko[n+1];
10324 for (i=0; i<Msize2[Mc_AN]; i++) workvec[n][i] += vec[k][i]*dum;
10325 }
10326 }
10327
10328 for (n=0; n<EKC_core_size[Mc_AN]; n++){
10329 for (i=0; i<Msize2[Mc_AN]; i++){
10330 vec[n][i] = workvec[n][i];
10331 }
10332 }
10333 }
10334
10335
Eigen_lapack_x(double ** a,double * ko,int n0,int EVmax)10336 int Eigen_lapack_x(double **a, double *ko, int n0, int EVmax)
10337 {
10338
10339 /*
10340 F77_NAME(dsyevx,DSYEVX)()
10341
10342 input: n;
10343 input: a[n][n]; matrix A
10344 output: a[n][n]; eigevectors
10345 output: ko[n]; eigenvalues
10346 */
10347
10348 char *name="Eigen_lapack";
10349
10350 char *JOBZ="V";
10351 char *RANGE="I";
10352 char *UPLO="L";
10353
10354 INTEGER n=n0;
10355 INTEGER LDA=n0;
10356 double VL,VU; /* dummy */
10357 INTEGER IL,IU;
10358 double ABSTOL=1.0e-13;
10359 INTEGER M;
10360
10361 double *A,*Z;
10362 INTEGER LDZ=n;
10363 INTEGER LWORK;
10364 double *WORK;
10365 INTEGER *IWORK;
10366 INTEGER *IFAIL, INFO;
10367
10368 int i,j;
10369
10370 A=(double*)malloc(sizeof(double)*n*n);
10371 Z=(double*)malloc(sizeof(double)*n*n);
10372
10373 LWORK=n*8;
10374 WORK=(double*)malloc(sizeof(double)*LWORK);
10375 IWORK=(INTEGER*)malloc(sizeof(INTEGER)*n*5);
10376 IFAIL=(INTEGER*)malloc(sizeof(INTEGER)*n);
10377
10378 IL = 1;
10379 IU = EVmax;
10380
10381 for (i=0; i<n; i++) {
10382 for (j=0; j<n; j++) {
10383 A[i*n+j] = a[i+1][j+1];
10384 }
10385 }
10386
10387 #if 0
10388 printf("A=\n");
10389 for (i=0;i<n;i++) {
10390 for (j=0;j<n;j++) {
10391 printf("%f ",A[i*n+j]);
10392 }
10393 printf("\n");
10394 }
10395 fflush(stdout);
10396 #endif
10397
10398 F77_NAME(dsyevx,DSYEVX)( JOBZ, RANGE, UPLO, &n, A, &LDA, &VL, &VU, &IL, &IU,
10399 &ABSTOL, &M, ko, Z, &LDZ, WORK, &LWORK, IWORK,
10400 IFAIL, &INFO );
10401
10402 if (INFO==0){
10403
10404 /* store eigenvectors */
10405 for (i=0;i<EVmax;i++) {
10406 for (j=0;j<n;j++) {
10407 /* a[i+1][j+1]= Z[i*n+j]; */
10408 a[j+1][i+1]= Z[i*n+j];
10409 }
10410 }
10411
10412 /* shift ko by 1 */
10413 for (i=EVmax; i>=1; i--){
10414 ko[i]= ko[i-1];
10415 }
10416 }
10417
10418 if (INFO!=0) {
10419 /* printf("\n%s: error in dsyevx_, info=%d\n\n",name,INFO); */
10420 }
10421
10422 free(IFAIL); free(IWORK); free(WORK); free(Z); free(A);
10423 return INFO;
10424
10425 }
10426
10427
Eigen_lapack_d(double ** a,double * ko,int n0,int EVmax)10428 int Eigen_lapack_d(double **a, double *ko, int n0, int EVmax)
10429 {
10430
10431 /*
10432 F77_NAME(dsyevd,DSYEVD)()
10433
10434 input: n;
10435 input: a[n][n]; matrix A
10436 output: a[n][n]; eigevectors
10437 output: ko[n]; eigenvalues
10438 */
10439
10440 static char *name="Eigen_lapack";
10441
10442 char *JOBZ="V";
10443 char *UPLO="L";
10444
10445 INTEGER n=n0;
10446 INTEGER LDA=n;
10447 double VL,VU; /* dummy */
10448 INTEGER IL,IU;
10449 double ABSTOL=1.0e-13;
10450 INTEGER M;
10451
10452 double *A;
10453 INTEGER LDZ=n;
10454 INTEGER LWORK,LIWORK;
10455 double *WORK;
10456 INTEGER *IWORK;
10457 INTEGER INFO;
10458
10459 int i,j;
10460
10461 A=(double*)malloc(sizeof(double)*n*n);
10462
10463 LWORK= 1 + 6*n + 2*n*n;
10464 WORK=(double*)malloc(sizeof(double)*LWORK);
10465
10466 LIWORK = 3 + 5*n;
10467 IWORK=(INTEGER*)malloc(sizeof(INTEGER)*LIWORK);
10468
10469
10470 IL = 1;
10471 IU = EVmax;
10472
10473 for (i=0;i<n;i++) {
10474 for (j=0;j<n;j++) {
10475 A[i*n+j]= a[i+1][j+1];
10476 }
10477 }
10478
10479 #if 0
10480 printf("A=\n");
10481 for (i=0;i<n;i++) {
10482 for (j=0;j<n;j++) {
10483 printf("%f ",A[i*n+j]);
10484 }
10485 printf("\n");
10486 }
10487 fflush(stdout);
10488 #endif
10489
10490 F77_NAME(dsyevd,DSYEVD)( JOBZ, UPLO, &n, A, &LDA, ko, WORK, &LWORK, IWORK, &LIWORK, &INFO );
10491
10492 if (INFO==0){
10493
10494 /* store eigenvectors */
10495 for (i=0;i<EVmax;i++) {
10496 for (j=0;j<n;j++) {
10497 /* a[i+1][j+1]= Z[i*n+j]; */
10498 a[j+1][i+1]= A[i*n+j];
10499 }
10500 }
10501
10502 /* shift ko by 1 */
10503 for (i=EVmax; i>=1; i--){
10504 ko[i]= ko[i-1];
10505 }
10506 }
10507
10508 if (INFO!=0) {
10509 /* printf("\n%s: error in dsyevd_, info=%d\n\n",name,INFO); */
10510 }
10511
10512 free(IWORK); free(WORK); free(A);
10513 return INFO;
10514 }
10515
10516
Eigen_lapack_r(double ** a,double * ko,int n0,int EVmax)10517 int Eigen_lapack_r(double **a, double *ko, int n0, int EVmax)
10518 {
10519
10520 /*
10521 F77_NAME(dsyevr,DSYEVR)()
10522
10523 input: n;
10524 input: a[n][n]; matrix A
10525 output: a[n][n]; eigevectors
10526 output: ko[n]; eigenvalues
10527 */
10528
10529 static char *name="Eigen_lapack";
10530
10531 char *JOBZ="V";
10532 char *RANGE="I";
10533 char *UPLO="L";
10534
10535 INTEGER n=n0;
10536 INTEGER LDA=n;
10537 double VL,VU; /* dummy */
10538 INTEGER IL,IU;
10539 double ABSTOL=1.0e-13;
10540 INTEGER M;
10541
10542 double *A,*Z;
10543 INTEGER LDZ=n;
10544 INTEGER LWORK,LIWORK;
10545 double *WORK;
10546 INTEGER *IWORK;
10547 INTEGER *ISUPPZ;
10548 INTEGER INFO;
10549
10550 int i,j;
10551
10552 A=(double*)malloc(sizeof(double)*n*n);
10553 Z=(double*)malloc(sizeof(double)*n*n);
10554
10555 LWORK= (n+16)*n; /* n*26 of (n+6)*n */
10556 WORK=(double*)malloc(sizeof(double)*LWORK);
10557
10558 LIWORK = (n+1)*n; /* n*10 or ??? */
10559 IWORK=(INTEGER*)malloc(sizeof(INTEGER)*LIWORK);
10560
10561 ISUPPZ =(INTEGER*)malloc(sizeof(INTEGER)*n*2);
10562
10563 IL = 1;
10564 IU = EVmax;
10565
10566 for (i=0;i<n;i++) {
10567 for (j=0;j<n;j++) {
10568 A[i*n+j]= a[i+1][j+1];
10569 }
10570 }
10571
10572 #if 0
10573 printf("A=\n");
10574 for (i=0;i<n;i++) {
10575 for (j=0;j<n;j++) {
10576 printf("%f ",A[i*n+j]);
10577 }
10578 printf("\n");
10579 }
10580 fflush(stdout);
10581 #endif
10582
10583 F77_NAME(dsyevr,DSYEVR)( JOBZ, RANGE, UPLO, &n, A, &LDA, &VL, &VU, &IL, &IU,
10584 &ABSTOL, &M, ko, Z, &LDZ, ISUPPZ, WORK, &LWORK,
10585 IWORK, &LIWORK, &INFO );
10586
10587 if (INFO==0){
10588
10589 /* store eigenvectors */
10590 for (i=0;i<EVmax;i++) {
10591 for (j=0;j<n;j++) {
10592 /* a[i+1][j+1]= Z[i*n+j]; */
10593 a[j+1][i+1]= Z[i*n+j];
10594 }
10595 }
10596
10597 /* shift ko by 1 */
10598 for (i=EVmax; i>=1; i--){
10599 ko[i]= ko[i-1];
10600 }
10601 }
10602
10603 if (INFO!=0) {
10604 printf("\n%s: error in dsyevr_, info=%d\n\n",name,INFO);
10605 MPI_Finalize();
10606 exit(10);
10607 }
10608
10609 free(ISUPPZ); free(IWORK); free(WORK); free(Z); free(A);
10610
10611 return INFO;
10612 }
10613