1 /**********************************************************************
2   Krylov.c:
3 
4      Krylov.c is a subroutine to perform a Krylov subspace
5      method developed by T.Ozaki.
6 
7   Log of Krylov.c
8 
9      10/June/2005  Released by T.Ozaki
10 
11 ***********************************************************************/
12 
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <math.h>
17 #include <time.h>
18 #include "openmx_common.h"
19 #include "lapack_prototypes.h"
20 #include "mpi.h"
21 #include <omp.h>
22 
23 #define  measure_time   0
24 #define  error_check    0
25 #define  cutoff_value   Threshold_OLP_Eigen
26 
27 #include "tran_prototypes.h"
28 
29 #ifdef nosse
30 #include "mimic_sse.h"
31 #else
32 #include <emmintrin.h>
33 #endif
34 
35 #ifdef kcomp
36 #define _mm_loadu_pd  _mm_load_pd
37 #define _mm_storeu_pd _mm_store_pd
38 #endif
39 
40 static void Generate_pMatrix( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0, double **invS,
41                               double ***Krylov_U, double ***Krylov_U_OLP, double **inv_RS, int *MP,
42                               int *Msize, int *Msize2, int *Msize3, int *Msize4, int Msize2_max,
43                               double **tmpvec0, double **tmpvec1, double **tmpvec2 );
44 
45 static void Generate_pMatrix2( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0, double ***Krylov_U,
46                                int *MP, int *Msize, int *Msize2, int *Msize3,
47                                double **tmpvec1 );
48 static void Krylov_IOLP( int Mc_AN, double ****OLP0, double ***Krylov_U_OLP, double **inv_RS,
49                          int *MP, int *Msize2, int *Msize4, int Msize2_max,
50                          double **tmpvec0, double **tmpvec1 );
51 static void S_orthonormalize_vec( int Mc_AN, int ct_on, double **vec,
52                                   double **workvec, double ****OLP0,
53                                   double **tmpmat0, double *ko, double *iko, int *MP, int *Msize2 );
54 static void Embedding_Matrix(int spin, int Mc_AN, double *****Hks,
55                              double ***Krylov_U, double ****EC_matrix,
56 			     int *MP, int *Msize, int *Msize2, int *Msize3,
57                              double **tmpvec1);
58 
59 static void Inverse_S_by_Cholesky(int Mc_AN, double ****OLP0, double **invS, int *MP, int NUM, double *LoS);
60 
61 static void Save_DOS_Col(double ******Residues, double ****OLP0, double ***EVal, int **LO_TC, int **HO_TC);
62 
63 static double Krylov_Col(char *mode,
64 			 int SCF_iter,
65 			 double *****Hks,
66 			 double ****OLP0,
67 			 double *****CDM,
68 			 double *****EDM,
69 			 double Eele0[2],
70 			 double Eele1[2]);
71 
72 
73 /*******************************************************
74   The following subroutines are called only when
75   Matomnum==1 && openmp_threads_num>1
76 *******************************************************/
77 
78 static double Krylov_Col_trd(char *mode,
79 			 int SCF_iter,
80 			 double *****Hks,
81 			 double ****OLP0,
82 			 double *****CDM,
83 			 double *****EDM,
84 			 double Eele0[2],
85 			 double Eele1[2]);
86 
87 static void Generate_pMatrix_trd( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0, double **invS,
88                               double ***Krylov_U, double ***Krylov_U_OLP, double **inv_RS, int *MP,
89                               int *Msize, int *Msize2, int *Msize3, int *Msize4, int Msize2_max,
90                               double **tmpvec0, double **tmpvec1, double **tmpvec2 );
91 
92 static void Generate_pMatrix2_trd( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0, double ***Krylov_U,
93                                int *MP, int *Msize, int *Msize2, int *Msize3,
94                                double **tmpvec1 );
95 static void Krylov_IOLP_trd( int Mc_AN, double ****OLP0, double ***Krylov_U_OLP, double **inv_RS,
96                          int *MP, int *Msize2, int *Msize4, int Msize2_max,
97                          double **tmpvec0, double **tmpvec1 );
98 static void S_orthonormalize_vec_trd( int Mc_AN, int ct_on, double **vec,
99                                   double **workvec, double ****OLP0,
100                                   double **tmpmat0, double *ko, double *iko, int *MP, int *Msize2 );
101 static void Embedding_Matrix_trd(int spin, int Mc_AN, double *****Hks,
102                              double ***Krylov_U, double ****EC_matrix,
103 			     int *MP, int *Msize, int *Msize2, int *Msize3,
104                              double **tmpvec1, int EKC_core_size_max, int Msize2_max);
105 
106 
107 static int Eigen_lapack_x(double **a, double *ko, int n, int EVmax);
108 static int Eigen_lapack_d(double **a, double *ko, int n, int EVmax);
109 static int Eigen_lapack_r(double **a, double *ko, int n, int EVmax);
110 
111 
112 
113 
Krylov(char * mode,int SCF_iter,double ***** Hks,double ***** ImNL,double **** OLP0,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])114 double Krylov(char *mode,
115               int SCF_iter,
116               double *****Hks,
117               double *****ImNL,
118               double ****OLP0,
119               double *****CDM,
120               double *****EDM,
121               double Eele0[2], double Eele1[2])
122 {
123   double time0;
124 
125   /****************************************************
126          collinear without spin-orbit coupling
127   ****************************************************/
128 
129   if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==0 ){
130 
131     if(atomnum<=NUMPROCS_MPI_COMM_WORLD && 1<openmp_threads_num)
132       time0 = Krylov_Col_trd(mode,SCF_iter, Hks, OLP0, CDM, EDM, Eele0, Eele1);
133     else
134       time0 = Krylov_Col(mode,SCF_iter, Hks, OLP0, CDM, EDM, Eele0, Eele1);
135   }
136 
137   /****************************************************
138          collinear with spin-orbit coupling
139   ****************************************************/
140 
141   else if ( (SpinP_switch==0 || SpinP_switch==1) && SO_switch==1 ){
142     printf("Spin-orbit coupling is not supported for collinear DFT calculations.\n");
143     MPI_Finalize();
144     exit(1);
145   }
146 
147   /****************************************************
148    non-collinear with and without spin-orbit coupling
149   ****************************************************/
150 
151   else if (SpinP_switch==3){
152     printf("The O(N) Krylov subspace method is not supported for non-collinear DFT calculations.\n");
153     MPI_Finalize();
154     exit(1);
155   }
156 
157   return time0;
158 }
159 
160 
161 
162 
163 
164 
165 
Krylov_Col(char * mode,int SCF_iter,double ***** Hks,double **** OLP0,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])166 static double Krylov_Col(char *mode,
167 			 int SCF_iter,
168 			 double *****Hks,
169 			 double ****OLP0,
170 			 double *****CDM,
171 			 double *****EDM,
172 			 double Eele0[2],
173 			 double Eele1[2])
174 {
175   static int firsttime=1,recalc_firsttime=1,recalc_flag;
176   int Mc_AN,Gc_AN,i,is,js,Gi,wan,wanA,wanB,Anum;
177   int num,NUM0,NUM,NUM1,n2,Cwan,Hwan,Rn2;
178   int size1,size2,max_size1,max_size2;
179   int ih,ig,ian,j,kl,jg,jan,Bnum,m,n,spin,i2,ip;
180   int k,l,i1,j1,P_min,m_size,q1,q2,csize,Residues_size;
181   int h_AN1,Mh_AN1,h_AN2,Gh_AN1,Gh_AN2,wan1,wan2;
182   int po,po1,loopN,tno1,tno2,h_AN,Gh_AN,rl1,rl2,rl;
183   int MA_AN,GA_AN,tnoA,GB_AN,tnoB,ct_on;
184   int Msize2_max;
185   static double TZ;
186   double My_TZ,sum,FermiF,time0,srt;
187   double sum00,sum10,sum20,sum30;
188   double sum01,sum11,sum21,sum31;
189   double sum02,sum12,sum22,sum32;
190   double sum03,sum13,sum23,sum33;
191   double tmp0,tmp1,tmp2,tmp3,b2,co,x0,Sx,Dx,xmin,xmax;
192   double My_Num_State,Num_State,x,Dnum;
193   double emin,emax,de;
194   double TStime,TEtime,Stime1,Etime1;
195   double Stime2,Etime2;
196   double time1,time2,time3,time4,time5;
197   double time6,time7,time8,time9,time10;
198   double time11,time12,time13,time14,time15,time16;
199   double Erange;
200   double My_Eele0[2],My_Eele1[2];
201   double max_x=30.0;
202   double ChemP_MAX,ChemP_MIN,spin_degeneracy;
203   double spetrum_radius;
204   double ***EVal;
205   double ******Residues;
206   double ***PDOS_DC;
207   double *tmp_array;
208   double *tmp_array2;
209 
210   int *MP;
211 
212   /*****************************************************
213   Msize:   \sum_FNAN Spe_Total_CNO
214   Msize2:  \sum_FNAN+SNAN Spe_Total_CNO
215   Msize3:  rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]
216   Msize4:  rlmax_EC2[Mc_AN]*EKC_core_size[Mc_AN]
217   Msize5:  dimension for the last column of Residues
218   *****************************************************/
219 
220   int *Msize;
221   int *Msize2;
222   int *Msize3;
223   int *Msize4;
224   int *Msize5;
225   int **LO_TC;
226   int **HO_TC;
227   int numprocs,myid,ID,IDS,IDR,tag=999;
228   double Stime_atom, Etime_atom;
229 
230   MPI_Status stat;
231   MPI_Request request;
232 
233   /* for OpenMP */
234   int OMPID,Nthrds,Nprocs;
235 
236   /* MPI */
237   MPI_Barrier(mpi_comm_level1);
238   MPI_Comm_size(mpi_comm_level1,&numprocs);
239   MPI_Comm_rank(mpi_comm_level1,&myid);
240 
241   dtime(&TStime);
242 
243   if (measure_time==1){
244     time1 = 0.0;
245     time2 = 0.0;
246     time3 = 0.0;
247     time4 = 0.0;
248     time5 = 0.0;
249     time6 = 0.0;
250     time7 = 0.0;
251     time8 = 0.0;
252     time9 = 0.0;
253     time10 = 0.0;
254     time11 = 0.0;
255     time12 = 0.0;
256     time13 = 0.0;
257     time14 = 0.0;
258     time15 = 0.0;
259   }
260 
261   /****************************************************
262                    allocation of arrays:
263   ****************************************************/
264 
265   Msize = (int*)malloc(sizeof(int)*(Matomnum+1));
266   Msize2 = (int*)malloc(sizeof(int)*(Matomnum+1));
267   Msize3 = (int*)malloc(sizeof(int)*(Matomnum+1));
268   Msize4 = (int*)malloc(sizeof(int)*(Matomnum+1));
269   Msize2_max = 0;
270 
271   /* find Msize */
272 
273   for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
274 
275     if (Mc_AN==0){
276       Msize[Mc_AN] = 1;
277     }
278     else{
279 
280       Gc_AN = M2G[Mc_AN];
281 
282       NUM = 0;
283       for (i=0; i<=FNAN[Gc_AN]; i++){
284 	Gi = natn[Gc_AN][i];
285 	wanA = WhatSpecies[Gi];
286 	NUM += Spe_Total_CNO[wanA];
287       }
288       Msize[Mc_AN] = NUM;
289     }
290   }
291 
292   /* find Msize2 and Msize2_max */
293 
294   for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
295 
296     if (Mc_AN==0){
297       Msize2[Mc_AN] = 1;
298     }
299     else{
300 
301       Gc_AN = M2G[Mc_AN];
302 
303       NUM = 0;
304       for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
305 	Gi = natn[Gc_AN][i];
306 	wanA = WhatSpecies[Gi];
307 	NUM += Spe_Total_CNO[wanA];
308       }
309       Msize2[Mc_AN] = NUM;
310     }
311 
312     if (Msize2_max<Msize2[Mc_AN]) Msize2_max = Msize2[Mc_AN] + 4;
313   }
314 
315   /* find Msize3 and Msize4 */
316 
317   Msize3[0] = 1;
318   Msize4[0] = 1;
319 
320   for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
321     Gc_AN = M2G[Mc_AN];
322     wan = WhatSpecies[Gc_AN];
323     ct_on = Spe_Total_CNO[wan];
324     Msize3[Mc_AN] = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN];
325     Msize4[Mc_AN] = rlmax_EC2[Mc_AN]*EKC_core_size[Mc_AN];
326   }
327 
328   m_size = 0;
329 
330   EVal = (double***)malloc(sizeof(double**)*(SpinP_switch+1));
331   for (spin=0; spin<=SpinP_switch; spin++){
332     EVal[spin] = (double**)malloc(sizeof(double*)*(Matomnum+1));
333 
334     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
335       n2 = Msize3[Mc_AN] + 2;
336       m_size += n2;
337       EVal[spin][Mc_AN] = (double*)malloc(sizeof(double)*n2);
338     }
339   }
340 
341   if (firsttime){
342   PrintMemory("Krylov: EVal",  sizeof(double)*m_size,NULL);
343   }
344 
345   if (2<=level_stdout){
346     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
347         printf("<Krylov> myid=%4d Mc_AN=%4d Gc_AN=%4d Msize=%4d\n",
348         myid,Mc_AN,M2G[Mc_AN],Msize[Mc_AN]);
349     }
350   }
351 
352   /****************************************************
353     allocation of arrays:
354 
355     double PDOS[SpinP_switch+1]
356                [Matomnum+1]
357                [n2]
358   ****************************************************/
359 
360   m_size = 0;
361   PDOS_DC = (double***)malloc(sizeof(double**)*(SpinP_switch+1));
362   for (spin=0; spin<=SpinP_switch; spin++){
363     PDOS_DC[spin] = (double**)malloc(sizeof(double*)*(Matomnum+1));
364     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
365       n2 = Msize3[Mc_AN] + 4;
366       m_size += n2;
367       PDOS_DC[spin][Mc_AN] = (double*)malloc(sizeof(double)*n2);
368     }
369   }
370 
371   if (firsttime){
372   PrintMemory("Krylov: PDOS_DC",sizeof(double)*m_size,NULL);
373   }
374 
375   /****************************************************
376     allocation of arrays:
377 
378     int LO_TC[SpinP_switch+1][Matomnum+1]
379     int HO_TC[SpinP_switch+1][Matomnum+1]
380   ****************************************************/
381 
382   LO_TC = (int**)malloc(sizeof(int*)*(SpinP_switch+1));
383   for (spin=0; spin<(SpinP_switch+1); spin++){
384     LO_TC[spin] = (int*)malloc(sizeof(int)*(Matomnum+1));
385   }
386 
387   HO_TC = (int**)malloc(sizeof(int*)*(SpinP_switch+1));
388   for (spin=0; spin<(SpinP_switch+1); spin++){
389     HO_TC[spin] = (int*)malloc(sizeof(int)*(Matomnum+1));
390   }
391 
392   /****************************************************
393     allocation of array:
394 
395     double Residues[SpinP_switch+1]
396                    [Matomnum+1]
397                    [FNAN[Gc_AN]+1]
398                    [Spe_Total_CNO[Gc_AN]]
399                    [Spe_Total_CNO[Gh_AN]]
400                    [HO_TC-LO_TC+3]
401   ****************************************************/
402 
403   Residues = (double******)malloc(sizeof(double*****)*(SpinP_switch+1));
404   for (spin=0; spin<=SpinP_switch; spin++){
405     Residues[spin] = (double*****)malloc(sizeof(double****)*(Matomnum+1));
406     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
407 
408       if (Mc_AN==0){
409         Gc_AN = 0;
410         FNAN[0] = 0;
411         tno1 = 1;
412       }
413       else{
414         Gc_AN = M2G[Mc_AN];
415         wanA = WhatSpecies[Gc_AN];
416         tno1 = Spe_Total_CNO[wanA];
417       }
418 
419       Residues[spin][Mc_AN] = (double****)malloc(sizeof(double***)*(FNAN[Gc_AN]+1));
420 
421       for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
422 
423         if (Mc_AN==0){
424           tno2 = 1;
425         }
426         else {
427           Gh_AN = natn[Gc_AN][h_AN];
428           wanB = WhatSpecies[Gh_AN];
429           tno2 = Spe_Total_CNO[wanB];
430         }
431 
432         Residues[spin][Mc_AN][h_AN] = (double***)malloc(sizeof(double**)*tno1);
433         for (i=0; i<tno1; i++){
434           Residues[spin][Mc_AN][h_AN][i] = (double**)malloc(sizeof(double*)*tno2);
435           /* note that the array is allocated once more in the loop */
436 
437 	  /*
438           for (j=0; j<tno2; j++){
439             Residues[spin][Mc_AN][h_AN][i][j] = (double*)malloc(sizeof(double)*4100);
440 	  }
441 	  */
442 
443         }
444       }
445     }
446   }
447 
448   for (spin=0; spin<=SpinP_switch; spin++){
449     Residues[spin][0][0][0][0] = (double*)malloc(sizeof(double)*1);
450   }
451 
452   /****************************************************
453      initialize density and energy density matrices
454   ****************************************************/
455 
456   for (spin=0; spin<=SpinP_switch; spin++){
457     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
458       Gc_AN = M2G[Mc_AN];
459       wanA = WhatSpecies[Gc_AN];
460       tno1 = Spe_Total_CNO[wanA];
461       for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
462 	Gh_AN = natn[Gc_AN][h_AN];
463 	wanB = WhatSpecies[Gh_AN];
464 	tno2 = Spe_Total_CNO[wanB];
465 	for (i=0; i<tno1; i++){
466 	  for (j=0; j<tno2; j++){
467 	    CDM[spin][Mc_AN][h_AN][i][j] = 0.0;
468 	    EDM[spin][Mc_AN][h_AN][i][j] = 0.0;
469 	  }
470 	}
471       }
472     }
473   }
474 
475   /****************************************************
476    MPI
477 
478    Hks
479   ****************************************************/
480 
481   if (measure_time==1) dtime(&Stime1);
482 
483   if (SCF_iter==1){
484 
485     /***********************************
486                set data size
487     ************************************/
488 
489     for (ID=0; ID<numprocs; ID++){
490 
491       IDS = (myid + ID) % numprocs;
492       IDR = (myid - ID + numprocs) % numprocs;
493 
494       if (ID!=0){
495 	tag = 999;
496 
497 	/* find data size to send block data */
498 	if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
499 
500 	  size1 = 0;
501 	  for (spin=0; spin<=SpinP_switch; spin++){
502 	    for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
503 	      Mc_AN = Snd_MAN[IDS][n];
504 	      Gc_AN = Snd_GAN[IDS][n];
505 	      Cwan = WhatSpecies[Gc_AN];
506 	      tno1 = Spe_Total_CNO[Cwan];
507 	      for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
508 		Gh_AN = natn[Gc_AN][h_AN];
509 		Hwan = WhatSpecies[Gh_AN];
510 		tno2 = Spe_Total_CNO[Hwan];
511 		size1 += tno1*tno2;
512 	      }
513 	    }
514 	  }
515 
516 	  Snd_HFS_Size[IDS] = size1;
517 	  MPI_Isend(&size1, 1, MPI_INT, IDS, tag, mpi_comm_level1, &request);
518 	}
519 	else{
520 	  Snd_HFS_Size[IDS] = 0;
521 	}
522 
523 	/* receiving of size of data */
524 
525 	if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
526 
527 	  MPI_Recv(&size2, 1, MPI_INT, IDR, tag, mpi_comm_level1, &stat);
528 	  Rcv_HFS_Size[IDR] = size2;
529 	}
530 	else{
531 	  Rcv_HFS_Size[IDR] = 0;
532 	}
533 
534 	if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0) MPI_Wait(&request,&stat);
535       }
536       else{
537 	Snd_HFS_Size[IDS] = 0;
538 	Rcv_HFS_Size[IDR] = 0;
539       }
540     }
541   }
542 
543   /***********************************
544              data transfer
545   ************************************/
546 
547   /* find maximum size of size1 */
548 
549   max_size1 = 0;
550   max_size2 = 0;
551   for (ID=0; ID<numprocs; ID++){
552     size1 = Snd_HFS_Size[ID];
553     if (max_size1<size1) max_size1 = size1;
554     size2 = Rcv_HFS_Size[ID];
555     if (max_size2<size2) max_size2 = size2;
556   }
557 
558   /* allocation of arrays */
559 
560   tmp_array  = (double*)malloc(sizeof(double)*max_size1);
561   tmp_array2 = (double*)malloc(sizeof(double)*max_size2);
562 
563   /* MPI communication */
564 
565   tag = 999;
566   for (ID=0; ID<numprocs; ID++){
567 
568     IDS = (myid + ID) % numprocs;
569     IDR = (myid - ID + numprocs) % numprocs;
570 
571     if (ID!=0){
572 
573       /*****************************
574               sending of data
575       *****************************/
576 
577       if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
578 
579         size1 = Snd_HFS_Size[IDS];
580 
581         /* multidimentional array to vector array */
582 
583         num = 0;
584         for (spin=0; spin<=SpinP_switch; spin++){
585           for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
586             Mc_AN = Snd_MAN[IDS][n];
587             Gc_AN = Snd_GAN[IDS][n];
588             Cwan = WhatSpecies[Gc_AN];
589             tno1 = Spe_Total_CNO[Cwan];
590             for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
591               Gh_AN = natn[Gc_AN][h_AN];
592               Hwan = WhatSpecies[Gh_AN];
593               tno2 = Spe_Total_CNO[Hwan];
594               for (i=0; i<tno1; i++){
595                 for (j=0; j<tno2; j++){
596                   tmp_array[num] = Hks[spin][Mc_AN][h_AN][i][j];
597                   num++;
598                 }
599               }
600 	    }
601           }
602 	}
603 
604         MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request);
605       }
606 
607       /*****************************
608          receiving of block data
609       *****************************/
610 
611       if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
612 
613         size2 = Rcv_HFS_Size[IDR];
614 
615         MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat);
616 
617         num = 0;
618         for (spin=0; spin<=SpinP_switch; spin++){
619           Mc_AN = S_TopMAN[IDR] - 1;  /* S_TopMAN should be used. */
620           for (n=0; n<(F_Rcv_Num[IDR]+S_Rcv_Num[IDR]); n++){
621             Mc_AN++;
622             Gc_AN = Rcv_GAN[IDR][n];
623             Cwan = WhatSpecies[Gc_AN];
624             tno1 = Spe_Total_CNO[Cwan];
625 
626             for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
627               Gh_AN = natn[Gc_AN][h_AN];
628               Hwan = WhatSpecies[Gh_AN];
629               tno2 = Spe_Total_CNO[Hwan];
630               for (i=0; i<tno1; i++){
631                 for (j=0; j<tno2; j++){
632                   Hks[spin][Mc_AN][h_AN][i][j] = tmp_array2[num];
633                   num++;
634 		}
635 	      }
636 	    }
637 	  }
638 	}
639       }
640 
641       if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
642         MPI_Wait(&request,&stat);
643       }
644     }
645   }
646 
647   /****************************************************
648    MPI
649 
650    OLP0
651   ****************************************************/
652 
653   /***********************************
654              set data size
655   ************************************/
656 
657   if (SCF_iter==1){
658 
659     /***********************************
660                data transfer
661     ************************************/
662 
663     tag = 999;
664     for (ID=0; ID<numprocs; ID++){
665 
666       IDS = (myid + ID) % numprocs;
667       IDR = (myid - ID + numprocs) % numprocs;
668 
669       if (ID!=0){
670 
671         /*****************************
672                 sending of data
673         *****************************/
674 
675         if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
676 
677           size1 = Snd_HFS_Size[IDS]/(1+SpinP_switch);
678 
679           /* multidimentional array to vector array */
680 
681           num = 0;
682 
683           for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
684             Mc_AN = Snd_MAN[IDS][n];
685             Gc_AN = Snd_GAN[IDS][n];
686             Cwan = WhatSpecies[Gc_AN];
687             tno1 = Spe_Total_CNO[Cwan];
688             for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
689               Gh_AN = natn[Gc_AN][h_AN];
690               Hwan = WhatSpecies[Gh_AN];
691               tno2 = Spe_Total_CNO[Hwan];
692               for (i=0; i<tno1; i++){
693                 for (j=0; j<tno2; j++){
694                   tmp_array[num] = OLP0[Mc_AN][h_AN][i][j];
695                   num++;
696                 }
697               }
698 	    }
699           }
700 
701           MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request);
702         }
703 
704         /*****************************
705            receiving of block data
706         *****************************/
707 
708         if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
709 
710           size2 = Rcv_HFS_Size[IDR]/(1+SpinP_switch);
711 
712           MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat);
713 
714           num = 0;
715           Mc_AN = S_TopMAN[IDR] - 1; /* S_TopMAN should be used. */
716           for (n=0; n<(F_Rcv_Num[IDR]+S_Rcv_Num[IDR]); n++){
717             Mc_AN++;
718             Gc_AN = Rcv_GAN[IDR][n];
719             Cwan = WhatSpecies[Gc_AN];
720             tno1 = Spe_Total_CNO[Cwan];
721 
722             for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
723               Gh_AN = natn[Gc_AN][h_AN];
724               Hwan = WhatSpecies[Gh_AN];
725               tno2 = Spe_Total_CNO[Hwan];
726               for (i=0; i<tno1; i++){
727                 for (j=0; j<tno2; j++){
728                   OLP0[Mc_AN][h_AN][i][j] = tmp_array2[num];
729                   num++;
730    	        }
731 	      }
732 	    }
733 	  }
734         }
735 
736         if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
737           MPI_Wait(&request,&stat);
738 	}
739       }
740     }
741   }
742 
743   /* freeing of arrays */
744   free(tmp_array);
745   free(tmp_array2);
746 
747   if (measure_time==1){
748     dtime(&Etime1);
749     time1 = Etime1 - Stime1;
750   }
751 
752   /***********************************************
753     for regeneration of the buffer matrix
754   ***********************************************/
755 
756   if (sqrt(fabs(NormRD[0]))<(0.2+0.10*(double)atomnum) && recalc_firsttime){
757     recalc_flag = 1;
758     recalc_firsttime = 0;
759   }
760   else{
761     recalc_flag = 0;
762   }
763 
764   if (SCF_iter==1) recalc_firsttime = 1;
765 
766   if (error_check==1){
767     printf("SCF_iter=%2d recalc_firsttime=%2d\n",SCF_iter,recalc_firsttime);
768   }
769 
770   if (measure_time==1) dtime(&Stime2);
771 
772 #pragma omp parallel shared(EKC_invS_flag,List_YOUSO,Residues,EDM,CDM,HO_TC,LO_TC,ChemP,EVal,RMI1,S_G2M,EC_matrix,recalc_flag,recalc_EM,Krylov_U,SpinP_switch,EKC_core_size,EKC_core_size_max,rlmax_EC,rlmax_EC2,time11,time10,time9,time8,time7,time6,time5,time4,time3,time2,Hks,OLP0,EKC_Exact_invS_flag,SCF_iter,Msize4,Msize3,Msize2,Msize,Msize2_max,natn,FNAN,SNAN,Spe_Total_CNO,WhatSpecies,M2G,Matomnum,myid,time_per_atom,firsttime)
773   {
774     int OMPID,Nthrds,Nprocs;
775     int Mc_AN,Gc_AN,wan,spin;
776     int ig,ian,ih,kl,jg,jan,Bnum,m,n,rl;
777     int Anum,i,j,k,Gi,wanA,NUM,n2,csize,is,i2;
778     int i1,rl1,js,ip,po1,tno1,h_AN,Gh_AN,wanB,tno2;
779     int *MP;
780     int KU_d1, KU_d2,lda,ldb,ldc,M,N,K;
781     double alpha, beta;
782     double **invS;
783     double *LoS;
784     double *C,*KU;
785     double *H_DC,*ko;
786     double ***Krylov_U_OLP;
787     double **inv_RS;
788     double sum00,sum10,sum20,sum30,sum;
789     double tmp0,tmp1,tmp2,tmp3;
790     double Erange;
791     double Stime_atom,Etime_atom;
792     double Stime1,Etime1;
793     double **tmpvec0;
794     double **tmpvec1;
795     double **tmpvec2;
796 
797     /* get info. on OpenMP */
798 
799     OMPID = omp_get_thread_num();
800     Nthrds = omp_get_num_threads();
801     Nprocs = omp_get_num_procs();
802 
803     /* allocation of arrays */
804 
805     MP = (int*)malloc(sizeof(int)*List_YOUSO[2]);
806 
807     tmpvec0 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
808     for (i=0; i<EKC_core_size_max; i++){
809       tmpvec0[i] = (double*)malloc(sizeof(double)*Msize2_max);
810     }
811 
812     tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
813     for (i=0; i<EKC_core_size_max; i++){
814       tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
815     }
816 
817     tmpvec2 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
818     for (i=0; i<EKC_core_size_max; i++){
819       tmpvec2[i] = (double*)malloc(sizeof(double)*Msize2_max);
820     }
821 
822     if (firsttime && OMPID==0){
823       PrintMemory("Krylov: tmpvec0",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
824       PrintMemory("Krylov: tmpvec1",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
825       PrintMemory("Krylov: tmpvec2",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
826     }
827 
828     /***********************************************
829      main loop of calculation
830     ***********************************************/
831 
832     for (Mc_AN=1+OMPID; Mc_AN<=Matomnum; Mc_AN+=Nthrds){
833 
834       dtime(&Stime_atom);
835 
836       Gc_AN = M2G[Mc_AN];
837       wan = WhatSpecies[Gc_AN];
838 
839       /* MP array */
840 
841       Anum = 1;
842       for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
843 	MP[i] = Anum;
844 	Gi = natn[Gc_AN][i];
845 	wanA = WhatSpecies[Gi];
846 	Anum += Spe_Total_CNO[wanA];
847       }
848       NUM = Anum - 1;
849       n2 = NUM + 40;
850 
851       /***********************************************
852                  allocation of arrays:
853       ***********************************************/
854 
855       if (Msize[Mc_AN]<Msize3[Mc_AN])
856 	csize = Msize3[Mc_AN] + 40;
857       else
858 	csize = Msize[Mc_AN] + 40;
859 
860       KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
861       KU_d2 = Msize2[Mc_AN];
862 
863       H_DC = (double*)malloc(sizeof(double)*csize*csize);
864       ko = (double*)malloc(sizeof(double)*csize);
865       C = (double*)malloc(sizeof(double)*csize*csize);
866       KU = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+2)*Msize3[Mc_AN]);
867 
868       /***********************************************
869           calculate the inverse of overlap matrix
870       ***********************************************/
871 
872       if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_Exact_invS_flag==1){
873 
874 	LoS = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3)*(Msize2[Mc_AN]+3));
875 
876 	invS = (double**)malloc(sizeof(double*)*(Msize2[Mc_AN]+3));
877 	for (i=0; i<(Msize2[Mc_AN]+3); i++){
878 	  invS[i] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
879 	}
880 
881 	if (measure_time==1) dtime(&Stime1);
882 
883 	Inverse_S_by_Cholesky(Mc_AN, OLP0, invS, MP, NUM, LoS);
884 
885 	if (measure_time==1 && OMPID==0){
886 	  dtime(&Etime1);
887 	  time2 += Etime1 - Stime1;
888 	}
889       }
890 
891       else if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_invS_flag==1){
892 
893 	Krylov_U_OLP = (double***)malloc(sizeof(double**)*rlmax_EC2[Mc_AN]);
894 	for (i=0; i<rlmax_EC2[Mc_AN]; i++){
895 	  Krylov_U_OLP[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
896 	  for (j=0; j<EKC_core_size[Mc_AN]; j++){
897 	    Krylov_U_OLP[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
898 	    for (k=0; k<(Msize2[Mc_AN]+3); k++)  Krylov_U_OLP[i][j][k] = 0.0;
899 	  }
900 	}
901 
902 	inv_RS = (double**)malloc(sizeof(double*)*(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]);
903 	for (i=0; i<(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]; i++){
904 	  inv_RS[i] = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]);
905 	}
906 
907 	if (measure_time==1) dtime(&Stime1);
908 
909 	Krylov_IOLP( Mc_AN, OLP0, Krylov_U_OLP, inv_RS, MP, Msize2, Msize4, Msize2_max, tmpvec0, tmpvec1 );
910 
911 	if (measure_time==1 && OMPID==0){
912 	  dtime(&Etime1);
913 	  time2 += Etime1 - Stime1;
914 	}
915       }
916 
917       for (spin=0; spin<=SpinP_switch; spin++){
918 
919 	/****************************************************
920                 generate a preconditioning matrix
921 	****************************************************/
922 
923 	if (measure_time==1) dtime(&Stime1);
924 
925 	if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN]){
926 
927 	  Generate_pMatrix( myid, spin, Mc_AN, Hks, OLP0, invS, Krylov_U, Krylov_U_OLP, inv_RS, MP,
928                             Msize, Msize2, Msize3, Msize4, Msize2_max, tmpvec0, tmpvec1, tmpvec2 );
929 	}
930 	else if (SCF_iter==1){
931 
932 	  Generate_pMatrix2( myid, spin, Mc_AN, Hks, OLP0, Krylov_U, MP, Msize, Msize2, Msize3, tmpvec1 );
933 	}
934 
935 	if (measure_time==1 && OMPID==0){
936 	  dtime(&Etime1);
937 	  time3 += Etime1 - Stime1;
938 	}
939 
940 	if (measure_time==1) dtime(&Stime1);
941 
942 	if (recalc_EM==1 || SCF_iter==1 || recalc_flag==1){
943 
944 	  Embedding_Matrix( spin, Mc_AN, Hks, Krylov_U, EC_matrix, MP, Msize, Msize2, Msize3, tmpvec1);
945 	}
946 
947 	if (measure_time==1 && OMPID==0){
948 	  dtime(&Etime1);
949 	  time4 += Etime1 - Stime1;
950 	}
951 
952 	/****************************************************
953                 construct the Hamiltonian matrix
954 	****************************************************/
955 
956 	if (measure_time==1) dtime(&Stime1);
957 
958 	for (i=0; i<=FNAN[Gc_AN]; i++){
959 	  ig = natn[Gc_AN][i];
960 	  ian = Spe_Total_CNO[WhatSpecies[ig]];
961 	  Anum = MP[i] - 1;
962 	  ih = S_G2M[ig]; /* S_G2M should be used */
963 
964 	  for (j=0; j<=FNAN[Gc_AN]; j++){
965 
966 	    kl = RMI1[Mc_AN][i][j];
967 	    jg = natn[Gc_AN][j];
968 	    jan = Spe_Total_CNO[WhatSpecies[jg]];
969 	    Bnum = MP[j] - 1;
970 
971 	    if (0<=kl){
972 	      for (m=0; m<ian; m++){
973 		for (n=0; n<jan; n++){
974 		  H_DC[(Anum+m)*Msize[Mc_AN]+Bnum+n] = Hks[spin][ih][kl][m][n];
975 		}
976 	      }
977 	    }
978 
979 	    else{
980 	      for (m=0; m<ian; m++){
981 		for (n=0; n<jan; n++){
982 		  H_DC[(Anum+m)*Msize[Mc_AN]+Bnum+n] = 0.0;
983 		}
984 	      }
985 	    }
986 	  }
987 	}
988 
989 	if (measure_time==1 && OMPID==0){
990 	  dtime(&Etime1);
991 	  time5 += Etime1 - Stime1;
992 	}
993 
994 	/****************************************************
995                    transform u1^+ * H_DC * u1
996 	****************************************************/
997 
998 	/* H_DC * u1 */
999 
1000 	if (measure_time==1) dtime(&Stime1);
1001 
1002 	/* original version
1003 
1004 	for (i=1; i<=Msize[Mc_AN]; i++){
1005 	  for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
1006 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
1007 
1008 	      sum = 0.0;
1009 	      for (j=1; j<=Msize[Mc_AN]; j++){
1010 		sum += H_DC[i][j]*Krylov_U[spin][Mc_AN][rl][n][j];
1011 	      }
1012 
1013 	      C[rl*EKC_core_size[Mc_AN]+n+1][i] = sum;
1014 	    }
1015 	  }
1016 	}
1017 	*/
1018 
1019 	/* BLAS3 version */
1020 
1021 	for (i=0; i<Msize[Mc_AN]; i++){
1022 	  for (j=0; j<Msize3[Mc_AN]; j++){
1023 	    KU[j*Msize[Mc_AN]+i] = Krylov_U[spin][Mc_AN][j*Msize2[Mc_AN]+i+1];
1024 	  }
1025 	}
1026 
1027 	alpha = 1.0; beta = 0.0;
1028 	M = Msize[Mc_AN]; N = Msize3[Mc_AN]; K = Msize[Mc_AN];
1029 	lda = M; ldb = K; ldc = Msize[Mc_AN];
1030 
1031 	F77_NAME(dgemm,DGEMM)( "N", "N", &M, &N, &K, &alpha,
1032 			       H_DC, &lda, KU, &ldb, &beta, C, &ldc);
1033 
1034 	if (measure_time==1 && OMPID==0){
1035 	  dtime(&Etime1);
1036 	  time6 += Etime1 - Stime1;
1037 	}
1038 
1039 	/* u1^+ * H_DC * u1 */
1040 
1041 	if (measure_time==1) dtime(&Stime1);
1042 
1043 	/* original version */
1044 
1045 	/*
1046 	for (rl1=0; rl1<rlmax_EC[Mc_AN]; rl1++){
1047           for (m=0; m<EKC_core_size[Mc_AN]; m++){
1048 	    for (rl2=rl1; rl2<rlmax_EC[Mc_AN]; rl2++){
1049 	      for (n=0; n<EKC_core_size[Mc_AN]; n++){
1050 		sum = 0.0;
1051 		i2 = rl2*EKC_core_size[Mc_AN] + n + 1;
1052 		for (i=1; i<=Msize[Mc_AN]; i++){
1053 		  sum += Krylov_U[spin][Mc_AN][rl1][m][i]*C[i2][i];
1054 		}
1055 
1056 		H_DC[rl1*EKC_core_size[Mc_AN]+m+1][rl2*EKC_core_size[Mc_AN]+n+1] = sum;
1057 		H_DC[rl2*EKC_core_size[Mc_AN]+n+1][rl1*EKC_core_size[Mc_AN]+m+1] = sum;
1058 	      }
1059 	    }
1060           }
1061 	}
1062 	*/
1063 
1064 	/* BLAS3 version */
1065 
1066 	alpha = 1.0; beta = 0.0;
1067 	M = Msize3[Mc_AN]; N = Msize3[Mc_AN]; K = Msize[Mc_AN];
1068 	lda = K; ldb = K; ldc = Msize3[Mc_AN];
1069 
1070 	F77_NAME(dgemm,DGEMM)("T", "N", &M, &N, &K, &alpha,
1071 			      KU, &lda, C, &ldb, &beta, H_DC, &ldc);
1072 
1073 	if (measure_time==1 && OMPID==0){
1074 	  dtime(&Etime1);
1075 	  time7 += Etime1 - Stime1;
1076 	}
1077 
1078 	/* correction for ZeroNum */
1079 
1080 	m = (int)Krylov_U[spin][Mc_AN][0];
1081 	for (i=0; i<m; i++){
1082 	  H_DC[i*Msize3[Mc_AN]+i] = 1.0e+3;
1083 	}
1084 
1085 	/****************************************************
1086             H0 = u1^+ * H_DC * u1 + D
1087 	****************************************************/
1088 
1089 	if (measure_time==1) dtime(&Stime1);
1090 
1091 	for (i=(Msize3[Mc_AN]-1); 0<=i; i--){
1092 	  for (j=0; j<Msize3[Mc_AN]; j++){
1093 	    H_DC[(i+1)*(Msize3[Mc_AN]+1)+(j+1)] = H_DC[i*Msize3[Mc_AN]+j] + EC_matrix[spin][Mc_AN][i+1][j+1];
1094 	  }
1095 	}
1096 
1097 	if (measure_time==1 && OMPID==0){
1098 	  dtime(&Etime1);
1099 	  time8 += Etime1 - Stime1;
1100 	}
1101 
1102 	/****************************************************
1103            diagonalize
1104 	****************************************************/
1105 
1106 	if (measure_time==1) dtime(&Stime1);
1107 
1108 	Eigen_lapack2(H_DC,Msize3[Mc_AN]+1,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
1109 
1110 	if (measure_time==1 && OMPID==0){
1111 	  dtime(&Etime1);
1112 	  time9 += Etime1 - Stime1;
1113 	}
1114 
1115 	/********************************************
1116            back transformation of eigenvectors
1117                       c = u1 * b
1118 	*********************************************/
1119 
1120 	if (measure_time==1) dtime(&Stime1);
1121 
1122 	/* original version */
1123 
1124 	/*
1125 	for (i=1; i<=Msize[Mc_AN]; i++){
1126           for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
1127 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
1128 
1129 	      tmp1 = Krylov_U[spin][Mc_AN][rl][n][i];
1130 	      i1 = rl*EKC_core_size[Mc_AN] + n + 1;
1131 
1132 	      for (j=1; j<=Msize3[Mc_AN]; j++){
1133 		C[i][j] += tmp1*H_DC[i1][j];
1134 	      }
1135 	    }
1136           }
1137 	}
1138 	*/
1139 
1140 	/* BLAS3 version */
1141 
1142 	alpha = 1.0; beta = 0.0;
1143         M = Msize3[Mc_AN]; N = Msize[Mc_AN]; K = Msize3[Mc_AN];
1144         lda = K; ldb = N; ldc = Msize3[Mc_AN];
1145 
1146 	F77_NAME(dgemm,DGEMM)("T", "T", &M, &N, &K, &alpha,
1147                                H_DC, &lda, KU, &ldb, &beta, C, &ldc);
1148 
1149 	if (measure_time==1 && OMPID==0){
1150 	  dtime(&Etime1);
1151 	  time10 += Etime1 - Stime1;
1152 	}
1153 
1154 	if (measure_time==1) dtime(&Stime1);
1155 
1156 	/***********************************************
1157           store eigenvalues and residues of poles
1158 	***********************************************/
1159 
1160 	for (i=1; i<=Msize3[Mc_AN]; i++){
1161 	  EVal[spin][Mc_AN][i-1] = ko[i];
1162 	}
1163 
1164 	/******************************************************
1165         set an energy range (-Erange+ChemP to Erange+ChemP)
1166         of eigenvalues used to store the Residues.
1167 	******************************************************/
1168 
1169 	Erange = 0.367493245;  /* in hartree, corresponds to 10 eV */
1170 
1171 	/***********************************************
1172           find LO_TC and HO_TC
1173 	***********************************************/
1174 
1175 	/* LO_TC */
1176 	i = 0;
1177 	ip = 0;
1178 	po1 = 0;
1179 	do{
1180 	  if ( (ChemP-Erange)<EVal[spin][Mc_AN][i]){
1181 	    ip = i;
1182 	    po1 = 1;
1183 	  }
1184 	  i++;
1185 	} while (po1==0 && i<Msize3[Mc_AN]);
1186 
1187 	LO_TC[spin][Mc_AN] = ip;
1188 
1189 	/* HO_TC */
1190 	i = 0;
1191 	ip = Msize3[Mc_AN]-1;
1192 	po1 = 0;
1193 	do{
1194 	  if ( (ChemP+Erange)<EVal[spin][Mc_AN][i]){
1195 	    ip = i;
1196 	    po1 = 1;
1197 	  }
1198 	  i++;
1199 	} while (po1==0 && i<Msize3[Mc_AN]);
1200 
1201 	HO_TC[spin][Mc_AN] = ip;
1202 
1203 	/***********************************************
1204           store residues of poles
1205 	***********************************************/
1206 
1207 	n2 = HO_TC[spin][Mc_AN] - LO_TC[spin][Mc_AN] + 3;
1208 	if (n2<1) n2 = 1;
1209 
1210 	wanA = WhatSpecies[Gc_AN];
1211 	tno1 = Spe_Total_CNO[wanA];
1212 
1213 	for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1214 
1215 	  Gh_AN = natn[Gc_AN][h_AN];
1216 	  wanB = WhatSpecies[Gh_AN];
1217 	  tno2 = Spe_Total_CNO[wanB];
1218 	  Bnum = MP[h_AN] - 1;
1219 
1220 	  for (i=0; i<tno1; i++){
1221 	    for (j=0; j<tno2; j++){
1222 
1223 	      for (i1=0; i1<LO_TC[spin][Mc_AN]; i1++){
1224 		tmp1 = C[i*Msize3[Mc_AN]+i1]*C[(Bnum+j)*Msize3[Mc_AN]+i1];
1225 		CDM[spin][Mc_AN][h_AN][i][j] += tmp1;
1226 		EDM[spin][Mc_AN][h_AN][i][j] += tmp1*EVal[spin][Mc_AN][i1];
1227 	      }
1228 
1229 	      /* <allocation of Residues */
1230 	      Residues[spin][Mc_AN][h_AN][i][j] = (double*)malloc(sizeof(double)*n2);
1231 	      /* allocation of Residues> */
1232 
1233 	      for (i1=LO_TC[spin][Mc_AN]; i1<=HO_TC[spin][Mc_AN]; i1++){
1234 		Residues[spin][Mc_AN][h_AN][i][j][i1-LO_TC[spin][Mc_AN]]
1235 		  = C[i*Msize3[Mc_AN]+i1]*C[(Bnum+j)*Msize3[Mc_AN]+i1];
1236 	      }
1237 	    }
1238 	  }
1239 	}
1240 
1241 	if (measure_time==1 && OMPID==0){
1242 	  dtime(&Etime1);
1243 	  time11 += Etime1 - Stime1;
1244 	}
1245 
1246       } /* spin */
1247 
1248       /***********************************************
1249                     freeing of arrays:
1250       ***********************************************/
1251 
1252       free(H_DC);
1253       free(ko);
1254       free(C);
1255       free(KU);
1256 
1257       if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_Exact_invS_flag==1){
1258 
1259 	free(LoS);
1260 
1261 	for (i=0; i<(Msize2[Mc_AN]+3); i++){
1262 	  free(invS[i]);
1263 	}
1264 	free(invS);
1265       }
1266 
1267       else if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_invS_flag==1){
1268 
1269 	for (i=0; i<rlmax_EC2[Mc_AN]; i++){
1270 	  for (j=0; j<EKC_core_size[Mc_AN]; j++){
1271 	    free(Krylov_U_OLP[i][j]);
1272 	  }
1273 	  free(Krylov_U_OLP[i]);
1274 	}
1275 	free(Krylov_U_OLP);
1276 
1277 	for (i=0; i<(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]; i++){
1278 	  free(inv_RS[i]);
1279 	}
1280 	free(inv_RS);
1281       }
1282 
1283       dtime(&Etime_atom);
1284       time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1285 
1286     } /* Mc_AN */
1287 
1288     /* freeing of array */
1289 
1290     free(MP);
1291 
1292     for (i=0; i<EKC_core_size_max; i++){
1293       free(tmpvec0[i]);
1294     }
1295     free(tmpvec0);
1296 
1297     for (i=0; i<EKC_core_size_max; i++){
1298       free(tmpvec1[i]);
1299     }
1300     free(tmpvec1);
1301 
1302     for (i=0; i<EKC_core_size_max; i++){
1303       free(tmpvec2[i]);
1304     }
1305     free(tmpvec2);
1306 
1307   } /* #pragma omp parallel */
1308 
1309   if (measure_time==1){
1310     dtime(&Etime2);
1311     time16 = Etime2 - Stime2;
1312   }
1313 
1314   if (firsttime){
1315 
1316     Residues_size = 1;
1317 
1318     for (spin=0; spin<=SpinP_switch; spin++){
1319       for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
1320 	Gc_AN = M2G[Mc_AN];
1321 	wan = WhatSpecies[Gc_AN];
1322 	tno1 = Spe_Total_CNO[wan];
1323 	for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1324 	  Gh_AN = natn[Gc_AN][h_AN];
1325 	  wanB = WhatSpecies[Gh_AN];
1326 	  tno2 = Spe_Total_CNO[wanB];
1327 	  n2 = HO_TC[spin][Mc_AN] - LO_TC[spin][Mc_AN] + 3;
1328 	  Residues_size += tno1*tno2*n2;
1329 	}
1330       }
1331     }
1332 
1333     PrintMemory("Krylov: Residues",sizeof(double)*Residues_size,NULL);
1334   }
1335 
1336   /****************************************************
1337                 calculate the projected DOS
1338   ****************************************************/
1339 
1340   if (measure_time==1) dtime(&Stime1);
1341 
1342 #pragma omp parallel shared(time_per_atom,Residues,LO_TC,HO_TC,EDM,CDM,OLP0,natn,FNAN,PDOS_DC,Msize3,Spe_Total_CNO,WhatSpecies,M2G,SpinP_switch,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,Stime_atom,spin,Gc_AN,wanA,tno1,i1,i,h_AN,Gh_AN,wanB,tno2,j,tmp1,Etime_atom)
1343   {
1344 
1345     /* get info. on OpenMP */
1346 
1347     OMPID = omp_get_thread_num();
1348     Nthrds = omp_get_num_threads();
1349     Nprocs = omp_get_num_procs();
1350 
1351     for (Mc_AN=1+OMPID; Mc_AN<=Matomnum; Mc_AN+=Nthrds){
1352 
1353       dtime(&Stime_atom);
1354 
1355       for (spin=0; spin<=SpinP_switch; spin++){
1356 
1357 	Gc_AN = M2G[Mc_AN];
1358 	wanA = WhatSpecies[Gc_AN];
1359 	tno1 = Spe_Total_CNO[wanA];
1360 
1361 	for (i1=0; i1<=(Msize3[Mc_AN]+1); i1++){
1362 	  PDOS_DC[spin][Mc_AN][i1] = 0.0;
1363 	}
1364 
1365 	for (i=0; i<tno1; i++){
1366 	  for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1367 	    Gh_AN = natn[Gc_AN][h_AN];
1368 	    wanB = WhatSpecies[Gh_AN];
1369 	    tno2 = Spe_Total_CNO[wanB];
1370 	    for (j=0; j<tno2; j++){
1371 
1372 	      tmp1 = OLP0[Mc_AN][h_AN][i][j];
1373 
1374 	      PDOS_DC[spin][Mc_AN][0] += tmp1*CDM[spin][Mc_AN][h_AN][i][j];
1375 	      PDOS_DC[spin][Mc_AN][1] += tmp1*EDM[spin][Mc_AN][h_AN][i][j];
1376 
1377 	      for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
1378 		PDOS_DC[spin][Mc_AN][i1+2] += Residues[spin][Mc_AN][h_AN][i][j][i1]*tmp1;
1379 	      }
1380 
1381 	    }
1382 	  }
1383 	}
1384 
1385       } /* spin */
1386 
1387       dtime(&Etime_atom);
1388       time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1389 
1390     } /* Mc_AN */
1391 
1392   } /* #pragma omp parallel */
1393 
1394   if (measure_time==1){
1395     dtime(&Etime1);
1396     time12 += Etime1 - Stime1;
1397   }
1398 
1399   /****************************************************
1400            find the total number of electrons
1401   ****************************************************/
1402 
1403   if (measure_time==1) dtime(&Stime1);
1404 
1405   My_TZ = 0.0;
1406   for (i=1; i<=Matomnum; i++){
1407     Gc_AN = M2G[i];
1408     wan = WhatSpecies[Gc_AN];
1409     My_TZ += Spe_Core_Charge[wan];
1410   }
1411 
1412   /* MPI, My_TZ */
1413 
1414   MPI_Barrier(mpi_comm_level1);
1415   MPI_Allreduce(&My_TZ, &TZ, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
1416 
1417   /****************************************************
1418                 find the chemical potential
1419   ****************************************************/
1420 
1421   po = 0;
1422   loopN = 0;
1423 
1424   ChemP_MAX = 10.0;
1425   ChemP_MIN =-10.0;
1426   if      (SpinP_switch==0) spin_degeneracy = 2.0;
1427   else if (SpinP_switch==1) spin_degeneracy = 1.0;
1428 
1429   do {
1430     ChemP = 0.50*(ChemP_MAX + ChemP_MIN);
1431 
1432     My_Num_State = 0.0;
1433 
1434     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
1435       for (spin=0; spin<=SpinP_switch; spin++){
1436 
1437         dtime(&Stime_atom);
1438 
1439         Gc_AN = M2G[Mc_AN];
1440 
1441         My_Num_State += spin_degeneracy*PDOS_DC[spin][Mc_AN][0];
1442 
1443         for (i=0; i<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i++){
1444 
1445           x = (EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
1446           if (x<=-max_x) x = -max_x;
1447           if (max_x<=x)  x = max_x;
1448           FermiF = 1.0/(1.0 + exp(x));
1449           My_Num_State += spin_degeneracy*FermiF*PDOS_DC[spin][Mc_AN][i+2];
1450 	}
1451 
1452         dtime(&Etime_atom);
1453         time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1454       }
1455     }
1456 
1457     /* MPI, My_Num_State */
1458 
1459     MPI_Barrier(mpi_comm_level1);
1460     MPI_Allreduce(&My_Num_State, &Num_State, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
1461 
1462     Dnum = (TZ - Num_State) - system_charge;
1463     if (0.0<=Dnum) ChemP_MIN = ChemP;
1464     else           ChemP_MAX = ChemP;
1465     if (fabs(Dnum)<1.0e-11) po = 1;
1466 
1467 
1468     if (myid==Host_ID && 3<=level_stdout){
1469       printf("  ChemP=%15.12f TZ=%15.12f Num_state=%15.12f\n",ChemP,TZ,Num_State);
1470     }
1471 
1472     loopN++;
1473   }
1474   while (po==0 && loopN<1000);
1475 
1476   if (measure_time==1){
1477     dtime(&Etime1);
1478     time13 += Etime1 - Stime1;
1479   }
1480 
1481   /****************************************************
1482         eigenenergy by summing up eigenvalues
1483   ****************************************************/
1484 
1485   if (measure_time==1) dtime(&Stime1);
1486 
1487   My_Eele0[0] = 0.0;
1488   My_Eele0[1] = 0.0;
1489   for (spin=0; spin<=SpinP_switch; spin++){
1490     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
1491 
1492       dtime(&Stime_atom);
1493 
1494       Gc_AN = M2G[Mc_AN];
1495       My_Eele0[spin] += PDOS_DC[spin][Mc_AN][1];
1496 
1497       for (i=0; i<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i++){
1498 
1499         x = (EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
1500 
1501         if (x<=-max_x) x = -max_x;
1502         if (max_x<=x)  x = max_x;
1503         FermiF = 1.0/(1.0 + exp(x));
1504         My_Eele0[spin] += FermiF*EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]]*PDOS_DC[spin][Mc_AN][i+2];
1505       }
1506 
1507       dtime(&Etime_atom);
1508       time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1509     }
1510   }
1511 
1512   /* MPI, My_Eele0 */
1513   for (spin=0; spin<=SpinP_switch; spin++){
1514     MPI_Barrier(mpi_comm_level1);
1515     MPI_Allreduce(&My_Eele0[spin], &Eele0[spin], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
1516   }
1517 
1518   if (SpinP_switch==0){
1519     Eele0[1] = Eele0[0];
1520   }
1521 
1522   if (measure_time==1){
1523     dtime(&Etime1);
1524     time14 += Etime1 - Stime1;
1525   }
1526 
1527   if (measure_time==1) dtime(&Stime1);
1528 
1529 #pragma omp parallel shared(FNAN,time_per_atom,EDM,CDM,Residues,natn,max_x,Beta,ChemP,EVal,LO_TC,HO_TC,Spe_Total_CNO,WhatSpecies,M2G,SpinP_switch,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,spin,Stime_atom,Gc_AN,wanA,tno1,i1,x,FermiF,h_AN,Gh_AN,wanB,tno2,i,j,tmp1,Etime_atom)
1530   {
1531 
1532     /* get info. on OpenMP */
1533 
1534     OMPID = omp_get_thread_num();
1535     Nthrds = omp_get_num_threads();
1536     Nprocs = omp_get_num_procs();
1537 
1538     for (Mc_AN=1+OMPID; Mc_AN<=Matomnum; Mc_AN+=Nthrds){
1539       for (spin=0; spin<=SpinP_switch; spin++){
1540 
1541 	dtime(&Stime_atom);
1542 
1543 	Gc_AN = M2G[Mc_AN];
1544 	wanA = WhatSpecies[Gc_AN];
1545 	tno1 = Spe_Total_CNO[wanA];
1546 
1547 	for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
1548 
1549 	  x = (EVal[spin][Mc_AN][i1+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
1550 	  if (x<=-max_x) x = -max_x;
1551 	  if (max_x<=x)  x = max_x;
1552 	  FermiF = 1.0/(1.0 + exp(x));
1553 
1554 	  for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1555 	    Gh_AN = natn[Gc_AN][h_AN];
1556 	    wanB = WhatSpecies[Gh_AN];
1557 	    tno2 = Spe_Total_CNO[wanB];
1558 	    for (i=0; i<tno1; i++){
1559 	      for (j=0; j<tno2; j++){
1560 		tmp1 = FermiF*Residues[spin][Mc_AN][h_AN][i][j][i1];
1561 		CDM[spin][Mc_AN][h_AN][i][j] += tmp1;
1562 		EDM[spin][Mc_AN][h_AN][i][j] += tmp1*EVal[spin][Mc_AN][i1+LO_TC[spin][Mc_AN]];
1563 	      }
1564 	    }
1565 	  }
1566 	}
1567 
1568 	dtime(&Etime_atom);
1569 	time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
1570       }
1571     }
1572 
1573   } /* #pragma omp parallel */
1574 
1575   /****************************************************
1576                       bond energies
1577   ****************************************************/
1578 
1579   My_Eele1[0] = 0.0;
1580   My_Eele1[1] = 0.0;
1581   for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
1582     GA_AN = M2G[MA_AN];
1583     wanA = WhatSpecies[GA_AN];
1584     tnoA = Spe_Total_CNO[wanA];
1585 
1586     for (j=0; j<=FNAN[GA_AN]; j++){
1587       GB_AN = natn[GA_AN][j];
1588       wanB = WhatSpecies[GB_AN];
1589       tnoB = Spe_Total_CNO[wanB];
1590 
1591       for (k=0; k<tnoA; k++){
1592 	for (l=0; l<tnoB; l++){
1593 	  for (spin=0; spin<=SpinP_switch; spin++){
1594 	    My_Eele1[spin] += CDM[spin][MA_AN][j][k][l]*Hks[spin][MA_AN][j][k][l];
1595 	  }
1596 	}
1597       }
1598 
1599     }
1600   }
1601 
1602   /* MPI, My_Eele1 */
1603   MPI_Barrier(mpi_comm_level1);
1604   for (spin=0; spin<=SpinP_switch; spin++){
1605     MPI_Allreduce(&My_Eele1[spin], &Eele1[spin], 1, MPI_DOUBLE,
1606                    MPI_SUM, mpi_comm_level1);
1607   }
1608 
1609   if (SpinP_switch==0){
1610     Eele1[1] = Eele1[0];
1611   }
1612 
1613   if (3<=level_stdout && myid==Host_ID){
1614     printf("  Eele00=%15.12f Eele01=%15.12f\n",Eele0[0],Eele0[1]);
1615     printf("  Eele10=%15.12f Eele11=%15.12f\n",Eele1[0],Eele1[1]);
1616   }
1617 
1618   if (measure_time==1){
1619     dtime(&Etime1);
1620     time15 += Etime1 - Stime1;
1621   }
1622 
1623   if ( strcasecmp(mode,"dos")==0 ){
1624     Save_DOS_Col(Residues,OLP0,EVal,LO_TC,HO_TC);
1625   }
1626 
1627   if (measure_time==1){
1628     printf("myid=%2d time1 =%5.3f time2 =%5.3f time3 =%5.3f time4 =%5.3f time5 =%5.3f\n",
1629             myid,time1,time2,time3,time4,time5);
1630     printf("myid=%2d time6 =%5.3f time7 =%5.3f time8 =%5.3f time9 =%5.3f time10=%5.3f\n",
1631             myid,time6,time7,time8,time9,time10);
1632     printf("myid=%2d time11=%5.3f time12=%5.3f time13=%5.3f time14=%5.3f time15=%5.3f\n",
1633             myid,time11,time12,time13,time14,time15);
1634     printf("myid=%2d time16=%5.3f\n",myid,time16);
1635   }
1636 
1637   /****************************************************
1638     freeing of arrays:
1639 
1640   ****************************************************/
1641 
1642   free(Msize);
1643   free(Msize2);
1644   free(Msize3);
1645   free(Msize4);
1646 
1647   for (spin=0; spin<(SpinP_switch+1); spin++){
1648     free(LO_TC[spin]);
1649   }
1650   free(LO_TC);
1651 
1652   for (spin=0; spin<(SpinP_switch+1); spin++){
1653     free(HO_TC[spin]);
1654   }
1655   free(HO_TC);
1656 
1657   for (spin=0; spin<=SpinP_switch; spin++){
1658     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
1659       free(EVal[spin][Mc_AN]);
1660     }
1661     free(EVal[spin]);
1662   }
1663   free(EVal);
1664 
1665   for (spin=0; spin<=SpinP_switch; spin++){
1666     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
1667 
1668       if (Mc_AN==0){
1669         Gc_AN = 0;
1670         FNAN[0] = 0;
1671         tno1 = 1;
1672       }
1673       else{
1674         Gc_AN = M2G[Mc_AN];
1675         wanA = WhatSpecies[Gc_AN];
1676         tno1 = Spe_Total_CNO[wanA];
1677       }
1678 
1679       for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
1680 
1681         if (Mc_AN==0){
1682           tno2 = 1;
1683         }
1684         else {
1685           Gh_AN = natn[Gc_AN][h_AN];
1686           wanB = WhatSpecies[Gh_AN];
1687           tno2 = Spe_Total_CNO[wanB];
1688         }
1689 
1690         for (i=0; i<tno1; i++){
1691           for (j=0; j<tno2; j++){
1692             free(Residues[spin][Mc_AN][h_AN][i][j]);
1693 	  }
1694           free(Residues[spin][Mc_AN][h_AN][i]);
1695         }
1696         free(Residues[spin][Mc_AN][h_AN]);
1697       }
1698       free(Residues[spin][Mc_AN]);
1699     }
1700     free(Residues[spin]);
1701   }
1702   free(Residues);
1703 
1704   for (spin=0; spin<=SpinP_switch; spin++){
1705     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
1706       free(PDOS_DC[spin][Mc_AN]);
1707     }
1708     free(PDOS_DC[spin]);
1709   }
1710   free(PDOS_DC);
1711 
1712   /* for time */
1713   dtime(&TEtime);
1714   time0 = TEtime - TStime;
1715 
1716   if (measure_time==1){
1717   printf("total time=%15.12f\n",time0);
1718   }
1719 
1720   /* for PrintMemory */
1721   firsttime=0;
1722 
1723   return time0;
1724 }
1725 
1726 
1727 
1728 
1729 
1730 
1731 
1732 
1733 
Generate_pMatrix(int myid,int spin,int Mc_AN,double ***** Hks,double **** OLP0,double ** invS,double *** Krylov_U,double *** Krylov_U_OLP,double ** inv_RS,int * MP,int * Msize,int * Msize2,int * Msize3,int * Msize4,int Msize2_max,double ** tmpvec0,double ** tmpvec1,double ** tmpvec2)1734 void Generate_pMatrix( int myid, int spin, int Mc_AN, double *****Hks,
1735                        double ****OLP0, double **invS,
1736                        double ***Krylov_U, double ***Krylov_U_OLP, double **inv_RS, int *MP,
1737                        int *Msize, int *Msize2, int *Msize3, int *Msize4, int Msize2_max,
1738                        double **tmpvec0, double **tmpvec1, double **tmpvec2 )
1739 {
1740   int rl,rl0,rl1,ct_AN,fan,san,can,wan,ct_on,i,j;
1741   int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1;
1742   int ZeroNum,Gh_AN,wanB,m1s,is;
1743   int rl00,rl01,rl02,rl03,rl04,rl05,rl06,rl07;
1744   int mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;
1745 
1746   int KU_d1, KU_d2, csize;
1747   __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
1748 
1749   double mmArr[8];
1750   double time1,time2,time3,time4,time5;
1751   double time6,time7,time8,time9,time10;
1752   double Stime1,Etime1;
1753   double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1754   double sum,dum,tmp0,tmp1,tmp2,tmp3,rcutA,r0;
1755   double **Utmp,**matRS0,**matRS1;
1756   double **tmpmat0;
1757   double *ko,*iko;
1758   double **FS;
1759   double ***U0;
1760 
1761   ct_AN = M2G[Mc_AN];
1762   fan = FNAN[ct_AN];
1763   san = SNAN[ct_AN];
1764   can = fan + san;
1765   wan = WhatSpecies[ct_AN];
1766   ct_on = Spe_Total_CNO[wan];
1767   rcutA = Spe_Atom_Cut1[wan];
1768 
1769   if (Msize[Mc_AN]<Msize3[Mc_AN])
1770     csize = Msize3[Mc_AN] + 40;
1771   else
1772     csize = Msize[Mc_AN] + 40;
1773 
1774   KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
1775   KU_d2 = Msize2[Mc_AN];
1776 
1777   /* allocation of arrays */
1778 
1779   Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
1780   for (i=0; i<rlmax_EC[Mc_AN]; i++){
1781     Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
1782   }
1783 
1784   U0 = (double***)malloc(sizeof(double**)*rlmax_EC[Mc_AN]);
1785   for (i=0; i<rlmax_EC[Mc_AN]; i++){
1786     U0[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
1787     for (j=0; j<EKC_core_size[Mc_AN]; j++){
1788       U0[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
1789       for (k=0; k<(Msize2[Mc_AN]+3); k++)  U0[i][j][k] = 0.0;
1790     }
1791   }
1792 
1793   tmpmat0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+4));
1794   for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
1795     tmpmat0[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+4));
1796   }
1797 
1798   FS = (double**)malloc(sizeof(double*)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
1799   for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
1800     FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
1801   }
1802 
1803   ko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
1804   iko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
1805 
1806   matRS0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+2));
1807   for (i=0; i<(EKC_core_size[Mc_AN]+2); i++){
1808     matRS0[i] = (double*)malloc(sizeof(double)*(Msize4[Mc_AN]+3));
1809   }
1810 
1811   matRS1 = (double**)malloc(sizeof(double*)*(Msize4[Mc_AN]+3));
1812   for (i=0; i<(Msize4[Mc_AN]+3); i++){
1813     matRS1[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+2));
1814   }
1815 
1816   /****************************************************
1817       initialize
1818   ****************************************************/
1819 
1820   if (measure_time==1){
1821     time1 = 0.0;
1822     time2 = 0.0;
1823     time3 = 0.0;
1824     time4 = 0.0;
1825     time5 = 0.0;
1826     time6 = 0.0;
1827     time7 = 0.0;
1828     time8 = 0.0;
1829     time9 = 0.0;
1830     time10 = 0.0;
1831   }
1832 
1833   if (measure_time==1) dtime(&Stime1);
1834 
1835   for (i=0; i<EKC_core_size_max; i++){
1836     for (j=0; j<Msize2_max; j++){
1837       tmpvec0[i][j] = 0.0;
1838       tmpvec1[i][j] = 0.0;
1839     }
1840   }
1841 
1842   /* find the nearest atom with distance of r0 */
1843 
1844   r0 = 1.0e+10;
1845   for (k=1; k<=FNAN[ct_AN]; k++){
1846     Gh_AN = natn[ct_AN][k];
1847     wanB = WhatSpecies[Gh_AN];
1848     if (Dis[ct_AN][k]<r0)  r0 = Dis[ct_AN][k];
1849   }
1850 
1851   /* starting vector */
1852 
1853   m = 0;
1854   for (k=0; k<=FNAN[ct_AN]; k++){
1855 
1856     Gh_AN = natn[ct_AN][k];
1857     wanB = WhatSpecies[Gh_AN];
1858 
1859     if ( Dis[ct_AN][k]<(scale_rc_EKC[Mc_AN]*r0) ){
1860 
1861       Anum = MP[k] - 1;
1862 
1863       for (i=0; i<Spe_Total_CNO[wanB]; i++){
1864 
1865         tmpvec0[m][Anum+i] = 1.0;
1866 
1867         m++;
1868       }
1869     }
1870   }
1871 
1872   S_orthonormalize_vec( Mc_AN, ct_on, tmpvec0, tmpvec1, OLP0, tmpmat0, ko, iko, MP, Msize2 );
1873 
1874   for (n=0; n<EKC_core_size[Mc_AN]; n++){
1875     for (i=0; i<Msize2[Mc_AN]; i++){
1876       U0[0][n][i] = tmpvec0[n][i];
1877     }
1878   }
1879 
1880   if (measure_time==1){
1881     dtime(&Etime1);
1882     time1 = Etime1 - Stime1;
1883   }
1884 
1885   /****************************************************
1886            generate Krylov subspace vectors
1887   ****************************************************/
1888 
1889   for (rl=0; rl<(rlmax_EC[Mc_AN]-1); rl++){
1890 
1891     if (measure_time==1) dtime(&Stime1);
1892 
1893     /*******************************************************
1894                             H * |Wn)
1895     *******************************************************/
1896 
1897     for (n=0; n<EKC_core_size[Mc_AN]; n++){
1898       for (i=0; i<Msize2[Mc_AN]; i++){
1899 	tmpvec1[n][i]  = 0.0;
1900       }
1901     }
1902 
1903     for (i=0; i<=can; i++){
1904 
1905       ig = natn[ct_AN][i];
1906       ian = Spe_Total_CNO[WhatSpecies[ig]];
1907       Anum = MP[i] - 1;
1908       ih = S_G2M[ig];
1909 
1910       for (j=0; j<=can; j++){
1911 
1912 	kl = RMI1[Mc_AN][i][j];
1913 	jg = natn[ct_AN][j];
1914 	jan = Spe_Total_CNO[WhatSpecies[jg]];
1915         Bnum = MP[j] - 1;
1916 
1917         if (0<=kl){
1918 
1919 #ifdef nosse
1920 
1921       /* original version */
1922 
1923       	  for (m=0; m<ian; m++){
1924 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
1925 
1926 	      sum = 0.0;
1927 	      for (k=0; k<jan; k++){
1928 		sum += Hks[spin][ih][kl][m][k]*tmpvec0[n][Bnum+k];
1929 	      }
1930 
1931               tmpvec1[n][Anum+m] += sum;
1932 	    }
1933 	  }
1934 
1935 #else
1936       /* Loop Unrolling + SSE version */
1937 
1938 	  for (m=0; m<(ian-3); m+=4){
1939 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
1940 
1941 	      mmSum00 = _mm_setzero_pd();
1942 	      mmSum01 = _mm_setzero_pd();
1943 	      mmSum10 = _mm_setzero_pd();
1944 	      mmSum11 = _mm_setzero_pd();
1945 	      mmSum20 = _mm_setzero_pd();
1946 	      mmSum21 = _mm_setzero_pd();
1947 	      mmSum30 = _mm_setzero_pd();
1948 	      mmSum31 = _mm_setzero_pd();
1949 
1950 	      for (k=0; k<(jan-3); k+=4){
1951 		mmTmp0 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+0]);
1952 		mmTmp1 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+2]);
1953 
1954 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
1955 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
1956 
1957 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
1958 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
1959 
1960 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
1961 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
1962 
1963 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
1964 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
1965 	      }
1966 
1967 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
1968 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
1969 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
1970 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
1971 
1972 	      _mm_storeu_pd(&mmArr[0], mmSum00);
1973 	      _mm_storeu_pd(&mmArr[2], mmSum10);
1974 	      _mm_storeu_pd(&mmArr[4], mmSum20);
1975 	      _mm_storeu_pd(&mmArr[6], mmSum30);
1976 
1977 	      sum0 = mmArr[0] + mmArr[1];
1978 	      sum1 = mmArr[2] + mmArr[3];
1979 	      sum2 = mmArr[4] + mmArr[5];
1980 	      sum3 = mmArr[6] + mmArr[7];
1981 
1982 	      for (; k<jan; k++){
1983 		sum0 += Hks[spin][ih][kl][m+0][k]*tmpvec0[n][Bnum+k];
1984 		sum1 += Hks[spin][ih][kl][m+1][k]*tmpvec0[n][Bnum+k];
1985 		sum2 += Hks[spin][ih][kl][m+2][k]*tmpvec0[n][Bnum+k];
1986 		sum3 += Hks[spin][ih][kl][m+3][k]*tmpvec0[n][Bnum+k];
1987 	      }
1988 
1989               tmpvec1[n][Anum+m+0] += sum0;
1990               tmpvec1[n][Anum+m+1] += sum1;
1991               tmpvec1[n][Anum+m+2] += sum2;
1992               tmpvec1[n][Anum+m+3] += sum3;
1993 	    }
1994 	  }
1995 
1996 	  for (; m<ian; m++){
1997 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
1998 
1999 	      sum = 0.0;
2000 	      for (k=0; k<jan; k++){
2001 		sum += Hks[spin][ih][kl][m][k]*tmpvec0[n][Bnum+k];
2002 	      }
2003 
2004               tmpvec1[n][Anum+m] += sum;
2005 	    }
2006 	  }
2007 #endif
2008 	}
2009       }
2010     }
2011 
2012     if (measure_time==1){
2013       dtime(&Etime1);
2014       time2 += Etime1 - Stime1;
2015     }
2016 
2017     /*******************************************************
2018            S^{-1} * H * |Wn)
2019     *******************************************************/
2020 
2021     if (EKC_Exact_invS_flag==1){
2022 
2023       if (measure_time==1) dtime(&Stime1);
2024 
2025 #ifdef nosse
2026 
2027       /* original version */
2028       /*
2029       for (n=0; n<EKC_core_size[Mc_AN]; n++){
2030         for (i=0; i<Msize2[Mc_AN]; i++){
2031 	  sum = 0.0;
2032 	  for (j=0; j<Msize2[Mc_AN]; j++){
2033 	    sum += invS[i][j]*tmpvec1[n][j];
2034 	  }
2035 	  tmpvec0[n][i] = sum;
2036         }
2037       }
2038       */
2039       /* unrolling version */
2040 
2041       for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
2042         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2043 
2044 	  sum0 = 0.0;
2045 	  sum1 = 0.0;
2046 	  sum2 = 0.0;
2047 	  sum3 = 0.0;
2048 
2049 	  for (j=0; j<Msize2[Mc_AN]; j++){
2050 	    sum0 += invS[i+0][j]*tmpvec1[n][j];
2051 	    sum1 += invS[i+1][j]*tmpvec1[n][j];
2052 	    sum2 += invS[i+2][j]*tmpvec1[n][j];
2053 	    sum3 += invS[i+3][j]*tmpvec1[n][j];
2054 	  }
2055 
2056 	  tmpvec0[n][i+0] = sum0;
2057 	  tmpvec0[n][i+1] = sum1;
2058 	  tmpvec0[n][i+2] = sum2;
2059 	  tmpvec0[n][i+3] = sum3;
2060         }
2061       }
2062 
2063       is = Msize2[Mc_AN] - Msize2[Mc_AN]%4;
2064 
2065       for (i=is; i<Msize2[Mc_AN]; i++){
2066         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2067 	  sum = 0.0;
2068 	  for (j=0; j<Msize2[Mc_AN]; j++){
2069 	    sum += invS[i][j]*tmpvec1[n][j];
2070 	  }
2071 	  tmpvec0[n][i] = sum;
2072         }
2073       }
2074 
2075 #else
2076       /* unrolling + SSE version */
2077 
2078       for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
2079         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2080 
2081 	  mmSum00 = _mm_setzero_pd();
2082 	  mmSum01 = _mm_setzero_pd();
2083 	  mmSum10 = _mm_setzero_pd();
2084 	  mmSum11 = _mm_setzero_pd();
2085 	  mmSum20 = _mm_setzero_pd();
2086 	  mmSum21 = _mm_setzero_pd();
2087 	  mmSum30 = _mm_setzero_pd();
2088 	  mmSum31 = _mm_setzero_pd();
2089 
2090 	  for (j=0; j<(Msize2[Mc_AN]-3); j+=4){
2091 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][j+0]);
2092 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][j+2]);
2093 
2094 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&invS[i+0][j+0]),mmTmp0));
2095 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&invS[i+0][j+2]),mmTmp1));
2096 
2097 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&invS[i+1][j+0]),mmTmp0));
2098 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&invS[i+1][j+2]),mmTmp1));
2099 
2100 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&invS[i+2][j+0]),mmTmp0));
2101 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&invS[i+2][j+2]),mmTmp1));
2102 
2103 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&invS[i+3][j+0]),mmTmp0));
2104 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&invS[i+3][j+2]),mmTmp1));
2105 	  }
2106 
2107 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2108 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2109 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2110 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2111 
2112 	  _mm_storeu_pd(&mmArr[0], mmSum00);
2113 	  _mm_storeu_pd(&mmArr[2], mmSum10);
2114 	  _mm_storeu_pd(&mmArr[4], mmSum20);
2115 	  _mm_storeu_pd(&mmArr[6], mmSum30);
2116 
2117 	  sum0 = mmArr[0] + mmArr[1];
2118 	  sum1 = mmArr[2] + mmArr[3];
2119 	  sum2 = mmArr[4] + mmArr[5];
2120 	  sum3 = mmArr[6] + mmArr[7];
2121 
2122 	  for (; j<Msize2[Mc_AN]; j++){
2123 	    sum0 += invS[i+0][j]*tmpvec1[n][j];
2124 	    sum1 += invS[i+1][j]*tmpvec1[n][j];
2125 	    sum2 += invS[i+2][j]*tmpvec1[n][j];
2126 	    sum3 += invS[i+3][j]*tmpvec1[n][j];
2127 	  }
2128 
2129 	  tmpvec0[n][i+0] = sum0;
2130 	  tmpvec0[n][i+1] = sum1;
2131 	  tmpvec0[n][i+2] = sum2;
2132 	  tmpvec0[n][i+3] = sum3;
2133         }
2134       }
2135 
2136       is = Msize2[Mc_AN] - Msize2[Mc_AN]%4;
2137 
2138       for (i=is; i<Msize2[Mc_AN]; i++){
2139         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2140 	  sum = 0.0;
2141 	  for (j=0; j<Msize2[Mc_AN]; j++){
2142 	    sum += invS[i][j]*tmpvec1[n][j];
2143 	  }
2144 	  tmpvec0[n][i] = sum;
2145         }
2146       }
2147 
2148 #endif
2149 
2150       if (measure_time==1){
2151 	dtime(&Etime1);
2152 	time3 += Etime1 - Stime1;
2153       }
2154     }
2155 
2156     /*******************************************************
2157           U * RS^-1 * U^+ * H * |Wn)
2158     *******************************************************/
2159 
2160     else if (EKC_invS_flag==1){
2161 
2162       /*  U^+ * H * |Wn) */
2163 
2164       for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
2165 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
2166 	  for (m=0; m<EKC_core_size[Mc_AN]; m++){
2167 
2168 	    sum = 0.0;
2169 	    for (i=0; i<Msize2[Mc_AN]; i++){
2170 	      sum += Krylov_U_OLP[rl0][n][i]*tmpvec1[m][i];
2171 	    }
2172 
2173 	    /* transpose the later calcualtion */
2174 	    matRS0[m][rl0*EKC_core_size[Mc_AN]+n] = sum;
2175 	  }
2176 	}
2177       }
2178 
2179       /*  RS^-1 * U^+ * H * |Wn) */
2180 
2181       for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
2182 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
2183 
2184 	  for (m=0; m<EKC_core_size[Mc_AN]; m++){
2185 
2186 	    sum = 0.0;
2187 	    for (i=0; i<Msize4[Mc_AN]; i++){
2188 	      sum += inv_RS[rl0*EKC_core_size[Mc_AN]+n][i]*matRS0[m][i];
2189 	    }
2190 
2191 	    matRS1[rl0*EKC_core_size[Mc_AN]+n][m] = sum;
2192 	  }
2193 	}
2194       }
2195 
2196       /*  U * RS^-1 * U^+ * H * |Wn) */
2197 
2198       for (n=0; n<EKC_core_size[Mc_AN]; n++){
2199 	for (i=0; i<Msize2[Mc_AN]; i++){
2200 	  tmpvec0[n][i] = 0.0;
2201 	}
2202       }
2203 
2204       for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
2205 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
2206 	  for (m=0; m<EKC_core_size[Mc_AN]; m++){
2207 	    tmp0 = matRS1[rl0*EKC_core_size[Mc_AN]+n][m];
2208 	    for (i=0; i<Msize2[Mc_AN]; i++){
2209 	      tmpvec0[m][i] += Krylov_U_OLP[rl0][n][i]*tmp0;
2210 	    }
2211 	  }
2212 	}
2213       }
2214     }
2215 
2216     else {
2217       for (n=0; n<EKC_core_size[Mc_AN]; n++){
2218         for (i=0; i<Msize2[Mc_AN]; i++){
2219 	  tmpvec0[n][i]  = tmpvec1[n][i];
2220         }
2221       }
2222     }
2223 
2224     if (measure_time==1) dtime(&Stime1);
2225 
2226     /*************************************************************
2227      S-orthogonalization by a classical block Gram-Schmidt method
2228     *************************************************************/
2229 
2230     /* |tmpvec2) = S * |tmpvec0) */
2231 
2232     for (n=0; n<EKC_core_size[Mc_AN]; n++){
2233       for (i=0; i<Msize2[Mc_AN]; i++){
2234 	tmpvec2[n][i] = 0.0;
2235       }
2236     }
2237 
2238     for (i=0; i<=can; i++){
2239 
2240       ig = natn[ct_AN][i];
2241       ian = Spe_Total_CNO[WhatSpecies[ig]];
2242       Anum = MP[i] - 1;
2243       ih = S_G2M[ig];
2244 
2245       for (j=0; j<=can; j++){
2246 
2247 	kl = RMI1[Mc_AN][i][j];
2248 	jg = natn[ct_AN][j];
2249 	jan = Spe_Total_CNO[WhatSpecies[jg]];
2250 	Bnum = MP[j] - 1;
2251 
2252 	if (0<=kl){
2253 
2254 #ifdef nosse
2255 
2256 	  /* Original version */
2257 
2258 	  for (m=0; m<ian; m++){
2259 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
2260 
2261 	      sum = 0.0;
2262 	      for (k=0; k<jan; k++){
2263 		sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
2264 	      }
2265 
2266 	      tmpvec2[n][Anum+m] += sum;
2267 	    }
2268 	  }
2269 
2270 #else
2271 
2272 	  /* Unrolling + SSE version */
2273 
2274 	  for (m=0; m<(ian-3); m+=4){
2275 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
2276 
2277 	      mmSum00 = _mm_setzero_pd();
2278 	      mmSum01 = _mm_setzero_pd();
2279 	      mmSum10 = _mm_setzero_pd();
2280 	      mmSum11 = _mm_setzero_pd();
2281 	      mmSum20 = _mm_setzero_pd();
2282 	      mmSum21 = _mm_setzero_pd();
2283 	      mmSum30 = _mm_setzero_pd();
2284 	      mmSum31 = _mm_setzero_pd();
2285 
2286 	      for (k=0; k<(jan-3); k+=4){
2287 		mmTmp0 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+0]);
2288 		mmTmp1 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+2]);
2289 
2290 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
2291 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
2292 
2293 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
2294 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
2295 
2296 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
2297 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
2298 
2299 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
2300 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
2301 	      }
2302 
2303 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2304 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2305 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2306 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2307 
2308 	      _mm_storeu_pd(&mmArr[0], mmSum00);
2309 	      _mm_storeu_pd(&mmArr[2], mmSum10);
2310 	      _mm_storeu_pd(&mmArr[4], mmSum20);
2311 	      _mm_storeu_pd(&mmArr[6], mmSum30);
2312 
2313 	      sum0 = mmArr[0] + mmArr[1];
2314 	      sum1 = mmArr[2] + mmArr[3];
2315 	      sum2 = mmArr[4] + mmArr[5];
2316 	      sum3 = mmArr[6] + mmArr[7];
2317 
2318 	      for (; k<jan; k++){
2319 		sum0 += OLP0[ih][kl][m+0][k]*tmpvec0[n][Bnum+k];
2320 		sum1 += OLP0[ih][kl][m+1][k]*tmpvec0[n][Bnum+k];
2321 		sum2 += OLP0[ih][kl][m+2][k]*tmpvec0[n][Bnum+k];
2322 		sum3 += OLP0[ih][kl][m+3][k]*tmpvec0[n][Bnum+k];
2323 	      }
2324 
2325 	      tmpvec2[n][Anum+m+0] += sum0;
2326 	      tmpvec2[n][Anum+m+1] += sum1;
2327 	      tmpvec2[n][Anum+m+2] += sum2;
2328 	      tmpvec2[n][Anum+m+3] += sum3;
2329 	    }
2330 	  }
2331 
2332 	  for (; m<ian; m++){
2333 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
2334 
2335 	      sum = 0.0;
2336 	      for (k=0; k<jan; k++){
2337 		sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
2338 	      }
2339 
2340 	      tmpvec2[n][Anum+m] += sum;
2341 	    }
2342 	  }
2343 
2344 #endif
2345 
2346 	}
2347       }
2348     }
2349 
2350     if (measure_time==1){
2351       dtime(&Etime1);
2352       time4 += Etime1 - Stime1;
2353     }
2354 
2355     if (measure_time==1) dtime(&Stime1);
2356 
2357 #ifdef nosse
2358 
2359     /* Original version */
2360 
2361     for (rl0=0; rl0<=rl; rl0++){
2362 
2363       /* (U_rl0|tmpvec2) */
2364 
2365       for (m=0; m<EKC_core_size[Mc_AN]; m++){
2366         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2367           sum = 0.0;
2368   	  for (i=0; i<Msize2[Mc_AN]; i++){
2369             sum += U0[rl0][m][i]*tmpvec2[n][i];
2370 	  }
2371           tmpmat0[m][n] = sum;
2372 	}
2373       }
2374 
2375       /* |tmpvec0) - |U_rl0) * (U_rl0|tmpvec2) */
2376 
2377       for (n=0; n<EKC_core_size[Mc_AN]; n++){
2378 	for (k=0; k<EKC_core_size[Mc_AN]; k++){
2379 	  dum = tmpmat0[k][n];
2380 	  for (i=0; i<Msize2[Mc_AN]; i++)  tmpvec0[n][i] -= U0[rl0][k][i]*dum;
2381         }
2382       }
2383 
2384     }
2385 
2386 #else
2387 
2388     /* Unrolling + SSE version */
2389 
2390     for (rl0=0; rl0<=rl; rl0++){
2391 
2392       /* (U_rl0|tmpvec2) */
2393 
2394       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
2395         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2396 
2397 	  mmSum00 = _mm_setzero_pd();
2398 	  mmSum01 = _mm_setzero_pd();
2399 	  mmSum10 = _mm_setzero_pd();
2400 	  mmSum11 = _mm_setzero_pd();
2401 	  mmSum20 = _mm_setzero_pd();
2402 	  mmSum21 = _mm_setzero_pd();
2403 	  mmSum30 = _mm_setzero_pd();
2404 	  mmSum31 = _mm_setzero_pd();
2405 
2406   	  for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
2407 	    mmTmp0 = _mm_loadu_pd(&tmpvec2[n][i+0]);
2408 	    mmTmp1 = _mm_loadu_pd(&tmpvec2[n][i+2]);
2409 
2410 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
2411 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
2412 
2413 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
2414 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
2415 
2416 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
2417 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
2418 
2419 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
2420 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
2421 	  }
2422 
2423 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2424 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2425 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2426 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2427 
2428 	  _mm_storeu_pd(&mmArr[0], mmSum00);
2429 	  _mm_storeu_pd(&mmArr[2], mmSum10);
2430 	  _mm_storeu_pd(&mmArr[4], mmSum20);
2431 	  _mm_storeu_pd(&mmArr[6], mmSum30);
2432 
2433 	  sum0 = mmArr[0] + mmArr[1];
2434 	  sum1 = mmArr[2] + mmArr[3];
2435 	  sum2 = mmArr[4] + mmArr[5];
2436 	  sum3 = mmArr[6] + mmArr[7];
2437 
2438   	  for (; i<Msize2[Mc_AN]; i++){
2439             sum0 += U0[rl0][m+0][i]*tmpvec2[n][i];
2440             sum1 += U0[rl0][m+1][i]*tmpvec2[n][i];
2441             sum2 += U0[rl0][m+2][i]*tmpvec2[n][i];
2442             sum3 += U0[rl0][m+3][i]*tmpvec2[n][i];
2443 	  }
2444 
2445           tmpmat0[m+0][n] = sum0;
2446           tmpmat0[m+1][n] = sum1;
2447           tmpmat0[m+2][n] = sum2;
2448           tmpmat0[m+3][n] = sum3;
2449 	}
2450       }
2451 
2452       for (; m<EKC_core_size[Mc_AN]; m++){
2453         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2454           sum = 0.0;
2455   	  for (i=0; i<Msize2[Mc_AN]; i++){
2456             sum += U0[rl0][m][i]*tmpvec2[n][i];
2457 	  }
2458           tmpmat0[m][n] = sum;
2459 	}
2460       }
2461 
2462       /* |tmpvec0) - |U_rl0) * (U_rl0|tmpvec2) */
2463 
2464       for (n=0; n<EKC_core_size[Mc_AN]; n++){
2465 	for (k=0; k<EKC_core_size[Mc_AN]; k++){
2466 	  dum = tmpmat0[k][n];
2467 	  for (i=0; i<Msize2[Mc_AN]; i++)  tmpvec0[n][i] -= U0[rl0][k][i]*dum;
2468         }
2469       }
2470 
2471     }
2472 
2473 #endif
2474 
2475     if (measure_time==1){
2476       dtime(&Etime1);
2477       time5 += Etime1 - Stime1;
2478     }
2479 
2480     /*************************************************************
2481                    S-orthonormalization of tmpvec0
2482     *************************************************************/
2483 
2484     if (measure_time==1) dtime(&Stime1);
2485 
2486     S_orthonormalize_vec( Mc_AN, ct_on, tmpvec0, tmpvec1, OLP0, tmpmat0, ko, iko, MP, Msize2 );
2487 
2488     for (n=0; n<EKC_core_size[Mc_AN]; n++){
2489       for (i=0; i<Msize2[Mc_AN]; i++){
2490         U0[rl+1][n][i] = tmpvec0[n][i];
2491       }
2492     }
2493 
2494     if (measure_time==1){
2495       dtime(&Etime1);
2496       time6 += Etime1 - Stime1;
2497     }
2498 
2499   } /* rl */
2500 
2501   /************************************************************
2502               orthogonalization by diagonalization
2503   ************************************************************/
2504 
2505   if (measure_time==1) dtime(&Stime1);
2506 
2507   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2508 
2509     /*  S * |Vn) */
2510 
2511     for (n=0; n<EKC_core_size[Mc_AN]; n++){
2512       for (i=0; i<Msize2[Mc_AN]; i++){
2513 	tmpvec1[n][i]  = 0.0;
2514       }
2515     }
2516 
2517     for (i=0; i<=can; i++){
2518 
2519       ig = natn[ct_AN][i];
2520       ian = Spe_Total_CNO[WhatSpecies[ig]];
2521       Anum = MP[i] - 1;
2522       ih = S_G2M[ig];
2523 
2524       for (j=0; j<=can; j++){
2525 
2526 	kl = RMI1[Mc_AN][i][j];
2527 	jg = natn[ct_AN][j];
2528 	jan = Spe_Total_CNO[WhatSpecies[jg]];
2529 	Bnum = MP[j] - 1;
2530 
2531 	if (0<=kl){
2532 
2533 #ifdef nosse
2534 
2535 	  /* Original version */
2536 	  /**/
2537 	  for (m=0; m<ian; m++){
2538 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
2539 
2540 	      sum = 0.0;
2541 	      for (k=0; k<jan; k++){
2542 		sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
2543 	      }
2544 	      tmpvec1[n][Anum+m] += sum;
2545 	    }
2546 	  }
2547 	  /**/
2548 
2549 #else
2550 
2551 	  /* Unrolling + SSE version */
2552 	  /**/
2553 	  for (m=0; m<(ian-3); m+=4){
2554 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
2555 
2556 	      mmSum00 = _mm_setzero_pd();
2557 	      mmSum01 = _mm_setzero_pd();
2558 	      mmSum10 = _mm_setzero_pd();
2559 	      mmSum11 = _mm_setzero_pd();
2560 	      mmSum20 = _mm_setzero_pd();
2561 	      mmSum21 = _mm_setzero_pd();
2562 	      mmSum30 = _mm_setzero_pd();
2563 	      mmSum31 = _mm_setzero_pd();
2564 
2565 	      for (k=0; k<(jan-3); k+=4){
2566 		mmTmp0 = _mm_loadu_pd(&U0[rl][n][Bnum+k+0]);
2567 		mmTmp1 = _mm_loadu_pd(&U0[rl][n][Bnum+k+2]);
2568 
2569 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
2570 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
2571 
2572 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
2573 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
2574 
2575 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
2576 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
2577 
2578 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
2579 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
2580 	      }
2581 
2582 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2583 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2584 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2585 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2586 
2587 	      _mm_storeu_pd(&mmArr[0], mmSum00);
2588 	      _mm_storeu_pd(&mmArr[2], mmSum10);
2589 	      _mm_storeu_pd(&mmArr[4], mmSum20);
2590 	      _mm_storeu_pd(&mmArr[6], mmSum30);
2591 
2592 	      sum0 = mmArr[0] + mmArr[1];
2593 	      sum1 = mmArr[2] + mmArr[3];
2594 	      sum2 = mmArr[4] + mmArr[5];
2595 	      sum3 = mmArr[6] + mmArr[7];
2596 
2597 	      for (; k<jan; k++){
2598 		sum0 += OLP0[ih][kl][m+0][k]*U0[rl][n][Bnum+k];
2599 		sum1 += OLP0[ih][kl][m+1][k]*U0[rl][n][Bnum+k];
2600 		sum2 += OLP0[ih][kl][m+2][k]*U0[rl][n][Bnum+k];
2601 		sum3 += OLP0[ih][kl][m+3][k]*U0[rl][n][Bnum+k];
2602 	      }
2603 
2604 	      tmpvec1[n][Anum+m+0] += sum0;
2605 	      tmpvec1[n][Anum+m+1] += sum1;
2606 	      tmpvec1[n][Anum+m+2] += sum2;
2607 	      tmpvec1[n][Anum+m+3] += sum3;
2608 	    }
2609 	  }
2610 
2611 	  for (; m<ian; m++){
2612 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
2613 
2614 	      sum = 0.0;
2615 	      for (k=0; k<jan; k++){
2616 		sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
2617 	      }
2618 	      tmpvec1[n][Anum+m] += sum;
2619 	    }
2620 	  }
2621 	  /**/
2622 #endif
2623 
2624 	}
2625       }
2626     }
2627 
2628 #ifdef nosse
2629 
2630     /* Original version */
2631     /**/
2632     for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
2633       for (m=0; m<EKC_core_size[Mc_AN]; m++){
2634         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2635   	  sum = 0.0;
2636 	  for (i=0; i<Msize2[Mc_AN]; i++){
2637             sum += U0[rl0][m][i]*tmpvec1[n][i];
2638 	  }
2639           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
2640           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
2641 	}
2642       }
2643     }
2644     /**/
2645 
2646 #else
2647 
2648     /* Unrolling + SSE version */
2649     /**/
2650     for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
2651       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
2652         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2653 
2654 	  mmSum00 = _mm_setzero_pd();
2655 	  mmSum01 = _mm_setzero_pd();
2656 	  mmSum10 = _mm_setzero_pd();
2657 	  mmSum11 = _mm_setzero_pd();
2658 	  mmSum20 = _mm_setzero_pd();
2659 	  mmSum21 = _mm_setzero_pd();
2660 	  mmSum30 = _mm_setzero_pd();
2661 	  mmSum31 = _mm_setzero_pd();
2662 
2663 	  for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
2664 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
2665 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
2666 
2667 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
2668 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
2669 
2670 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
2671 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
2672 
2673 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
2674 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
2675 
2676 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
2677 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
2678 	  }
2679 
2680 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2681 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2682 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2683 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2684 
2685 	  _mm_storeu_pd(&mmArr[0], mmSum00);
2686 	  _mm_storeu_pd(&mmArr[2], mmSum10);
2687 	  _mm_storeu_pd(&mmArr[4], mmSum20);
2688 	  _mm_storeu_pd(&mmArr[6], mmSum30);
2689 
2690 	  sum0 = mmArr[0] + mmArr[1];
2691 	  sum1 = mmArr[2] + mmArr[3];
2692 	  sum2 = mmArr[4] + mmArr[5];
2693 	  sum3 = mmArr[6] + mmArr[7];
2694 
2695 	  for (; i<Msize2[Mc_AN]; i++){
2696             sum0 += U0[rl0][m+0][i]*tmpvec1[n][i];
2697             sum1 += U0[rl0][m+1][i]*tmpvec1[n][i];
2698             sum2 += U0[rl0][m+2][i]*tmpvec1[n][i];
2699             sum3 += U0[rl0][m+3][i]*tmpvec1[n][i];
2700 	  }
2701 
2702           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
2703           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum0;
2704 
2705           FS[rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
2706           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+2] = sum1;
2707 
2708           FS[rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
2709           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+3] = sum2;
2710 
2711           FS[rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
2712           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+4] = sum3;
2713 	}
2714       }
2715 
2716       for (; m<EKC_core_size[Mc_AN]; m++){
2717         for (n=0; n<EKC_core_size[Mc_AN]; n++){
2718   	  sum = 0.0;
2719 	  for (i=0; i<Msize2[Mc_AN]; i++){
2720             sum += U0[rl0][m][i]*tmpvec1[n][i];
2721 	  }
2722           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
2723           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
2724 	}
2725       }
2726 
2727     }
2728     /**/
2729 
2730 #endif
2731 
2732   }
2733 
2734   if (measure_time==1){
2735     dtime(&Etime1);
2736     time7 += Etime1 - Stime1;
2737   }
2738 
2739   if (measure_time==1) dtime(&Stime1);
2740 
2741   Eigen_lapack(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
2742 
2743   if (measure_time==1){
2744     dtime(&Etime1);
2745     time8 += Etime1 - Stime1;
2746   }
2747 
2748   ZeroNum = 0;
2749 
2750   for (i=1; i<=Msize3[Mc_AN]; i++){
2751 
2752     if (error_check==1){
2753       printf("spin=%2d Mc_AN=%2d i=%3d ko[i]=%18.15f\n",spin,Mc_AN,i,ko[i]);
2754     }
2755 
2756     if (cutoff_value<ko[i]){
2757       ko[i]  = sqrt(fabs(ko[i]));
2758       iko[i] = 1.0/ko[i];
2759     }
2760     else{
2761       ZeroNum++;
2762       ko[i]  = 0.0;
2763       iko[i] = 0.0;
2764     }
2765   }
2766 
2767   if (error_check==1){
2768     printf("spin=%2d Mc_AN=%2d ZeroNum=%2d\n",spin,Mc_AN,ZeroNum);
2769   }
2770 
2771   for (i=1; i<=Msize3[Mc_AN]; i++){
2772     for (j=1; j<=Msize3[Mc_AN]; j++){
2773       FS[i][j] = FS[i][j]*iko[j];
2774     }
2775   }
2776 
2777   /* transpose for later calculation */
2778   for (i=1; i<=Msize3[Mc_AN]; i++){
2779     for (j=i+1; j<=Msize3[Mc_AN]; j++){
2780       tmp1 = FS[i][j];
2781       tmp2 = FS[j][i];
2782       FS[i][j] = tmp2;
2783       FS[j][i] = tmp1;
2784     }
2785   }
2786 
2787   if (measure_time==1) dtime(&Stime1);
2788 
2789   /* U0 * U * lamda^{-1/2} */
2790 
2791 #ifdef nosse
2792 
2793   /* original version */
2794   /*
2795   for (i=0; i<Msize2[Mc_AN]; i++){
2796     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
2797       for (m=0; m<EKC_core_size[Mc_AN]; m++){
2798 
2799         m1 = rl0*EKC_core_size[Mc_AN] + m + 1;
2800 
2801 	sum = 0.0;
2802 	for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2803 
2804           n1 = rl*EKC_core_size[Mc_AN] + 1;
2805 
2806 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
2807 	    sum += U0[rl][n][i]*FS[m1][n1+n];
2808 	  }
2809 	}
2810 
2811         Utmp[rl0][m] = sum;
2812       }
2813     }
2814 
2815     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
2816       for (m=0; m<EKC_core_size[Mc_AN]; m++){
2817         U0[rl0][m][i] = Utmp[rl0][m];
2818       }
2819     }
2820   }
2821   */
2822 
2823   /* unrolling version */
2824 
2825   for (i=0; i<Msize2[Mc_AN]; i++){
2826 
2827     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2828       for (n=0; n<EKC_core_size[Mc_AN]; n++){
2829         Utmp[rl][n] = U0[rl][n][i];
2830       }
2831     }
2832 
2833     for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
2834 
2835       rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
2836       rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
2837       rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
2838       rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
2839 
2840       mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
2841       mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
2842       mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
2843       mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
2844 
2845       sum0 = 0.0;
2846       sum1 = 0.0;
2847       sum2 = 0.0;
2848       sum3 = 0.0;
2849 
2850       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2851 
2852 	n1 = rl*EKC_core_size[Mc_AN] + 1;
2853 
2854 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
2855 	  sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
2856 	  sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
2857 	  sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
2858 	  sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
2859 	}
2860       }
2861 
2862       U0[rl00][mm0][i] = sum0;
2863       U0[rl01][mm1][i] = sum1;
2864       U0[rl02][mm2][i] = sum2;
2865       U0[rl03][mm3][i] = sum3;
2866     }
2867 
2868     m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
2869 
2870     for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
2871 
2872       rl0 = (m1-1)/EKC_core_size[Mc_AN];
2873       m = (m1-1)%EKC_core_size[Mc_AN];
2874 
2875       sum = 0.0;
2876 
2877       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2878 
2879 	n1 = rl*EKC_core_size[Mc_AN] + 1;
2880 
2881 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
2882 	  sum += Utmp[rl][n]*FS[m1][n1+n];
2883 	}
2884       }
2885 
2886       U0[rl0][m][i] = sum;
2887     }
2888   } /* i */
2889 
2890 #else
2891 
2892   /* Unrolling + SSE version */
2893   /**/
2894 
2895   for (i=0; i<Msize2[Mc_AN]; i++){
2896 
2897     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2898       for (n=0; n<EKC_core_size[Mc_AN]; n++){
2899         Utmp[rl][n] = U0[rl][n][i];
2900       }
2901     }
2902 
2903     for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
2904 
2905       rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
2906       rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
2907       rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
2908       rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
2909 
2910       mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
2911       mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
2912       mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
2913       mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
2914 
2915       sum0 = 0.0;
2916       sum1 = 0.0;
2917       sum2 = 0.0;
2918       sum3 = 0.0;
2919 
2920       mmSum00 = _mm_setzero_pd();
2921       mmSum01 = _mm_setzero_pd();
2922       mmSum10 = _mm_setzero_pd();
2923       mmSum11 = _mm_setzero_pd();
2924       mmSum20 = _mm_setzero_pd();
2925       mmSum21 = _mm_setzero_pd();
2926       mmSum30 = _mm_setzero_pd();
2927       mmSum31 = _mm_setzero_pd();
2928 
2929       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2930 
2931 	n1 = rl*EKC_core_size[Mc_AN] + 1;
2932 
2933 	if (0){
2934 	mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
2935 	mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
2936 	}
2937 
2938 	for (n=0; n<(EKC_core_size[Mc_AN]-3); n+=4){
2939 	  mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
2940 	  mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
2941 
2942 	  mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+0]),mmTmp0));
2943 	  mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+2]),mmTmp1));
2944 
2945 	  mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+0]),mmTmp0));
2946 	  mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+2]),mmTmp1));
2947 
2948 	  mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+0]),mmTmp0));
2949 	  mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+2]),mmTmp1));
2950 
2951 	  mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+0]),mmTmp0));
2952 	  mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+2]),mmTmp1));
2953 	}
2954 
2955 	for (; n<EKC_core_size[Mc_AN]; n++){
2956 	  sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
2957 	  sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
2958 	  sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
2959 	  sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
2960 	}
2961 
2962       }
2963 
2964       mmSum00 = _mm_add_pd(mmSum00, mmSum01);
2965       mmSum10 = _mm_add_pd(mmSum10, mmSum11);
2966       mmSum20 = _mm_add_pd(mmSum20, mmSum21);
2967       mmSum30 = _mm_add_pd(mmSum30, mmSum31);
2968 
2969       _mm_storeu_pd(&mmArr[0], mmSum00);
2970       _mm_storeu_pd(&mmArr[2], mmSum10);
2971       _mm_storeu_pd(&mmArr[4], mmSum20);
2972       _mm_storeu_pd(&mmArr[6], mmSum30);
2973 
2974       sum0 += mmArr[0] + mmArr[1];
2975       sum1 += mmArr[2] + mmArr[3];
2976       sum2 += mmArr[4] + mmArr[5];
2977       sum3 += mmArr[6] + mmArr[7];
2978 
2979       U0[rl00][mm0][i] = sum0;
2980       U0[rl01][mm1][i] = sum1;
2981       U0[rl02][mm2][i] = sum2;
2982       U0[rl03][mm3][i] = sum3;
2983     }
2984 
2985     m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
2986 
2987     for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
2988 
2989       rl0 = (m1-1)/EKC_core_size[Mc_AN];
2990       m = (m1-1)%EKC_core_size[Mc_AN];
2991 
2992       sum = 0.0;
2993 
2994       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
2995 
2996 	n1 = rl*EKC_core_size[Mc_AN] + 1;
2997 
2998 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
2999 	  sum += Utmp[rl][n]*FS[m1][n1+n];
3000 	}
3001       }
3002 
3003       U0[rl0][m][i] = sum;
3004     }
3005   } /* i */
3006 
3007 #endif
3008 
3009   Krylov_U[spin][Mc_AN][0] = ZeroNum;
3010 
3011   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3012     for (n=0; n<EKC_core_size[Mc_AN]; n++){
3013       for (i=0; i<Msize2[Mc_AN]; i++){
3014         Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+i+1] = U0[rl][n][i];
3015       }
3016     }
3017   }
3018 
3019   if (measure_time==1){
3020     dtime(&Etime1);
3021     time9 += Etime1 - Stime1;
3022   }
3023 
3024   /************************************************************
3025          check the orthonormality of Krylov vectors
3026   ************************************************************/
3027 
3028   if (error_check==1){
3029 
3030     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3031 
3032       for (n=0; n<EKC_core_size[Mc_AN]; n++){
3033 	for (i=0; i<Msize2[Mc_AN]; i++){
3034 	  tmpvec1[n][i]  = 0.0;
3035 	}
3036       }
3037 
3038       for (i=0; i<=can; i++){
3039 
3040 	ig = natn[ct_AN][i];
3041 	ian = Spe_Total_CNO[WhatSpecies[ig]];
3042 	Anum = MP[i] - 1;
3043 	ih = S_G2M[ig];
3044 
3045 	for (j=0; j<=can; j++){
3046 
3047 	  kl = RMI1[Mc_AN][i][j];
3048 	  jg = natn[ct_AN][j];
3049 	  jan = Spe_Total_CNO[WhatSpecies[jg]];
3050 	  Bnum = MP[j] - 1;
3051 
3052 	  if (0<=kl){
3053 
3054 	    for (m=0; m<ian; m++){
3055 	      for (n=0; n<EKC_core_size[Mc_AN]; n++){
3056 
3057 		sum = 0.0;
3058 		for (k=0; k<jan; k++){
3059 		  sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
3060 		}
3061 
3062 		tmpvec1[n][Anum+m] += sum;
3063 	      }
3064 	    }
3065 	  }
3066 	}
3067       }
3068 
3069       for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3070 	for (m=0; m<EKC_core_size[Mc_AN]; m++){
3071 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
3072 	    sum = 0.0;
3073 	    for (i=0; i<Msize2[Mc_AN]; i++){
3074 	      sum += U0[rl0][m][i]*tmpvec1[n][i];
3075 	    }
3076 
3077 	    if (rl==rl0 && m==n){
3078 	      if ( 1.0e-10<fabs(sum-1.0) ) {
3079 		printf("A spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
3080 		       spin,Mc_AN,rl,rl0,m,n,sum);
3081 	      }
3082 	    }
3083 	    else{
3084 	      if ( 1.0e-10<fabs(sum) ) {
3085 		printf("B spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
3086 		       spin,Mc_AN,rl,rl0,m,n,sum);
3087 	      }
3088 	    }
3089 
3090 	  }
3091 	}
3092       }
3093     }
3094   }
3095 
3096   if (measure_time==1){
3097     printf("pMatrix myid=%2d time1 =%5.3f time2 =%5.3f time3 =%5.3f time4 =%5.3f\n",
3098             myid,time1,time2,time3,time4);
3099     printf("pMatrix myid=%2d time5 =%5.3f time6 =%5.3f time7 =%5.3f time8 =%5.3f\n",
3100             myid,time5,time6,time7,time8);
3101     printf("pMatrix myid=%2d time9 =%5.3f\n",myid,time9);
3102   }
3103 
3104   /* freeing of arrays */
3105 
3106   for (i=0; i<rlmax_EC[Mc_AN]; i++){
3107     free(Utmp[i]);
3108   }
3109   free(Utmp);
3110 
3111   for (i=0; i<rlmax_EC[Mc_AN]; i++){
3112     for (j=0; j<EKC_core_size[Mc_AN]; j++){
3113       free(U0[i][j]);
3114     }
3115     free(U0[i]);
3116   }
3117   free(U0);
3118 
3119   for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
3120     free(tmpmat0[i]);
3121   }
3122   free(tmpmat0);
3123 
3124   for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
3125     free(FS[i]);
3126   }
3127   free(FS);
3128 
3129   free(ko);
3130   free(iko);
3131 
3132   for (i=0; i<(EKC_core_size[Mc_AN]+2); i++){
3133     free(matRS0[i]);
3134   }
3135   free(matRS0);
3136 
3137   for (i=0; i<(Msize4[Mc_AN]+3); i++){
3138     free(matRS1[i]);
3139   }
3140   free(matRS1);
3141 }
3142 
3143 
3144 
3145 
3146 
3147 
3148 
3149 
Generate_pMatrix2(int myid,int spin,int Mc_AN,double ***** Hks,double **** OLP0,double *** Krylov_U,int * MP,int * Msize,int * Msize2,int * Msize3,double ** tmpvec1)3150 void Generate_pMatrix2( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0,
3151                         double ***Krylov_U, int *MP, int *Msize, int *Msize2, int *Msize3,
3152                         double **tmpvec1)
3153 {
3154   int rl,rl0,rl1,ct_AN,fan,san,can,wan,ct_on,i,j;
3155   int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1;
3156   int ZeroNum,rl_half;
3157   int KU_d1,KU_d2,csize;
3158   double sum,dum,tmp0,tmp1,tmp2,tmp3;
3159   double **Utmp;
3160   double *ko,*iko;
3161   double **FS;
3162   double ***U0;
3163 
3164   int rl00,rl01,rl02,rl03,rl04,rl05,rl06,rl07;
3165   int mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,m1s;
3166   __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
3167 
3168   double mmArr[8];
3169   double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
3170 
3171   ct_AN = M2G[Mc_AN];
3172   fan = FNAN[ct_AN];
3173   san = SNAN[ct_AN];
3174   can = fan + san;
3175   wan = WhatSpecies[ct_AN];
3176   ct_on = Spe_Total_CNO[wan];
3177 
3178   if (Msize[Mc_AN]<Msize3[Mc_AN])
3179     csize = Msize3[Mc_AN] + 40;
3180   else
3181     csize = Msize[Mc_AN] + 40;
3182 
3183   KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
3184   KU_d2 = Msize2[Mc_AN];
3185 
3186   /* allocation of arrays */
3187 
3188   Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
3189   for (i=0; i<rlmax_EC[Mc_AN]; i++){
3190     Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
3191   }
3192 
3193   U0 = (double***)malloc(sizeof(double**)*rlmax_EC[Mc_AN]);
3194   for (i=0; i<rlmax_EC[Mc_AN]; i++){
3195     U0[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
3196     for (j=0; j<EKC_core_size[Mc_AN]; j++){
3197       U0[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
3198       for (k=0; k<(Msize2[Mc_AN]+3); k++)  U0[i][j][k] = 0.0;
3199     }
3200   }
3201 
3202   FS = (double**)malloc(sizeof(double*)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
3203   for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
3204     FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
3205   }
3206 
3207   ko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
3208   iko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
3209 
3210   /****************************************************
3211    initialize
3212   ****************************************************/
3213 
3214   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3215     for (n=0; n<EKC_core_size[Mc_AN]; n++){
3216       for (i=0; i<Msize2[Mc_AN]; i++){
3217         U0[rl][n][i] = 0.0;
3218       }
3219     }
3220   }
3221 
3222   i = 0;
3223   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3224     for (n=0; n<EKC_core_size[Mc_AN]; n++){
3225       if (i<Msize2[Mc_AN]) U0[rl][n][i] = 1.0;
3226       i++;
3227     }
3228   }
3229 
3230   /************************************************************
3231               orthogonalization by diagonalization
3232   ************************************************************/
3233 
3234   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3235 
3236     /*  S * |Vn) */
3237 
3238     for (n=0; n<EKC_core_size[Mc_AN]; n++){
3239       for (i=0; i<Msize2[Mc_AN]; i++){
3240 	tmpvec1[n][i]  = 0.0;
3241       }
3242     }
3243 
3244     for (i=0; i<=can; i++){
3245 
3246       ig = natn[ct_AN][i];
3247       ian = Spe_Total_CNO[WhatSpecies[ig]];
3248       Anum = MP[i] - 1;
3249       ih = S_G2M[ig];
3250 
3251       for (j=0; j<=can; j++){
3252 
3253 	kl = RMI1[Mc_AN][i][j];
3254 	jg = natn[ct_AN][j];
3255 	jan = Spe_Total_CNO[WhatSpecies[jg]];
3256 	Bnum = MP[j] - 1;
3257 
3258 	if (0<=kl){
3259 
3260 #ifdef nosse
3261 
3262 	  /* Original version */
3263 	  /**/
3264 	  for (m=0; m<ian; m++){
3265 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
3266 
3267 	      sum = 0.0;
3268 	      for (k=0; k<jan; k++){
3269 		sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
3270 	      }
3271 	      tmpvec1[n][Anum+m] += sum;
3272 	    }
3273 	  }
3274 	  /**/
3275 
3276 #else
3277 
3278 	  /* Unrolling + SSE version */
3279 
3280 	  for (m=0; m<(ian-3); m+=4){
3281 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
3282 
3283 	      mmSum00 = _mm_setzero_pd();
3284 	      mmSum01 = _mm_setzero_pd();
3285 	      mmSum10 = _mm_setzero_pd();
3286 	      mmSum11 = _mm_setzero_pd();
3287 	      mmSum20 = _mm_setzero_pd();
3288 	      mmSum21 = _mm_setzero_pd();
3289 	      mmSum30 = _mm_setzero_pd();
3290 	      mmSum31 = _mm_setzero_pd();
3291 
3292 	      for (k=0; k<(jan-3); k+=4){
3293 		mmTmp0 = _mm_loadu_pd(&U0[rl][n][Bnum+k+0]);
3294 		mmTmp1 = _mm_loadu_pd(&U0[rl][n][Bnum+k+2]);
3295 
3296 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
3297 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
3298 
3299 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
3300 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
3301 
3302 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
3303 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
3304 
3305 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
3306 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
3307 	      }
3308 
3309 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
3310 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
3311 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
3312 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
3313 
3314 	      _mm_storeu_pd(&mmArr[0], mmSum00);
3315 	      _mm_storeu_pd(&mmArr[2], mmSum10);
3316 	      _mm_storeu_pd(&mmArr[4], mmSum20);
3317 	      _mm_storeu_pd(&mmArr[6], mmSum30);
3318 
3319 	      sum0 = mmArr[0] + mmArr[1];
3320 	      sum1 = mmArr[2] + mmArr[3];
3321 	      sum2 = mmArr[4] + mmArr[5];
3322 	      sum3 = mmArr[6] + mmArr[7];
3323 
3324 	      for (; k<jan; k++){
3325 		sum0 += OLP0[ih][kl][m+0][k]*U0[rl][n][Bnum+k];
3326 		sum1 += OLP0[ih][kl][m+1][k]*U0[rl][n][Bnum+k];
3327 		sum2 += OLP0[ih][kl][m+2][k]*U0[rl][n][Bnum+k];
3328 		sum3 += OLP0[ih][kl][m+3][k]*U0[rl][n][Bnum+k];
3329 	      }
3330 
3331 	      tmpvec1[n][Anum+m+0] += sum0;
3332 	      tmpvec1[n][Anum+m+1] += sum1;
3333 	      tmpvec1[n][Anum+m+2] += sum2;
3334 	      tmpvec1[n][Anum+m+3] += sum3;
3335 	    }
3336 	  }
3337 
3338 	  for (; m<ian; m++){
3339 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
3340 
3341 	      sum = 0.0;
3342 	      for (k=0; k<jan; k++){
3343 		sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
3344 	      }
3345 	      tmpvec1[n][Anum+m] += sum;
3346 	    }
3347 	  }
3348 
3349 #endif
3350 
3351 	}
3352       }
3353     }
3354 
3355 #ifdef nosse
3356 
3357     /* Original version */
3358     /**/
3359     for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
3360       for (m=0; m<EKC_core_size[Mc_AN]; m++){
3361         for (n=0; n<EKC_core_size[Mc_AN]; n++){
3362   	  sum = 0.0;
3363 	  for (i=0; i<Msize2[Mc_AN]; i++){
3364             sum += U0[rl0][m][i]*tmpvec1[n][i];
3365 	  }
3366           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
3367           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
3368 	}
3369       }
3370     }
3371     /**/
3372 
3373 #else
3374 
3375     /* Unrolling + SSE version */
3376 
3377     for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
3378       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
3379         for (n=0; n<EKC_core_size[Mc_AN]; n++){
3380 
3381 	  mmSum00 = _mm_setzero_pd();
3382 	  mmSum01 = _mm_setzero_pd();
3383 	  mmSum10 = _mm_setzero_pd();
3384 	  mmSum11 = _mm_setzero_pd();
3385 	  mmSum20 = _mm_setzero_pd();
3386 	  mmSum21 = _mm_setzero_pd();
3387 	  mmSum30 = _mm_setzero_pd();
3388 	  mmSum31 = _mm_setzero_pd();
3389 
3390 	  for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
3391 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
3392 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
3393 
3394 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
3395 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
3396 
3397 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
3398 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
3399 
3400 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
3401 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
3402 
3403 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
3404 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
3405 	  }
3406 
3407 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
3408 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
3409 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
3410 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
3411 
3412 	  _mm_storeu_pd(&mmArr[0], mmSum00);
3413 	  _mm_storeu_pd(&mmArr[2], mmSum10);
3414 	  _mm_storeu_pd(&mmArr[4], mmSum20);
3415 	  _mm_storeu_pd(&mmArr[6], mmSum30);
3416 
3417 	  sum0 = mmArr[0] + mmArr[1];
3418 	  sum1 = mmArr[2] + mmArr[3];
3419 	  sum2 = mmArr[4] + mmArr[5];
3420 	  sum3 = mmArr[6] + mmArr[7];
3421 
3422 	  for (; i<Msize2[Mc_AN]; i++){
3423             sum0 += U0[rl0][m+0][i]*tmpvec1[n][i];
3424             sum1 += U0[rl0][m+1][i]*tmpvec1[n][i];
3425             sum2 += U0[rl0][m+2][i]*tmpvec1[n][i];
3426             sum3 += U0[rl0][m+3][i]*tmpvec1[n][i];
3427 	  }
3428 
3429           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
3430           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum0;
3431 
3432           FS[rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
3433           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+2] = sum1;
3434 
3435           FS[rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
3436           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+3] = sum2;
3437 
3438           FS[rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
3439           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+4] = sum3;
3440 	}
3441       }
3442 
3443       for (; m<EKC_core_size[Mc_AN]; m++){
3444         for (n=0; n<EKC_core_size[Mc_AN]; n++){
3445   	  sum = 0.0;
3446 	  for (i=0; i<Msize2[Mc_AN]; i++){
3447             sum += U0[rl0][m][i]*tmpvec1[n][i];
3448 	  }
3449           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
3450           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
3451 	}
3452       }
3453     }
3454 
3455 #endif
3456 
3457   }
3458 
3459   Eigen_lapack(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
3460   ZeroNum = 0;
3461 
3462   for (i=1; i<=Msize3[Mc_AN]; i++){
3463 
3464     if (error_check==1){
3465       printf("spin=%2d Mc_AN=%2d i=%3d ko[i]=%18.15f\n",spin,Mc_AN,i,ko[i]);
3466     }
3467 
3468     if (cutoff_value<ko[i]){
3469       ko[i]  = sqrt(fabs(ko[i]));
3470       iko[i] = 1.0/ko[i];
3471     }
3472     else{
3473       ZeroNum++;
3474       ko[i]  = 0.0;
3475       iko[i] = 0.0;
3476     }
3477   }
3478 
3479   if (error_check==1){
3480     printf("spin=%2d Mc_AN=%2d ZeroNum=%2d\n",spin,Mc_AN,ZeroNum);
3481   }
3482 
3483   for (i=1; i<=Msize3[Mc_AN]; i++){
3484     for (j=1; j<=Msize3[Mc_AN]; j++){
3485       FS[i][j] = FS[i][j]*iko[j];
3486     }
3487   }
3488 
3489   /* transpose for later calculation */
3490   for (i=1; i<=Msize3[Mc_AN]; i++){
3491     for (j=i+1; j<=Msize3[Mc_AN]; j++){
3492       tmp1 = FS[i][j];
3493       tmp2 = FS[j][i];
3494       FS[i][j] = tmp2;
3495       FS[j][i] = tmp1;
3496     }
3497   }
3498 
3499   /* U0 * U * lamda^{-1/2} */
3500 
3501 #ifdef nosse
3502 
3503   /* original version */
3504 
3505   /**/
3506   for (i=0; i<Msize2[Mc_AN]; i++){
3507     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3508       for (m=0; m<EKC_core_size[Mc_AN]; m++){
3509 
3510         m1 = rl0*EKC_core_size[Mc_AN] + m + 1;
3511 
3512 	sum = 0.0;
3513 	for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3514 
3515           n1 = rl*EKC_core_size[Mc_AN] + 1;
3516 
3517 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
3518 	    sum += U0[rl][n][i]*FS[m1][n1+n];
3519 	  }
3520 	}
3521 
3522         Utmp[rl0][m] = sum;
3523       }
3524     }
3525 
3526     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3527       for (m=0; m<EKC_core_size[Mc_AN]; m++){
3528         U0[rl0][m][i] = Utmp[rl0][m];
3529       }
3530     }
3531   }
3532   /**/
3533 
3534 #else
3535 
3536   /* Unrolling + SSE version */
3537   /**/
3538   for (i=0; i<Msize2[Mc_AN]; i++){
3539 
3540     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3541       for (n=0; n<EKC_core_size[Mc_AN]; n++){
3542         Utmp[rl][n] = U0[rl][n][i];
3543       }
3544     }
3545 
3546     for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
3547 
3548       rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
3549       rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
3550       rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
3551       rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
3552 
3553       mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
3554       mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
3555       mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
3556       mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
3557 
3558       sum0 = 0.0;
3559       sum1 = 0.0;
3560       sum2 = 0.0;
3561       sum3 = 0.0;
3562 
3563       mmSum00 = _mm_setzero_pd();
3564       mmSum01 = _mm_setzero_pd();
3565       mmSum10 = _mm_setzero_pd();
3566       mmSum11 = _mm_setzero_pd();
3567       mmSum20 = _mm_setzero_pd();
3568       mmSum21 = _mm_setzero_pd();
3569       mmSum30 = _mm_setzero_pd();
3570       mmSum31 = _mm_setzero_pd();
3571 
3572       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3573 
3574 	n1 = rl*EKC_core_size[Mc_AN] + 1;
3575 
3576         if (0){
3577 	  mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]); /*???????????*/
3578   	  mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]); /*???????????*/
3579         }
3580 
3581 	for (n=0; n<(EKC_core_size[Mc_AN]-3); n+=4){
3582 	  mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
3583 	  mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
3584 	  mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+0]),mmTmp0));
3585 	  mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+2]),mmTmp1));
3586 
3587 	  mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+0]),mmTmp0));
3588 	  mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+2]),mmTmp1));
3589 
3590 	  mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+0]),mmTmp0));
3591 	  mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+2]),mmTmp1));
3592 
3593 	  mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+0]),mmTmp0));
3594 	  mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+2]),mmTmp1));
3595 	}
3596 
3597 	for (; n<EKC_core_size[Mc_AN]; n++){
3598 	  sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
3599 	  sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
3600 	  sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
3601 	  sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
3602 	}
3603       }
3604 
3605       mmSum00 = _mm_add_pd(mmSum00, mmSum01);
3606       mmSum10 = _mm_add_pd(mmSum10, mmSum11);
3607       mmSum20 = _mm_add_pd(mmSum20, mmSum21);
3608       mmSum30 = _mm_add_pd(mmSum30, mmSum31);
3609 
3610       _mm_storeu_pd(&mmArr[0], mmSum00);
3611       _mm_storeu_pd(&mmArr[2], mmSum10);
3612       _mm_storeu_pd(&mmArr[4], mmSum20);
3613       _mm_storeu_pd(&mmArr[6], mmSum30);
3614 
3615       sum0 += mmArr[0] + mmArr[1];
3616       sum1 += mmArr[2] + mmArr[3];
3617       sum2 += mmArr[4] + mmArr[5];
3618       sum3 += mmArr[6] + mmArr[7];
3619 
3620       U0[rl00][mm0][i] = sum0;
3621       U0[rl01][mm1][i] = sum1;
3622       U0[rl02][mm2][i] = sum2;
3623       U0[rl03][mm3][i] = sum3;
3624     }
3625 
3626     m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
3627 
3628     for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
3629 
3630       rl0 = (m1-1)/EKC_core_size[Mc_AN];
3631       m = (m1-1)%EKC_core_size[Mc_AN];
3632 
3633       sum = 0.0;
3634 
3635       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3636 
3637 	n1 = rl*EKC_core_size[Mc_AN] + 1;
3638 
3639 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
3640 	  sum += Utmp[rl][n]*FS[m1][n1+n];
3641 	}
3642       }
3643 
3644       U0[rl0][m][i] = sum;
3645     }
3646   } /* i */
3647 
3648 #endif
3649 
3650   Krylov_U[spin][Mc_AN][0] = ZeroNum;
3651 
3652   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3653     for (n=0; n<EKC_core_size[Mc_AN]; n++){
3654       for (i=0; i<Msize2[Mc_AN]; i++){
3655         Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+i+1] = U0[rl][n][i];
3656       }
3657     }
3658   }
3659 
3660   /************************************************************
3661          check the orthonormality of Krylov vectors
3662   ************************************************************/
3663 
3664   if (error_check==1){
3665 
3666     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3667 
3668       for (n=0; n<EKC_core_size[Mc_AN]; n++){
3669 	for (i=0; i<Msize2[Mc_AN]; i++){
3670 	  tmpvec1[n][i]  = 0.0;
3671 	}
3672       }
3673 
3674       for (i=0; i<=can; i++){
3675 
3676 	ig = natn[ct_AN][i];
3677 	ian = Spe_Total_CNO[WhatSpecies[ig]];
3678 	Anum = MP[i] - 1;
3679 	ih = S_G2M[ig];
3680 
3681 	for (j=0; j<=can; j++){
3682 
3683 	  kl = RMI1[Mc_AN][i][j];
3684 	  jg = natn[ct_AN][j];
3685 	  jan = Spe_Total_CNO[WhatSpecies[jg]];
3686 	  Bnum = MP[j] - 1;
3687 
3688 	  if (0<=kl){
3689 
3690 	    for (m=0; m<ian; m++){
3691 	      for (n=0; n<EKC_core_size[Mc_AN]; n++){
3692 
3693 		sum = 0.0;
3694 		for (k=0; k<jan; k++){
3695 		  sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
3696 		}
3697 
3698 		tmpvec1[n][Anum+m] += sum;
3699 	      }
3700 	    }
3701 	  }
3702 	}
3703       }
3704 
3705       for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3706 	for (m=0; m<EKC_core_size[Mc_AN]; m++){
3707 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
3708 	    sum = 0.0;
3709 	    for (i=0; i<Msize2[Mc_AN]; i++){
3710 	      sum += U0[rl0][m][i]*tmpvec1[n][i];
3711 	    }
3712 
3713 	    if (rl==rl0 && m==n){
3714 	      if ( 1.0e-10<fabs(sum-1.0) ) {
3715 		printf("A spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
3716 		       spin,Mc_AN,rl,rl0,m,n,sum);
3717 	      }
3718 	    }
3719 	    else{
3720 	      if ( 1.0e-10<fabs(sum) ) {
3721 		printf("B spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
3722 		       spin,Mc_AN,rl,rl0,m,n,sum);
3723 	      }
3724 	    }
3725 
3726 	  }
3727 	}
3728       }
3729     }
3730   }
3731 
3732   /* freeing of arrays */
3733 
3734   for (i=0; i<rlmax_EC[Mc_AN]; i++){
3735     free(Utmp[i]);
3736   }
3737   free(Utmp);
3738 
3739   for (i=0; i<rlmax_EC[Mc_AN]; i++){
3740     for (j=0; j<EKC_core_size[Mc_AN]; j++){
3741       free(U0[i][j]);
3742     }
3743     free(U0[i]);
3744   }
3745   free(U0);
3746 
3747   for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
3748     free(FS[i]);
3749   }
3750   free(FS);
3751 
3752   free(ko);
3753   free(iko);
3754 }
3755 
3756 
3757 
3758 
3759 
3760 
3761 
Embedding_Matrix(int spin,int Mc_AN,double ***** Hks,double *** Krylov_U,double **** EC_matrix,int * MP,int * Msize,int * Msize2,int * Msize3,double ** tmpvec1)3762 void Embedding_Matrix(int spin, int Mc_AN, double *****Hks,
3763                       double ***Krylov_U, double ****EC_matrix,
3764                       int *MP, int *Msize, int *Msize2, int *Msize3,
3765                       double **tmpvec1 )
3766 {
3767   int ct_AN,fan,san,can,wan,ct_on;
3768   int rl,rl0,m,n,i,j,k,kl,jg,jan,ih,ian;
3769   int Anum,Bnum,ig;
3770   int KU_d1,KU_d2,csize;
3771   double sum,tmp1,tmp2,tmp3;
3772   __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
3773 
3774   double mmArr[8];
3775   double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
3776   double stime,etime;
3777   double time1,time2,time3,time4,time5,time6,time7,time8;
3778 
3779   ct_AN = M2G[Mc_AN];
3780   fan = FNAN[ct_AN];
3781   san = SNAN[ct_AN];
3782   can = fan + san;
3783   wan = WhatSpecies[ct_AN];
3784   ct_on = Spe_Total_CNO[wan];
3785 
3786   if (Msize[Mc_AN]<Msize3[Mc_AN])
3787     csize = Msize3[Mc_AN] + 40;
3788   else
3789     csize = Msize[Mc_AN] + 40;
3790 
3791   KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
3792   KU_d2 = Msize2[Mc_AN];
3793 
3794   if (measure_time==1){
3795     time1 = 0.0;
3796     time2 = 0.0;
3797     time3 = 0.0;
3798     time4 = 0.0;
3799     time5 = 0.0;
3800     time6 = 0.0;
3801     time7 = 0.0;
3802     time8 = 0.0;
3803   }
3804 
3805   /*******************************
3806             u1^+ C^+ u2
3807   *******************************/
3808 
3809   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
3810 
3811     /* C^+ u2 */
3812 
3813     for (n=0; n<EKC_core_size[Mc_AN]; n++){
3814       for (i=0; i<Msize2[Mc_AN]; i++){
3815 	tmpvec1[n][i]  = 0.0;
3816       }
3817     }
3818 
3819     for (i=0; i<=fan; i++){
3820 
3821       ig = natn[ct_AN][i];
3822       ian = Spe_Total_CNO[WhatSpecies[ig]];
3823       Anum = MP[i] - 1;
3824       ih = S_G2M[ig];
3825 
3826       for (j=fan+1; j<=can; j++){
3827 
3828 	kl = RMI1[Mc_AN][i][j];
3829 	jg = natn[ct_AN][j];
3830 	jan = Spe_Total_CNO[WhatSpecies[jg]];
3831 	Bnum = MP[j];
3832 
3833 	if (0<=kl){
3834 
3835 #ifdef nosse
3836 
3837           if (measure_time==1) dtime(&stime);
3838 
3839 	  /* Original version */
3840 	  /**/
3841 	  for (m=0; m<ian; m++){
3842 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
3843 
3844 	      sum = 0.0;
3845 	      for (k=0; k<jan; k++){
3846 		sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3847 	      }
3848 
3849 	      tmpvec1[n][Anum+m] += sum;
3850 	    }
3851 	  }
3852 	  /**/
3853 
3854           if (measure_time==1){
3855             dtime(&etime);
3856             time1 += etime - stime;
3857 	  }
3858 
3859 #else
3860           if (measure_time==1) dtime(&stime);
3861 
3862 	  /* Unrolling + SSE version */
3863 	  /**/
3864 	  for (m=0; m<(ian-3); m+=4){
3865 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
3866 
3867 	      mmSum00 = _mm_setzero_pd();
3868 	      mmSum01 = _mm_setzero_pd();
3869 	      mmSum10 = _mm_setzero_pd();
3870 	      mmSum11 = _mm_setzero_pd();
3871 	      mmSum20 = _mm_setzero_pd();
3872 	      mmSum21 = _mm_setzero_pd();
3873 	      mmSum30 = _mm_setzero_pd();
3874 	      mmSum31 = _mm_setzero_pd();
3875 
3876 	      for (k=0; k<(jan-3); k+=4){
3877 		mmTmp0 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0]);
3878 		mmTmp1 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+2]);
3879 
3880 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
3881 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
3882 
3883 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
3884 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
3885 
3886 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
3887 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
3888 
3889 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
3890 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
3891 	      }
3892 
3893 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
3894 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
3895 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
3896 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
3897 
3898 	      _mm_storeu_pd(&mmArr[0], mmSum00);
3899 	      _mm_storeu_pd(&mmArr[2], mmSum10);
3900 	      _mm_storeu_pd(&mmArr[4], mmSum20);
3901 	      _mm_storeu_pd(&mmArr[6], mmSum30);
3902 
3903 	      sum0 = mmArr[0] + mmArr[1];
3904 	      sum1 = mmArr[2] + mmArr[3];
3905 	      sum2 = mmArr[4] + mmArr[5];
3906 	      sum3 = mmArr[6] + mmArr[7];
3907 
3908 	      for (; k<jan; k++){
3909 		sum0 += Hks[spin][ih][kl][m+0][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3910 		sum1 += Hks[spin][ih][kl][m+1][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3911 		sum2 += Hks[spin][ih][kl][m+2][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3912 		sum3 += Hks[spin][ih][kl][m+3][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3913 	      }
3914 
3915 	      tmpvec1[n][Anum+m+0] += sum0;
3916 	      tmpvec1[n][Anum+m+1] += sum1;
3917 	      tmpvec1[n][Anum+m+2] += sum2;
3918 	      tmpvec1[n][Anum+m+3] += sum3;
3919 	    }
3920 	  }
3921 
3922 	  for (; m<ian; m++){
3923 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
3924 
3925 	      sum = 0.0;
3926 	      for (k=0; k<jan; k++){
3927 		sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
3928 	      }
3929 
3930 	      tmpvec1[n][Anum+m] += sum;
3931 	    }
3932 	  }
3933 
3934           if (measure_time==1){
3935             dtime(&etime);
3936             time1 += etime - stime;
3937 	  }
3938 #endif
3939 
3940 	}
3941       }
3942     }
3943 
3944     /* u1^+ C^+ u2 */
3945 
3946 #ifdef nosse
3947 
3948     if (measure_time==1) dtime(&stime);
3949 
3950     /* Original version */
3951     /**/
3952     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3953       for (m=0; m<EKC_core_size[Mc_AN]; m++){
3954         for (n=0; n<EKC_core_size[Mc_AN]; n++){
3955   	  sum = 0.0;
3956 	  for (i=0; i<Msize[Mc_AN]; i++){
3957             sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+m*KU_d2+i+1]*tmpvec1[n][i];
3958 	  }
3959           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
3960 	}
3961       }
3962     }
3963     /**/
3964 
3965     if (measure_time==1){
3966       dtime(&etime);
3967       time2 += etime - stime;
3968     }
3969 
3970 #else
3971 
3972     /*
3973     printf("ABC1 Mc_AN=%2d %2d %2d %2d\n",Mc_AN,rlmax_EC[Mc_AN],EKC_core_size[Mc_AN],Msize[Mc_AN]);
3974     */
3975 
3976     if (measure_time==1) dtime(&stime);
3977 
3978     /* Unrolling + SSE version */
3979     /**/
3980     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
3981       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
3982         for (n=0; n<EKC_core_size[Mc_AN]; n++){
3983 
3984 	  mmSum00 = _mm_setzero_pd();
3985 	  mmSum01 = _mm_setzero_pd();
3986 	  mmSum10 = _mm_setzero_pd();
3987 	  mmSum11 = _mm_setzero_pd();
3988 	  mmSum20 = _mm_setzero_pd();
3989 	  mmSum21 = _mm_setzero_pd();
3990 	  mmSum30 = _mm_setzero_pd();
3991 	  mmSum31 = _mm_setzero_pd();
3992 
3993 	  for (i=0; i<(Msize[Mc_AN]-3); i+=4){
3994 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
3995 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
3996 
3997 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]),mmTmp0));
3998 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+3]),mmTmp1));
3999 
4000 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]),mmTmp0));
4001 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+3]),mmTmp1));
4002 
4003 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]),mmTmp0));
4004 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+3]),mmTmp1));
4005 
4006 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]),mmTmp0));
4007 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+3]),mmTmp1));
4008 	  }
4009 
4010 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4011 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4012 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4013 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4014 
4015 	  _mm_storeu_pd(&mmArr[0], mmSum00);
4016 	  _mm_storeu_pd(&mmArr[2], mmSum10);
4017 	  _mm_storeu_pd(&mmArr[4], mmSum20);
4018 	  _mm_storeu_pd(&mmArr[6], mmSum30);
4019 
4020 	  sum0 = mmArr[0] + mmArr[1];
4021 	  sum1 = mmArr[2] + mmArr[3];
4022 	  sum2 = mmArr[4] + mmArr[5];
4023 	  sum3 = mmArr[6] + mmArr[7];
4024 
4025 	  for (; i<Msize[Mc_AN]; i++){
4026             sum0 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
4027             sum1 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]*tmpvec1[n][i];
4028             sum2 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]*tmpvec1[n][i];
4029             sum3 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]*tmpvec1[n][i];
4030 	  }
4031 
4032           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
4033           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
4034           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
4035           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
4036 	}
4037       }
4038 
4039       for (; m<EKC_core_size[Mc_AN]; m++){
4040         for (n=0; n<EKC_core_size[Mc_AN]; n++){
4041   	  sum = 0.0;
4042 	  for (i=0; i<Msize[Mc_AN]; i++){
4043             sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
4044 	  }
4045           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
4046 	}
4047       }
4048 
4049     }
4050     /**/
4051 
4052     if (measure_time==1){
4053       dtime(&etime);
4054       time2 += etime - stime;
4055     }
4056 
4057 #endif
4058 
4059   } /* rl */
4060 
4061   /*******************************
4062             u2^+ C u1
4063   *******************************/
4064 
4065   for (i=1; i<=Msize3[Mc_AN]; i++){
4066     for (j=i; j<=Msize3[Mc_AN]; j++){
4067 
4068       tmp1 = EC_matrix[spin][Mc_AN][i][j];
4069       tmp2 = EC_matrix[spin][Mc_AN][j][i];
4070       tmp3 = tmp1 + tmp2;
4071 
4072       EC_matrix[spin][Mc_AN][i][j] = tmp3;
4073       EC_matrix[spin][Mc_AN][j][i] = tmp3;
4074     }
4075   }
4076 
4077   /*******************************
4078              u2^+ B u2
4079   *******************************/
4080 
4081   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
4082 
4083     /* B u2 */
4084 
4085     for (n=0; n<EKC_core_size[Mc_AN]; n++){
4086       for (i=0; i<Msize2[Mc_AN]; i++){
4087 	tmpvec1[n][i]  = 0.0;
4088       }
4089     }
4090 
4091     for (i=fan+1; i<=can; i++){
4092 
4093       ig = natn[ct_AN][i];
4094       ian = Spe_Total_CNO[WhatSpecies[ig]];
4095       Anum = MP[i] - 1;
4096       ih = S_G2M[ig];
4097 
4098       for (j=fan+1; j<=can; j++){
4099 
4100 	kl = RMI1[Mc_AN][i][j];
4101 	jg = natn[ct_AN][j];
4102 	jan = Spe_Total_CNO[WhatSpecies[jg]];
4103 	Bnum = MP[j];
4104 
4105 	if (0<=kl){
4106 
4107 #ifdef nosse
4108 
4109           if (measure_time==1) dtime(&stime);
4110 
4111 	  /* Original version */
4112 	  /**/
4113 	  for (m=0; m<ian; m++){
4114 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
4115 
4116 	      sum = 0.0;
4117 	      for (k=0; k<jan; k++){
4118 		sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
4119 	      }
4120 
4121 	      tmpvec1[n][Anum+m] += sum;
4122 	    }
4123 	  }
4124 	  /**/
4125 
4126 	  if (measure_time==1){
4127 	    dtime(&etime);
4128 	    time3 += etime - stime;
4129 	  }
4130 
4131 #else
4132 
4133           if (measure_time==1) dtime(&stime);
4134 
4135 	  /* Unrolling + SSE version */
4136 	  /**/
4137 	  for (m=0; m<(ian-3); m+=4){
4138 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
4139 
4140 	      mmSum00 = _mm_setzero_pd();
4141 	      mmSum01 = _mm_setzero_pd();
4142 	      mmSum10 = _mm_setzero_pd();
4143 	      mmSum11 = _mm_setzero_pd();
4144 	      mmSum20 = _mm_setzero_pd();
4145 	      mmSum21 = _mm_setzero_pd();
4146 	      mmSum30 = _mm_setzero_pd();
4147 	      mmSum31 = _mm_setzero_pd();
4148 
4149 	      for (k=0; k<(jan-3); k+=4){
4150 		mmTmp0 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0]);
4151 		mmTmp1 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+2]);
4152 
4153 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
4154 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
4155 
4156 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
4157 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
4158 
4159 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
4160 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
4161 
4162 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
4163 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
4164 	      }
4165 
4166 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4167 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4168 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4169 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4170 
4171 	      _mm_storeu_pd(&mmArr[0], mmSum00);
4172 	      _mm_storeu_pd(&mmArr[2], mmSum10);
4173 	      _mm_storeu_pd(&mmArr[4], mmSum20);
4174 	      _mm_storeu_pd(&mmArr[6], mmSum30);
4175 
4176 	      sum0 = mmArr[0] + mmArr[1];
4177 	      sum1 = mmArr[2] + mmArr[3];
4178 	      sum2 = mmArr[4] + mmArr[5];
4179 	      sum3 = mmArr[6] + mmArr[7];
4180 
4181 	      for (; k<jan; k++){
4182 		sum0 += Hks[spin][ih][kl][m+0][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
4183 		sum1 += Hks[spin][ih][kl][m+1][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
4184 		sum2 += Hks[spin][ih][kl][m+2][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
4185 		sum3 += Hks[spin][ih][kl][m+3][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
4186 	      }
4187 
4188 	      tmpvec1[n][Anum+m+0] += sum0;
4189 	      tmpvec1[n][Anum+m+1] += sum1;
4190 	      tmpvec1[n][Anum+m+2] += sum2;
4191 	      tmpvec1[n][Anum+m+3] += sum3;
4192 	    }
4193 	  }
4194 
4195 	  for (; m<ian; m++){
4196 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
4197 
4198 	      sum = 0.0;
4199 	      for (k=0; k<jan; k++){
4200 		sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
4201 	      }
4202 
4203 	      tmpvec1[n][Anum+m] += sum;
4204 	    }
4205 	  }
4206 	  /**/
4207 
4208 	  if (measure_time==1){
4209 	    dtime(&etime);
4210 	    time3 += etime - stime;
4211 	  }
4212 
4213 #endif
4214 
4215 	}
4216       }
4217     }
4218 
4219     /* u2^+ B u2 */
4220 
4221 #ifdef nosse
4222 
4223     if (measure_time==1) dtime(&stime);
4224 
4225     /* Original version */
4226     /**/
4227     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
4228       for (m=0; m<EKC_core_size[Mc_AN]; m++){
4229         for (n=0; n<EKC_core_size[Mc_AN]; n++){
4230   	  sum = 0.0;
4231 	  for (i=Msize[Mc_AN]; i<Msize2[Mc_AN]; i++){
4232             sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+m*KU_d2+i+1]*tmpvec1[n][i];
4233 	  }
4234           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum;
4235 	}
4236       }
4237     }
4238     /**/
4239 
4240     if (measure_time==1){
4241       dtime(&etime);
4242       time4 += etime - stime;
4243     }
4244 
4245 #else
4246 
4247     if (measure_time==1) dtime(&stime);
4248 
4249     /* Unrolling + SSE version */
4250     /**/
4251     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
4252       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
4253         for (n=0; n<EKC_core_size[Mc_AN]; n++){
4254 
4255 	  mmSum00 = _mm_setzero_pd();
4256 	  mmSum01 = _mm_setzero_pd();
4257 	  mmSum10 = _mm_setzero_pd();
4258 	  mmSum11 = _mm_setzero_pd();
4259 	  mmSum20 = _mm_setzero_pd();
4260 	  mmSum21 = _mm_setzero_pd();
4261 	  mmSum30 = _mm_setzero_pd();
4262 	  mmSum31 = _mm_setzero_pd();
4263 
4264 	  for (i=Msize[Mc_AN]; i<(Msize2[Mc_AN]-3); i+=4){
4265 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
4266 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
4267 
4268 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]),mmTmp0));
4269 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+3]),mmTmp1));
4270 
4271 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]),mmTmp0));
4272 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+3]),mmTmp1));
4273 
4274 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]),mmTmp0));
4275 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+3]),mmTmp1));
4276 
4277 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]),mmTmp0));
4278 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+3]),mmTmp1));
4279 	  }
4280 
4281 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4282 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4283 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4284 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4285 
4286 	  _mm_storeu_pd(&mmArr[0], mmSum00);
4287 	  _mm_storeu_pd(&mmArr[2], mmSum10);
4288 	  _mm_storeu_pd(&mmArr[4], mmSum20);
4289 	  _mm_storeu_pd(&mmArr[6], mmSum30);
4290 
4291 	  sum0 = mmArr[0] + mmArr[1];
4292 	  sum1 = mmArr[2] + mmArr[3];
4293 	  sum2 = mmArr[4] + mmArr[5];
4294 	  sum3 = mmArr[6] + mmArr[7];
4295 
4296 	  for (; i<Msize2[Mc_AN]; i++){
4297             sum0 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
4298             sum1 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]*tmpvec1[n][i];
4299             sum2 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]*tmpvec1[n][i];
4300             sum3 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]*tmpvec1[n][i];
4301 	  }
4302 
4303           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum0;
4304           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] += sum1;
4305           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] += sum2;
4306           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] += sum3;
4307 	}
4308       }
4309 
4310       for (; m<EKC_core_size[Mc_AN]; m++){
4311         for (n=0; n<EKC_core_size[Mc_AN]; n++){
4312   	  sum = 0.0;
4313 	  for (i=Msize[Mc_AN]; i<Msize2[Mc_AN]; i++){
4314             sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
4315 	  }
4316           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum;
4317 	}
4318       }
4319 
4320     }
4321     /**/
4322 
4323     if (measure_time==1){
4324       dtime(&etime);
4325       time4 += etime - stime;
4326     }
4327 
4328 #endif
4329 
4330   } /* rl */
4331 
4332   if (measure_time==1){
4333     printf("Embedding_Matrix time1=%5.3f time2=%5.3f time3=%5.3f time4=%5.3f\n",
4334             time1,time2,time3,time4);
4335   }
4336 }
4337 
4338 
4339 
4340 
4341 
4342 
4343 
4344 
4345 
4346 
Krylov_IOLP(int Mc_AN,double **** OLP0,double *** Krylov_U_OLP,double ** inv_RS,int * MP,int * Msize2,int * Msize4,int Msize2_max,double ** tmpvec0,double ** tmpvec1)4347 void Krylov_IOLP( int Mc_AN, double ****OLP0, double ***Krylov_U_OLP, double **inv_RS, int *MP,
4348                   int *Msize2, int *Msize4, int Msize2_max, double **tmpvec0, double **tmpvec1 )
4349 {
4350   int rl,ct_AN,fan,san,can,wan,ct_on,i,j;
4351   int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1;
4352   int rl0,ZeroNum,Neumann_series,ns,Gh_AN,wanB;
4353   double sum,dum,tmp0,tmp1,tmp2,tmp3,rcutA,r0;
4354   double **tmpmat0;
4355   double *ko,*iko;
4356   double **FS;
4357 
4358   ct_AN = M2G[Mc_AN];
4359   fan = FNAN[ct_AN];
4360   san = SNAN[ct_AN];
4361   can = fan + san;
4362   wan = WhatSpecies[ct_AN];
4363   ct_on = Spe_Total_CNO[wan];
4364   rcutA = Spe_Atom_Cut1[wan];
4365 
4366   /* allocation of arrays */
4367 
4368   tmpmat0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+4));
4369   for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
4370     tmpmat0[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+4));
4371   }
4372 
4373   FS = (double**)malloc(sizeof(double*)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
4374   for (i=0; i<(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
4375     FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
4376   }
4377 
4378   ko = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
4379   iko = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
4380 
4381   /****************************************************
4382       initialize
4383   ****************************************************/
4384 
4385   for (i=0; i<EKC_core_size_max; i++){
4386     for (j=0; j<Msize2_max; j++){
4387       tmpvec0[i][j] = 0.0;
4388       tmpvec1[i][j] = 0.0;
4389     }
4390   }
4391 
4392   /* find the nearest atom with distance of r0 */
4393 
4394   r0 = 1.0e+10;
4395   for (k=1; k<=FNAN[ct_AN]; k++){
4396     Gh_AN = natn[ct_AN][k];
4397     wanB = WhatSpecies[Gh_AN];
4398     if (Dis[ct_AN][k]<r0)  r0 = Dis[ct_AN][k];
4399   }
4400 
4401   /* starting vector */
4402 
4403   m = 0;
4404   for (k=0; k<=FNAN[ct_AN]; k++){
4405 
4406     Gh_AN = natn[ct_AN][k];
4407     wanB = WhatSpecies[Gh_AN];
4408 
4409     if ( Dis[ct_AN][k]<(scale_rc_EKC[Mc_AN]*r0) ){
4410 
4411       Anum = MP[k] - 1;
4412 
4413       for (i=0; i<Spe_Total_CNO[wanB]; i++){
4414 
4415         tmpvec0[m][Anum+i]         = 1.0;
4416         Krylov_U_OLP[0][m][Anum+i] = 1.0;
4417 
4418         m++;
4419       }
4420     }
4421   }
4422 
4423   /*
4424   for (i=0; i<EKC_core_size[Mc_AN]; i++){
4425     tmpvec0[i][i]         = 1.0;
4426     Krylov_U_OLP[0][i][i] = 1.0;
4427   }
4428   */
4429 
4430   /****************************************************
4431            generate Krylov subspace vectors
4432   ****************************************************/
4433 
4434   for (rl=0; rl<(rlmax_EC2[Mc_AN]-1); rl++){
4435 
4436     /*******************************************************
4437                             S * |Wn)
4438     *******************************************************/
4439 
4440     for (n=0; n<EKC_core_size[Mc_AN]; n++){
4441       for (i=0; i<Msize2[Mc_AN]; i++){
4442 	tmpvec1[n][i]  = 0.0;
4443       }
4444     }
4445 
4446     for (i=0; i<=can; i++){
4447 
4448       ig = natn[ct_AN][i];
4449       ian = Spe_Total_CNO[WhatSpecies[ig]];
4450       Anum = MP[i] - 1;
4451       ih = S_G2M[ig];
4452 
4453       for (j=0; j<=can; j++){
4454 
4455 	kl = RMI1[Mc_AN][i][j];
4456 	jg = natn[ct_AN][j];
4457 	jan = Spe_Total_CNO[WhatSpecies[jg]];
4458         Bnum = MP[j] - 1;
4459 
4460         if (0<=kl){
4461 
4462 	  for (m=0; m<ian; m++){
4463 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
4464 
4465 	      sum = 0.0;
4466 	      for (k=0; k<jan; k++){
4467 		sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
4468 	      }
4469 
4470               tmpvec1[n][Anum+m] += sum;
4471 	    }
4472 	  }
4473 	}
4474       }
4475     }
4476 
4477     /*************************************************************
4478       orthogonalization by a modified block Gram-Schmidt method
4479     *************************************************************/
4480 
4481     /* |tmpvec1) := (I - \sum_{rl0} |U_rl0)(U_rl0|)|tmpvec1) */
4482 
4483     for (rl0=0; rl0<=rl; rl0++){
4484 
4485       /* (U_rl0|tmpvec1) */
4486 
4487       for (m=0; m<EKC_core_size[Mc_AN]; m++){
4488         for (n=0; n<EKC_core_size[Mc_AN]; n++){
4489           sum = 0.0;
4490   	  for (i=0; i<Msize2[Mc_AN]; i++){
4491             sum += Krylov_U_OLP[rl0][m][i]*tmpvec1[n][i];
4492 	  }
4493           tmpmat0[m][n] = sum;
4494 	}
4495       }
4496 
4497       /* |tmpvec1) := |tmpvec1) - |U_rl0)(U_rl0|tmpvec1) */
4498 
4499       for (n=0; n<EKC_core_size[Mc_AN]; n++){
4500 	for (k=0; k<EKC_core_size[Mc_AN]; k++){
4501 	  dum = tmpmat0[k][n];
4502 	  for (i=0; i<Msize2[Mc_AN]; i++)  tmpvec1[n][i] -= Krylov_U_OLP[rl0][k][i]*dum;
4503         }
4504       }
4505     }
4506 
4507     /*************************************************************
4508                        normalization of tmpvec1
4509     *************************************************************/
4510 
4511     for (m=0; m<EKC_core_size[Mc_AN]; m++){
4512       for (n=0; n<EKC_core_size[Mc_AN]; n++){
4513 
4514         sum = 0.0;
4515         for (i=0; i<Msize2[Mc_AN]; i++){
4516           sum += tmpvec1[m][i]*tmpvec1[n][i];
4517 	}
4518 
4519         tmpmat0[m+1][n+1] = sum;
4520       }
4521     }
4522 
4523     /* diagonalize tmpmat0 */
4524 
4525     if ( EKC_core_size[Mc_AN]==1){
4526       ko[1] = tmpmat0[1][1];
4527       tmpmat0[1][1] = 1.0;
4528     }
4529     else{
4530       Eigen_lapack( tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN] );
4531     }
4532 
4533     ZeroNum = 0;
4534 
4535     for (n=1; n<=EKC_core_size[Mc_AN]; n++){
4536       if (cutoff_value<ko[n]){
4537 	ko[n]  = sqrt(fabs(ko[n]));
4538 	iko[n] = 1.0/ko[n];
4539       }
4540       else{
4541 	ZeroNum++;
4542 	ko[n]  = 0.0;
4543 	iko[n] = 0.0;
4544       }
4545 
4546       if (error_check==1){
4547         printf("rl=%3d ko=%18.15f\n",rl,ko[n]);
4548       }
4549     }
4550 
4551     if (error_check==1){
4552       printf("rl=%3d ZeroNum=%3d\n",rl,ZeroNum);
4553     }
4554 
4555     /* tmpvec0 = tmpvec1 * tmpmat0^{-1/2} */
4556 
4557     for (n=0; n<EKC_core_size[Mc_AN]; n++){
4558       for (i=0; i<Msize2[Mc_AN]; i++){
4559 	tmpvec0[n][i] = 0.0;
4560       }
4561     }
4562 
4563     for (n=0; n<EKC_core_size[Mc_AN]; n++){
4564       for (k=0; k<EKC_core_size[Mc_AN]; k++){
4565 	dum = tmpmat0[k+1][n+1]*iko[n+1];
4566 	for (i=0; i<Msize2[Mc_AN]; i++)  tmpvec0[n][i] += tmpvec1[k][i]*dum;
4567       }
4568     }
4569 
4570     /*************************************************************
4571                          store Krylov vectors
4572     *************************************************************/
4573 
4574     for (n=0; n<EKC_core_size[Mc_AN]; n++){
4575       for (i=0; i<Msize2[Mc_AN]; i++){
4576         Krylov_U_OLP[rl+1][n][i] = tmpvec0[n][i];
4577       }
4578     }
4579 
4580   } /* rl */
4581 
4582   /************************************************************
4583        calculate the inverse of the reduced overlap matrix
4584   ************************************************************/
4585 
4586   /*  construct the reduced overlap matrix */
4587 
4588   for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
4589 
4590     /*  S * |Vn) */
4591 
4592     for (n=0; n<EKC_core_size[Mc_AN]; n++){
4593       for (i=0; i<Msize2[Mc_AN]; i++){
4594 	tmpvec1[n][i]  = 0.0;
4595       }
4596     }
4597 
4598     for (i=0; i<=can; i++){
4599 
4600       ig = natn[ct_AN][i];
4601       ian = Spe_Total_CNO[WhatSpecies[ig]];
4602       Anum = MP[i] - 1;
4603       ih = S_G2M[ig];
4604 
4605       for (j=0; j<=can; j++){
4606 
4607 	kl = RMI1[Mc_AN][i][j];
4608 	jg = natn[ct_AN][j];
4609 	jan = Spe_Total_CNO[WhatSpecies[jg]];
4610 	Bnum = MP[j] - 1;
4611 
4612 	if (0<=kl){
4613 
4614 	  for (m=0; m<ian; m++){
4615 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
4616 
4617 	      sum = 0.0;
4618 	      for (k=0; k<jan; k++){
4619 		sum += OLP0[ih][kl][m][k]*Krylov_U_OLP[rl][n][Bnum+k];
4620 	      }
4621 	      tmpvec1[n][Anum+m] += sum;
4622 	    }
4623 	  }
4624 	}
4625       }
4626     }
4627 
4628     for (rl0=rl; rl0<rlmax_EC2[Mc_AN]; rl0++){
4629       for (m=0; m<EKC_core_size[Mc_AN]; m++){
4630         for (n=0; n<EKC_core_size[Mc_AN]; n++){
4631   	  sum = 0.0;
4632 	  for (i=0; i<Msize2[Mc_AN]; i++){
4633             sum += Krylov_U_OLP[rl0][m][i]*tmpvec1[n][i];
4634 	  }
4635           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
4636           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
4637 	}
4638       }
4639     }
4640   }
4641 
4642   /* diagonalize FS */
4643 
4644   Eigen_lapack(FS,ko,Msize4[Mc_AN],Msize4[Mc_AN]);
4645 
4646   /* find ill-conditioned eigenvalues */
4647 
4648   ZeroNum = 0;
4649   for (i=1; i<=Msize4[Mc_AN]; i++){
4650 
4651     if (error_check==1){
4652       printf("Mc_AN=%2d i=%3d ko[i]=%18.15f\n",Mc_AN,i,ko[i]);
4653     }
4654 
4655     if (cutoff_value<ko[i]){
4656       iko[i] = 1.0/ko[i];
4657     }
4658     else{
4659       ZeroNum++;
4660       ko[i]  = 0.0;
4661       iko[i] = 0.0;
4662     }
4663   }
4664 
4665   if (error_check==1){
4666     printf("Mc_AN=%2d ZeroNum=%2d\n",Mc_AN,ZeroNum);
4667   }
4668 
4669   /* construct the inverse */
4670 
4671   for (i=1; i<=Msize4[Mc_AN]; i++){
4672     for (j=1; j<=Msize4[Mc_AN]; j++){
4673       sum = 0.0;
4674       for (k=1; k<=Msize4[Mc_AN]; k++){
4675         sum += FS[i][k]*iko[k]*FS[j][k];
4676       }
4677       inv_RS[i-1][j-1] = sum;
4678     }
4679   }
4680 
4681   /* symmetrization of inv_RS */
4682 
4683   for (i=1; i<=Msize4[Mc_AN]; i++){
4684     for (j=i+1; j<=Msize4[Mc_AN]; j++){
4685       tmp0 = inv_RS[i-1][j-1];
4686       tmp1 = inv_RS[j-1][i-1];
4687       tmp2 = 0.5*(tmp0 + tmp1);
4688       inv_RS[i-1][j-1] = tmp2;
4689       inv_RS[j-1][i-1] = tmp2;
4690     }
4691   }
4692 
4693 
4694 
4695   /*
4696   {
4697 
4698 
4699     double mat1[1000][1000];
4700     double tsum;
4701     int myid;
4702 
4703     MPI_Comm_rank(mpi_comm_level1,&myid);
4704 
4705     if (myid==0){
4706 
4707       printf("check normalization\n");
4708 
4709       for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
4710 	for (m=0; m<EKC_core_size[Mc_AN]; m++){
4711 	  for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
4712 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
4713 
4714               sum = 0.0;
4715    	      for (i=0; i<Msize2[Mc_AN]; i++){
4716                 sum += Krylov_U_OLP[rl][m][i]*Krylov_U_OLP[rl0][n][i];
4717 	      }
4718               printf("rl=%3d rl0=%3d m=%3d n=%3d  <|>=%18.15f\n",rl,rl0,m,n,sum);
4719 	    }
4720 	  }
4721 	}
4722       }
4723 
4724 
4725       printf("\n\ninvS\n");
4726 
4727 
4728       for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
4729 	for (m=0; m<EKC_core_size[Mc_AN]; m++){
4730 
4731 	  for (i=0; i<Msize2[Mc_AN]; i++){
4732 
4733 	    sum = 0.0;
4734 
4735 	    for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
4736 	      for (n=0; n<EKC_core_size[Mc_AN]; n++){
4737 		sum += inv_RS[rl*EKC_core_size[Mc_AN]+m][rl0*EKC_core_size[Mc_AN]+n]*Krylov_U_OLP[rl0][n][i];
4738 	      }
4739 	    }
4740 
4741 	    mat1[rl*EKC_core_size[Mc_AN]+m][i] = sum;
4742 	  }
4743 	}
4744       }
4745 
4746 
4747       tsum = 0.0;
4748 
4749       for (i=0; i<Msize2[Mc_AN]; i++){
4750 	for (j=0; j<Msize2[Mc_AN]; j++){
4751 
4752 	  sum = 0.0;
4753 	  for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
4754 	    for (m=0; m<EKC_core_size[Mc_AN]; m++){
4755 	      sum += Krylov_U_OLP[rl][m][i]*mat1[rl*EKC_core_size[Mc_AN]+m][j];
4756 	    }
4757 	  }
4758 
4759 	  printf("i=%4d j=%4d %18.15f\n",i,j,sum);
4760 
4761           tsum += fabs(sum);
4762 
4763 	}
4764       }
4765 
4766 
4767       printf("\n\ntsum=%18.15f\n",tsum);
4768 
4769     }
4770 
4771 
4772     MPI_Finalize();
4773     exit(0);
4774   }
4775   */
4776 
4777   /* freeing of arrays */
4778 
4779   for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
4780     free(tmpmat0[i]);
4781   }
4782   free(tmpmat0);
4783 
4784   for (i=0; i<(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
4785     free(FS[i]);
4786   }
4787   free(FS);
4788 
4789   free(ko);
4790   free(iko);
4791 }
4792 
4793 
4794 
4795 
4796 
4797 
4798 
4799 
4800 
4801 
4802 
4803 
S_orthonormalize_vec(int Mc_AN,int ct_on,double ** vec,double ** workvec,double **** OLP0,double ** tmpmat0,double * ko,double * iko,int * MP,int * Msize2)4804 void S_orthonormalize_vec( int Mc_AN, int ct_on, double **vec,
4805                            double **workvec, double ****OLP0,
4806                            double **tmpmat0, double *ko, double *iko,
4807                            int *MP, int *Msize2 )
4808 {
4809   int n,i,j,can,san,fan,ct_AN;
4810   int k,m,ZeroNum,Anum,Bnum,ih;
4811   int kl,jg,jan,ig,ian;
4812   double dum,sum;
4813   double tmp0,tmp1,tmp2,tmp3;
4814   __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
4815   double mmArr[8];
4816   double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
4817 
4818   ct_AN = M2G[Mc_AN];
4819   fan = FNAN[ct_AN];
4820   san = SNAN[ct_AN];
4821   can = fan + san;
4822 
4823   /* S|Vn) */
4824 
4825   for (n=0; n<EKC_core_size[Mc_AN]; n++){
4826     for (i=0; i<Msize2[Mc_AN]; i++){
4827       workvec[n][i]  = 0.0;
4828     }
4829   }
4830 
4831   for (i=0; i<=can; i++){
4832 
4833     ig = natn[ct_AN][i];
4834     ian = Spe_Total_CNO[WhatSpecies[ig]];
4835     Anum = MP[i] - 1;
4836     ih = S_G2M[ig];
4837 
4838     for (j=0; j<=can; j++){
4839 
4840       kl = RMI1[Mc_AN][i][j];
4841       jg = natn[ct_AN][j];
4842       jan = Spe_Total_CNO[WhatSpecies[jg]];
4843       Bnum = MP[j] - 1;
4844 
4845       if (0<=kl){
4846 
4847 #ifdef nosse
4848 
4849 	/* Original version */
4850 	/**/
4851 	for (m=0; m<ian; m++){
4852 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
4853 
4854 	    sum = 0.0;
4855 	    for (k=0; k<jan; k++){
4856 	      sum += OLP0[ih][kl][m][k]*vec[n][Bnum+k];
4857 	    }
4858 
4859 	    workvec[n][Anum+m] += sum;
4860 	  }
4861 	}
4862 	/**/
4863 
4864 #else
4865 
4866 	/* Unrolling + SSE version */
4867 	/**/
4868 	for (m=0; m<(ian-3); m+=4){
4869 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
4870 
4871 	    mmSum00 = _mm_setzero_pd();
4872 	    mmSum01 = _mm_setzero_pd();
4873 	    mmSum10 = _mm_setzero_pd();
4874 	    mmSum11 = _mm_setzero_pd();
4875 	    mmSum20 = _mm_setzero_pd();
4876 	    mmSum21 = _mm_setzero_pd();
4877 	    mmSum30 = _mm_setzero_pd();
4878 	    mmSum31 = _mm_setzero_pd();
4879 
4880 	    for (k=0; k<(jan-3); k+=4){
4881 	      mmTmp0 = _mm_loadu_pd(&vec[n][Bnum+k+0]);
4882 	      mmTmp1 = _mm_loadu_pd(&vec[n][Bnum+k+2]);
4883 
4884 	      mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
4885 	      mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
4886 
4887 	      mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
4888 	      mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
4889 
4890 	      mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
4891 	      mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
4892 
4893 	      mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
4894 	      mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
4895 	    }
4896 
4897 	    mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4898 	    mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4899 	    mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4900 	    mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4901 
4902 	    _mm_storeu_pd(&mmArr[0], mmSum00);
4903 	    _mm_storeu_pd(&mmArr[2], mmSum10);
4904 	    _mm_storeu_pd(&mmArr[4], mmSum20);
4905 	    _mm_storeu_pd(&mmArr[6], mmSum30);
4906 
4907 	    sum0 = mmArr[0] + mmArr[1];
4908 	    sum1 = mmArr[2] + mmArr[3];
4909 	    sum2 = mmArr[4] + mmArr[5];
4910 	    sum3 = mmArr[6] + mmArr[7];
4911 
4912 	    for (; k<jan; k++){
4913 	      sum0 += OLP0[ih][kl][m+0][k]*vec[n][Bnum+k];
4914 	      sum1 += OLP0[ih][kl][m+1][k]*vec[n][Bnum+k];
4915 	      sum2 += OLP0[ih][kl][m+2][k]*vec[n][Bnum+k];
4916 	      sum3 += OLP0[ih][kl][m+3][k]*vec[n][Bnum+k];
4917 	    }
4918 
4919 	    workvec[n][Anum+m+0] += sum0;
4920 	    workvec[n][Anum+m+1] += sum1;
4921 	    workvec[n][Anum+m+2] += sum2;
4922 	    workvec[n][Anum+m+3] += sum3;
4923 	  }
4924 	}
4925 
4926 	for (; m<ian; m++){
4927 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
4928 
4929 	    sum = 0.0;
4930 	    for (k=0; k<jan; k++){
4931 	      sum += OLP0[ih][kl][m][k]*vec[n][Bnum+k];
4932 	    }
4933 
4934 	    workvec[n][Anum+m] += sum;
4935 	  }
4936 	}
4937 	/**/
4938 
4939 #endif
4940 
4941       }
4942     }
4943   }
4944 
4945   /*  (Vn|S|Vn) */
4946 
4947 #ifdef nosse
4948 
4949   /* Original version */
4950   /**/
4951   for (m=0; m<EKC_core_size[Mc_AN]; m++){
4952     for (n=m; n<EKC_core_size[Mc_AN]; n++){
4953       sum = 0.0;
4954       for (i=0; i<Msize2[Mc_AN]; i++){
4955 	sum += vec[m][i]*workvec[n][i];
4956       }
4957       tmpmat0[m+1][n+1] = sum;
4958       tmpmat0[n+1][m+1] = sum;
4959     }
4960   }
4961   /**/
4962 
4963 #else
4964 
4965   /* Unrolling + SSE version */
4966   /**/
4967   for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
4968     for (n=m; n<EKC_core_size[Mc_AN]; n++){
4969       mmSum00 = _mm_setzero_pd();
4970       mmSum01 = _mm_setzero_pd();
4971       mmSum10 = _mm_setzero_pd();
4972       mmSum11 = _mm_setzero_pd();
4973       mmSum20 = _mm_setzero_pd();
4974       mmSum21 = _mm_setzero_pd();
4975       mmSum30 = _mm_setzero_pd();
4976       mmSum31 = _mm_setzero_pd();
4977 
4978       for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
4979 	mmTmp0 = _mm_loadu_pd(&workvec[n][i+0]);
4980 	mmTmp1 = _mm_loadu_pd(&workvec[n][i+2]);
4981 
4982 	mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&vec[m+0][i+0]),mmTmp0));
4983 	mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&vec[m+0][i+2]),mmTmp1));
4984 
4985 	mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&vec[m+1][i+0]),mmTmp0));
4986 	mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&vec[m+1][i+2]),mmTmp1));
4987 
4988 	mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&vec[m+2][i+0]),mmTmp0));
4989 	mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&vec[m+2][i+2]),mmTmp1));
4990 
4991 	mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&vec[m+3][i+0]),mmTmp0));
4992 	mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&vec[m+3][i+2]),mmTmp1));
4993       }
4994 
4995       mmSum00 = _mm_add_pd(mmSum00, mmSum01);
4996       mmSum10 = _mm_add_pd(mmSum10, mmSum11);
4997       mmSum20 = _mm_add_pd(mmSum20, mmSum21);
4998       mmSum30 = _mm_add_pd(mmSum30, mmSum31);
4999 
5000       _mm_storeu_pd(&mmArr[0], mmSum00);
5001       _mm_storeu_pd(&mmArr[2], mmSum10);
5002       _mm_storeu_pd(&mmArr[4], mmSum20);
5003       _mm_storeu_pd(&mmArr[6], mmSum30);
5004 
5005       sum0 = mmArr[0] + mmArr[1];
5006       sum1 = mmArr[2] + mmArr[3];
5007       sum2 = mmArr[4] + mmArr[5];
5008       sum3 = mmArr[6] + mmArr[7];
5009 
5010       for (; i<Msize2[Mc_AN]; i++){
5011 	sum0 += vec[m+0][i]*workvec[n][i];
5012 	sum1 += vec[m+1][i]*workvec[n][i];
5013 	sum2 += vec[m+2][i]*workvec[n][i];
5014 	sum3 += vec[m+3][i]*workvec[n][i];
5015       }
5016 
5017       tmpmat0[m+1][n+1] = sum0;
5018       tmpmat0[n+1][m+1] = sum0;
5019 
5020       tmpmat0[m+2][n+1] = sum1;
5021       tmpmat0[n+1][m+2] = sum1;
5022 
5023       tmpmat0[m+3][n+1] = sum2;
5024       tmpmat0[n+1][m+3] = sum2;
5025 
5026       tmpmat0[m+4][n+1] = sum3;
5027       tmpmat0[n+1][m+4] = sum3;
5028     }
5029   }
5030 
5031   for (; m<EKC_core_size[Mc_AN]; m++){
5032     for (n=m; n<EKC_core_size[Mc_AN]; n++){
5033       sum = 0.0;
5034       for (i=0; i<Msize2[Mc_AN]; i++){
5035 	sum += vec[m][i]*workvec[n][i];
5036       }
5037       tmpmat0[m+1][n+1] = sum;
5038       tmpmat0[n+1][m+1] = sum;
5039     }
5040   }
5041   /**/
5042 
5043 #endif
5044 
5045   /* diagonalize tmpmat0 */
5046 
5047   if ( EKC_core_size[Mc_AN]==1){
5048     ko[1] = tmpmat0[1][1];
5049     tmpmat0[1][1] = 1.0;
5050   }
5051   else{
5052     Eigen_lapack( tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN] );
5053   }
5054 
5055   ZeroNum = 0;
5056 
5057   for (n=1; n<=EKC_core_size[Mc_AN]; n++){
5058     if (cutoff_value<ko[n]){
5059       ko[n]  = sqrt(fabs(ko[n]));
5060       iko[n] = 1.0/ko[n];
5061     }
5062     else{
5063       ZeroNum++;
5064       ko[n]  = 0.0;
5065       iko[n] = 0.0;
5066     }
5067   }
5068 
5069   /* U0 = vec * tmpmat0^{-1/2} */
5070 
5071   for (n=0; n<EKC_core_size[Mc_AN]; n++){
5072     for (i=0; i<Msize2[Mc_AN]; i++){
5073       workvec[n][i] = 0.0;
5074     }
5075   }
5076 
5077   for (n=0; n<EKC_core_size[Mc_AN]; n++){
5078     for (k=0; k<EKC_core_size[Mc_AN]; k++){
5079       dum = tmpmat0[k+1][n+1]*iko[n+1];
5080       for (i=0; i<Msize2[Mc_AN]; i++)  workvec[n][i] += vec[k][i]*dum;
5081     }
5082   }
5083 
5084   for (n=0; n<EKC_core_size[Mc_AN]; n++){
5085     for (i=0; i<Msize2[Mc_AN]; i++){
5086       vec[n][i] = workvec[n][i];
5087     }
5088   }
5089 }
5090 
5091 
5092 
Inverse_S_by_Cholesky(int Mc_AN,double **** OLP0,double ** invS,int * MP,int NUM,double * LoS)5093 void Inverse_S_by_Cholesky(int Mc_AN, double ****OLP0, double **invS, int *MP, int NUM, double *LoS)
5094 {
5095   char  *UPLO="U";
5096   INTEGER N,lda,info,lwork;
5097   int Gc_AN,i,j,k,Anum,Gi,wanA,time1,time2;
5098   int ig,ian,ih,kl,jg,jan,Bnum,m,n;
5099   double tmp0,tmp1;
5100 
5101   N = NUM;
5102   Gc_AN = M2G[Mc_AN];
5103 
5104   /* OLP0 to invS */
5105 
5106   for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
5107 
5108     ig = natn[Gc_AN][i];
5109     ian = Spe_Total_CNO[WhatSpecies[ig]];
5110     Anum = MP[i] - 1;
5111     ih = S_G2M[ig];
5112 
5113     for (j=0; j<=(FNAN[Gc_AN]+SNAN[Gc_AN]); j++){
5114 
5115       kl = RMI1[Mc_AN][i][j];
5116       jg = natn[Gc_AN][j];
5117       jan = Spe_Total_CNO[WhatSpecies[jg]];
5118       Bnum = MP[j] - 1;
5119 
5120       if (0<=kl){
5121 	for (m=0; m<ian; m++){
5122 	  for (n=0; n<jan; n++){
5123 	    invS[Anum+m][Bnum+n] = OLP0[ih][kl][m][n];
5124 	  }
5125 	}
5126       }
5127 
5128       else{
5129 	for (m=0; m<ian; m++){
5130 	  for (n=0; n<jan; n++){
5131 	    invS[Anum+m][Bnum+n] = 0.0;
5132 	  }
5133 	}
5134       }
5135     }
5136   }
5137 
5138   /* invS -> LoS */
5139 
5140   for (i=0;i<N;i++) {
5141     for (j=0;j<N;j++) {
5142        LoS[j*N+i]= invS[i][j];
5143     }
5144   }
5145 
5146   /* call dpotrf_() in clapack */
5147 
5148   lda = N;
5149   F77_NAME(dpotrf,DPOTRF)(UPLO, &N, LoS, &lda, &info);
5150 
5151   if (info!=0){
5152     printf("Error in dpotrf_() which is called from Embedding_Cluster.c  info=%2d\n",info);
5153   }
5154 
5155   /* call dpotri_() in clapack */
5156 
5157   lwork = N;
5158   F77_NAME(dpotri,DPOTRI)(UPLO, &N, LoS, &lda, &info);
5159 
5160   if (info!=0){
5161     printf("Error in dpotri_() which is called from Embedding_Cluster.c  info=%2d\n",info);
5162   }
5163 
5164   /* LoS -> invS */
5165 
5166   for (j=0; j<N; j++) {
5167     m = j*N;
5168     for (i=0; i<=j; i++) {
5169       invS[i][j] = LoS[m+i];
5170       invS[j][i] = LoS[m+i];
5171     }
5172   }
5173 
5174 }
5175 
5176 
Save_DOS_Col(double ****** Residues,double **** OLP0,double *** EVal,int ** LO_TC,int ** HO_TC)5177 void Save_DOS_Col(double ******Residues, double ****OLP0, double ***EVal, int **LO_TC, int **HO_TC)
5178 {
5179   int spin,Mc_AN,wanA,Gc_AN,tno1;
5180   int i1,i,j,MaxL,l,h_AN,Gh_AN,wanB,tno2;
5181   double Stime_atom,Etime_atom;
5182   double sum;
5183   int i_vec[10];
5184   char file_eig[YOUSO10],file_ev[YOUSO10];
5185   FILE *fp_eig, *fp_ev;
5186   int numprocs,myid,ID,tag;
5187 
5188   /* for OpenMP */
5189   int OMPID,Nthrds,Nprocs;
5190 
5191   /* MPI */
5192 
5193   MPI_Comm_size(mpi_comm_level1,&numprocs);
5194   MPI_Comm_rank(mpi_comm_level1,&myid);
5195 
5196   if (myid==Host_ID){
5197     printf("The DOS is supported for a range from -10 to 10 eV for the O(N) Krylov subspace method.\n");
5198   }
5199 
5200   /* open file pointers */
5201 
5202   if (myid==Host_ID){
5203 
5204     sprintf(file_eig,"%s%s.Dos.val",filepath,filename);
5205     if ( (fp_eig=fopen(file_eig,"w"))==NULL ) {
5206       printf("cannot open a file %s\n",file_eig);
5207     }
5208   }
5209 
5210   sprintf(file_ev, "%s%s.Dos.vec%i",filepath,filename,myid);
5211   if ( (fp_ev=fopen(file_ev,"w"))==NULL ) {
5212     printf("cannot open a file %s\n",file_ev);
5213   }
5214 
5215   /****************************************************
5216                    save *.Dos.vec
5217   ****************************************************/
5218 
5219   for (spin=0; spin<=SpinP_switch; spin++){
5220     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5221 
5222       dtime(&Stime_atom);
5223 
5224       Gc_AN = M2G[Mc_AN];
5225       wanA = WhatSpecies[Gc_AN];
5226       tno1 = Spe_Total_CNO[wanA];
5227 
5228       fprintf(fp_ev,"<AN%dAN%d\n",Gc_AN,spin);
5229       fprintf(fp_ev,"%d\n",(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1));
5230 
5231       for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
5232 
5233 	fprintf(fp_ev,"%4d  %10.6f  ",i1,EVal[spin][Mc_AN][i1-1+LO_TC[spin][Mc_AN]]);
5234 
5235 	for (i=0; i<tno1; i++){
5236 
5237 	  sum = 0.0;
5238 	  for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5239 	    Gh_AN = natn[Gc_AN][h_AN];
5240 	    wanB = WhatSpecies[Gh_AN];
5241 	    tno2 = Spe_Total_CNO[wanB];
5242 	    for (j=0; j<tno2; j++){
5243 	      sum += Residues[spin][Mc_AN][h_AN][i][j][i1]*
5244 		               OLP0[Mc_AN][h_AN][i][j];
5245 	    }
5246 	  }
5247 
5248 	  fprintf(fp_ev,"%8.5f",sum);
5249 	}
5250 	fprintf(fp_ev,"\n");
5251       }
5252 
5253       fprintf(fp_ev,"AN%dAN%d>\n",Gc_AN,spin);
5254 
5255       dtime(&Etime_atom);
5256       time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
5257     }
5258   }
5259 
5260   /****************************************************
5261                    save *.Dos.val
5262   ****************************************************/
5263 
5264   if ( (fp_eig=fopen(file_eig,"w")) != NULL ) {
5265 
5266     fprintf(fp_eig,"mode        5\n");
5267     fprintf(fp_eig,"NonCol      0\n");
5268     /*      fprintf(fp_eig,"N           %d\n",n); */
5269     fprintf(fp_eig,"Nspin       %d\n",SpinP_switch);
5270     fprintf(fp_eig,"Erange      %lf %lf\n",Dos_Erange[0],Dos_Erange[1]);
5271     fprintf(fp_eig,"atomnum     %d\n",atomnum);
5272 
5273     fprintf(fp_eig,"<WhatSpecies\n");
5274     for (i=1;i<=atomnum;i++) {
5275       fprintf(fp_eig,"%d ",WhatSpecies[i]);
5276     }
5277     fprintf(fp_eig,"\nWhatSpecies>\n");
5278 
5279     fprintf(fp_eig,"SpeciesNum     %d\n",SpeciesNum);
5280     fprintf(fp_eig,"<Spe_Total_CNO\n");
5281     for (i=0;i<SpeciesNum;i++) {
5282       fprintf(fp_eig,"%d ",Spe_Total_CNO[i]);
5283     }
5284     fprintf(fp_eig,"\nSpe_Total_CNO>\n");
5285 
5286     MaxL=Supported_MaxL;
5287     fprintf(fp_eig,"MaxL           %d\n",Supported_MaxL);
5288     fprintf(fp_eig,"<Spe_Num_CBasis\n");
5289     for (i=0;i<SpeciesNum;i++) {
5290       for (l=0;l<=MaxL;l++) {
5291 	fprintf(fp_eig,"%d ",Spe_Num_CBasis[i][l]);
5292       }
5293       fprintf(fp_eig,"\n");
5294     }
5295     fprintf(fp_eig,"Spe_Num_CBasis>\n");
5296     fprintf(fp_eig,"ChemP       %lf\n",ChemP);
5297 
5298   }
5299 
5300   /* close file pointers */
5301 
5302   if (myid==Host_ID){
5303     if (fp_eig) fclose(fp_eig);
5304   }
5305 
5306   if (fp_ev)  fclose(fp_ev);
5307 }
5308 
5309 
Krylov_Col_trd(char * mode,int SCF_iter,double ***** Hks,double **** OLP0,double ***** CDM,double ***** EDM,double Eele0[2],double Eele1[2])5310 static double Krylov_Col_trd(char *mode,
5311 			 int SCF_iter,
5312 			 double *****Hks,
5313 			 double ****OLP0,
5314 			 double *****CDM,
5315 			 double *****EDM,
5316 			 double Eele0[2],
5317 			 double Eele1[2])
5318 
5319 /**********************************************************************************
5320   The structure of this subroutine is exactly the same as the original Krylov_Col,
5321   except for the main loop where OMP parallelization at the atom level is removed.
5322 **********************************************************************************/
5323 
5324 {
5325   static int firsttime=1,recalc_firsttime=1,recalc_flag;
5326   int Mc_AN,Gc_AN,i,is,js,Gi,wan,wanA,wanB,Anum;
5327   int num,NUM0,NUM,NUM1,n2,Cwan,Hwan,Rn2;
5328   int size1,size2,max_size1,max_size2;
5329   int ih,ig,ian,j,kl,jg,jan,Bnum,m,n,spin,i2,ip;
5330   int k,l,i1,j1,P_min,m_size,q1,q2,csize,Residues_size;
5331   int h_AN1,Mh_AN1,h_AN2,Gh_AN1,Gh_AN2,wan1,wan2;
5332   int po,po1,loopN,tno1,tno2,h_AN,Gh_AN,rl1,rl2,rl;
5333   int MA_AN,GA_AN,tnoA,GB_AN,tnoB,ct_on;
5334   int Msize2_max;
5335   static double TZ;
5336   double My_TZ,sum,FermiF,time0,srt;
5337   double sum00,sum10,sum20,sum30;
5338   double sum01,sum11,sum21,sum31;
5339   double sum02,sum12,sum22,sum32;
5340   double sum03,sum13,sum23,sum33;
5341   double tmp0,tmp1,tmp2,tmp3,b2,co,x0,Sx,Dx,xmin,xmax;
5342   double My_Num_State,Num_State,x,Dnum;
5343   double emin,emax,de;
5344   double TStime,TEtime,Stime1,Etime1;
5345   double Stime2,Etime2;
5346   double time1,time2,time3,time4,time5;
5347   double time6,time7,time8,time9,time10;
5348   double time11,time12,time13,time14,time15,time16;
5349   double Erange;
5350   double My_Eele0[2],My_Eele1[2];
5351   double max_x=30.0;
5352   double ChemP_MAX,ChemP_MIN,spin_degeneracy;
5353   double spetrum_radius;
5354   double ***EVal;
5355   double ******Residues;
5356   double ***PDOS_DC;
5357   double *tmp_array;
5358   double *tmp_array2;
5359 
5360   int *MP;
5361 
5362   /*****************************************************
5363   Msize:   \sum_FNAN Spe_Total_CNO
5364   Msize2:  \sum_FNAN+SNAN Spe_Total_CNO
5365   Msize3:  rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]
5366   Msize4:  rlmax_EC2[Mc_AN]*EKC_core_size[Mc_AN]
5367   Msize5:  dimension for the last column of Residues
5368   *****************************************************/
5369 
5370   int *Msize;
5371   int *Msize2;
5372   int *Msize3;
5373   int *Msize4;
5374   int *Msize5;
5375   int **LO_TC;
5376   int **HO_TC;
5377   int numprocs,myid,ID,IDS,IDR,tag=999;
5378   double Stime_atom, Etime_atom;
5379 
5380   MPI_Status stat;
5381   MPI_Request request;
5382 
5383   /* for OpenMP */
5384   int OMPID,Nthrds,Nprocs;
5385 
5386   /* MPI */
5387   MPI_Barrier(mpi_comm_level1);
5388   MPI_Comm_size(mpi_comm_level1,&numprocs);
5389   MPI_Comm_rank(mpi_comm_level1,&myid);
5390 
5391   dtime(&TStime);
5392 
5393   if (measure_time==1){
5394     time1 = 0.0;
5395     time2 = 0.0;
5396     time3 = 0.0;
5397     time4 = 0.0;
5398     time5 = 0.0;
5399     time6 = 0.0;
5400     time7 = 0.0;
5401     time8 = 0.0;
5402     time9 = 0.0;
5403     time10 = 0.0;
5404     time11 = 0.0;
5405     time12 = 0.0;
5406     time13 = 0.0;
5407     time14 = 0.0;
5408     time15 = 0.0;
5409   }
5410 
5411   /****************************************************
5412                    allocation of arrays:
5413   ****************************************************/
5414 
5415   Msize = (int*)malloc(sizeof(int)*(Matomnum+1));
5416   Msize2 = (int*)malloc(sizeof(int)*(Matomnum+1));
5417   Msize3 = (int*)malloc(sizeof(int)*(Matomnum+1));
5418   Msize4 = (int*)malloc(sizeof(int)*(Matomnum+1));
5419   Msize2_max = 0;
5420 
5421   /* find Msize */
5422 
5423   for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5424 
5425     if (Mc_AN==0){
5426       Msize[Mc_AN] = 1;
5427     }
5428     else{
5429 
5430       Gc_AN = M2G[Mc_AN];
5431 
5432       NUM = 0;
5433       for (i=0; i<=FNAN[Gc_AN]; i++){
5434 	Gi = natn[Gc_AN][i];
5435 	wanA = WhatSpecies[Gi];
5436 	NUM += Spe_Total_CNO[wanA];
5437       }
5438       Msize[Mc_AN] = NUM;
5439     }
5440   }
5441 
5442   /* find Msize2 and Msize2_max */
5443 
5444   for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5445 
5446     if (Mc_AN==0){
5447       Msize2[Mc_AN] = 1;
5448     }
5449     else{
5450 
5451       Gc_AN = M2G[Mc_AN];
5452 
5453       NUM = 0;
5454       for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
5455 	Gi = natn[Gc_AN][i];
5456 	wanA = WhatSpecies[Gi];
5457 	NUM += Spe_Total_CNO[wanA];
5458       }
5459       Msize2[Mc_AN] = NUM;
5460     }
5461 
5462     if (Msize2_max<Msize2[Mc_AN]) Msize2_max = Msize2[Mc_AN] + 4;
5463   }
5464 
5465   /* find Msize3 and Msize4 */
5466 
5467   Msize3[0] = 1;
5468   Msize4[0] = 1;
5469 
5470   for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5471     Gc_AN = M2G[Mc_AN];
5472     wan = WhatSpecies[Gc_AN];
5473     ct_on = Spe_Total_CNO[wan];
5474     Msize3[Mc_AN] = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN];
5475     Msize4[Mc_AN] = rlmax_EC2[Mc_AN]*EKC_core_size[Mc_AN];
5476   }
5477 
5478   m_size = 0;
5479 
5480   EVal = (double***)malloc(sizeof(double**)*(SpinP_switch+1));
5481   for (spin=0; spin<=SpinP_switch; spin++){
5482     EVal[spin] = (double**)malloc(sizeof(double*)*(Matomnum+1));
5483 
5484     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5485       n2 = Msize3[Mc_AN] + 2;
5486       m_size += n2;
5487       EVal[spin][Mc_AN] = (double*)malloc(sizeof(double)*n2);
5488     }
5489   }
5490 
5491   if (firsttime){
5492   PrintMemory("Krylov: EVal",  sizeof(double)*m_size,NULL);
5493   }
5494 
5495   if (2<=level_stdout){
5496     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5497         printf("<Krylov> myid=%4d Mc_AN=%4d Gc_AN=%4d Msize=%4d\n",
5498         myid,Mc_AN,M2G[Mc_AN],Msize[Mc_AN]);
5499     }
5500   }
5501 
5502   /****************************************************
5503     allocation of arrays:
5504 
5505     double PDOS[SpinP_switch+1]
5506                [Matomnum+1]
5507                [n2]
5508   ****************************************************/
5509 
5510   m_size = 0;
5511   PDOS_DC = (double***)malloc(sizeof(double**)*(SpinP_switch+1));
5512   for (spin=0; spin<=SpinP_switch; spin++){
5513     PDOS_DC[spin] = (double**)malloc(sizeof(double*)*(Matomnum+1));
5514     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5515       n2 = Msize3[Mc_AN] + 4;
5516       m_size += n2;
5517       PDOS_DC[spin][Mc_AN] = (double*)malloc(sizeof(double)*n2);
5518     }
5519   }
5520 
5521   if (firsttime){
5522   PrintMemory("Krylov: PDOS_DC",sizeof(double)*m_size,NULL);
5523   }
5524 
5525   /****************************************************
5526     allocation of arrays:
5527 
5528     int LO_TC[SpinP_switch+1][Matomnum+1]
5529     int HO_TC[SpinP_switch+1][Matomnum+1]
5530   ****************************************************/
5531 
5532   LO_TC = (int**)malloc(sizeof(int*)*(SpinP_switch+1));
5533   for (spin=0; spin<(SpinP_switch+1); spin++){
5534     LO_TC[spin] = (int*)malloc(sizeof(int)*(Matomnum+1));
5535   }
5536 
5537   HO_TC = (int**)malloc(sizeof(int*)*(SpinP_switch+1));
5538   for (spin=0; spin<(SpinP_switch+1); spin++){
5539     HO_TC[spin] = (int*)malloc(sizeof(int)*(Matomnum+1));
5540   }
5541 
5542   /****************************************************
5543     allocation of array:
5544 
5545     double Residues[SpinP_switch+1]
5546                    [Matomnum+1]
5547                    [FNAN[Gc_AN]+1]
5548                    [Spe_Total_CNO[Gc_AN]]
5549                    [Spe_Total_CNO[Gh_AN]]
5550                    [HO_TC-LO_TC+3]
5551   ****************************************************/
5552 
5553   Residues = (double******)malloc(sizeof(double*****)*(SpinP_switch+1));
5554   for (spin=0; spin<=SpinP_switch; spin++){
5555     Residues[spin] = (double*****)malloc(sizeof(double****)*(Matomnum+1));
5556     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
5557 
5558       if (Mc_AN==0){
5559         Gc_AN = 0;
5560         FNAN[0] = 0;
5561         tno1 = 1;
5562       }
5563       else{
5564 
5565         Gc_AN = M2G[Mc_AN];
5566         wanA = WhatSpecies[Gc_AN];
5567         tno1 = Spe_Total_CNO[wanA];
5568       }
5569 
5570       Residues[spin][Mc_AN] = (double****)malloc(sizeof(double***)*(FNAN[Gc_AN]+1));
5571 
5572       for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5573 
5574         if (Mc_AN==0){
5575           tno2 = 1;
5576         }
5577         else {
5578           Gh_AN = natn[Gc_AN][h_AN];
5579           wanB = WhatSpecies[Gh_AN];
5580           tno2 = Spe_Total_CNO[wanB];
5581         }
5582 
5583         Residues[spin][Mc_AN][h_AN] = (double***)malloc(sizeof(double**)*tno1);
5584         for (i=0; i<tno1; i++){
5585           Residues[spin][Mc_AN][h_AN][i] = (double**)malloc(sizeof(double*)*tno2);
5586           /* note that the array is allocated once more in the loop */
5587         }
5588       }
5589     }
5590   }
5591 
5592   for (spin=0; spin<=SpinP_switch; spin++){
5593     Residues[spin][0][0][0][0] = (double*)malloc(sizeof(double)*1);
5594   }
5595 
5596   /****************************************************
5597      initialize density and energy density matrices
5598   ****************************************************/
5599 
5600   for (spin=0; spin<=SpinP_switch; spin++){
5601     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5602       Gc_AN = M2G[Mc_AN];
5603       wanA = WhatSpecies[Gc_AN];
5604       tno1 = Spe_Total_CNO[wanA];
5605       for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5606 	Gh_AN = natn[Gc_AN][h_AN];
5607 	wanB = WhatSpecies[Gh_AN];
5608 	tno2 = Spe_Total_CNO[wanB];
5609 	for (i=0; i<tno1; i++){
5610 	  for (j=0; j<tno2; j++){
5611 	    CDM[spin][Mc_AN][h_AN][i][j] = 0.0;
5612 	    EDM[spin][Mc_AN][h_AN][i][j] = 0.0;
5613 	  }
5614 	}
5615       }
5616     }
5617   }
5618 
5619   /****************************************************
5620    MPI
5621 
5622    Hks
5623   ****************************************************/
5624 
5625   if (measure_time==1) dtime(&Stime1);
5626 
5627   if (SCF_iter==1){
5628 
5629     /***********************************
5630                set data size
5631     ************************************/
5632 
5633     for (ID=0; ID<numprocs; ID++){
5634 
5635       IDS = (myid + ID) % numprocs;
5636       IDR = (myid - ID + numprocs) % numprocs;
5637 
5638       if (ID!=0){
5639 	tag = 999;
5640 
5641 	/* find data size to send block data */
5642 	if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5643 
5644 	  size1 = 0;
5645 	  for (spin=0; spin<=SpinP_switch; spin++){
5646 	    for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
5647 	      Mc_AN = Snd_MAN[IDS][n];
5648 	      Gc_AN = Snd_GAN[IDS][n];
5649 	      Cwan = WhatSpecies[Gc_AN];
5650 	      tno1 = Spe_Total_CNO[Cwan];
5651 	      for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5652 		Gh_AN = natn[Gc_AN][h_AN];
5653 		Hwan = WhatSpecies[Gh_AN];
5654 		tno2 = Spe_Total_CNO[Hwan];
5655 		size1 += tno1*tno2;
5656 	      }
5657 	    }
5658 	  }
5659 
5660 	  Snd_HFS_Size[IDS] = size1;
5661 	  MPI_Isend(&size1, 1, MPI_INT, IDS, tag, mpi_comm_level1, &request);
5662 	}
5663 	else{
5664 	  Snd_HFS_Size[IDS] = 0;
5665 	}
5666 
5667 	/* receiving of size of data */
5668 
5669 	if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
5670 
5671 	  MPI_Recv(&size2, 1, MPI_INT, IDR, tag, mpi_comm_level1, &stat);
5672 	  Rcv_HFS_Size[IDR] = size2;
5673 	}
5674 	else{
5675 	  Rcv_HFS_Size[IDR] = 0;
5676 	}
5677 
5678 	if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0) MPI_Wait(&request,&stat);
5679       }
5680       else{
5681 	Snd_HFS_Size[IDS] = 0;
5682 	Rcv_HFS_Size[IDR] = 0;
5683       }
5684     }
5685   }
5686 
5687   /***********************************
5688              data transfer
5689   ************************************/
5690 
5691   /* find maximum size of size1 */
5692 
5693   max_size1 = 0;
5694   max_size2 = 0;
5695   for (ID=0; ID<numprocs; ID++){
5696     size1 = Snd_HFS_Size[ID];
5697     if (max_size1<size1) max_size1 = size1;
5698     size2 = Rcv_HFS_Size[ID];
5699     if (max_size2<size2) max_size2 = size2;
5700   }
5701 
5702   /* allocation of arrays */
5703 
5704   tmp_array  = (double*)malloc(sizeof(double)*max_size1);
5705   tmp_array2 = (double*)malloc(sizeof(double)*max_size2);
5706 
5707   /* MPI communication */
5708 
5709   tag = 999;
5710   for (ID=0; ID<numprocs; ID++){
5711 
5712     IDS = (myid + ID) % numprocs;
5713     IDR = (myid - ID + numprocs) % numprocs;
5714 
5715     if (ID!=0){
5716 
5717       /*****************************
5718               sending of data
5719       *****************************/
5720 
5721       if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5722 
5723         size1 = Snd_HFS_Size[IDS];
5724 
5725         /* multidimentional array to vector array */
5726 
5727         num = 0;
5728         for (spin=0; spin<=SpinP_switch; spin++){
5729           for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
5730             Mc_AN = Snd_MAN[IDS][n];
5731             Gc_AN = Snd_GAN[IDS][n];
5732             Cwan = WhatSpecies[Gc_AN];
5733             tno1 = Spe_Total_CNO[Cwan];
5734             for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5735               Gh_AN = natn[Gc_AN][h_AN];
5736               Hwan = WhatSpecies[Gh_AN];
5737               tno2 = Spe_Total_CNO[Hwan];
5738               for (i=0; i<tno1; i++){
5739                 for (j=0; j<tno2; j++){
5740                   tmp_array[num] = Hks[spin][Mc_AN][h_AN][i][j];
5741                   num++;
5742                 }
5743               }
5744 	    }
5745           }
5746 	}
5747 
5748         MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request);
5749       }
5750 
5751       /*****************************
5752          receiving of block data
5753       *****************************/
5754 
5755       if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
5756 
5757         size2 = Rcv_HFS_Size[IDR];
5758 
5759         MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat);
5760 
5761         num = 0;
5762         for (spin=0; spin<=SpinP_switch; spin++){
5763           Mc_AN = S_TopMAN[IDR] - 1;  /* S_TopMAN should be used. */
5764           for (n=0; n<(F_Rcv_Num[IDR]+S_Rcv_Num[IDR]); n++){
5765             Mc_AN++;
5766             Gc_AN = Rcv_GAN[IDR][n];
5767             Cwan = WhatSpecies[Gc_AN];
5768             tno1 = Spe_Total_CNO[Cwan];
5769 
5770             for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5771               Gh_AN = natn[Gc_AN][h_AN];
5772               Hwan = WhatSpecies[Gh_AN];
5773               tno2 = Spe_Total_CNO[Hwan];
5774               for (i=0; i<tno1; i++){
5775                 for (j=0; j<tno2; j++){
5776                   Hks[spin][Mc_AN][h_AN][i][j] = tmp_array2[num];
5777                   num++;
5778 		}
5779 	      }
5780 	    }
5781 	  }
5782 	}
5783       }
5784 
5785       if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5786         MPI_Wait(&request,&stat);
5787       }
5788     }
5789   }
5790 
5791   /****************************************************
5792    MPI
5793 
5794    OLP0
5795   ****************************************************/
5796 
5797   /***********************************
5798              set data size
5799   ************************************/
5800 
5801   if (SCF_iter==1){
5802 
5803     /***********************************
5804                data transfer
5805     ************************************/
5806 
5807     tag = 999;
5808     for (ID=0; ID<numprocs; ID++){
5809 
5810       IDS = (myid + ID) % numprocs;
5811       IDR = (myid - ID + numprocs) % numprocs;
5812 
5813       if (ID!=0){
5814 
5815         /*****************************
5816                 sending of data
5817         *****************************/
5818 
5819         if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5820 
5821           size1 = Snd_HFS_Size[IDS]/(1+SpinP_switch);
5822 
5823           /* multidimentional array to vector array */
5824 
5825           num = 0;
5826 
5827           for (n=0; n<(F_Snd_Num[IDS]+S_Snd_Num[IDS]); n++){
5828             Mc_AN = Snd_MAN[IDS][n];
5829             Gc_AN = Snd_GAN[IDS][n];
5830             Cwan = WhatSpecies[Gc_AN];
5831             tno1 = Spe_Total_CNO[Cwan];
5832             for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5833               Gh_AN = natn[Gc_AN][h_AN];
5834               Hwan = WhatSpecies[Gh_AN];
5835               tno2 = Spe_Total_CNO[Hwan];
5836               for (i=0; i<tno1; i++){
5837                 for (j=0; j<tno2; j++){
5838                   tmp_array[num] = OLP0[Mc_AN][h_AN][i][j];
5839                   num++;
5840                 }
5841               }
5842 	    }
5843           }
5844 
5845           MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request);
5846         }
5847 
5848         /*****************************
5849            receiving of block data
5850         *****************************/
5851 
5852         if ((F_Rcv_Num[IDR]+S_Rcv_Num[IDR])!=0){
5853 
5854           size2 = Rcv_HFS_Size[IDR]/(1+SpinP_switch);
5855 
5856           MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat);
5857 
5858           num = 0;
5859           Mc_AN = S_TopMAN[IDR] - 1; /* S_TopMAN should be used. */
5860           for (n=0; n<(F_Rcv_Num[IDR]+S_Rcv_Num[IDR]); n++){
5861             Mc_AN++;
5862             Gc_AN = Rcv_GAN[IDR][n];
5863             Cwan = WhatSpecies[Gc_AN];
5864             tno1 = Spe_Total_CNO[Cwan];
5865 
5866             for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
5867               Gh_AN = natn[Gc_AN][h_AN];
5868               Hwan = WhatSpecies[Gh_AN];
5869               tno2 = Spe_Total_CNO[Hwan];
5870               for (i=0; i<tno1; i++){
5871                 for (j=0; j<tno2; j++){
5872                   OLP0[Mc_AN][h_AN][i][j] = tmp_array2[num];
5873                   num++;
5874    	        }
5875 	      }
5876 	    }
5877 	  }
5878         }
5879 
5880         if ((F_Snd_Num[IDS]+S_Snd_Num[IDS])!=0){
5881           MPI_Wait(&request,&stat);
5882 	}
5883       }
5884     }
5885   }
5886 
5887   /* freeing of arrays */
5888   free(tmp_array);
5889   free(tmp_array2);
5890 
5891   if (measure_time==1){
5892     dtime(&Etime1);
5893     time1 = Etime1 - Stime1;
5894   }
5895 
5896   /***********************************************
5897     for regeneration of the buffer matrix
5898   ***********************************************/
5899 
5900   if (sqrt(fabs(NormRD[0]))<(0.2+0.10*(double)atomnum) && recalc_firsttime){
5901     recalc_flag = 1;
5902     recalc_firsttime = 0;
5903   }
5904   else{
5905     recalc_flag = 0;
5906   }
5907 
5908   if (SCF_iter==1) recalc_firsttime = 1;
5909 
5910   if (error_check==1){
5911     printf("SCF_iter=%2d recalc_firsttime=%2d\n",SCF_iter,recalc_firsttime);
5912   }
5913 
5914   if (measure_time==1) dtime(&Stime2);
5915 
5916   /*
5917 #pragma omp parallel shared(List_YOUSO,Residues,EDM,CDM,HO_TC,LO_TC,ChemP,EVal,RMI1,S_G2M,EC_matrix,recalc_flag,recalc_EM,Krylov_U,SpinP_switch,EKC_core_size,rlmax_EC,rlmax_EC2,time11,time10,time9,time8,time7,time6,time5,time4,time3,time2,Hks,OLP0,EKC_Exact_invS_flag,SCF_iter,Msize3,Msize2,Msize,natn,FNAN,SNAN,Spe_Total_CNO,WhatSpecies,M2G,Matomnum,myid,time_per_atom,firsttime)
5918   */
5919 
5920   {
5921     int OMPID,Nthrds,Nprocs;
5922     int Mc_AN,Gc_AN,wan,spin;
5923     int ig,ian,ih,kl,jg,jan,Bnum,m,n,rl;
5924     int Anum,i,j,k,Gi,wanA,NUM,n2,csize,is,i2;
5925     int i1,rl1,js,ip,po1,tno1,h_AN,Gh_AN,wanB,tno2;
5926     int *MP;
5927     int KU_d1, KU_d2,lda,ldb,ldc,M,N,K;
5928     double alpha, beta;
5929     double **invS;
5930     double *LoS;
5931     double *C,*KU;
5932     double *H_DC,*ko;
5933     double ***Krylov_U_OLP;
5934     double **inv_RS;
5935     double sum00,sum10,sum20,sum30,sum;
5936     double tmp0,tmp1,tmp2,tmp3;
5937     double Erange;
5938     double Stime_atom,Etime_atom;
5939     double Stime1,Etime1;
5940     double **tmpvec0;
5941     double **tmpvec1;
5942     double **tmpvec2;
5943 
5944     __m128d mmSum00,mmSum01,mmSum10,mmSum11;
5945     __m128d mmSum20,mmSum21,mmSum30,mmSum31;
5946     __m128d mmTmp0, mmTmp1, mmTmp2, mmTmp3;
5947     __m128d mmTmp4, mmTmp5;
5948 
5949     /* get info. on OpenMP */
5950 
5951     OMPID = omp_get_thread_num();
5952 
5953     /*
5954     Nthrds = omp_get_num_threads();
5955     Nprocs = omp_get_num_procs();
5956     */
5957 
5958     /* allocation of arrays */
5959 
5960     MP = (int*)malloc(sizeof(int)*List_YOUSO[2]);
5961 
5962     tmpvec0 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
5963     for (i=0; i<EKC_core_size_max; i++){
5964       tmpvec0[i] = (double*)malloc(sizeof(double)*Msize2_max);
5965     }
5966 
5967     tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
5968     for (i=0; i<EKC_core_size_max; i++){
5969       tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
5970     }
5971 
5972     tmpvec2 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
5973     for (i=0; i<EKC_core_size_max; i++){
5974       tmpvec2[i] = (double*)malloc(sizeof(double)*Msize2_max);
5975     }
5976 
5977     if (firsttime && OMPID==0){
5978       PrintMemory("Krylov: tmpvec0",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
5979       PrintMemory("Krylov: tmpvec1",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
5980       PrintMemory("Krylov: tmpvec2",sizeof(double)*EKC_core_size_max*Msize2_max,NULL);
5981     }
5982 
5983     /***********************************************
5984      main loop of calculation
5985     ***********************************************/
5986 
5987     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
5988 
5989       dtime(&Stime_atom);
5990 
5991       Gc_AN = M2G[Mc_AN];
5992       wan = WhatSpecies[Gc_AN];
5993 
5994       /* MP array */
5995 
5996       Anum = 1;
5997       for (i=0; i<=(FNAN[Gc_AN]+SNAN[Gc_AN]); i++){
5998 	MP[i] = Anum;
5999 	Gi = natn[Gc_AN][i];
6000 	wanA = WhatSpecies[Gi];
6001 	Anum += Spe_Total_CNO[wanA];
6002       }
6003       NUM = Anum - 1;
6004       n2 = NUM + 40;
6005 
6006       /***********************************************
6007                  allocation of arrays:
6008       ***********************************************/
6009 
6010       if (Msize[Mc_AN]<Msize3[Mc_AN])
6011 	csize = Msize3[Mc_AN] + 40;
6012       else
6013 	csize = Msize[Mc_AN] + 40;
6014 
6015       KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
6016       KU_d2 = Msize2[Mc_AN];
6017 
6018       H_DC = (double*)malloc(sizeof(double)*csize*csize);
6019       ko = (double*)malloc(sizeof(double)*csize);
6020       C = (double*)malloc(sizeof(double)*csize*csize);
6021       KU = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+2)*Msize3[Mc_AN]);
6022 
6023       /***********************************************
6024         calculate the inverse of overlap matrix
6025       ***********************************************/
6026 
6027       if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_Exact_invS_flag==1){
6028 
6029 	LoS = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3)*(Msize2[Mc_AN]+3));
6030 
6031 	invS = (double**)malloc(sizeof(double*)*(Msize2[Mc_AN]+3));
6032 	for (i=0; i<(Msize2[Mc_AN]+3); i++){
6033 	  invS[i] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
6034 	}
6035 
6036 	if (measure_time==1) dtime(&Stime1);
6037 
6038 	Inverse_S_by_Cholesky(Mc_AN, OLP0, invS, MP, NUM, LoS);
6039 
6040 	if (measure_time==1){
6041 	  dtime(&Etime1);
6042 	  time2 += Etime1 - Stime1;
6043 	}
6044       }
6045 
6046       else if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_invS_flag==1){
6047 
6048 	Krylov_U_OLP = (double***)malloc(sizeof(double**)*rlmax_EC2[Mc_AN]);
6049 	for (i=0; i<rlmax_EC2[Mc_AN]; i++){
6050 	  Krylov_U_OLP[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
6051 	  for (j=0; j<EKC_core_size[Mc_AN]; j++){
6052 	    Krylov_U_OLP[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
6053 	    for (k=0; k<(Msize2[Mc_AN]+3); k++)  Krylov_U_OLP[i][j][k] = 0.0;
6054 	  }
6055 	}
6056 
6057 	inv_RS = (double**)malloc(sizeof(double*)*(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]);
6058 	for (i=0; i<(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]; i++){
6059 	  inv_RS[i] = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]);
6060 	}
6061 
6062 	if (measure_time==1) dtime(&Stime1);
6063 
6064 	Krylov_IOLP_trd( Mc_AN, OLP0, Krylov_U_OLP, inv_RS, MP, Msize2, Msize4, Msize2_max, tmpvec0, tmpvec1 );
6065 
6066 	if (measure_time==1){
6067 	  dtime(&Etime1);
6068 	  time2 += Etime1 - Stime1;
6069 	}
6070       }
6071 
6072       for (spin=0; spin<=SpinP_switch; spin++){
6073 
6074 	/****************************************************
6075                 generate a preconditioning matrix
6076 	****************************************************/
6077 
6078 	if (measure_time==1) dtime(&Stime1);
6079 
6080 	if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN]){
6081 
6082 	  Generate_pMatrix_trd( myid, spin, Mc_AN, Hks, OLP0, invS, Krylov_U, Krylov_U_OLP, inv_RS, MP,
6083                             Msize, Msize2, Msize3, Msize4, Msize2_max, tmpvec0, tmpvec1, tmpvec2 );
6084 	}
6085 	else if (SCF_iter==1){
6086 
6087 	  Generate_pMatrix2_trd( myid, spin, Mc_AN, Hks, OLP0, Krylov_U, MP, Msize, Msize2, Msize3, tmpvec1 );
6088 	}
6089 
6090 	if (measure_time==1){
6091 	  dtime(&Etime1);
6092 	  time3 += Etime1 - Stime1;
6093 	}
6094 
6095 	if (measure_time==1) dtime(&Stime1);
6096 
6097 	if (recalc_EM==1 || SCF_iter<=3 || recalc_flag==1){
6098 
6099 	  Embedding_Matrix_trd( spin, Mc_AN, Hks, Krylov_U, EC_matrix, MP, Msize, Msize2, Msize3, tmpvec1, EKC_core_size_max, Msize2_max);
6100 	}
6101 
6102 	if (measure_time==1){
6103 	  dtime(&Etime1);
6104 	  time4 += Etime1 - Stime1;
6105 	}
6106 
6107 	/****************************************************
6108                 construct the Hamiltonian matrix
6109 	****************************************************/
6110 
6111 	if (measure_time==1) dtime(&Stime1);
6112 
6113 	for (i=0; i<=FNAN[Gc_AN]; i++){
6114 	  ig = natn[Gc_AN][i];
6115 	  ian = Spe_Total_CNO[WhatSpecies[ig]];
6116 	  Anum = MP[i] - 1;
6117 	  ih = S_G2M[ig]; /* S_G2M should be used */
6118 
6119 	  for (j=0; j<=FNAN[Gc_AN]; j++){
6120 
6121 	    kl = RMI1[Mc_AN][i][j];
6122 	    jg = natn[Gc_AN][j];
6123 	    jan = Spe_Total_CNO[WhatSpecies[jg]];
6124 	    Bnum = MP[j] - 1;
6125 
6126 	    if (0<=kl){
6127 	      for (m=0; m<ian; m++){
6128 		for (n=0; n<jan; n++){
6129 		  H_DC[(Anum+m)*Msize[Mc_AN]+Bnum+n] = Hks[spin][ih][kl][m][n];
6130 		}
6131 	      }
6132 	    }
6133 
6134 	    else{
6135 	      for (m=0; m<ian; m++){
6136 		for (n=0; n<jan; n++){
6137 		  H_DC[(Anum+m)*Msize[Mc_AN]+Bnum+n] = 0.0;
6138 		}
6139 	      }
6140 	    }
6141 	  }
6142 	}
6143 
6144 	if (measure_time==1){
6145 	  dtime(&Etime1);
6146 	  time5 += Etime1 - Stime1;
6147 	}
6148 
6149 	/****************************************************
6150                    transform u1^+ * H_DC * u1
6151 	****************************************************/
6152 
6153 	/* H_DC * u1 */
6154 
6155 	if (measure_time==1) dtime(&Stime1);
6156 
6157 	/* original version
6158 
6159 	for (i=1; i<=Msize[Mc_AN]; i++){
6160 	  for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
6161 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
6162 
6163 	      sum = 0.0;
6164 	      for (j=1; j<=Msize[Mc_AN]; j++){
6165 		sum += H_DC[i][j]*Krylov_U[spin][Mc_AN][rl][n][j];
6166 	      }
6167 
6168 	      C[rl*EKC_core_size[Mc_AN]+n+1][i] = sum;
6169 	    }
6170 	  }
6171 	}
6172 	*/
6173 
6174 	/* BLAS3 version */
6175 
6176 	for (i=0; i<Msize[Mc_AN]; i++){
6177 	  for (j=0; j<Msize3[Mc_AN]; j++){
6178 	    KU[j*Msize[Mc_AN]+i] = Krylov_U[spin][Mc_AN][j*Msize2[Mc_AN]+i+1];
6179 	  }
6180 	}
6181 
6182 	alpha = 1.0; beta = 0.0;
6183 	M = Msize[Mc_AN]; N = Msize3[Mc_AN]; K = Msize[Mc_AN];
6184 	lda = M; ldb = K; ldc = Msize[Mc_AN];
6185 
6186 	F77_NAME(dgemm,DGEMM)( "N", "N", &M, &N, &K, &alpha,
6187 			       H_DC, &lda, KU, &ldb, &beta, C, &ldc);
6188 
6189 	if (measure_time==1){
6190 	  dtime(&Etime1);
6191 	  time6 += Etime1 - Stime1;
6192 	}
6193 
6194 	/* u1^+ * H_DC * u1 */
6195 
6196 	if (measure_time==1) dtime(&Stime1);
6197 
6198 	/* original version */
6199 
6200 	/*
6201 	for (rl1=0; rl1<rlmax_EC[Mc_AN]; rl1++){
6202           for (m=0; m<EKC_core_size[Mc_AN]; m++){
6203 	    for (rl2=rl1; rl2<rlmax_EC[Mc_AN]; rl2++){
6204 	      for (n=0; n<EKC_core_size[Mc_AN]; n++){
6205 		sum = 0.0;
6206 		i2 = rl2*EKC_core_size[Mc_AN] + n + 1;
6207 		for (i=1; i<=Msize[Mc_AN]; i++){
6208 		  sum += Krylov_U[spin][Mc_AN][rl1][m][i]*C[i2][i];
6209 		}
6210 
6211 		H_DC[rl1*EKC_core_size[Mc_AN]+m+1][rl2*EKC_core_size[Mc_AN]+n+1] = sum;
6212 		H_DC[rl2*EKC_core_size[Mc_AN]+n+1][rl1*EKC_core_size[Mc_AN]+m+1] = sum;
6213 	      }
6214 	    }
6215           }
6216 	}
6217 	*/
6218 
6219 	/* BLAS3 version */
6220 
6221 	alpha = 1.0; beta = 0.0;
6222 	M = Msize3[Mc_AN]; N = Msize3[Mc_AN]; K = Msize[Mc_AN];
6223 	lda = K; ldb = K; ldc = Msize3[Mc_AN];
6224 
6225 	F77_NAME(dgemm,DGEMM)("T", "N", &M, &N, &K, &alpha,
6226 			      KU, &lda, C, &ldb, &beta, H_DC, &ldc);
6227 
6228 	if (measure_time==1){
6229 	  dtime(&Etime1);
6230 	  time7 += Etime1 - Stime1;
6231 	}
6232 
6233 	/* correction for ZeroNum */
6234 
6235 	m = (int)Krylov_U[spin][Mc_AN][0];
6236 	for (i=0; i<m; i++){
6237 	  H_DC[i*Msize3[Mc_AN]+i] = 1.0e+3;
6238 	}
6239 
6240 	/****************************************************
6241             H0 = u1^+ * H_DC * u1 + D
6242 	****************************************************/
6243 
6244 	if (measure_time==1) dtime(&Stime1);
6245 
6246 	for (i=(Msize3[Mc_AN]-1); 0<=i; i--){
6247 	  for (j=0; j<Msize3[Mc_AN]; j++){
6248 	    H_DC[(i+1)*(Msize3[Mc_AN]+1)+(j+1)] = H_DC[i*Msize3[Mc_AN]+j] + EC_matrix[spin][Mc_AN][i+1][j+1];
6249 	  }
6250 	}
6251 
6252 	if (measure_time==1){
6253 	  dtime(&Etime1);
6254 	  time8 += Etime1 - Stime1;
6255 	}
6256 
6257 	/****************************************************
6258            diagonalize
6259 	****************************************************/
6260 
6261 	if (measure_time==1) dtime(&Stime1);
6262 
6263 	Eigen_lapack2(H_DC,Msize3[Mc_AN]+1,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
6264 
6265 	if (measure_time==1){
6266 	  dtime(&Etime1);
6267 	  time9 += Etime1 - Stime1;
6268 	}
6269 
6270 	/********************************************
6271            back transformation of eigenvectors
6272                       c = u1 * b
6273 	*********************************************/
6274 
6275 	if (measure_time==1) dtime(&Stime1);
6276 
6277 	/* original version */
6278 
6279 	/*
6280 	for (i=1; i<=Msize[Mc_AN]; i++){
6281           for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
6282 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
6283 
6284 	      tmp1 = Krylov_U[spin][Mc_AN][rl][n][i];
6285 	      i1 = rl*EKC_core_size[Mc_AN] + n + 1;
6286 
6287 	      for (j=1; j<=Msize3[Mc_AN]; j++){
6288 		C[i][j] += tmp1*H_DC[i1][j];
6289 	      }
6290 	    }
6291           }
6292 	}
6293 	*/
6294 
6295 	/* BLAS3 version */
6296 
6297 	alpha = 1.0; beta = 0.0;
6298         M = Msize3[Mc_AN]; N = Msize[Mc_AN]; K = Msize3[Mc_AN];
6299         lda = K; ldb = N; ldc = Msize3[Mc_AN];
6300 
6301 	F77_NAME(dgemm,DGEMM)("T", "T", &M, &N, &K, &alpha,
6302                                H_DC, &lda, KU, &ldb, &beta, C, &ldc);
6303 
6304 	if (measure_time==1){
6305 	  dtime(&Etime1);
6306 	  time10 += Etime1 - Stime1;
6307 	}
6308 
6309 	if (measure_time==1) dtime(&Stime1);
6310 
6311 	/***********************************************
6312           store eigenvalues and residues of poles
6313 	***********************************************/
6314 
6315 	for (i=1; i<=Msize3[Mc_AN]; i++){
6316 	  EVal[spin][Mc_AN][i-1] = ko[i];
6317 	}
6318 
6319 	/******************************************************
6320         set an energy range (-Erange+ChemP to Erange+ChemP)
6321         of eigenvalues used to store the Residues.
6322 	******************************************************/
6323 
6324 	Erange = 0.367493245;  /* in hartree, corresponds to 10 eV */
6325 
6326 	/***********************************************
6327           find LO_TC and HO_TC
6328 	***********************************************/
6329 
6330 	/* LO_TC */
6331 	i = 0;
6332 	ip = 0;
6333 	po1 = 0;
6334 	do{
6335 	  if ( (ChemP-Erange)<EVal[spin][Mc_AN][i]){
6336 	    ip = i;
6337 	    po1 = 1;
6338 	  }
6339 	  i++;
6340 	} while (po1==0 && i<Msize3[Mc_AN]);
6341 
6342 	LO_TC[spin][Mc_AN] = ip;
6343 
6344 	/* HO_TC */
6345 	i = 0;
6346 	ip = Msize3[Mc_AN]-1;
6347 	po1 = 0;
6348 	do{
6349 	  if ( (ChemP+Erange)<EVal[spin][Mc_AN][i]){
6350 	    ip = i;
6351 	    po1 = 1;
6352 	  }
6353 	  i++;
6354 	} while (po1==0 && i<Msize3[Mc_AN]);
6355 
6356 	HO_TC[spin][Mc_AN] = ip;
6357 
6358 	/***********************************************
6359           store residues of poles
6360 	***********************************************/
6361 
6362 	wanA = WhatSpecies[Gc_AN];
6363 	tno1 = Spe_Total_CNO[wanA];
6364 
6365 	for (i=0; i<tno1; i++){
6366 	  for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6367 
6368 	    Gh_AN = natn[Gc_AN][h_AN];
6369 	    wanB = WhatSpecies[Gh_AN];
6370 	    tno2 = Spe_Total_CNO[wanB];
6371 	    Bnum = MP[h_AN] - 1;
6372 	    for (j=0; j<tno2; j++){
6373 
6374 	      for (i1=0; i1<LO_TC[spin][Mc_AN]; i1++){
6375 		tmp1 = C[i*Msize3[Mc_AN]+i1]*C[(Bnum+j)*Msize3[Mc_AN]+i1];
6376 		CDM[spin][Mc_AN][h_AN][i][j] += tmp1;
6377 		EDM[spin][Mc_AN][h_AN][i][j] += tmp1*EVal[spin][Mc_AN][i1];
6378 	      }
6379 
6380 	      /* <allocation of Residues */
6381 	      n2 = HO_TC[spin][Mc_AN] - LO_TC[spin][Mc_AN] + 3;
6382 	      Residues[spin][Mc_AN][h_AN][i][j] = (double*)malloc(sizeof(double)*n2);
6383 	      /* allocation of Residues> */
6384 	      for (i1=LO_TC[spin][Mc_AN]; i1<=HO_TC[spin][Mc_AN]; i1++){
6385 		Residues[spin][Mc_AN][h_AN][i][j][i1-LO_TC[spin][Mc_AN]]
6386                           = C[i*Msize3[Mc_AN]+i1]*C[(Bnum+j)*Msize3[Mc_AN]+i1];
6387 
6388 	      }
6389 	    }
6390 	  }
6391 	}
6392 
6393 	if (measure_time==1){
6394 	  dtime(&Etime1);
6395 	  time11 += Etime1 - Stime1;
6396 	}
6397 
6398       } /* spin */
6399 
6400       /***********************************************
6401                     freeing of arrays:
6402       ***********************************************/
6403 
6404       free(H_DC);
6405       free(ko);
6406       free(C);
6407       free(KU);
6408 
6409       if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_Exact_invS_flag==1){
6410 
6411 	free(LoS);
6412 
6413 	for (i=0; i<(Msize2[Mc_AN]+3); i++){
6414 	  free(invS[i]);
6415 	}
6416 	free(invS);
6417       }
6418 
6419       else if (SCF_iter==1 && Msize3[Mc_AN]<Msize2[Mc_AN] && EKC_invS_flag==1){
6420 
6421 	for (i=0; i<rlmax_EC2[Mc_AN]; i++){
6422 	  for (j=0; j<EKC_core_size[Mc_AN]; j++){
6423 	    free(Krylov_U_OLP[i][j]);
6424 	  }
6425 	  free(Krylov_U_OLP[i]);
6426 	}
6427 	free(Krylov_U_OLP);
6428 
6429 	for (i=0; i<(rlmax_EC2[Mc_AN]+1)*EKC_core_size[Mc_AN]; i++){
6430 	  free(inv_RS[i]);
6431 	}
6432 	free(inv_RS);
6433       }
6434 
6435       dtime(&Etime_atom);
6436       time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6437 
6438     } /* Mc_AN */
6439 
6440     /* freeing of array */
6441 
6442     free(MP);
6443 
6444     for (i=0; i<EKC_core_size_max; i++){
6445       free(tmpvec0[i]);
6446     }
6447     free(tmpvec0);
6448 
6449     for (i=0; i<EKC_core_size_max; i++){
6450       free(tmpvec1[i]);
6451     }
6452     free(tmpvec1);
6453 
6454     for (i=0; i<EKC_core_size_max; i++){
6455       free(tmpvec2[i]);
6456     }
6457     free(tmpvec2);
6458 
6459   } /* #pragma omp parallel */
6460 
6461   if (measure_time==1){
6462     dtime(&Etime2);
6463     time16 = Etime2 - Stime2;
6464   }
6465 
6466   if (firsttime){
6467 
6468     Residues_size = 1;
6469 
6470     for (spin=0; spin<=SpinP_switch; spin++){
6471       for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6472 	Gc_AN = M2G[Mc_AN];
6473 	wan = WhatSpecies[Gc_AN];
6474 	tno1 = Spe_Total_CNO[wan];
6475 	for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6476 	  Gh_AN = natn[Gc_AN][h_AN];
6477 	  wanB = WhatSpecies[Gh_AN];
6478 	  tno2 = Spe_Total_CNO[wanB];
6479 	  n2 = HO_TC[spin][Mc_AN] - LO_TC[spin][Mc_AN] + 3;
6480 	  Residues_size += tno1*tno2*n2;
6481 	}
6482       }
6483     }
6484 
6485     PrintMemory("Krylov: Residues",sizeof(double)*Residues_size,NULL);
6486   }
6487 
6488   /****************************************************
6489                 calculate the projected DOS
6490   ****************************************************/
6491 
6492   if (measure_time==1) dtime(&Stime1);
6493 
6494   /*
6495 #pragma omp parallel shared(time_per_atom,Residues,LO_TC,HO_TC,EDM,CDM,OLP0,natn,FNAN,PDOS_DC,Msize3,Spe_Total_CNO,WhatSpecies,M2G,SpinP_switch,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,Stime_atom,spin,Gc_AN,wanA,tno1,i1,i,h_AN,Gh_AN,wanB,tno2,j,tmp1,Etime_atom)
6496   */
6497 
6498   {
6499 
6500     /* get info. on OpenMP */
6501 
6502     /*
6503     OMPID = omp_get_thread_num();
6504     Nthrds = omp_get_num_threads();
6505     Nprocs = omp_get_num_procs();
6506     */
6507 
6508     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6509 
6510       dtime(&Stime_atom);
6511 
6512       for (spin=0; spin<=SpinP_switch; spin++){
6513 
6514 	Gc_AN = M2G[Mc_AN];
6515 	wanA = WhatSpecies[Gc_AN];
6516 	tno1 = Spe_Total_CNO[wanA];
6517 
6518 	for (i1=0; i1<=(Msize3[Mc_AN]+1); i1++){
6519 	  PDOS_DC[spin][Mc_AN][i1] = 0.0;
6520 	}
6521 
6522 	for (i=0; i<tno1; i++){
6523 
6524 	  for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6525 	    Gh_AN = natn[Gc_AN][h_AN];
6526 	    wanB = WhatSpecies[Gh_AN];
6527 	    tno2 = Spe_Total_CNO[wanB];
6528 	    for (j=0; j<tno2; j++){
6529 
6530 	      tmp1 = OLP0[Mc_AN][h_AN][i][j];
6531 
6532 	      PDOS_DC[spin][Mc_AN][0] += tmp1*CDM[spin][Mc_AN][h_AN][i][j];
6533 	      PDOS_DC[spin][Mc_AN][1] += tmp1*EDM[spin][Mc_AN][h_AN][i][j];
6534 
6535 	      for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
6536 		PDOS_DC[spin][Mc_AN][i1+2] += Residues[spin][Mc_AN][h_AN][i][j][i1]*tmp1;
6537 	      }
6538 
6539 	    }
6540 	  }
6541 	}
6542 
6543       } /* spin */
6544 
6545       dtime(&Etime_atom);
6546       time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6547 
6548     } /* Mc_AN */
6549 
6550   } /* #pragma omp parallel */
6551 
6552   if (measure_time==1){
6553     dtime(&Etime1);
6554     time12 += Etime1 - Stime1;
6555   }
6556 
6557   /****************************************************
6558            find the total number of electrons
6559   ****************************************************/
6560 
6561   if (measure_time==1) dtime(&Stime1);
6562 
6563   My_TZ = 0.0;
6564   for (i=1; i<=Matomnum; i++){
6565     Gc_AN = M2G[i];
6566     wan = WhatSpecies[Gc_AN];
6567     My_TZ += Spe_Core_Charge[wan];
6568   }
6569 
6570   /* MPI, My_TZ */
6571 
6572   MPI_Barrier(mpi_comm_level1);
6573   MPI_Allreduce(&My_TZ, &TZ, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
6574 
6575   /****************************************************
6576                 find the chemical potential
6577   ****************************************************/
6578 
6579   po = 0;
6580   loopN = 0;
6581 
6582   ChemP_MAX = 10.0;
6583   ChemP_MIN =-10.0;
6584   if      (SpinP_switch==0) spin_degeneracy = 2.0;
6585   else if (SpinP_switch==1) spin_degeneracy = 1.0;
6586 
6587   do {
6588     ChemP = 0.50*(ChemP_MAX + ChemP_MIN);
6589 
6590     My_Num_State = 0.0;
6591 
6592     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6593       for (spin=0; spin<=SpinP_switch; spin++){
6594 
6595         dtime(&Stime_atom);
6596 
6597         Gc_AN = M2G[Mc_AN];
6598 
6599         My_Num_State += spin_degeneracy*PDOS_DC[spin][Mc_AN][0];
6600 
6601         for (i=0; i<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i++){
6602 
6603           x = (EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
6604           if (x<=-max_x) x = -max_x;
6605           if (max_x<=x)  x = max_x;
6606           FermiF = 1.0/(1.0 + exp(x));
6607           My_Num_State += spin_degeneracy*FermiF*PDOS_DC[spin][Mc_AN][i+2];
6608 	}
6609 
6610         dtime(&Etime_atom);
6611         time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6612       }
6613     }
6614 
6615     /* MPI, My_Num_State */
6616 
6617     MPI_Barrier(mpi_comm_level1);
6618     MPI_Allreduce(&My_Num_State, &Num_State, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
6619 
6620     Dnum = (TZ - Num_State) - system_charge;
6621     if (0.0<=Dnum) ChemP_MIN = ChemP;
6622     else           ChemP_MAX = ChemP;
6623     if (fabs(Dnum)<1.0e-11) po = 1;
6624 
6625 
6626     if (myid==Host_ID && 3<=level_stdout){
6627       printf("  ChemP=%15.12f TZ=%15.12f Num_state=%15.12f\n",ChemP,TZ,Num_State);
6628     }
6629 
6630     loopN++;
6631   }
6632   while (po==0 && loopN<1000);
6633 
6634   if (measure_time==1){
6635     dtime(&Etime1);
6636     time13 += Etime1 - Stime1;
6637   }
6638 
6639   /****************************************************
6640         eigenenergy by summing up eigenvalues
6641   ****************************************************/
6642 
6643   if (measure_time==1) dtime(&Stime1);
6644 
6645   My_Eele0[0] = 0.0;
6646   My_Eele0[1] = 0.0;
6647   for (spin=0; spin<=SpinP_switch; spin++){
6648     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6649 
6650       dtime(&Stime_atom);
6651 
6652       Gc_AN = M2G[Mc_AN];
6653       My_Eele0[spin] += PDOS_DC[spin][Mc_AN][1];
6654 
6655       for (i=0; i<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i++){
6656 
6657         x = (EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
6658 
6659         if (x<=-max_x) x = -max_x;
6660         if (max_x<=x)  x = max_x;
6661         FermiF = 1.0/(1.0 + exp(x));
6662         My_Eele0[spin] += FermiF*EVal[spin][Mc_AN][i+LO_TC[spin][Mc_AN]]*PDOS_DC[spin][Mc_AN][i+2];
6663       }
6664 
6665       dtime(&Etime_atom);
6666       time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6667     }
6668   }
6669 
6670   /* MPI, My_Eele0 */
6671   for (spin=0; spin<=SpinP_switch; spin++){
6672     MPI_Barrier(mpi_comm_level1);
6673     MPI_Allreduce(&My_Eele0[spin], &Eele0[spin], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1);
6674   }
6675 
6676   if (SpinP_switch==0){
6677     Eele0[1] = Eele0[0];
6678   }
6679 
6680   if (measure_time==1){
6681     dtime(&Etime1);
6682     time14 += Etime1 - Stime1;
6683   }
6684 
6685   if (measure_time==1) dtime(&Stime1);
6686 
6687   /*
6688 #pragma omp parallel shared(time_per_atom,EDM,CDM,Residues,natn,max_x,Beta,ChemP,EVal,LO_TC,HO_TC,Spe_Total_CNO,WhatSpecies,M2G,SpinP_switch,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,spin,Stime_atom,Gc_AN,wanA,tno1,i1,x,FermiF,h_AN,Gh_AN,wanB,tno2,i,j,tmp1,Etime_atom)
6689   */
6690 
6691   {
6692 
6693     /* get info. on OpenMP */
6694 
6695     /*
6696     OMPID = omp_get_thread_num();
6697     Nthrds = omp_get_num_threads();
6698     Nprocs = omp_get_num_procs();
6699     */
6700 
6701     for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){
6702       for (spin=0; spin<=SpinP_switch; spin++){
6703 
6704 	dtime(&Stime_atom);
6705 
6706 	Gc_AN = M2G[Mc_AN];
6707 	wanA = WhatSpecies[Gc_AN];
6708 	tno1 = Spe_Total_CNO[wanA];
6709 
6710 	for (i1=0; i1<(HO_TC[spin][Mc_AN]-LO_TC[spin][Mc_AN]+1); i1++){
6711 
6712 	  x = (EVal[spin][Mc_AN][i1+LO_TC[spin][Mc_AN]] - ChemP)*Beta;
6713 	  if (x<=-max_x) x = -max_x;
6714 	  if (max_x<=x)  x = max_x;
6715 	  FermiF = 1.0/(1.0 + exp(x));
6716 
6717 	  for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6718 	    Gh_AN = natn[Gc_AN][h_AN];
6719 	    wanB = WhatSpecies[Gh_AN];
6720 	    tno2 = Spe_Total_CNO[wanB];
6721 	    for (i=0; i<tno1; i++){
6722 	      for (j=0; j<tno2; j++){
6723 		tmp1 = FermiF*Residues[spin][Mc_AN][h_AN][i][j][i1];
6724 		CDM[spin][Mc_AN][h_AN][i][j] += tmp1;
6725 		EDM[spin][Mc_AN][h_AN][i][j] += tmp1*EVal[spin][Mc_AN][i1+LO_TC[spin][Mc_AN]];
6726 	      }
6727 	    }
6728 	  }
6729 	}
6730 
6731 	dtime(&Etime_atom);
6732 	time_per_atom[Gc_AN] += Etime_atom - Stime_atom;
6733       }
6734     }
6735 
6736   } /* #pragma omp parallel */
6737 
6738   /****************************************************
6739                       bond energies
6740   ****************************************************/
6741 
6742   My_Eele1[0] = 0.0;
6743   My_Eele1[1] = 0.0;
6744   for (MA_AN=1; MA_AN<=Matomnum; MA_AN++){
6745     GA_AN = M2G[MA_AN];
6746     wanA = WhatSpecies[GA_AN];
6747     tnoA = Spe_Total_CNO[wanA];
6748 
6749     for (j=0; j<=FNAN[GA_AN]; j++){
6750       GB_AN = natn[GA_AN][j];
6751       wanB = WhatSpecies[GB_AN];
6752       tnoB = Spe_Total_CNO[wanB];
6753 
6754       for (k=0; k<tnoA; k++){
6755 	for (l=0; l<tnoB; l++){
6756 	  for (spin=0; spin<=SpinP_switch; spin++){
6757 	    My_Eele1[spin] += CDM[spin][MA_AN][j][k][l]*Hks[spin][MA_AN][j][k][l];
6758 	  }
6759 	}
6760       }
6761 
6762     }
6763   }
6764 
6765   /* MPI, My_Eele1 */
6766   MPI_Barrier(mpi_comm_level1);
6767   for (spin=0; spin<=SpinP_switch; spin++){
6768     MPI_Allreduce(&My_Eele1[spin], &Eele1[spin], 1, MPI_DOUBLE,
6769                    MPI_SUM, mpi_comm_level1);
6770   }
6771 
6772   if (SpinP_switch==0){
6773     Eele1[1] = Eele1[0];
6774   }
6775 
6776   if (3<=level_stdout && myid==Host_ID){
6777     printf("  Eele00=%15.12f Eele01=%15.12f\n",Eele0[0],Eele0[1]);
6778     printf("  Eele10=%15.12f Eele11=%15.12f\n",Eele1[0],Eele1[1]);
6779   }
6780 
6781   if (measure_time==1){
6782     dtime(&Etime1);
6783     time15 += Etime1 - Stime1;
6784   }
6785 
6786   if ( strcasecmp(mode,"dos")==0 ){
6787     Save_DOS_Col(Residues,OLP0,EVal,LO_TC,HO_TC);
6788   }
6789 
6790   if (measure_time==1){
6791     printf("myid=%2d time1 =%5.3f time2 =%5.3f time3 =%5.3f time4 =%5.3f time5 =%5.3f\n",
6792             myid,time1,time2,time3,time4,time5);
6793     printf("myid=%2d time6 =%5.3f time7 =%5.3f time8 =%5.3f time9 =%5.3f time10=%5.3f\n",
6794             myid,time6,time7,time8,time9,time10);
6795     printf("myid=%2d time11=%5.3f time12=%5.3f time13=%5.3f time14=%5.3f time15=%5.3f\n",
6796             myid,time11,time12,time13,time14,time15);
6797     printf("myid=%2d time16=%5.3f\n",myid,time16);
6798   }
6799 
6800   /****************************************************
6801     freeing of arrays:
6802 
6803   ****************************************************/
6804 
6805   free(Msize);
6806   free(Msize2);
6807   free(Msize3);
6808   free(Msize4);
6809 
6810   for (spin=0; spin<(SpinP_switch+1); spin++){
6811     free(LO_TC[spin]);
6812   }
6813   free(LO_TC);
6814 
6815   for (spin=0; spin<(SpinP_switch+1); spin++){
6816     free(HO_TC[spin]);
6817   }
6818   free(HO_TC);
6819 
6820   for (spin=0; spin<=SpinP_switch; spin++){
6821     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
6822       free(EVal[spin][Mc_AN]);
6823     }
6824     free(EVal[spin]);
6825   }
6826   free(EVal);
6827 
6828   for (spin=0; spin<=SpinP_switch; spin++){
6829     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
6830 
6831       if (Mc_AN==0){
6832         Gc_AN = 0;
6833         FNAN[0] = 0;
6834         tno1 = 1;
6835       }
6836       else{
6837         Gc_AN = M2G[Mc_AN];
6838         wanA = WhatSpecies[Gc_AN];
6839         tno1 = Spe_Total_CNO[wanA];
6840       }
6841 
6842       for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){
6843 
6844         if (Mc_AN==0){
6845           tno2 = 1;
6846         }
6847         else {
6848           Gh_AN = natn[Gc_AN][h_AN];
6849           wanB = WhatSpecies[Gh_AN];
6850           tno2 = Spe_Total_CNO[wanB];
6851         }
6852 
6853         for (i=0; i<tno1; i++){
6854           for (j=0; j<tno2; j++){
6855             free(Residues[spin][Mc_AN][h_AN][i][j]);
6856 	  }
6857           free(Residues[spin][Mc_AN][h_AN][i]);
6858         }
6859         free(Residues[spin][Mc_AN][h_AN]);
6860       }
6861       free(Residues[spin][Mc_AN]);
6862     }
6863     free(Residues[spin]);
6864   }
6865   free(Residues);
6866 
6867   for (spin=0; spin<=SpinP_switch; spin++){
6868     for (Mc_AN=0; Mc_AN<=Matomnum; Mc_AN++){
6869       free(PDOS_DC[spin][Mc_AN]);
6870     }
6871     free(PDOS_DC[spin]);
6872   }
6873   free(PDOS_DC);
6874 
6875   /* for time */
6876   dtime(&TEtime);
6877   time0 = TEtime - TStime;
6878 
6879   if (measure_time==1){
6880   printf("total time=%15.12f\n",time0);
6881   }
6882 
6883   /* for PrintMemory */
6884   firsttime=0;
6885 
6886   return time0;
6887 }
6888 
6889 
6890 
6891 
6892 
6893 
6894 
6895 
6896 
Generate_pMatrix_trd(int myid,int spin,int Mc_AN,double ***** Hks,double **** OLP0,double ** invS,double *** Krylov_U,double *** Krylov_U_OLP,double ** inv_RS,int * MP,int * Msize,int * Msize2,int * Msize3,int * Msize4,int Msize2_max,double ** tmpvec0,double ** tmpvec1,double ** tmpvec2)6897 void Generate_pMatrix_trd( int myid, int spin, int Mc_AN, double *****Hks,
6898                        double ****OLP0, double **invS,
6899                        double ***Krylov_U, double ***Krylov_U_OLP, double **inv_RS, int *MP,
6900                        int *Msize, int *Msize2, int *Msize3, int *Msize4, int Msize2_max,
6901                        double **tmpvec0, double **tmpvec1, double **tmpvec2 )
6902 
6903 /* This subroutine is exactly the same as the original Generate_pMatrix,
6904 except for the OMP parallelized loops in orthogonalization by diagonalization,
6905 and Eigen_lapack is replaced with Eigen_lapack_d, _x, or _r
6906 */
6907 
6908 {
6909   int rl,rl0,rl1,ct_AN,fan,san,can,wan,ct_on,i,j;
6910   int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1;
6911   int ZeroNum,Gh_AN,wanB,m1s,is,info;
6912   int rl00,rl01,rl02,rl03,rl04,rl05,rl06,rl07;
6913   int mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;
6914 
6915   int KU_d1, KU_d2, csize;
6916   __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
6917 
6918   double mmArr[8];
6919   double time1,time2,time3,time4,time5;
6920   double time6,time7,time8,time9,time10;
6921   double Stime1,Etime1;
6922   double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
6923   double sum,dum,tmp0,tmp1,tmp2,tmp3,rcutA,r0;
6924   double **Utmp,**matRS0,**matRS1;
6925   double **tmpmat0;
6926   double *ko,*iko;
6927   double **FS;
6928   double ***U0;
6929 
6930   /* for OpenMP */
6931   int OMPID,Nthrds,Nprocs;
6932 
6933   ct_AN = M2G[Mc_AN];
6934   fan = FNAN[ct_AN];
6935   san = SNAN[ct_AN];
6936   can = fan + san;
6937   wan = WhatSpecies[ct_AN];
6938   ct_on = Spe_Total_CNO[wan];
6939   rcutA = Spe_Atom_Cut1[wan];
6940 
6941   if (Msize[Mc_AN]<Msize3[Mc_AN])
6942     csize = Msize3[Mc_AN] + 40;
6943   else
6944     csize = Msize[Mc_AN] + 40;
6945 
6946   KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
6947   KU_d2 = Msize2[Mc_AN];
6948 
6949   /* allocation of arrays */
6950 
6951   Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
6952   for (i=0; i<rlmax_EC[Mc_AN]; i++){
6953     Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
6954   }
6955 
6956   U0 = (double***)malloc(sizeof(double**)*rlmax_EC[Mc_AN]);
6957   for (i=0; i<rlmax_EC[Mc_AN]; i++){
6958     U0[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
6959     for (j=0; j<EKC_core_size[Mc_AN]; j++){
6960       U0[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
6961       for (k=0; k<(Msize2[Mc_AN]+3); k++)  U0[i][j][k] = 0.0;
6962     }
6963   }
6964 
6965   tmpmat0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+4));
6966   for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
6967     tmpmat0[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+4));
6968   }
6969 
6970   FS = (double**)malloc(sizeof(double*)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
6971   for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
6972     FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
6973   }
6974 
6975   ko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
6976   iko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
6977 
6978   matRS0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+2));
6979   for (i=0; i<(EKC_core_size[Mc_AN]+2); i++){
6980     matRS0[i] = (double*)malloc(sizeof(double)*(Msize4[Mc_AN]+3));
6981   }
6982 
6983   matRS1 = (double**)malloc(sizeof(double*)*(Msize4[Mc_AN]+3));
6984   for (i=0; i<(Msize4[Mc_AN]+3); i++){
6985     matRS1[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+2));
6986   }
6987 
6988   /****************************************************
6989       initialize
6990   ****************************************************/
6991 
6992   if (measure_time==1){
6993     time1 = 0.0;
6994     time2 = 0.0;
6995     time3 = 0.0;
6996     time4 = 0.0;
6997     time5 = 0.0;
6998     time6 = 0.0;
6999     time7 = 0.0;
7000     time8 = 0.0;
7001     time9 = 0.0;
7002     time10 = 0.0;
7003   }
7004 
7005   if (measure_time==1) dtime(&Stime1);
7006 
7007   for (i=0; i<EKC_core_size_max; i++){
7008     for (j=0; j<Msize2_max; j++){
7009       tmpvec0[i][j] = 0.0;
7010       tmpvec1[i][j] = 0.0;
7011     }
7012   }
7013 
7014   /* find the nearest atom with distance of r0 */
7015 
7016   r0 = 1.0e+10;
7017   for (k=1; k<=FNAN[ct_AN]; k++){
7018     Gh_AN = natn[ct_AN][k];
7019     wanB = WhatSpecies[Gh_AN];
7020     if (Dis[ct_AN][k]<r0)  r0 = Dis[ct_AN][k];
7021   }
7022 
7023   /* starting vector */
7024 
7025   m = 0;
7026   for (k=0; k<=FNAN[ct_AN]; k++){
7027 
7028     Gh_AN = natn[ct_AN][k];
7029     wanB = WhatSpecies[Gh_AN];
7030 
7031     if ( Dis[ct_AN][k]<(scale_rc_EKC[Mc_AN]*r0) ){
7032 
7033       Anum = MP[k] - 1;
7034 
7035       for (i=0; i<Spe_Total_CNO[wanB]; i++){
7036 
7037         tmpvec0[m][Anum+i] = 1.0;
7038 
7039         m++;
7040       }
7041     }
7042   }
7043 
7044   S_orthonormalize_vec_trd( Mc_AN, ct_on, tmpvec0, tmpvec1, OLP0, tmpmat0, ko, iko, MP, Msize2 );
7045 
7046   for (n=0; n<EKC_core_size[Mc_AN]; n++){
7047     for (i=0; i<Msize2[Mc_AN]; i++){
7048       U0[0][n][i] = tmpvec0[n][i];
7049     }
7050   }
7051 
7052   if (measure_time==1){
7053     dtime(&Etime1);
7054     time1 = Etime1 - Stime1;
7055   }
7056 
7057   /****************************************************
7058            generate Krylov subspace vectors
7059   ****************************************************/
7060 
7061   for (rl=0; rl<(rlmax_EC[Mc_AN]-1); rl++){
7062 
7063     if (measure_time==1) dtime(&Stime1);
7064 
7065     /*******************************************************
7066                             H * |Wn)
7067     *******************************************************/
7068 
7069     for (n=0; n<EKC_core_size[Mc_AN]; n++){
7070       for (i=0; i<Msize2[Mc_AN]; i++){
7071 	tmpvec1[n][i]  = 0.0;
7072       }
7073     }
7074 
7075     for (i=0; i<=can; i++){
7076 
7077       ig = natn[ct_AN][i];
7078       ian = Spe_Total_CNO[WhatSpecies[ig]];
7079       Anum = MP[i] - 1;
7080       ih = S_G2M[ig];
7081 
7082       for (j=0; j<=can; j++){
7083 
7084 	kl = RMI1[Mc_AN][i][j];
7085 	jg = natn[ct_AN][j];
7086 	jan = Spe_Total_CNO[WhatSpecies[jg]];
7087         Bnum = MP[j] - 1;
7088 
7089         if (0<=kl){
7090 
7091 #ifdef nosse
7092 
7093       /* original version */
7094 
7095       	  for (m=0; m<ian; m++){
7096 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
7097 
7098 	      sum = 0.0;
7099 	      for (k=0; k<jan; k++){
7100 		sum += Hks[spin][ih][kl][m][k]*tmpvec0[n][Bnum+k];
7101 	      }
7102 
7103               tmpvec1[n][Anum+m] += sum;
7104 	    }
7105 	  }
7106 
7107 #else
7108       /* Loop Unrolling + SSE version */
7109 
7110 	  for (m=0; m<(ian-3); m+=4){
7111 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
7112 
7113 	      mmSum00 = _mm_setzero_pd();
7114 	      mmSum01 = _mm_setzero_pd();
7115 	      mmSum10 = _mm_setzero_pd();
7116 	      mmSum11 = _mm_setzero_pd();
7117 	      mmSum20 = _mm_setzero_pd();
7118 	      mmSum21 = _mm_setzero_pd();
7119 	      mmSum30 = _mm_setzero_pd();
7120 	      mmSum31 = _mm_setzero_pd();
7121 
7122 	      for (k=0; k<(jan-3); k+=4){
7123 		mmTmp0 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+0]);
7124 		mmTmp1 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+2]);
7125 
7126 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
7127 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
7128 
7129 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
7130 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
7131 
7132 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
7133 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
7134 
7135 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
7136 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
7137 	      }
7138 
7139 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7140 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7141 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7142 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7143 
7144 	      _mm_storeu_pd(&mmArr[0], mmSum00);
7145 	      _mm_storeu_pd(&mmArr[2], mmSum10);
7146 	      _mm_storeu_pd(&mmArr[4], mmSum20);
7147 	      _mm_storeu_pd(&mmArr[6], mmSum30);
7148 
7149 	      sum0 = mmArr[0] + mmArr[1];
7150 	      sum1 = mmArr[2] + mmArr[3];
7151 	      sum2 = mmArr[4] + mmArr[5];
7152 	      sum3 = mmArr[6] + mmArr[7];
7153 
7154 	      for (; k<jan; k++){
7155 		sum0 += Hks[spin][ih][kl][m+0][k]*tmpvec0[n][Bnum+k];
7156 		sum1 += Hks[spin][ih][kl][m+1][k]*tmpvec0[n][Bnum+k];
7157 		sum2 += Hks[spin][ih][kl][m+2][k]*tmpvec0[n][Bnum+k];
7158 		sum3 += Hks[spin][ih][kl][m+3][k]*tmpvec0[n][Bnum+k];
7159 	      }
7160 
7161               tmpvec1[n][Anum+m+0] += sum0;
7162               tmpvec1[n][Anum+m+1] += sum1;
7163               tmpvec1[n][Anum+m+2] += sum2;
7164               tmpvec1[n][Anum+m+3] += sum3;
7165 	    }
7166 	  }
7167 
7168 	  for (; m<ian; m++){
7169 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
7170 
7171 	      sum = 0.0;
7172 	      for (k=0; k<jan; k++){
7173 		sum += Hks[spin][ih][kl][m][k]*tmpvec0[n][Bnum+k];
7174 	      }
7175 
7176               tmpvec1[n][Anum+m] += sum;
7177 	    }
7178 	  }
7179 #endif
7180 	}
7181       }
7182     }
7183 
7184     if (measure_time==1){
7185       dtime(&Etime1);
7186       time2 += Etime1 - Stime1;
7187     }
7188 
7189     /*******************************************************
7190            S^{-1} * H * |Wn)
7191     *******************************************************/
7192 
7193     if (EKC_Exact_invS_flag==1){
7194 
7195       if (measure_time==1) dtime(&Stime1);
7196 
7197 #ifdef nosse
7198 
7199       /* original version */
7200       /*
7201       for (n=0; n<EKC_core_size[Mc_AN]; n++){
7202         for (i=0; i<Msize2[Mc_AN]; i++){
7203 	  sum = 0.0;
7204 	  for (j=0; j<Msize2[Mc_AN]; j++){
7205 	    sum += invS[i][j]*tmpvec1[n][j];
7206 	  }
7207 	  tmpvec0[n][i] = sum;
7208         }
7209       }
7210       */
7211       /* unrolling version */
7212 
7213       for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
7214         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7215 
7216 	  sum0 = 0.0;
7217 	  sum1 = 0.0;
7218 	  sum2 = 0.0;
7219 	  sum3 = 0.0;
7220 
7221 	  for (j=0; j<Msize2[Mc_AN]; j++){
7222 	    sum0 += invS[i+0][j]*tmpvec1[n][j];
7223 	    sum1 += invS[i+1][j]*tmpvec1[n][j];
7224 	    sum2 += invS[i+2][j]*tmpvec1[n][j];
7225 	    sum3 += invS[i+3][j]*tmpvec1[n][j];
7226 	  }
7227 
7228 	  tmpvec0[n][i+0] = sum0;
7229 	  tmpvec0[n][i+1] = sum1;
7230 	  tmpvec0[n][i+2] = sum2;
7231 	  tmpvec0[n][i+3] = sum3;
7232         }
7233       }
7234 
7235       is = Msize2[Mc_AN] - Msize2[Mc_AN]%4;
7236 
7237       for (i=is; i<Msize2[Mc_AN]; i++){
7238         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7239 	  sum = 0.0;
7240 	  for (j=0; j<Msize2[Mc_AN]; j++){
7241 	    sum += invS[i][j]*tmpvec1[n][j];
7242 	  }
7243 	  tmpvec0[n][i] = sum;
7244         }
7245       }
7246 
7247 #else
7248       /* unrolling + SSE version */
7249 
7250       for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
7251         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7252 
7253 	  mmSum00 = _mm_setzero_pd();
7254 	  mmSum01 = _mm_setzero_pd();
7255 	  mmSum10 = _mm_setzero_pd();
7256 	  mmSum11 = _mm_setzero_pd();
7257 	  mmSum20 = _mm_setzero_pd();
7258 	  mmSum21 = _mm_setzero_pd();
7259 	  mmSum30 = _mm_setzero_pd();
7260 	  mmSum31 = _mm_setzero_pd();
7261 
7262 	  for (j=0; j<(Msize2[Mc_AN]-3); j+=4){
7263 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][j+0]);
7264 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][j+2]);
7265 
7266 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&invS[i+0][j+0]),mmTmp0));
7267 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&invS[i+0][j+2]),mmTmp1));
7268 
7269 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&invS[i+1][j+0]),mmTmp0));
7270 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&invS[i+1][j+2]),mmTmp1));
7271 
7272 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&invS[i+2][j+0]),mmTmp0));
7273 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&invS[i+2][j+2]),mmTmp1));
7274 
7275 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&invS[i+3][j+0]),mmTmp0));
7276 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&invS[i+3][j+2]),mmTmp1));
7277 	  }
7278 
7279 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7280 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7281 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7282 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7283 
7284 	  _mm_storeu_pd(&mmArr[0], mmSum00);
7285 	  _mm_storeu_pd(&mmArr[2], mmSum10);
7286 	  _mm_storeu_pd(&mmArr[4], mmSum20);
7287 	  _mm_storeu_pd(&mmArr[6], mmSum30);
7288 
7289 	  sum0 = mmArr[0] + mmArr[1];
7290 	  sum1 = mmArr[2] + mmArr[3];
7291 	  sum2 = mmArr[4] + mmArr[5];
7292 	  sum3 = mmArr[6] + mmArr[7];
7293 
7294 	  for (; j<Msize2[Mc_AN]; j++){
7295 	    sum0 += invS[i+0][j]*tmpvec1[n][j];
7296 	    sum1 += invS[i+1][j]*tmpvec1[n][j];
7297 	    sum2 += invS[i+2][j]*tmpvec1[n][j];
7298 	    sum3 += invS[i+3][j]*tmpvec1[n][j];
7299 	  }
7300 
7301 	  tmpvec0[n][i+0] = sum0;
7302 	  tmpvec0[n][i+1] = sum1;
7303 	  tmpvec0[n][i+2] = sum2;
7304 	  tmpvec0[n][i+3] = sum3;
7305         }
7306       }
7307 
7308       is = Msize2[Mc_AN] - Msize2[Mc_AN]%4;
7309 
7310       for (i=is; i<Msize2[Mc_AN]; i++){
7311         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7312 	  sum = 0.0;
7313 	  for (j=0; j<Msize2[Mc_AN]; j++){
7314 	    sum += invS[i][j]*tmpvec1[n][j];
7315 	  }
7316 	  tmpvec0[n][i] = sum;
7317         }
7318       }
7319 
7320 #endif
7321 
7322       if (measure_time==1){
7323 	dtime(&Etime1);
7324 	time3 += Etime1 - Stime1;
7325       }
7326     }
7327 
7328     /*******************************************************
7329           U * RS^-1 * U^+ * H * |Wn)
7330     *******************************************************/
7331 
7332     else if (EKC_invS_flag==1){
7333 
7334       /*  U^+ * H * |Wn) */
7335 
7336       for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
7337 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
7338 	  for (m=0; m<EKC_core_size[Mc_AN]; m++){
7339 
7340 	    sum = 0.0;
7341 	    for (i=0; i<Msize2[Mc_AN]; i++){
7342 	      sum += Krylov_U_OLP[rl0][n][i]*tmpvec1[m][i];
7343 	    }
7344 
7345 	    /* transpose the later calcualtion */
7346 	    matRS0[m][rl0*EKC_core_size[Mc_AN]+n] = sum;
7347 	  }
7348 	}
7349       }
7350 
7351       /*  RS^-1 * U^+ * H * |Wn) */
7352 
7353       for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
7354 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
7355 
7356 	  for (m=0; m<EKC_core_size[Mc_AN]; m++){
7357 
7358 	    sum = 0.0;
7359 	    for (i=0; i<Msize4[Mc_AN]; i++){
7360 	      sum += inv_RS[rl0*EKC_core_size[Mc_AN]+n][i]*matRS0[m][i];
7361 	    }
7362 
7363 	    matRS1[rl0*EKC_core_size[Mc_AN]+n][m] = sum;
7364 	  }
7365 	}
7366       }
7367 
7368       /*  U * RS^-1 * U^+ * H * |Wn) */
7369 
7370       for (n=0; n<EKC_core_size[Mc_AN]; n++){
7371 	for (i=0; i<Msize2[Mc_AN]; i++){
7372 	  tmpvec0[n][i] = 0.0;
7373 	}
7374       }
7375 
7376       for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
7377 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
7378 	  for (m=0; m<EKC_core_size[Mc_AN]; m++){
7379 	    tmp0 = matRS1[rl0*EKC_core_size[Mc_AN]+n][m];
7380 	    for (i=0; i<Msize2[Mc_AN]; i++){
7381 	      tmpvec0[m][i] += Krylov_U_OLP[rl0][n][i]*tmp0;
7382 	    }
7383 	  }
7384 	}
7385       }
7386     }
7387 
7388     else {
7389       for (n=0; n<EKC_core_size[Mc_AN]; n++){
7390         for (i=0; i<Msize2[Mc_AN]; i++){
7391 	  tmpvec0[n][i]  = tmpvec1[n][i];
7392         }
7393       }
7394     }
7395 
7396     if (measure_time==1) dtime(&Stime1);
7397 
7398     /*************************************************************
7399      S-orthogonalization by a classical block Gram-Schmidt method
7400     *************************************************************/
7401 
7402     /* |tmpvec2) = S * |tmpvec0) */
7403 
7404     for (n=0; n<EKC_core_size[Mc_AN]; n++){
7405       for (i=0; i<Msize2[Mc_AN]; i++){
7406 	tmpvec2[n][i] = 0.0;
7407       }
7408     }
7409 
7410     for (i=0; i<=can; i++){
7411 
7412       ig = natn[ct_AN][i];
7413       ian = Spe_Total_CNO[WhatSpecies[ig]];
7414       Anum = MP[i] - 1;
7415       ih = S_G2M[ig];
7416 
7417       for (j=0; j<=can; j++){
7418 
7419 	kl = RMI1[Mc_AN][i][j];
7420 	jg = natn[ct_AN][j];
7421 	jan = Spe_Total_CNO[WhatSpecies[jg]];
7422 	Bnum = MP[j] - 1;
7423 
7424 	if (0<=kl){
7425 
7426 #ifdef nosse
7427 
7428 	  /* Original version */
7429 
7430 	  for (m=0; m<ian; m++){
7431 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
7432 
7433 	      sum = 0.0;
7434 	      for (k=0; k<jan; k++){
7435 		sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
7436 	      }
7437 
7438 	      tmpvec2[n][Anum+m] += sum;
7439 	    }
7440 	  }
7441 
7442 #else
7443 
7444 	  /* Unrolling + SSE version */
7445 
7446 	  for (m=0; m<(ian-3); m+=4){
7447 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
7448 
7449 	      mmSum00 = _mm_setzero_pd();
7450 	      mmSum01 = _mm_setzero_pd();
7451 	      mmSum10 = _mm_setzero_pd();
7452 	      mmSum11 = _mm_setzero_pd();
7453 	      mmSum20 = _mm_setzero_pd();
7454 	      mmSum21 = _mm_setzero_pd();
7455 	      mmSum30 = _mm_setzero_pd();
7456 	      mmSum31 = _mm_setzero_pd();
7457 
7458 	      for (k=0; k<(jan-3); k+=4){
7459 		mmTmp0 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+0]);
7460 		mmTmp1 = _mm_loadu_pd(&tmpvec0[n][Bnum+k+2]);
7461 
7462 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
7463 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
7464 
7465 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
7466 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
7467 
7468 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
7469 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
7470 
7471 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
7472 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
7473 	      }
7474 
7475 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7476 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7477 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7478 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7479 
7480 	      _mm_storeu_pd(&mmArr[0], mmSum00);
7481 	      _mm_storeu_pd(&mmArr[2], mmSum10);
7482 	      _mm_storeu_pd(&mmArr[4], mmSum20);
7483 	      _mm_storeu_pd(&mmArr[6], mmSum30);
7484 
7485 	      sum0 = mmArr[0] + mmArr[1];
7486 	      sum1 = mmArr[2] + mmArr[3];
7487 	      sum2 = mmArr[4] + mmArr[5];
7488 	      sum3 = mmArr[6] + mmArr[7];
7489 
7490 	      for (; k<jan; k++){
7491 		sum0 += OLP0[ih][kl][m+0][k]*tmpvec0[n][Bnum+k];
7492 		sum1 += OLP0[ih][kl][m+1][k]*tmpvec0[n][Bnum+k];
7493 		sum2 += OLP0[ih][kl][m+2][k]*tmpvec0[n][Bnum+k];
7494 		sum3 += OLP0[ih][kl][m+3][k]*tmpvec0[n][Bnum+k];
7495 	      }
7496 
7497 	      tmpvec2[n][Anum+m+0] += sum0;
7498 	      tmpvec2[n][Anum+m+1] += sum1;
7499 	      tmpvec2[n][Anum+m+2] += sum2;
7500 	      tmpvec2[n][Anum+m+3] += sum3;
7501 	    }
7502 	  }
7503 
7504 	  for (; m<ian; m++){
7505 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
7506 
7507 	      sum = 0.0;
7508 	      for (k=0; k<jan; k++){
7509 		sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
7510 	      }
7511 
7512 	      tmpvec2[n][Anum+m] += sum;
7513 	    }
7514 	  }
7515 
7516 #endif
7517 
7518 	}
7519       }
7520     }
7521 
7522     if (measure_time==1){
7523       dtime(&Etime1);
7524       time4 += Etime1 - Stime1;
7525     }
7526 
7527     if (measure_time==1) dtime(&Stime1);
7528 
7529 #ifdef nosse
7530 
7531     /* Original version */
7532 
7533     for (rl0=0; rl0<=rl; rl0++){
7534 
7535       /* (U_rl0|tmpvec2) */
7536 
7537       for (m=0; m<EKC_core_size[Mc_AN]; m++){
7538         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7539           sum = 0.0;
7540   	  for (i=0; i<Msize2[Mc_AN]; i++){
7541             sum += U0[rl0][m][i]*tmpvec2[n][i];
7542 	  }
7543           tmpmat0[m][n] = sum;
7544 	}
7545       }
7546 
7547       /* |tmpvec0) - |U_rl0) * (U_rl0|tmpvec2) */
7548 
7549       for (n=0; n<EKC_core_size[Mc_AN]; n++){
7550 	for (k=0; k<EKC_core_size[Mc_AN]; k++){
7551 	  dum = tmpmat0[k][n];
7552 	  for (i=0; i<Msize2[Mc_AN]; i++)  tmpvec0[n][i] -= U0[rl0][k][i]*dum;
7553         }
7554       }
7555 
7556     }
7557 
7558 #else
7559 
7560     /* Unrolling + SSE version */
7561 
7562     for (rl0=0; rl0<=rl; rl0++){
7563 
7564       /* (U_rl0|tmpvec2) */
7565 
7566       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
7567         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7568 
7569 	  mmSum00 = _mm_setzero_pd();
7570 	  mmSum01 = _mm_setzero_pd();
7571 	  mmSum10 = _mm_setzero_pd();
7572 	  mmSum11 = _mm_setzero_pd();
7573 	  mmSum20 = _mm_setzero_pd();
7574 	  mmSum21 = _mm_setzero_pd();
7575 	  mmSum30 = _mm_setzero_pd();
7576 	  mmSum31 = _mm_setzero_pd();
7577 
7578   	  for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
7579 	    mmTmp0 = _mm_loadu_pd(&tmpvec2[n][i+0]);
7580 	    mmTmp1 = _mm_loadu_pd(&tmpvec2[n][i+2]);
7581 
7582 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
7583 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
7584 
7585 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
7586 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
7587 
7588 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
7589 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
7590 
7591 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
7592 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
7593 	  }
7594 
7595 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7596 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7597 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7598 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7599 
7600 	  _mm_storeu_pd(&mmArr[0], mmSum00);
7601 	  _mm_storeu_pd(&mmArr[2], mmSum10);
7602 	  _mm_storeu_pd(&mmArr[4], mmSum20);
7603 	  _mm_storeu_pd(&mmArr[6], mmSum30);
7604 
7605 	  sum0 = mmArr[0] + mmArr[1];
7606 	  sum1 = mmArr[2] + mmArr[3];
7607 	  sum2 = mmArr[4] + mmArr[5];
7608 	  sum3 = mmArr[6] + mmArr[7];
7609 
7610   	  for (; i<Msize2[Mc_AN]; i++){
7611             sum0 += U0[rl0][m+0][i]*tmpvec2[n][i];
7612             sum1 += U0[rl0][m+1][i]*tmpvec2[n][i];
7613             sum2 += U0[rl0][m+2][i]*tmpvec2[n][i];
7614             sum3 += U0[rl0][m+3][i]*tmpvec2[n][i];
7615 	  }
7616 
7617           tmpmat0[m+0][n] = sum0;
7618           tmpmat0[m+1][n] = sum1;
7619           tmpmat0[m+2][n] = sum2;
7620           tmpmat0[m+3][n] = sum3;
7621 	}
7622       }
7623 
7624       for (; m<EKC_core_size[Mc_AN]; m++){
7625         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7626           sum = 0.0;
7627   	  for (i=0; i<Msize2[Mc_AN]; i++){
7628             sum += U0[rl0][m][i]*tmpvec2[n][i];
7629 	  }
7630           tmpmat0[m][n] = sum;
7631 	}
7632       }
7633 
7634       /* |tmpvec0) - |U_rl0) * (U_rl0|tmpvec2) */
7635 
7636       for (n=0; n<EKC_core_size[Mc_AN]; n++){
7637 	for (k=0; k<EKC_core_size[Mc_AN]; k++){
7638 	  dum = tmpmat0[k][n];
7639 	  for (i=0; i<Msize2[Mc_AN]; i++)  tmpvec0[n][i] -= U0[rl0][k][i]*dum;
7640         }
7641       }
7642 
7643     }
7644 
7645 #endif
7646 
7647     if (measure_time==1){
7648       dtime(&Etime1);
7649       time5 += Etime1 - Stime1;
7650     }
7651 
7652     /*************************************************************
7653                    S-orthonormalization of tmpvec0
7654     *************************************************************/
7655 
7656     if (measure_time==1) dtime(&Stime1);
7657 
7658     S_orthonormalize_vec_trd( Mc_AN, ct_on, tmpvec0, tmpvec1, OLP0, tmpmat0, ko, iko, MP, Msize2 );
7659 
7660     for (n=0; n<EKC_core_size[Mc_AN]; n++){
7661       for (i=0; i<Msize2[Mc_AN]; i++){
7662         U0[rl+1][n][i] = tmpvec0[n][i];
7663       }
7664     }
7665 
7666     if (measure_time==1){
7667       dtime(&Etime1);
7668       time6 += Etime1 - Stime1;
7669     }
7670 
7671   } /* rl */
7672 
7673   /************************************************************
7674               orthogonalization by diagonalization
7675   ************************************************************/
7676 
7677   if (measure_time==1) dtime(&Stime1);
7678 
7679 #pragma omp parallel shared(EKC_core_size_max,Msize2_max,rlmax_EC,Mc_AN,EKC_core_size,Msize2,can,natn,ct_AN,ct_on,Spe_Total_CNO,WhatSpecies,MP,S_G2M,RMI1,Hks,spin,Msize,Msize4,OLP0,U0,rlmax_EC2,FS)   private(OMPID,Nthrds,Nprocs,rl,n,i,ig,ian,Anum,ih,j,kl,jg,jan,Bnum,m,sum,k,mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31,mmTmp0,mmTmp1,mmArr,sum0,sum1,sum2,sum3,rl0)
7680 
7681   {
7682 
7683     double **tmpvec1;
7684 
7685     tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
7686     for (i=0; i<EKC_core_size_max; i++){
7687       tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
7688     }
7689 
7690 
7691     OMPID = omp_get_thread_num();
7692     Nthrds = omp_get_num_threads();
7693     Nprocs = omp_get_num_procs();
7694 
7695     for (rl=0+OMPID; rl<rlmax_EC[Mc_AN]; rl+=Nthrds){
7696 
7697     /*  S * |Vn) */
7698 
7699     for (n=0; n<EKC_core_size[Mc_AN]; n++){
7700       for (i=0; i<Msize2[Mc_AN]; i++){
7701 	tmpvec1[n][i]  = 0.0;
7702       }
7703     }
7704 
7705     for (i=0; i<=can; i++){
7706 
7707       ig = natn[ct_AN][i];
7708       ian = Spe_Total_CNO[WhatSpecies[ig]];
7709       Anum = MP[i] - 1;
7710       ih = S_G2M[ig];
7711 
7712       for (j=0; j<=can; j++){
7713 
7714 	kl = RMI1[Mc_AN][i][j];
7715 	jg = natn[ct_AN][j];
7716 	jan = Spe_Total_CNO[WhatSpecies[jg]];
7717 	Bnum = MP[j] - 1;
7718 
7719 	if (0<=kl){
7720 
7721 #ifdef nosse
7722 
7723 	  /* Original version */
7724 	  /**/
7725 	  for (m=0; m<ian; m++){
7726 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
7727 
7728 	      sum = 0.0;
7729 	      for (k=0; k<jan; k++){
7730 		sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
7731 	      }
7732 	      tmpvec1[n][Anum+m] += sum;
7733 	    }
7734 	  }
7735 	  /**/
7736 
7737 #else
7738 
7739 	  /* Unrolling + SSE version */
7740 	  /**/
7741 	  for (m=0; m<(ian-3); m+=4){
7742 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
7743 
7744 	      mmSum00 = _mm_setzero_pd();
7745 	      mmSum01 = _mm_setzero_pd();
7746 	      mmSum10 = _mm_setzero_pd();
7747 	      mmSum11 = _mm_setzero_pd();
7748 	      mmSum20 = _mm_setzero_pd();
7749 	      mmSum21 = _mm_setzero_pd();
7750 	      mmSum30 = _mm_setzero_pd();
7751 	      mmSum31 = _mm_setzero_pd();
7752 
7753 	      for (k=0; k<(jan-3); k+=4){
7754 		mmTmp0 = _mm_loadu_pd(&U0[rl][n][Bnum+k+0]);
7755 		mmTmp1 = _mm_loadu_pd(&U0[rl][n][Bnum+k+2]);
7756 
7757 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
7758 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
7759 
7760 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
7761 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
7762 
7763 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
7764 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
7765 
7766 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
7767 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
7768 	      }
7769 
7770 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7771 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7772 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7773 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7774 
7775 	      _mm_storeu_pd(&mmArr[0], mmSum00);
7776 	      _mm_storeu_pd(&mmArr[2], mmSum10);
7777 	      _mm_storeu_pd(&mmArr[4], mmSum20);
7778 	      _mm_storeu_pd(&mmArr[6], mmSum30);
7779 
7780 	      sum0 = mmArr[0] + mmArr[1];
7781 	      sum1 = mmArr[2] + mmArr[3];
7782 	      sum2 = mmArr[4] + mmArr[5];
7783 	      sum3 = mmArr[6] + mmArr[7];
7784 
7785 	      for (; k<jan; k++){
7786 		sum0 += OLP0[ih][kl][m+0][k]*U0[rl][n][Bnum+k];
7787 		sum1 += OLP0[ih][kl][m+1][k]*U0[rl][n][Bnum+k];
7788 		sum2 += OLP0[ih][kl][m+2][k]*U0[rl][n][Bnum+k];
7789 		sum3 += OLP0[ih][kl][m+3][k]*U0[rl][n][Bnum+k];
7790 	      }
7791 
7792 	      tmpvec1[n][Anum+m+0] += sum0;
7793 	      tmpvec1[n][Anum+m+1] += sum1;
7794 	      tmpvec1[n][Anum+m+2] += sum2;
7795 	      tmpvec1[n][Anum+m+3] += sum3;
7796 	    }
7797 	  }
7798 
7799 	  for (; m<ian; m++){
7800 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
7801 
7802 	      sum = 0.0;
7803 	      for (k=0; k<jan; k++){
7804 		sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
7805 	      }
7806 	      tmpvec1[n][Anum+m] += sum;
7807 	    }
7808 	  }
7809 	  /**/
7810 #endif
7811 
7812 	}
7813       }
7814     }
7815 
7816 #ifdef nosse
7817 
7818     /* Original version */
7819     /**/
7820     for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
7821       for (m=0; m<EKC_core_size[Mc_AN]; m++){
7822         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7823   	  sum = 0.0;
7824 	  for (i=0; i<Msize2[Mc_AN]; i++){
7825             sum += U0[rl0][m][i]*tmpvec1[n][i];
7826 	  }
7827           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
7828           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
7829 	}
7830       }
7831     }
7832     /**/
7833 
7834 #else
7835 
7836     /* Unrolling + SSE version */
7837     /**/
7838     for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
7839       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
7840         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7841 
7842 	  mmSum00 = _mm_setzero_pd();
7843 	  mmSum01 = _mm_setzero_pd();
7844 	  mmSum10 = _mm_setzero_pd();
7845 	  mmSum11 = _mm_setzero_pd();
7846 	  mmSum20 = _mm_setzero_pd();
7847 	  mmSum21 = _mm_setzero_pd();
7848 	  mmSum30 = _mm_setzero_pd();
7849 	  mmSum31 = _mm_setzero_pd();
7850 
7851 	  for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
7852 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
7853 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
7854 
7855 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
7856 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
7857 
7858 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
7859 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
7860 
7861 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
7862 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
7863 
7864 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
7865 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
7866 	  }
7867 
7868 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
7869 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
7870 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
7871 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
7872 
7873 	  _mm_storeu_pd(&mmArr[0], mmSum00);
7874 	  _mm_storeu_pd(&mmArr[2], mmSum10);
7875 	  _mm_storeu_pd(&mmArr[4], mmSum20);
7876 	  _mm_storeu_pd(&mmArr[6], mmSum30);
7877 
7878 	  sum0 = mmArr[0] + mmArr[1];
7879 	  sum1 = mmArr[2] + mmArr[3];
7880 	  sum2 = mmArr[4] + mmArr[5];
7881 	  sum3 = mmArr[6] + mmArr[7];
7882 
7883 	  for (; i<Msize2[Mc_AN]; i++){
7884             sum0 += U0[rl0][m+0][i]*tmpvec1[n][i];
7885             sum1 += U0[rl0][m+1][i]*tmpvec1[n][i];
7886             sum2 += U0[rl0][m+2][i]*tmpvec1[n][i];
7887             sum3 += U0[rl0][m+3][i]*tmpvec1[n][i];
7888 	  }
7889 
7890           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
7891           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum0;
7892 
7893           FS[rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
7894           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+2] = sum1;
7895 
7896           FS[rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
7897           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+3] = sum2;
7898 
7899           FS[rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
7900           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+4] = sum3;
7901 	}
7902       }
7903 
7904       for (; m<EKC_core_size[Mc_AN]; m++){
7905         for (n=0; n<EKC_core_size[Mc_AN]; n++){
7906   	  sum = 0.0;
7907 	  for (i=0; i<Msize2[Mc_AN]; i++){
7908             sum += U0[rl0][m][i]*tmpvec1[n][i];
7909 	  }
7910           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
7911           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
7912 	}
7913       }
7914 
7915     }
7916     /**/
7917 
7918 #endif
7919 
7920   }
7921 
7922     for (i=0; i<EKC_core_size_max; i++){
7923       free(tmpvec1[i]);
7924     }
7925     free(tmpvec1);
7926 
7927   } /* #pragma omp parallel */
7928 
7929   if (measure_time==1){
7930     dtime(&Etime1);
7931     time7 += Etime1 - Stime1;
7932   }
7933 
7934   if (measure_time==1) dtime(&Stime1);
7935 
7936   info = Eigen_lapack_d(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
7937   if (info!=0){
7938     info = Eigen_lapack_x(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
7939     if (info!=0){
7940       Eigen_lapack_r(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
7941     }
7942   }
7943 
7944   if (measure_time==1){
7945     dtime(&Etime1);
7946     time8 += Etime1 - Stime1;
7947   }
7948 
7949   ZeroNum = 0;
7950 
7951   for (i=1; i<=Msize3[Mc_AN]; i++){
7952 
7953     if (error_check==1){
7954       printf("spin=%2d Mc_AN=%2d i=%3d ko[i]=%18.15f\n",spin,Mc_AN,i,ko[i]);
7955     }
7956 
7957     if (cutoff_value<ko[i]){
7958       ko[i]  = sqrt(fabs(ko[i]));
7959       iko[i] = 1.0/ko[i];
7960     }
7961     else{
7962       ZeroNum++;
7963       ko[i]  = 0.0;
7964       iko[i] = 0.0;
7965     }
7966   }
7967 
7968   if (error_check==1){
7969     printf("spin=%2d Mc_AN=%2d ZeroNum=%2d\n",spin,Mc_AN,ZeroNum);
7970   }
7971 
7972   for (i=1; i<=Msize3[Mc_AN]; i++){
7973     for (j=1; j<=Msize3[Mc_AN]; j++){
7974       FS[i][j] = FS[i][j]*iko[j];
7975     }
7976   }
7977 
7978   /* transpose for later calculation */
7979   for (i=1; i<=Msize3[Mc_AN]; i++){
7980     for (j=i+1; j<=Msize3[Mc_AN]; j++){
7981       tmp1 = FS[i][j];
7982       tmp2 = FS[j][i];
7983       FS[i][j] = tmp2;
7984       FS[j][i] = tmp1;
7985     }
7986   }
7987 
7988   if (measure_time==1) dtime(&Stime1);
7989 
7990   /* U0 * U * lamda^{-1/2} */
7991 
7992 #ifdef nosse
7993 
7994   /* original version */
7995   /*
7996   for (i=0; i<Msize2[Mc_AN]; i++){
7997     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
7998       for (m=0; m<EKC_core_size[Mc_AN]; m++){
7999 
8000         m1 = rl0*EKC_core_size[Mc_AN] + m + 1;
8001 
8002 	sum = 0.0;
8003 	for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8004 
8005           n1 = rl*EKC_core_size[Mc_AN] + 1;
8006 
8007 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
8008 	    sum += U0[rl][n][i]*FS[m1][n1+n];
8009 	  }
8010 	}
8011 
8012         Utmp[rl0][m] = sum;
8013       }
8014     }
8015 
8016     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8017       for (m=0; m<EKC_core_size[Mc_AN]; m++){
8018         U0[rl0][m][i] = Utmp[rl0][m];
8019       }
8020     }
8021   }
8022   */
8023 
8024   /* unrolling version */
8025 
8026   for (i=0; i<Msize2[Mc_AN]; i++){
8027 
8028     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8029       for (n=0; n<EKC_core_size[Mc_AN]; n++){
8030         Utmp[rl][n] = U0[rl][n][i];
8031       }
8032     }
8033 
8034     for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
8035 
8036       rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
8037       rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
8038       rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
8039       rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
8040 
8041       mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
8042       mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
8043       mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
8044       mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
8045 
8046       sum0 = 0.0;
8047       sum1 = 0.0;
8048       sum2 = 0.0;
8049       sum3 = 0.0;
8050 
8051       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8052 
8053 	n1 = rl*EKC_core_size[Mc_AN] + 1;
8054 
8055 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
8056 	  sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
8057 	  sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
8058 	  sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
8059 	  sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
8060 	}
8061       }
8062 
8063       U0[rl00][mm0][i] = sum0;
8064       U0[rl01][mm1][i] = sum1;
8065       U0[rl02][mm2][i] = sum2;
8066       U0[rl03][mm3][i] = sum3;
8067     }
8068 
8069     m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
8070 
8071     for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
8072 
8073       rl0 = (m1-1)/EKC_core_size[Mc_AN];
8074       m = (m1-1)%EKC_core_size[Mc_AN];
8075 
8076       sum = 0.0;
8077 
8078       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8079 
8080 	n1 = rl*EKC_core_size[Mc_AN] + 1;
8081 
8082 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
8083 	  sum += Utmp[rl][n]*FS[m1][n1+n];
8084 	}
8085       }
8086 
8087       U0[rl0][m][i] = sum;
8088     }
8089   } /* i */
8090 
8091 #else
8092 
8093   /* Unrolling + SSE version */
8094   /**/
8095 
8096 #pragma omp parallel shared(rlmax_EC,Mc_AN,EKC_core_size,Msize2,U0,FS)   private(OMPID,Nthrds,Nprocs,rl,n,i,j,m,sum,k,mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31,mmTmp0,mmTmp1,mmArr,sum0,sum1,sum2,sum3,rl0,rl00,rl01,rl02,rl03,mm0,mm1,mm2,mm3,n1,m1,m1s,Utmp)
8097 
8098   {
8099 
8100     double ** Utmp;
8101     Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
8102     for (i=0; i<rlmax_EC[Mc_AN]; i++){
8103       Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
8104     }
8105 
8106     OMPID = omp_get_thread_num();
8107     Nthrds = omp_get_num_threads();
8108     Nprocs = omp_get_num_procs();
8109 
8110   for (i=0+OMPID; i<Msize2[Mc_AN]; i+=Nthrds){
8111 
8112     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8113       for (n=0; n<EKC_core_size[Mc_AN]; n++){
8114         Utmp[rl][n] = U0[rl][n][i];
8115       }
8116     }
8117 
8118     for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
8119 
8120       rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
8121       rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
8122       rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
8123       rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
8124 
8125       mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
8126       mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
8127       mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
8128       mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
8129 
8130       sum0 = 0.0;
8131       sum1 = 0.0;
8132       sum2 = 0.0;
8133       sum3 = 0.0;
8134 
8135       mmSum00 = _mm_setzero_pd();
8136       mmSum01 = _mm_setzero_pd();
8137       mmSum10 = _mm_setzero_pd();
8138       mmSum11 = _mm_setzero_pd();
8139       mmSum20 = _mm_setzero_pd();
8140       mmSum21 = _mm_setzero_pd();
8141       mmSum30 = _mm_setzero_pd();
8142       mmSum31 = _mm_setzero_pd();
8143 
8144       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8145 
8146 	n1 = rl*EKC_core_size[Mc_AN] + 1;
8147 
8148 	if (0){
8149 	mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
8150 	mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
8151 	}
8152 
8153 	for (n=0; n<(EKC_core_size[Mc_AN]-3); n+=4){
8154 	  mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
8155 	  mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
8156 
8157 	  mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+0]),mmTmp0));
8158 	  mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+2]),mmTmp1));
8159 
8160 	  mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+0]),mmTmp0));
8161 	  mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+2]),mmTmp1));
8162 
8163 	  mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+0]),mmTmp0));
8164 	  mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+2]),mmTmp1));
8165 
8166 	  mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+0]),mmTmp0));
8167 	  mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+2]),mmTmp1));
8168 	}
8169 
8170 	for (; n<EKC_core_size[Mc_AN]; n++){
8171 	  sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
8172 	  sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
8173 	  sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
8174 	  sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
8175 	}
8176 
8177       }
8178 
8179       mmSum00 = _mm_add_pd(mmSum00, mmSum01);
8180       mmSum10 = _mm_add_pd(mmSum10, mmSum11);
8181       mmSum20 = _mm_add_pd(mmSum20, mmSum21);
8182       mmSum30 = _mm_add_pd(mmSum30, mmSum31);
8183 
8184       _mm_storeu_pd(&mmArr[0], mmSum00);
8185       _mm_storeu_pd(&mmArr[2], mmSum10);
8186       _mm_storeu_pd(&mmArr[4], mmSum20);
8187       _mm_storeu_pd(&mmArr[6], mmSum30);
8188 
8189       sum0 += mmArr[0] + mmArr[1];
8190       sum1 += mmArr[2] + mmArr[3];
8191       sum2 += mmArr[4] + mmArr[5];
8192       sum3 += mmArr[6] + mmArr[7];
8193 
8194       U0[rl00][mm0][i] = sum0;
8195       U0[rl01][mm1][i] = sum1;
8196       U0[rl02][mm2][i] = sum2;
8197       U0[rl03][mm3][i] = sum3;
8198     }
8199 
8200     m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
8201 
8202     for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
8203 
8204       rl0 = (m1-1)/EKC_core_size[Mc_AN];
8205       m = (m1-1)%EKC_core_size[Mc_AN];
8206 
8207       sum = 0.0;
8208 
8209       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8210 
8211 	n1 = rl*EKC_core_size[Mc_AN] + 1;
8212 
8213 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
8214 	  sum += Utmp[rl][n]*FS[m1][n1+n];
8215 	}
8216       }
8217 
8218       U0[rl0][m][i] = sum;
8219     }
8220   } /* i */
8221 
8222   for (i=0; i<rlmax_EC[Mc_AN]; i++){
8223     free(Utmp[i]);
8224   }
8225   free(Utmp);
8226 
8227 
8228   } /* #pragma omp parallel */
8229 
8230 #endif
8231 
8232   Krylov_U[spin][Mc_AN][0] = ZeroNum;
8233 
8234   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8235     for (n=0; n<EKC_core_size[Mc_AN]; n++){
8236       for (i=0; i<Msize2[Mc_AN]; i++){
8237         Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+i+1] = U0[rl][n][i];
8238       }
8239     }
8240   }
8241 
8242   if (measure_time==1){
8243     dtime(&Etime1);
8244     time9 += Etime1 - Stime1;
8245   }
8246 
8247   /************************************************************
8248          check the orthonormality of Krylov vectors
8249   ************************************************************/
8250 
8251   if (error_check==1){
8252 
8253     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8254 
8255       for (n=0; n<EKC_core_size[Mc_AN]; n++){
8256 	for (i=0; i<Msize2[Mc_AN]; i++){
8257 	  tmpvec1[n][i]  = 0.0;
8258 	}
8259       }
8260 
8261       for (i=0; i<=can; i++){
8262 
8263 	ig = natn[ct_AN][i];
8264 	ian = Spe_Total_CNO[WhatSpecies[ig]];
8265 	Anum = MP[i] - 1;
8266 	ih = S_G2M[ig];
8267 
8268 	for (j=0; j<=can; j++){
8269 
8270 	  kl = RMI1[Mc_AN][i][j];
8271 	  jg = natn[ct_AN][j];
8272 	  jan = Spe_Total_CNO[WhatSpecies[jg]];
8273 	  Bnum = MP[j] - 1;
8274 
8275 	  if (0<=kl){
8276 
8277 	    for (m=0; m<ian; m++){
8278 	      for (n=0; n<EKC_core_size[Mc_AN]; n++){
8279 
8280 		sum = 0.0;
8281 		for (k=0; k<jan; k++){
8282 		  sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
8283 		}
8284 
8285 		tmpvec1[n][Anum+m] += sum;
8286 	      }
8287 	    }
8288 	  }
8289 	}
8290       }
8291 
8292       for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8293 	for (m=0; m<EKC_core_size[Mc_AN]; m++){
8294 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
8295 	    sum = 0.0;
8296 	    for (i=0; i<Msize2[Mc_AN]; i++){
8297 	      sum += U0[rl0][m][i]*tmpvec1[n][i];
8298 	    }
8299 
8300 	    if (rl==rl0 && m==n){
8301 	      if ( 1.0e-10<fabs(sum-1.0) ) {
8302 		printf("A spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
8303 		       spin,Mc_AN,rl,rl0,m,n,sum);
8304 	      }
8305 	    }
8306 	    else{
8307 	      if ( 1.0e-10<fabs(sum) ) {
8308 		printf("B spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
8309 		       spin,Mc_AN,rl,rl0,m,n,sum);
8310 	      }
8311 	    }
8312 
8313 	  }
8314 	}
8315       }
8316     }
8317   }
8318 
8319   if (measure_time==1){
8320     printf("pMatrix myid=%2d time1 =%5.3f time2 =%5.3f time3 =%5.3f time4 =%5.3f\n",
8321             myid,time1,time2,time3,time4);
8322     printf("pMatrix myid=%2d time5 =%5.3f time6 =%5.3f time7 =%5.3f time8 =%5.3f\n",
8323             myid,time5,time6,time7,time8);
8324     printf("pMatrix myid=%2d time9 =%5.3f\n",myid,time9);
8325   }
8326 
8327   /* freeing of arrays */
8328 
8329   for (i=0; i<rlmax_EC[Mc_AN]; i++){
8330     free(Utmp[i]);
8331   }
8332   free(Utmp);
8333 
8334   for (i=0; i<rlmax_EC[Mc_AN]; i++){
8335     for (j=0; j<EKC_core_size[Mc_AN]; j++){
8336       free(U0[i][j]);
8337     }
8338     free(U0[i]);
8339   }
8340   free(U0);
8341 
8342   for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
8343     free(tmpmat0[i]);
8344   }
8345   free(tmpmat0);
8346 
8347   for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
8348     free(FS[i]);
8349   }
8350   free(FS);
8351 
8352   free(ko);
8353   free(iko);
8354 
8355   for (i=0; i<(EKC_core_size[Mc_AN]+2); i++){
8356     free(matRS0[i]);
8357   }
8358   free(matRS0);
8359 
8360   for (i=0; i<(Msize4[Mc_AN]+3); i++){
8361     free(matRS1[i]);
8362   }
8363   free(matRS1);
8364 }
8365 
8366 
8367 
8368 
8369 
8370 
8371 
8372 
Generate_pMatrix2_trd(int myid,int spin,int Mc_AN,double ***** Hks,double **** OLP0,double *** Krylov_U,int * MP,int * Msize,int * Msize2,int * Msize3,double ** tmpvec1)8373 void Generate_pMatrix2_trd( int myid, int spin, int Mc_AN, double *****Hks, double ****OLP0,
8374                         double ***Krylov_U, int *MP, int *Msize, int *Msize2, int *Msize3,
8375                         double **tmpvec1)
8376 
8377 /* This subroutine is exactly the same as the original Generate_pMatrix2,
8378 except that Eigen_lapack is replaced with Eigen_lapack_d, _x, or _r
8379 */
8380 
8381 {
8382   int rl,rl0,rl1,ct_AN,fan,san,can,wan,ct_on,i,j;
8383   int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1,info;
8384   int ZeroNum,rl_half;
8385   int KU_d1,KU_d2,csize;
8386   double sum,dum,tmp0,tmp1,tmp2,tmp3;
8387   double **Utmp;
8388   double *ko,*iko;
8389   double **FS;
8390   double ***U0;
8391 
8392   int rl00,rl01,rl02,rl03,rl04,rl05,rl06,rl07;
8393   int mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,m1s;
8394   __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
8395 
8396   double mmArr[8];
8397   double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
8398 
8399   ct_AN = M2G[Mc_AN];
8400   fan = FNAN[ct_AN];
8401   san = SNAN[ct_AN];
8402   can = fan + san;
8403   wan = WhatSpecies[ct_AN];
8404   ct_on = Spe_Total_CNO[wan];
8405 
8406   if (Msize[Mc_AN]<Msize3[Mc_AN])
8407     csize = Msize3[Mc_AN] + 40;
8408   else
8409     csize = Msize[Mc_AN] + 40;
8410 
8411   KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
8412   KU_d2 = Msize2[Mc_AN];
8413 
8414   /* allocation of arrays */
8415 
8416   Utmp = (double**)malloc(sizeof(double*)*rlmax_EC[Mc_AN]);
8417   for (i=0; i<rlmax_EC[Mc_AN]; i++){
8418     Utmp[i] = (double*)malloc(sizeof(double)*EKC_core_size[Mc_AN]);
8419   }
8420 
8421   U0 = (double***)malloc(sizeof(double**)*rlmax_EC[Mc_AN]);
8422   for (i=0; i<rlmax_EC[Mc_AN]; i++){
8423     U0[i] = (double**)malloc(sizeof(double*)*EKC_core_size[Mc_AN]);
8424     for (j=0; j<EKC_core_size[Mc_AN]; j++){
8425       U0[i][j] = (double*)malloc(sizeof(double)*(Msize2[Mc_AN]+3));
8426       for (k=0; k<(Msize2[Mc_AN]+3); k++)  U0[i][j][k] = 0.0;
8427     }
8428   }
8429 
8430   FS = (double**)malloc(sizeof(double*)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
8431   for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
8432     FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
8433   }
8434 
8435   ko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
8436   iko = (double*)malloc(sizeof(double)*(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]);
8437 
8438   /****************************************************
8439    initialize
8440   ****************************************************/
8441 
8442   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8443     for (n=0; n<EKC_core_size[Mc_AN]; n++){
8444       for (i=0; i<Msize2[Mc_AN]; i++){
8445         U0[rl][n][i] = 0.0;
8446       }
8447     }
8448   }
8449 
8450   i = 0;
8451   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8452     for (n=0; n<EKC_core_size[Mc_AN]; n++){
8453       if (i<Msize2[Mc_AN]) U0[rl][n][i] = 1.0;
8454       i++;
8455     }
8456   }
8457 
8458   /************************************************************
8459               orthogonalization by diagonalization
8460   ************************************************************/
8461 
8462   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8463 
8464     /*  S * |Vn) */
8465 
8466     for (n=0; n<EKC_core_size[Mc_AN]; n++){
8467       for (i=0; i<Msize2[Mc_AN]; i++){
8468 	tmpvec1[n][i]  = 0.0;
8469       }
8470     }
8471 
8472     for (i=0; i<=can; i++){
8473 
8474       ig = natn[ct_AN][i];
8475       ian = Spe_Total_CNO[WhatSpecies[ig]];
8476       Anum = MP[i] - 1;
8477       ih = S_G2M[ig];
8478 
8479       for (j=0; j<=can; j++){
8480 
8481 	kl = RMI1[Mc_AN][i][j];
8482 	jg = natn[ct_AN][j];
8483 	jan = Spe_Total_CNO[WhatSpecies[jg]];
8484 	Bnum = MP[j] - 1;
8485 
8486 	if (0<=kl){
8487 
8488 #ifdef nosse
8489 
8490 	  /* Original version */
8491 	  /**/
8492 	  for (m=0; m<ian; m++){
8493 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
8494 
8495 	      sum = 0.0;
8496 	      for (k=0; k<jan; k++){
8497 		sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
8498 	      }
8499 	      tmpvec1[n][Anum+m] += sum;
8500 	    }
8501 	  }
8502 	  /**/
8503 
8504 #else
8505 
8506 	  /* Unrolling + SSE version */
8507 
8508 	  for (m=0; m<(ian-3); m+=4){
8509 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
8510 
8511 	      mmSum00 = _mm_setzero_pd();
8512 	      mmSum01 = _mm_setzero_pd();
8513 	      mmSum10 = _mm_setzero_pd();
8514 	      mmSum11 = _mm_setzero_pd();
8515 	      mmSum20 = _mm_setzero_pd();
8516 	      mmSum21 = _mm_setzero_pd();
8517 	      mmSum30 = _mm_setzero_pd();
8518 	      mmSum31 = _mm_setzero_pd();
8519 
8520 	      for (k=0; k<(jan-3); k+=4){
8521 		mmTmp0 = _mm_loadu_pd(&U0[rl][n][Bnum+k+0]);
8522 		mmTmp1 = _mm_loadu_pd(&U0[rl][n][Bnum+k+2]);
8523 
8524 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
8525 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
8526 
8527 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
8528 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
8529 
8530 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
8531 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
8532 
8533 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
8534 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
8535 	      }
8536 
8537 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
8538 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
8539 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
8540 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
8541 
8542 	      _mm_storeu_pd(&mmArr[0], mmSum00);
8543 	      _mm_storeu_pd(&mmArr[2], mmSum10);
8544 	      _mm_storeu_pd(&mmArr[4], mmSum20);
8545 	      _mm_storeu_pd(&mmArr[6], mmSum30);
8546 
8547 	      sum0 = mmArr[0] + mmArr[1];
8548 	      sum1 = mmArr[2] + mmArr[3];
8549 	      sum2 = mmArr[4] + mmArr[5];
8550 	      sum3 = mmArr[6] + mmArr[7];
8551 
8552 	      for (; k<jan; k++){
8553 		sum0 += OLP0[ih][kl][m+0][k]*U0[rl][n][Bnum+k];
8554 		sum1 += OLP0[ih][kl][m+1][k]*U0[rl][n][Bnum+k];
8555 		sum2 += OLP0[ih][kl][m+2][k]*U0[rl][n][Bnum+k];
8556 		sum3 += OLP0[ih][kl][m+3][k]*U0[rl][n][Bnum+k];
8557 	      }
8558 
8559 	      tmpvec1[n][Anum+m+0] += sum0;
8560 	      tmpvec1[n][Anum+m+1] += sum1;
8561 	      tmpvec1[n][Anum+m+2] += sum2;
8562 	      tmpvec1[n][Anum+m+3] += sum3;
8563 	    }
8564 	  }
8565 
8566 	  for (; m<ian; m++){
8567 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
8568 
8569 	      sum = 0.0;
8570 	      for (k=0; k<jan; k++){
8571 		sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
8572 	      }
8573 	      tmpvec1[n][Anum+m] += sum;
8574 	    }
8575 	  }
8576 
8577 #endif
8578 
8579 	}
8580       }
8581     }
8582 
8583 #ifdef nosse
8584 
8585     /* Original version */
8586     /**/
8587     for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
8588       for (m=0; m<EKC_core_size[Mc_AN]; m++){
8589         for (n=0; n<EKC_core_size[Mc_AN]; n++){
8590   	  sum = 0.0;
8591 	  for (i=0; i<Msize2[Mc_AN]; i++){
8592             sum += U0[rl0][m][i]*tmpvec1[n][i];
8593 	  }
8594           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
8595           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
8596 	}
8597       }
8598     }
8599     /**/
8600 
8601 #else
8602 
8603     /* Unrolling + SSE version */
8604 
8605     for (rl0=rl; rl0<rlmax_EC[Mc_AN]; rl0++){
8606       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
8607         for (n=0; n<EKC_core_size[Mc_AN]; n++){
8608 
8609 	  mmSum00 = _mm_setzero_pd();
8610 	  mmSum01 = _mm_setzero_pd();
8611 	  mmSum10 = _mm_setzero_pd();
8612 	  mmSum11 = _mm_setzero_pd();
8613 	  mmSum20 = _mm_setzero_pd();
8614 	  mmSum21 = _mm_setzero_pd();
8615 	  mmSum30 = _mm_setzero_pd();
8616 	  mmSum31 = _mm_setzero_pd();
8617 
8618 	  for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
8619 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
8620 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
8621 
8622 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+0]),mmTmp0));
8623 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+0][i+2]),mmTmp1));
8624 
8625 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+0]),mmTmp0));
8626 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+1][i+2]),mmTmp1));
8627 
8628 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+0]),mmTmp0));
8629 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+2][i+2]),mmTmp1));
8630 
8631 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+0]),mmTmp0));
8632 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&U0[rl0][m+3][i+2]),mmTmp1));
8633 	  }
8634 
8635 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
8636 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
8637 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
8638 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
8639 
8640 	  _mm_storeu_pd(&mmArr[0], mmSum00);
8641 	  _mm_storeu_pd(&mmArr[2], mmSum10);
8642 	  _mm_storeu_pd(&mmArr[4], mmSum20);
8643 	  _mm_storeu_pd(&mmArr[6], mmSum30);
8644 
8645 	  sum0 = mmArr[0] + mmArr[1];
8646 	  sum1 = mmArr[2] + mmArr[3];
8647 	  sum2 = mmArr[4] + mmArr[5];
8648 	  sum3 = mmArr[6] + mmArr[7];
8649 
8650 	  for (; i<Msize2[Mc_AN]; i++){
8651             sum0 += U0[rl0][m+0][i]*tmpvec1[n][i];
8652             sum1 += U0[rl0][m+1][i]*tmpvec1[n][i];
8653             sum2 += U0[rl0][m+2][i]*tmpvec1[n][i];
8654             sum3 += U0[rl0][m+3][i]*tmpvec1[n][i];
8655 	  }
8656 
8657           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
8658           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum0;
8659 
8660           FS[rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
8661           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+2] = sum1;
8662 
8663           FS[rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
8664           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+3] = sum2;
8665 
8666           FS[rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
8667           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+4] = sum3;
8668 	}
8669       }
8670 
8671       for (; m<EKC_core_size[Mc_AN]; m++){
8672         for (n=0; n<EKC_core_size[Mc_AN]; n++){
8673   	  sum = 0.0;
8674 	  for (i=0; i<Msize2[Mc_AN]; i++){
8675             sum += U0[rl0][m][i]*tmpvec1[n][i];
8676 	  }
8677           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
8678           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
8679 	}
8680       }
8681     }
8682 
8683 #endif
8684 
8685   }
8686 
8687 
8688   info = Eigen_lapack_d(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
8689   if (info!=0){
8690     info = Eigen_lapack_x(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
8691     if (info!=0){
8692       Eigen_lapack_r(FS,ko,Msize3[Mc_AN],Msize3[Mc_AN]);
8693     }
8694   }
8695 
8696   ZeroNum = 0;
8697 
8698   for (i=1; i<=Msize3[Mc_AN]; i++){
8699 
8700     if (error_check==1){
8701       printf("spin=%2d Mc_AN=%2d i=%3d ko[i]=%18.15f\n",spin,Mc_AN,i,ko[i]);
8702     }
8703 
8704     if (cutoff_value<ko[i]){
8705       ko[i]  = sqrt(fabs(ko[i]));
8706       iko[i] = 1.0/ko[i];
8707     }
8708     else{
8709       ZeroNum++;
8710       ko[i]  = 0.0;
8711       iko[i] = 0.0;
8712     }
8713   }
8714 
8715   if (error_check==1){
8716     printf("spin=%2d Mc_AN=%2d ZeroNum=%2d\n",spin,Mc_AN,ZeroNum);
8717   }
8718 
8719   for (i=1; i<=Msize3[Mc_AN]; i++){
8720     for (j=1; j<=Msize3[Mc_AN]; j++){
8721       FS[i][j] = FS[i][j]*iko[j];
8722     }
8723   }
8724 
8725   /* transpose for later calculation */
8726   for (i=1; i<=Msize3[Mc_AN]; i++){
8727     for (j=i+1; j<=Msize3[Mc_AN]; j++){
8728       tmp1 = FS[i][j];
8729       tmp2 = FS[j][i];
8730       FS[i][j] = tmp2;
8731       FS[j][i] = tmp1;
8732     }
8733   }
8734 
8735   /* U0 * U * lamda^{-1/2} */
8736 
8737 #ifdef nosse
8738 
8739   /* original version */
8740 
8741   /**/
8742   for (i=0; i<Msize2[Mc_AN]; i++){
8743     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8744       for (m=0; m<EKC_core_size[Mc_AN]; m++){
8745 
8746         m1 = rl0*EKC_core_size[Mc_AN] + m + 1;
8747 
8748 	sum = 0.0;
8749 	for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8750 
8751           n1 = rl*EKC_core_size[Mc_AN] + 1;
8752 
8753 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
8754 	    sum += U0[rl][n][i]*FS[m1][n1+n];
8755 	  }
8756 	}
8757 
8758         Utmp[rl0][m] = sum;
8759       }
8760     }
8761 
8762     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8763       for (m=0; m<EKC_core_size[Mc_AN]; m++){
8764         U0[rl0][m][i] = Utmp[rl0][m];
8765       }
8766     }
8767   }
8768   /**/
8769 
8770 #else
8771 
8772   /* Unrolling + SSE version */
8773   /**/
8774   for (i=0; i<Msize2[Mc_AN]; i++){
8775 
8776     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8777       for (n=0; n<EKC_core_size[Mc_AN]; n++){
8778         Utmp[rl][n] = U0[rl][n][i];
8779       }
8780     }
8781 
8782     for (m1=1; m1<=(rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]-3); m1+=4){
8783 
8784       rl00 = (m1+0-1)/EKC_core_size[Mc_AN];
8785       rl01 = (m1+1-1)/EKC_core_size[Mc_AN];
8786       rl02 = (m1+2-1)/EKC_core_size[Mc_AN];
8787       rl03 = (m1+3-1)/EKC_core_size[Mc_AN];
8788 
8789       mm0 = (m1+0-1)%EKC_core_size[Mc_AN];
8790       mm1 = (m1+1-1)%EKC_core_size[Mc_AN];
8791       mm2 = (m1+2-1)%EKC_core_size[Mc_AN];
8792       mm3 = (m1+3-1)%EKC_core_size[Mc_AN];
8793 
8794       sum0 = 0.0;
8795       sum1 = 0.0;
8796       sum2 = 0.0;
8797       sum3 = 0.0;
8798 
8799       mmSum00 = _mm_setzero_pd();
8800       mmSum01 = _mm_setzero_pd();
8801       mmSum10 = _mm_setzero_pd();
8802       mmSum11 = _mm_setzero_pd();
8803       mmSum20 = _mm_setzero_pd();
8804       mmSum21 = _mm_setzero_pd();
8805       mmSum30 = _mm_setzero_pd();
8806       mmSum31 = _mm_setzero_pd();
8807 
8808       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8809 
8810 	n1 = rl*EKC_core_size[Mc_AN] + 1;
8811 
8812         if (0){
8813 	  mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]); /*???????????*/
8814   	  mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]); /*???????????*/
8815         }
8816 
8817 	for (n=0; n<(EKC_core_size[Mc_AN]-3); n+=4){
8818 	  mmTmp0 = _mm_loadu_pd(&Utmp[rl][n+0]);
8819 	  mmTmp1 = _mm_loadu_pd(&Utmp[rl][n+2]);
8820 	  mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+0]),mmTmp0));
8821 	  mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&FS[m1+0][n1+n+2]),mmTmp1));
8822 
8823 	  mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+0]),mmTmp0));
8824 	  mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&FS[m1+1][n1+n+2]),mmTmp1));
8825 
8826 	  mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+0]),mmTmp0));
8827 	  mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&FS[m1+2][n1+n+2]),mmTmp1));
8828 
8829 	  mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+0]),mmTmp0));
8830 	  mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&FS[m1+3][n1+n+2]),mmTmp1));
8831 	}
8832 
8833 	for (; n<EKC_core_size[Mc_AN]; n++){
8834 	  sum0 += Utmp[rl][n]*FS[m1+0][n1+n];
8835 	  sum1 += Utmp[rl][n]*FS[m1+1][n1+n];
8836 	  sum2 += Utmp[rl][n]*FS[m1+2][n1+n];
8837 	  sum3 += Utmp[rl][n]*FS[m1+3][n1+n];
8838 	}
8839       }
8840 
8841       mmSum00 = _mm_add_pd(mmSum00, mmSum01);
8842       mmSum10 = _mm_add_pd(mmSum10, mmSum11);
8843       mmSum20 = _mm_add_pd(mmSum20, mmSum21);
8844       mmSum30 = _mm_add_pd(mmSum30, mmSum31);
8845 
8846       _mm_storeu_pd(&mmArr[0], mmSum00);
8847       _mm_storeu_pd(&mmArr[2], mmSum10);
8848       _mm_storeu_pd(&mmArr[4], mmSum20);
8849       _mm_storeu_pd(&mmArr[6], mmSum30);
8850 
8851       sum0 += mmArr[0] + mmArr[1];
8852       sum1 += mmArr[2] + mmArr[3];
8853       sum2 += mmArr[4] + mmArr[5];
8854       sum3 += mmArr[6] + mmArr[7];
8855 
8856       U0[rl00][mm0][i] = sum0;
8857       U0[rl01][mm1][i] = sum1;
8858       U0[rl02][mm2][i] = sum2;
8859       U0[rl03][mm3][i] = sum3;
8860     }
8861 
8862     m1s = rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN] - (rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN])%4 + 1;
8863 
8864     for (m1=m1s; m1<=rlmax_EC[Mc_AN]*EKC_core_size[Mc_AN]; m1++){
8865 
8866       rl0 = (m1-1)/EKC_core_size[Mc_AN];
8867       m = (m1-1)%EKC_core_size[Mc_AN];
8868 
8869       sum = 0.0;
8870 
8871       for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8872 
8873 	n1 = rl*EKC_core_size[Mc_AN] + 1;
8874 
8875 	for (n=0; n<EKC_core_size[Mc_AN]; n++){
8876 	  sum += Utmp[rl][n]*FS[m1][n1+n];
8877 	}
8878       }
8879 
8880       U0[rl0][m][i] = sum;
8881     }
8882   } /* i */
8883 
8884 #endif
8885 
8886   Krylov_U[spin][Mc_AN][0] = ZeroNum;
8887 
8888   for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8889     for (n=0; n<EKC_core_size[Mc_AN]; n++){
8890       for (i=0; i<Msize2[Mc_AN]; i++){
8891         Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+i+1] = U0[rl][n][i];
8892       }
8893     }
8894   }
8895 
8896   /************************************************************
8897          check the orthonormality of Krylov vectors
8898   ************************************************************/
8899 
8900   if (error_check==1){
8901 
8902     for (rl=0; rl<rlmax_EC[Mc_AN]; rl++){
8903 
8904       for (n=0; n<EKC_core_size[Mc_AN]; n++){
8905 	for (i=0; i<Msize2[Mc_AN]; i++){
8906 	  tmpvec1[n][i]  = 0.0;
8907 	}
8908       }
8909 
8910       for (i=0; i<=can; i++){
8911 
8912 	ig = natn[ct_AN][i];
8913 	ian = Spe_Total_CNO[WhatSpecies[ig]];
8914 	Anum = MP[i] - 1;
8915 	ih = S_G2M[ig];
8916 
8917 	for (j=0; j<=can; j++){
8918 
8919 	  kl = RMI1[Mc_AN][i][j];
8920 	  jg = natn[ct_AN][j];
8921 	  jan = Spe_Total_CNO[WhatSpecies[jg]];
8922 	  Bnum = MP[j] - 1;
8923 
8924 	  if (0<=kl){
8925 
8926 	    for (m=0; m<ian; m++){
8927 	      for (n=0; n<EKC_core_size[Mc_AN]; n++){
8928 
8929 		sum = 0.0;
8930 		for (k=0; k<jan; k++){
8931 		  sum += OLP0[ih][kl][m][k]*U0[rl][n][Bnum+k];
8932 		}
8933 
8934 		tmpvec1[n][Anum+m] += sum;
8935 	      }
8936 	    }
8937 	  }
8938 	}
8939       }
8940 
8941       for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
8942 	for (m=0; m<EKC_core_size[Mc_AN]; m++){
8943 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
8944 	    sum = 0.0;
8945 	    for (i=0; i<Msize2[Mc_AN]; i++){
8946 	      sum += U0[rl0][m][i]*tmpvec1[n][i];
8947 	    }
8948 
8949 	    if (rl==rl0 && m==n){
8950 	      if ( 1.0e-10<fabs(sum-1.0) ) {
8951 		printf("A spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
8952 		       spin,Mc_AN,rl,rl0,m,n,sum);
8953 	      }
8954 	    }
8955 	    else{
8956 	      if ( 1.0e-10<fabs(sum) ) {
8957 		printf("B spin=%2d Mc_AN=%2d rl=%2d rl0=%2d m=%2d n=%2d sum=%18.15f\n",
8958 		       spin,Mc_AN,rl,rl0,m,n,sum);
8959 	      }
8960 	    }
8961 
8962 	  }
8963 	}
8964       }
8965     }
8966   }
8967 
8968   /* freeing of arrays */
8969 
8970   for (i=0; i<rlmax_EC[Mc_AN]; i++){
8971     free(Utmp[i]);
8972   }
8973   free(Utmp);
8974 
8975   for (i=0; i<rlmax_EC[Mc_AN]; i++){
8976     for (j=0; j<EKC_core_size[Mc_AN]; j++){
8977       free(U0[i][j]);
8978     }
8979     free(U0[i]);
8980   }
8981   free(U0);
8982 
8983   for (i=0; i<(rlmax_EC[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
8984     free(FS[i]);
8985   }
8986   free(FS);
8987 
8988   free(ko);
8989   free(iko);
8990 }
8991 
8992 
8993 
8994 
8995 
8996 
8997 
Embedding_Matrix_trd(int spin,int Mc_AN,double ***** Hks,double *** Krylov_U,double **** EC_matrix,int * MP,int * Msize,int * Msize2,int * Msize3,double ** tmpvec,int EKC_core_size_max,int Msize2_max)8998 void Embedding_Matrix_trd(int spin, int Mc_AN, double *****Hks,
8999                       double ***Krylov_U, double ****EC_matrix,
9000                       int *MP, int *Msize, int *Msize2, int *Msize3,
9001                       double **tmpvec, int EKC_core_size_max, int Msize2_max)
9002 
9003 /* This subroutine is exactly the same as the original Embedding_Matrix,
9004 except for the OMP parallelized loops
9005 */
9006 
9007 {
9008   int ct_AN,fan,san,can,wan,ct_on;
9009   int rl,rl0,m,n,i,j,k,kl,jg,jan,ih,ian;
9010   int Anum,Bnum,ig;
9011   int KU_d1,KU_d2,csize;
9012   double sum,tmp1,tmp2,tmp3;
9013 
9014   __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1;
9015   double mmArr[8];
9016   double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
9017 
9018   /* for OpenMP */
9019   int OMPID,Nthrds,Nprocs;
9020 
9021   ct_AN = M2G[Mc_AN];
9022   fan = FNAN[ct_AN];
9023   san = SNAN[ct_AN];
9024   can = fan + san;
9025   wan = WhatSpecies[ct_AN];
9026   ct_on = Spe_Total_CNO[wan];
9027 
9028   if (Msize[Mc_AN]<Msize3[Mc_AN])
9029     csize = Msize3[Mc_AN] + 40;
9030   else
9031     csize = Msize[Mc_AN] + 40;
9032 
9033   KU_d1 = EKC_core_size[Mc_AN]*Msize2[Mc_AN];
9034   KU_d2 = Msize2[Mc_AN];
9035 
9036   /*******************************
9037             u1^+ C^+ u2
9038   *******************************/
9039 
9040 #pragma omp parallel shared(EKC_core_size_max,Msize2_max,rlmax_EC,Mc_AN,EKC_core_size,Msize2,fan,can,natn,ct_AN,Spe_Total_CNO,WhatSpecies,MP,S_G2M,RMI1,Hks,Krylov_U,spin,KU_d1,KU_d2,Msize,EC_matrix)   private(OMPID,Nthrds,Nprocs,rl,n,i,ig,ian,Anum,ih,j,kl,jg,jan,Bnum,m,sum,k,mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31,mmTmp0,mmTmp1,mmArr,sum0,sum1,sum2,sum3,rl0)
9041 
9042   {
9043 
9044     double **tmpvec1;
9045     tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
9046     for (i=0; i<EKC_core_size_max; i++){
9047       tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
9048     }
9049 
9050     /* get info. on OpenMP */
9051 
9052     OMPID = omp_get_thread_num();
9053     Nthrds = omp_get_num_threads();
9054     Nprocs = omp_get_num_procs();
9055 
9056     for (rl=0+OMPID; rl<rlmax_EC[Mc_AN]; rl+=Nthrds){
9057 
9058     /* C^+ u2 */
9059 
9060     for (n=0; n<EKC_core_size[Mc_AN]; n++){
9061       for (i=0; i<Msize2[Mc_AN]; i++){
9062 	tmpvec1[n][i]  = 0.0;
9063       }
9064     }
9065 
9066     for (i=0; i<=fan; i++){
9067 
9068       ig = natn[ct_AN][i];
9069       ian = Spe_Total_CNO[WhatSpecies[ig]];
9070       Anum = MP[i] - 1;
9071       ih = S_G2M[ig];
9072 
9073       for (j=fan+1; j<=can; j++){
9074 
9075 	kl = RMI1[Mc_AN][i][j];
9076 	jg = natn[ct_AN][j];
9077 	jan = Spe_Total_CNO[WhatSpecies[jg]];
9078 	Bnum = MP[j];
9079 
9080 	if (0<=kl){
9081 
9082 #ifdef nosse
9083 
9084 	  /* Original version */
9085 	  /**/
9086 	  for (m=0; m<ian; m++){
9087 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
9088 
9089 	      sum = 0.0;
9090 	      for (k=0; k<jan; k++){
9091 		sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9092 	      }
9093 
9094 	      tmpvec1[n][Anum+m] += sum;
9095 	    }
9096 	  }
9097 	  /**/
9098 
9099 #else
9100 
9101 	  /* Unrolling + SSE version */
9102 	  /**/
9103 	  for (m=0; m<(ian-3); m+=4){
9104 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
9105 
9106 	      mmSum00 = _mm_setzero_pd();
9107 	      mmSum01 = _mm_setzero_pd();
9108 	      mmSum10 = _mm_setzero_pd();
9109 	      mmSum11 = _mm_setzero_pd();
9110 	      mmSum20 = _mm_setzero_pd();
9111 	      mmSum21 = _mm_setzero_pd();
9112 	      mmSum30 = _mm_setzero_pd();
9113 	      mmSum31 = _mm_setzero_pd();
9114 
9115 	      for (k=0; k<(jan-3); k+=4){
9116 		mmTmp0 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0]);
9117 		mmTmp1 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+2]);
9118 
9119 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
9120 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
9121 
9122 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
9123 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
9124 
9125 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
9126 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
9127 
9128 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
9129 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
9130 	      }
9131 
9132 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
9133 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
9134 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
9135 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
9136 
9137 	      _mm_storeu_pd(&mmArr[0], mmSum00);
9138 	      _mm_storeu_pd(&mmArr[2], mmSum10);
9139 	      _mm_storeu_pd(&mmArr[4], mmSum20);
9140 	      _mm_storeu_pd(&mmArr[6], mmSum30);
9141 
9142 	      sum0 = mmArr[0] + mmArr[1];
9143 	      sum1 = mmArr[2] + mmArr[3];
9144 	      sum2 = mmArr[4] + mmArr[5];
9145 	      sum3 = mmArr[6] + mmArr[7];
9146 
9147 	      for (; k<jan; k++){
9148 		sum0 += Hks[spin][ih][kl][m+0][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9149 		sum1 += Hks[spin][ih][kl][m+1][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9150 		sum2 += Hks[spin][ih][kl][m+2][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9151 		sum3 += Hks[spin][ih][kl][m+3][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9152 	      }
9153 
9154 	      tmpvec1[n][Anum+m+0] += sum0;
9155 	      tmpvec1[n][Anum+m+1] += sum1;
9156 	      tmpvec1[n][Anum+m+2] += sum2;
9157 	      tmpvec1[n][Anum+m+3] += sum3;
9158 	    }
9159 	  }
9160 
9161 	  for (; m<ian; m++){
9162 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
9163 
9164 	      sum = 0.0;
9165 	      for (k=0; k<jan; k++){
9166 		sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9167 	      }
9168 
9169 	      tmpvec1[n][Anum+m] += sum;
9170 	    }
9171 	  }
9172 	  /**/
9173 #endif
9174 
9175 	}
9176       }
9177     }
9178 
9179     /* u1^+ C^+ u2 */
9180 
9181 #ifdef nosse
9182 
9183     /* Original version */
9184     /**/
9185     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
9186       for (m=0; m<EKC_core_size[Mc_AN]; m++){
9187         for (n=0; n<EKC_core_size[Mc_AN]; n++){
9188   	  sum = 0.0;
9189 	  for (i=0; i<Msize[Mc_AN]; i++){
9190             sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+m*KU_d2+i+1]*tmpvec1[n][i];
9191 	  }
9192           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
9193 	}
9194       }
9195     }
9196     /**/
9197 
9198 #else
9199 
9200     /* Unrolling + SSE version */
9201     /**/
9202     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
9203       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
9204         for (n=0; n<EKC_core_size[Mc_AN]; n++){
9205 
9206 	  mmSum00 = _mm_setzero_pd();
9207 	  mmSum01 = _mm_setzero_pd();
9208 	  mmSum10 = _mm_setzero_pd();
9209 	  mmSum11 = _mm_setzero_pd();
9210 	  mmSum20 = _mm_setzero_pd();
9211 	  mmSum21 = _mm_setzero_pd();
9212 	  mmSum30 = _mm_setzero_pd();
9213 	  mmSum31 = _mm_setzero_pd();
9214 
9215 	  for (i=0; i<(Msize[Mc_AN]-3); i+=4){
9216 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
9217 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
9218 
9219 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]),mmTmp0));
9220 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+3]),mmTmp1));
9221 
9222 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]),mmTmp0));
9223 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+3]),mmTmp1));
9224 
9225 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]),mmTmp0));
9226 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+3]),mmTmp1));
9227 
9228 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]),mmTmp0));
9229 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+3]),mmTmp1));
9230 	  }
9231 
9232 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
9233 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
9234 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
9235 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
9236 
9237 	  _mm_storeu_pd(&mmArr[0], mmSum00);
9238 	  _mm_storeu_pd(&mmArr[2], mmSum10);
9239 	  _mm_storeu_pd(&mmArr[4], mmSum20);
9240 	  _mm_storeu_pd(&mmArr[6], mmSum30);
9241 
9242 	  sum0 = mmArr[0] + mmArr[1];
9243 	  sum1 = mmArr[2] + mmArr[3];
9244 	  sum2 = mmArr[4] + mmArr[5];
9245 	  sum3 = mmArr[6] + mmArr[7];
9246 
9247 	  for (; i<Msize[Mc_AN]; i++){
9248             sum0 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
9249             sum1 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]*tmpvec1[n][i];
9250             sum2 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]*tmpvec1[n][i];
9251             sum3 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]*tmpvec1[n][i];
9252 	  }
9253 
9254           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum0;
9255           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] = sum1;
9256           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] = sum2;
9257           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] = sum3;
9258 	}
9259       }
9260 
9261       for (; m<EKC_core_size[Mc_AN]; m++){
9262         for (n=0; n<EKC_core_size[Mc_AN]; n++){
9263   	  sum = 0.0;
9264 	  for (i=0; i<Msize[Mc_AN]; i++){
9265             sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
9266 	  }
9267           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
9268 	}
9269       }
9270 
9271     }
9272     /**/
9273 
9274 #endif
9275 
9276   } /* rl */
9277 
9278   for (i=0; i<EKC_core_size_max; i++){
9279     free(tmpvec1[i]);
9280   }
9281   free(tmpvec1);
9282 
9283 
9284   } /* #pragma omp parallel */
9285 
9286   /*******************************
9287             u2^+ C u1
9288   *******************************/
9289 
9290   for (i=1; i<=Msize3[Mc_AN]; i++){
9291     for (j=i; j<=Msize3[Mc_AN]; j++){
9292 
9293       tmp1 = EC_matrix[spin][Mc_AN][i][j];
9294       tmp2 = EC_matrix[spin][Mc_AN][j][i];
9295       tmp3 = tmp1 + tmp2;
9296 
9297       EC_matrix[spin][Mc_AN][i][j] = tmp3;
9298       EC_matrix[spin][Mc_AN][j][i] = tmp3;
9299     }
9300   }
9301 
9302   /*******************************
9303              u2^+ B u2
9304   *******************************/
9305 
9306 #pragma omp parallel shared(EKC_core_size_max,Msize2_max,rlmax_EC,Mc_AN,EKC_core_size,Msize2,fan,can,natn,ct_AN,Spe_Total_CNO,WhatSpecies,MP,S_G2M,RMI1,Hks,Krylov_U,spin,KU_d1,KU_d2,Msize,EC_matrix)   private(OMPID,Nthrds,Nprocs,rl,n,i,ig,ian,Anum,ih,j,kl,jg,jan,Bnum,m,sum,k,mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31,mmTmp0,mmTmp1,mmArr,sum0,sum1,sum2,sum3,rl0)
9307 
9308   {
9309 
9310     double **tmpvec1;
9311     tmpvec1 = (double**)malloc(sizeof(double*)*EKC_core_size_max);
9312     for (i=0; i<EKC_core_size_max; i++){
9313       tmpvec1[i] = (double*)malloc(sizeof(double)*Msize2_max);
9314     }
9315 
9316     /* get info. on OpenMP */
9317 
9318     OMPID = omp_get_thread_num();
9319     Nthrds = omp_get_num_threads();
9320     Nprocs = omp_get_num_procs();
9321 
9322   for (rl=0+OMPID; rl<rlmax_EC[Mc_AN]; rl+=Nthrds){
9323 
9324     /* B u2 */
9325 
9326     for (n=0; n<EKC_core_size[Mc_AN]; n++){
9327       for (i=0; i<Msize2[Mc_AN]; i++){
9328 	tmpvec1[n][i]  = 0.0;
9329       }
9330     }
9331 
9332     for (i=fan+1; i<=can; i++){
9333 
9334       ig = natn[ct_AN][i];
9335       ian = Spe_Total_CNO[WhatSpecies[ig]];
9336       Anum = MP[i] - 1;
9337       ih = S_G2M[ig];
9338 
9339       for (j=fan+1; j<=can; j++){
9340 
9341 	kl = RMI1[Mc_AN][i][j];
9342 	jg = natn[ct_AN][j];
9343 	jan = Spe_Total_CNO[WhatSpecies[jg]];
9344 	Bnum = MP[j];
9345 
9346 	if (0<=kl){
9347 
9348 #ifdef nosse
9349 
9350 	  /* Original version */
9351 	  /**/
9352 	  for (m=0; m<ian; m++){
9353 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
9354 
9355 	      sum = 0.0;
9356 	      for (k=0; k<jan; k++){
9357 		sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9358 	      }
9359 
9360 	      tmpvec1[n][Anum+m] += sum;
9361 	    }
9362 	  }
9363 	  /**/
9364 
9365 #else
9366 
9367 	  /* Unrolling + SSE version */
9368 	  /**/
9369 	  for (m=0; m<(ian-3); m+=4){
9370 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
9371 
9372 	      mmSum00 = _mm_setzero_pd();
9373 	      mmSum01 = _mm_setzero_pd();
9374 	      mmSum10 = _mm_setzero_pd();
9375 	      mmSum11 = _mm_setzero_pd();
9376 	      mmSum20 = _mm_setzero_pd();
9377 	      mmSum21 = _mm_setzero_pd();
9378 	      mmSum30 = _mm_setzero_pd();
9379 	      mmSum31 = _mm_setzero_pd();
9380 
9381 	      for (k=0; k<(jan-3); k+=4){
9382 		mmTmp0 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0]);
9383 		mmTmp1 = _mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+2]);
9384 
9385 		mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+0]),mmTmp0));
9386 		mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+0][k+2]),mmTmp1));
9387 
9388 		mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+0]),mmTmp0));
9389 		mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+1][k+2]),mmTmp1));
9390 
9391 		mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+0]),mmTmp0));
9392 		mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+2][k+2]),mmTmp1));
9393 
9394 		mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+0]),mmTmp0));
9395 		mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Hks[spin][ih][kl][m+3][k+2]),mmTmp1));
9396 	      }
9397 
9398 	      mmSum00 = _mm_add_pd(mmSum00, mmSum01);
9399 	      mmSum10 = _mm_add_pd(mmSum10, mmSum11);
9400 	      mmSum20 = _mm_add_pd(mmSum20, mmSum21);
9401 	      mmSum30 = _mm_add_pd(mmSum30, mmSum31);
9402 
9403 	      _mm_storeu_pd(&mmArr[0], mmSum00);
9404 	      _mm_storeu_pd(&mmArr[2], mmSum10);
9405 	      _mm_storeu_pd(&mmArr[4], mmSum20);
9406 	      _mm_storeu_pd(&mmArr[6], mmSum30);
9407 
9408 	      sum0 = mmArr[0] + mmArr[1];
9409 	      sum1 = mmArr[2] + mmArr[3];
9410 	      sum2 = mmArr[4] + mmArr[5];
9411 	      sum3 = mmArr[6] + mmArr[7];
9412 
9413 	      for (; k<jan; k++){
9414 		sum0 += Hks[spin][ih][kl][m+0][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
9415 		sum1 += Hks[spin][ih][kl][m+1][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
9416 		sum2 += Hks[spin][ih][kl][m+2][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
9417 		sum3 += Hks[spin][ih][kl][m+3][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k+0];
9418 	      }
9419 
9420 	      tmpvec1[n][Anum+m+0] += sum0;
9421 	      tmpvec1[n][Anum+m+1] += sum1;
9422 	      tmpvec1[n][Anum+m+2] += sum2;
9423 	      tmpvec1[n][Anum+m+3] += sum3;
9424 	    }
9425 	  }
9426 
9427 	  for (; m<ian; m++){
9428 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
9429 
9430 	      sum = 0.0;
9431 	      for (k=0; k<jan; k++){
9432 		sum += Hks[spin][ih][kl][m][k]*Krylov_U[spin][Mc_AN][rl*KU_d1+n*KU_d2+Bnum+k];
9433 	      }
9434 
9435 	      tmpvec1[n][Anum+m] += sum;
9436 	    }
9437 	  }
9438 	  /**/
9439 
9440 #endif
9441 
9442 	}
9443       }
9444     }
9445 
9446     /* u2^+ B u2 */
9447 
9448 #ifdef nosse
9449 
9450     /* Original version */
9451     /**/
9452     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
9453       for (m=0; m<EKC_core_size[Mc_AN]; m++){
9454         for (n=0; n<EKC_core_size[Mc_AN]; n++){
9455   	  sum = 0.0;
9456 	  for (i=Msize[Mc_AN]; i<Msize2[Mc_AN]; i++){
9457             sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+m*KU_d2+i+1]*tmpvec1[n][i];
9458 	  }
9459           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum;
9460 	}
9461       }
9462     }
9463     /**/
9464 
9465 #else
9466 
9467     /* Unrolling + SSE version */
9468     /**/
9469     for (rl0=0; rl0<rlmax_EC[Mc_AN]; rl0++){
9470       for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
9471         for (n=0; n<EKC_core_size[Mc_AN]; n++){
9472 
9473 	  mmSum00 = _mm_setzero_pd();
9474 	  mmSum01 = _mm_setzero_pd();
9475 	  mmSum10 = _mm_setzero_pd();
9476 	  mmSum11 = _mm_setzero_pd();
9477 	  mmSum20 = _mm_setzero_pd();
9478 	  mmSum21 = _mm_setzero_pd();
9479 	  mmSum30 = _mm_setzero_pd();
9480 	  mmSum31 = _mm_setzero_pd();
9481 
9482 	  for (i=Msize[Mc_AN]; i<(Msize2[Mc_AN]-3); i+=4){
9483 	    mmTmp0 = _mm_loadu_pd(&tmpvec1[n][i+0]);
9484 	    mmTmp1 = _mm_loadu_pd(&tmpvec1[n][i+2]);
9485 
9486 	    mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]),mmTmp0));
9487 	    mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+3]),mmTmp1));
9488 
9489 	    mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]),mmTmp0));
9490 	    mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+3]),mmTmp1));
9491 
9492 	    mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]),mmTmp0));
9493 	    mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+3]),mmTmp1));
9494 
9495 	    mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]),mmTmp0));
9496 	    mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+3]),mmTmp1));
9497 	  }
9498 
9499 	  mmSum00 = _mm_add_pd(mmSum00, mmSum01);
9500 	  mmSum10 = _mm_add_pd(mmSum10, mmSum11);
9501 	  mmSum20 = _mm_add_pd(mmSum20, mmSum21);
9502 	  mmSum30 = _mm_add_pd(mmSum30, mmSum31);
9503 
9504 	  _mm_storeu_pd(&mmArr[0], mmSum00);
9505 	  _mm_storeu_pd(&mmArr[2], mmSum10);
9506 	  _mm_storeu_pd(&mmArr[4], mmSum20);
9507 	  _mm_storeu_pd(&mmArr[6], mmSum30);
9508 
9509 	  sum0 = mmArr[0] + mmArr[1];
9510 	  sum1 = mmArr[2] + mmArr[3];
9511 	  sum2 = mmArr[4] + mmArr[5];
9512 	  sum3 = mmArr[6] + mmArr[7];
9513 
9514 	  for (; i<Msize2[Mc_AN]; i++){
9515             sum0 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
9516             sum1 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+1)*KU_d2+i+1]*tmpvec1[n][i];
9517             sum2 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+2)*KU_d2+i+1]*tmpvec1[n][i];
9518             sum3 += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+3)*KU_d2+i+1]*tmpvec1[n][i];
9519 	  }
9520 
9521           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum0;
9522           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+2][rl*EKC_core_size[Mc_AN]+n+1] += sum1;
9523           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+3][rl*EKC_core_size[Mc_AN]+n+1] += sum2;
9524           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+4][rl*EKC_core_size[Mc_AN]+n+1] += sum3;
9525 	}
9526       }
9527 
9528       for (; m<EKC_core_size[Mc_AN]; m++){
9529         for (n=0; n<EKC_core_size[Mc_AN]; n++){
9530   	  sum = 0.0;
9531 	  for (i=Msize[Mc_AN]; i<Msize2[Mc_AN]; i++){
9532             sum += Krylov_U[spin][Mc_AN][rl0*KU_d1+(m+0)*KU_d2+i+1]*tmpvec1[n][i];
9533 	  }
9534           EC_matrix[spin][Mc_AN][rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] += sum;
9535 	}
9536       }
9537 
9538     }
9539     /**/
9540 
9541 #endif
9542 
9543   } /* rl */
9544 
9545     for (i=0; i<EKC_core_size_max; i++){
9546       free(tmpvec1[i]);
9547     }
9548     free(tmpvec1);
9549 
9550   } /* #pragma omp parallel */
9551 
9552 }
9553 
9554 
9555 
9556 
9557 
9558 
9559 
9560 
9561 
9562 
Krylov_IOLP_trd(int Mc_AN,double **** OLP0,double *** Krylov_U_OLP,double ** inv_RS,int * MP,int * Msize2,int * Msize4,int Msize2_max,double ** tmpvec0,double ** tmpvec1)9563 void Krylov_IOLP_trd( int Mc_AN, double ****OLP0, double ***Krylov_U_OLP, double **inv_RS, int *MP,
9564                   int *Msize2, int *Msize4, int Msize2_max, double **tmpvec0, double **tmpvec1 )
9565 
9566 /* This subroutine is exactly the same as the original Krylov_IOLP,
9567 except that Eigen_lapack is replaced with Eigen_lapack_d, _x, or _r
9568 */
9569 
9570 {
9571   int rl,ct_AN,fan,san,can,wan,ct_on,i,j;
9572   int n,Anum,Bnum,k,ian,ih,kl,jg,ig,jan,m,m1,n1,info;
9573   int rl0,ZeroNum,Neumann_series,ns,Gh_AN,wanB;
9574   double sum,dum,tmp0,tmp1,tmp2,tmp3,rcutA,r0;
9575   double **tmpmat0;
9576   double *ko,*iko;
9577   double **FS;
9578 
9579   ct_AN = M2G[Mc_AN];
9580   fan = FNAN[ct_AN];
9581   san = SNAN[ct_AN];
9582   can = fan + san;
9583   wan = WhatSpecies[ct_AN];
9584   ct_on = Spe_Total_CNO[wan];
9585   rcutA = Spe_Atom_Cut1[wan];
9586 
9587   /* allocation of arrays */
9588 
9589   tmpmat0 = (double**)malloc(sizeof(double*)*(EKC_core_size[Mc_AN]+4));
9590   for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
9591     tmpmat0[i] = (double*)malloc(sizeof(double)*(EKC_core_size[Mc_AN]+4));
9592   }
9593 
9594   FS = (double**)malloc(sizeof(double*)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
9595   for (i=0; i<(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
9596     FS[i] = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
9597   }
9598 
9599   ko = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
9600   iko = (double*)malloc(sizeof(double)*(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]);
9601 
9602   /****************************************************
9603       initialize
9604   ****************************************************/
9605 
9606   for (i=0; i<EKC_core_size_max; i++){
9607     for (j=0; j<Msize2_max; j++){
9608       tmpvec0[i][j] = 0.0;
9609       tmpvec1[i][j] = 0.0;
9610     }
9611   }
9612 
9613   /* find the nearest atom with distance of r0 */
9614 
9615   r0 = 1.0e+10;
9616   for (k=1; k<=FNAN[ct_AN]; k++){
9617     Gh_AN = natn[ct_AN][k];
9618     wanB = WhatSpecies[Gh_AN];
9619     if (Dis[ct_AN][k]<r0)  r0 = Dis[ct_AN][k];
9620   }
9621 
9622   /* starting vector */
9623 
9624   m = 0;
9625   for (k=0; k<=FNAN[ct_AN]; k++){
9626 
9627     Gh_AN = natn[ct_AN][k];
9628     wanB = WhatSpecies[Gh_AN];
9629 
9630     if ( Dis[ct_AN][k]<(scale_rc_EKC[Mc_AN]*r0) ){
9631 
9632       Anum = MP[k] - 1;
9633 
9634       for (i=0; i<Spe_Total_CNO[wanB]; i++){
9635 
9636         tmpvec0[m][Anum+i]         = 1.0;
9637         Krylov_U_OLP[0][m][Anum+i] = 1.0;
9638 
9639         m++;
9640       }
9641     }
9642   }
9643 
9644   /*
9645   for (i=0; i<EKC_core_size[Mc_AN]; i++){
9646     tmpvec0[i][i]         = 1.0;
9647     Krylov_U_OLP[0][i][i] = 1.0;
9648   }
9649   */
9650 
9651   /****************************************************
9652            generate Krylov subspace vectors
9653   ****************************************************/
9654 
9655   for (rl=0; rl<(rlmax_EC2[Mc_AN]-1); rl++){
9656 
9657     /*******************************************************
9658                             S * |Wn)
9659     *******************************************************/
9660 
9661     for (n=0; n<EKC_core_size[Mc_AN]; n++){
9662       for (i=0; i<Msize2[Mc_AN]; i++){
9663 	tmpvec1[n][i]  = 0.0;
9664       }
9665     }
9666 
9667     for (i=0; i<=can; i++){
9668 
9669       ig = natn[ct_AN][i];
9670       ian = Spe_Total_CNO[WhatSpecies[ig]];
9671       Anum = MP[i] - 1;
9672       ih = S_G2M[ig];
9673 
9674       for (j=0; j<=can; j++){
9675 
9676 	kl = RMI1[Mc_AN][i][j];
9677 	jg = natn[ct_AN][j];
9678 	jan = Spe_Total_CNO[WhatSpecies[jg]];
9679         Bnum = MP[j] - 1;
9680 
9681         if (0<=kl){
9682 
9683 	  for (m=0; m<ian; m++){
9684 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
9685 
9686 	      sum = 0.0;
9687 	      for (k=0; k<jan; k++){
9688 		sum += OLP0[ih][kl][m][k]*tmpvec0[n][Bnum+k];
9689 	      }
9690 
9691               tmpvec1[n][Anum+m] += sum;
9692 	    }
9693 	  }
9694 	}
9695       }
9696     }
9697 
9698     /*************************************************************
9699       orthogonalization by a modified block Gram-Schmidt method
9700     *************************************************************/
9701 
9702     /* |tmpvec1) := (I - \sum_{rl0} |U_rl0)(U_rl0|)|tmpvec1) */
9703 
9704     for (rl0=0; rl0<=rl; rl0++){
9705 
9706       /* (U_rl0|tmpvec1) */
9707 
9708       for (m=0; m<EKC_core_size[Mc_AN]; m++){
9709         for (n=0; n<EKC_core_size[Mc_AN]; n++){
9710           sum = 0.0;
9711   	  for (i=0; i<Msize2[Mc_AN]; i++){
9712             sum += Krylov_U_OLP[rl0][m][i]*tmpvec1[n][i];
9713 	  }
9714           tmpmat0[m][n] = sum;
9715 	}
9716       }
9717 
9718       /* |tmpvec1) := |tmpvec1) - |U_rl0)(U_rl0|tmpvec1) */
9719 
9720       for (n=0; n<EKC_core_size[Mc_AN]; n++){
9721 	for (k=0; k<EKC_core_size[Mc_AN]; k++){
9722 	  dum = tmpmat0[k][n];
9723 	  for (i=0; i<Msize2[Mc_AN]; i++)  tmpvec1[n][i] -= Krylov_U_OLP[rl0][k][i]*dum;
9724         }
9725       }
9726     }
9727 
9728     /*************************************************************
9729                        normalization of tmpvec1
9730     *************************************************************/
9731 
9732     for (m=0; m<EKC_core_size[Mc_AN]; m++){
9733       for (n=0; n<EKC_core_size[Mc_AN]; n++){
9734 
9735         sum = 0.0;
9736         for (i=0; i<Msize2[Mc_AN]; i++){
9737           sum += tmpvec1[m][i]*tmpvec1[n][i];
9738 	}
9739 
9740         tmpmat0[m+1][n+1] = sum;
9741       }
9742     }
9743 
9744     /* diagonalize tmpmat0 */
9745 
9746     if ( EKC_core_size[Mc_AN]==1){
9747       ko[1] = tmpmat0[1][1];
9748       tmpmat0[1][1] = 1.0;
9749     }
9750     else{
9751 
9752       info = Eigen_lapack_d(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
9753       if (info!=0){
9754 	info = Eigen_lapack_x(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
9755 	if (info!=0){
9756 	  Eigen_lapack_r(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
9757 	}
9758       }
9759     }
9760 
9761     ZeroNum = 0;
9762 
9763     for (n=1; n<=EKC_core_size[Mc_AN]; n++){
9764       if (cutoff_value<ko[n]){
9765 	ko[n]  = sqrt(fabs(ko[n]));
9766 	iko[n] = 1.0/ko[n];
9767       }
9768       else{
9769 	ZeroNum++;
9770 	ko[n]  = 0.0;
9771 	iko[n] = 0.0;
9772       }
9773 
9774       if (error_check==1){
9775         printf("rl=%3d ko=%18.15f\n",rl,ko[n]);
9776       }
9777     }
9778 
9779     if (error_check==1){
9780       printf("rl=%3d ZeroNum=%3d\n",rl,ZeroNum);
9781     }
9782 
9783     /* tmpvec0 = tmpvec1 * tmpmat0^{-1/2} */
9784 
9785     for (n=0; n<EKC_core_size[Mc_AN]; n++){
9786       for (i=0; i<Msize2[Mc_AN]; i++){
9787 	tmpvec0[n][i] = 0.0;
9788       }
9789     }
9790 
9791     for (n=0; n<EKC_core_size[Mc_AN]; n++){
9792       for (k=0; k<EKC_core_size[Mc_AN]; k++){
9793 	dum = tmpmat0[k+1][n+1]*iko[n+1];
9794 	for (i=0; i<Msize2[Mc_AN]; i++)  tmpvec0[n][i] += tmpvec1[k][i]*dum;
9795       }
9796     }
9797 
9798     /*************************************************************
9799                          store Krylov vectors
9800     *************************************************************/
9801 
9802     for (n=0; n<EKC_core_size[Mc_AN]; n++){
9803       for (i=0; i<Msize2[Mc_AN]; i++){
9804         Krylov_U_OLP[rl+1][n][i] = tmpvec0[n][i];
9805       }
9806     }
9807 
9808   } /* rl */
9809 
9810   /************************************************************
9811        calculate the inverse of the reduced overlap matrix
9812   ************************************************************/
9813 
9814   /*  construct the reduced overlap matrix */
9815 
9816   for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
9817 
9818     /*  S * |Vn) */
9819 
9820     for (n=0; n<EKC_core_size[Mc_AN]; n++){
9821       for (i=0; i<Msize2[Mc_AN]; i++){
9822 	tmpvec1[n][i]  = 0.0;
9823       }
9824     }
9825 
9826     for (i=0; i<=can; i++){
9827 
9828       ig = natn[ct_AN][i];
9829       ian = Spe_Total_CNO[WhatSpecies[ig]];
9830       Anum = MP[i] - 1;
9831       ih = S_G2M[ig];
9832 
9833       for (j=0; j<=can; j++){
9834 
9835 	kl = RMI1[Mc_AN][i][j];
9836 	jg = natn[ct_AN][j];
9837 	jan = Spe_Total_CNO[WhatSpecies[jg]];
9838 	Bnum = MP[j] - 1;
9839 
9840 	if (0<=kl){
9841 
9842 	  for (m=0; m<ian; m++){
9843 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
9844 
9845 	      sum = 0.0;
9846 	      for (k=0; k<jan; k++){
9847 		sum += OLP0[ih][kl][m][k]*Krylov_U_OLP[rl][n][Bnum+k];
9848 	      }
9849 	      tmpvec1[n][Anum+m] += sum;
9850 	    }
9851 	  }
9852 	}
9853       }
9854     }
9855 
9856     for (rl0=rl; rl0<rlmax_EC2[Mc_AN]; rl0++){
9857       for (m=0; m<EKC_core_size[Mc_AN]; m++){
9858         for (n=0; n<EKC_core_size[Mc_AN]; n++){
9859   	  sum = 0.0;
9860 	  for (i=0; i<Msize2[Mc_AN]; i++){
9861             sum += Krylov_U_OLP[rl0][m][i]*tmpvec1[n][i];
9862 	  }
9863           FS[rl0*EKC_core_size[Mc_AN]+m+1][rl*EKC_core_size[Mc_AN]+n+1] = sum;
9864           FS[rl*EKC_core_size[Mc_AN]+n+1][rl0*EKC_core_size[Mc_AN]+m+1] = sum;
9865 	}
9866       }
9867     }
9868   }
9869 
9870   /* diagonalize FS */
9871 
9872   info = Eigen_lapack_d(FS,ko,Msize4[Mc_AN],Msize4[Mc_AN]);
9873   if (info!=0){
9874     info = Eigen_lapack_x(FS,ko,Msize4[Mc_AN],Msize4[Mc_AN]);
9875     if (info!=0){
9876       Eigen_lapack_r(FS,ko,Msize4[Mc_AN],Msize4[Mc_AN]);
9877     }
9878   }
9879 
9880   /* find ill-conditioned eigenvalues */
9881 
9882   ZeroNum = 0;
9883   for (i=1; i<=Msize4[Mc_AN]; i++){
9884 
9885     if (error_check==1){
9886       printf("Mc_AN=%2d i=%3d ko[i]=%18.15f\n",Mc_AN,i,ko[i]);
9887     }
9888 
9889     if (cutoff_value<ko[i]){
9890       iko[i] = 1.0/ko[i];
9891     }
9892     else{
9893       ZeroNum++;
9894       ko[i]  = 0.0;
9895       iko[i] = 0.0;
9896     }
9897   }
9898 
9899   if (error_check==1){
9900     printf("Mc_AN=%2d ZeroNum=%2d\n",Mc_AN,ZeroNum);
9901   }
9902 
9903   /* construct the inverse */
9904 
9905   for (i=1; i<=Msize4[Mc_AN]; i++){
9906     for (j=1; j<=Msize4[Mc_AN]; j++){
9907       sum = 0.0;
9908       for (k=1; k<=Msize4[Mc_AN]; k++){
9909         sum += FS[i][k]*iko[k]*FS[j][k];
9910       }
9911       inv_RS[i-1][j-1] = sum;
9912     }
9913   }
9914 
9915   /* symmetrization of inv_RS */
9916 
9917   for (i=1; i<=Msize4[Mc_AN]; i++){
9918     for (j=i+1; j<=Msize4[Mc_AN]; j++){
9919       tmp0 = inv_RS[i-1][j-1];
9920       tmp1 = inv_RS[j-1][i-1];
9921       tmp2 = 0.5*(tmp0 + tmp1);
9922       inv_RS[i-1][j-1] = tmp2;
9923       inv_RS[j-1][i-1] = tmp2;
9924     }
9925   }
9926 
9927   /*
9928   {
9929 
9930 
9931     double mat1[1000][1000];
9932     double tsum;
9933     int myid;
9934 
9935     MPI_Comm_rank(mpi_comm_level1,&myid);
9936 
9937     if (myid==0){
9938 
9939       printf("check normalization\n");
9940 
9941       for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
9942 	for (m=0; m<EKC_core_size[Mc_AN]; m++){
9943 	  for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
9944 	    for (n=0; n<EKC_core_size[Mc_AN]; n++){
9945 
9946               sum = 0.0;
9947    	      for (i=0; i<Msize2[Mc_AN]; i++){
9948                 sum += Krylov_U_OLP[rl][m][i]*Krylov_U_OLP[rl0][n][i];
9949 	      }
9950               printf("rl=%3d rl0=%3d m=%3d n=%3d  <|>=%18.15f\n",rl,rl0,m,n,sum);
9951 	    }
9952 	  }
9953 	}
9954       }
9955 
9956 
9957       printf("\n\ninvS\n");
9958 
9959 
9960       for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
9961 	for (m=0; m<EKC_core_size[Mc_AN]; m++){
9962 
9963 	  for (i=0; i<Msize2[Mc_AN]; i++){
9964 
9965 	    sum = 0.0;
9966 
9967 	    for (rl0=0; rl0<rlmax_EC2[Mc_AN]; rl0++){
9968 	      for (n=0; n<EKC_core_size[Mc_AN]; n++){
9969 		sum += inv_RS[rl*EKC_core_size[Mc_AN]+m][rl0*EKC_core_size[Mc_AN]+n]*Krylov_U_OLP[rl0][n][i];
9970 	      }
9971 	    }
9972 
9973 	    mat1[rl*EKC_core_size[Mc_AN]+m][i] = sum;
9974 	  }
9975 	}
9976       }
9977 
9978 
9979       tsum = 0.0;
9980 
9981       for (i=0; i<Msize2[Mc_AN]; i++){
9982 	for (j=0; j<Msize2[Mc_AN]; j++){
9983 
9984 	  sum = 0.0;
9985 	  for (rl=0; rl<rlmax_EC2[Mc_AN]; rl++){
9986 	    for (m=0; m<EKC_core_size[Mc_AN]; m++){
9987 	      sum += Krylov_U_OLP[rl][m][i]*mat1[rl*EKC_core_size[Mc_AN]+m][j];
9988 	    }
9989 	  }
9990 
9991 	  printf("i=%4d j=%4d %18.15f\n",i,j,sum);
9992 
9993           tsum += fabs(sum);
9994 
9995 	}
9996       }
9997 
9998 
9999       printf("\n\ntsum=%18.15f\n",tsum);
10000 
10001     }
10002 
10003 
10004     MPI_Finalize();
10005     exit(0);
10006   }
10007   */
10008 
10009   /* freeing of arrays */
10010 
10011   for (i=0; i<(EKC_core_size[Mc_AN]+4); i++){
10012     free(tmpmat0[i]);
10013   }
10014   free(tmpmat0);
10015 
10016   for (i=0; i<(rlmax_EC2[Mc_AN]+2)*EKC_core_size[Mc_AN]; i++){
10017     free(FS[i]);
10018   }
10019   free(FS);
10020 
10021   free(ko);
10022   free(iko);
10023 }
10024 
10025 
10026 
10027 
10028 
10029 
10030 
10031 
10032 
10033 
10034 
10035 
S_orthonormalize_vec_trd(int Mc_AN,int ct_on,double ** vec,double ** workvec,double **** OLP0,double ** tmpmat0,double * ko,double * iko,int * MP,int * Msize2)10036 void S_orthonormalize_vec_trd( int Mc_AN, int ct_on, double **vec,
10037                            double **workvec, double ****OLP0,
10038                            double **tmpmat0, double *ko, double *iko,
10039                            int *MP, int *Msize2 )
10040 
10041 /* This subroutine is exactly the same as the original S_orthonormalize_vec,
10042 except that Eigen_lapack is replaced with Eigen_lapack_d, _x, or _r
10043 */
10044 
10045 {
10046   int n,i,j,can,san,fan,ct_AN;
10047   int k,m,ZeroNum,Anum,Bnum,ih;
10048   int kl,jg,jan,ig,ian,info;
10049   double dum,sum;
10050   double tmp0,tmp1,tmp2,tmp3;
10051   __m128d mmSum00,mmSum01,mmSum10,mmSum11,mmSum20,mmSum21,mmSum30,mmSum31, mmTmp0, mmTmp1, mmTmp2, mmTmp3, mmTmp4, mmTmp5;
10052   double mmArr[8];
10053   double sum0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
10054 
10055   ct_AN = M2G[Mc_AN];
10056   fan = FNAN[ct_AN];
10057   san = SNAN[ct_AN];
10058   can = fan + san;
10059 
10060   /* S|Vn) */
10061 
10062   for (n=0; n<EKC_core_size[Mc_AN]; n++){
10063     for (i=0; i<Msize2[Mc_AN]; i++){
10064       workvec[n][i]  = 0.0;
10065     }
10066   }
10067 
10068   for (i=0; i<=can; i++){
10069 
10070     ig = natn[ct_AN][i];
10071     ian = Spe_Total_CNO[WhatSpecies[ig]];
10072     Anum = MP[i] - 1;
10073     ih = S_G2M[ig];
10074 
10075     for (j=0; j<=can; j++){
10076 
10077       kl = RMI1[Mc_AN][i][j];
10078       jg = natn[ct_AN][j];
10079       jan = Spe_Total_CNO[WhatSpecies[jg]];
10080       Bnum = MP[j] - 1;
10081 
10082       if (0<=kl){
10083 
10084 #ifdef nosse
10085 
10086 	/* Original version */
10087 	/**/
10088 	for (m=0; m<ian; m++){
10089 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
10090 
10091 	    sum = 0.0;
10092 	    for (k=0; k<jan; k++){
10093 	      sum += OLP0[ih][kl][m][k]*vec[n][Bnum+k];
10094 	    }
10095 
10096 	    workvec[n][Anum+m] += sum;
10097 	  }
10098 	}
10099 	/**/
10100 
10101 #else
10102 
10103 	/* Unrolling + SSE version */
10104 	/**/
10105 	for (m=0; m<(ian-3); m+=4){
10106 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
10107 
10108 	    mmSum00 = _mm_setzero_pd();
10109 	    mmSum01 = _mm_setzero_pd();
10110 	    mmSum10 = _mm_setzero_pd();
10111 	    mmSum11 = _mm_setzero_pd();
10112 	    mmSum20 = _mm_setzero_pd();
10113 	    mmSum21 = _mm_setzero_pd();
10114 	    mmSum30 = _mm_setzero_pd();
10115 	    mmSum31 = _mm_setzero_pd();
10116 
10117 	    for (k=0; k<(jan-3); k+=4){
10118 	      mmTmp0 = _mm_loadu_pd(&vec[n][Bnum+k+0]);
10119 	      mmTmp1 = _mm_loadu_pd(&vec[n][Bnum+k+2]);
10120 
10121 	      mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+0]),mmTmp0));
10122 	      mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+0][k+2]),mmTmp1));
10123 
10124 	      mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+0]),mmTmp0));
10125 	      mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+1][k+2]),mmTmp1));
10126 
10127 	      mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+0]),mmTmp0));
10128 	      mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+2][k+2]),mmTmp1));
10129 
10130 	      mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+0]),mmTmp0));
10131 	      mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&OLP0[ih][kl][m+3][k+2]),mmTmp1));
10132 	    }
10133 
10134 	    mmSum00 = _mm_add_pd(mmSum00, mmSum01);
10135 	    mmSum10 = _mm_add_pd(mmSum10, mmSum11);
10136 	    mmSum20 = _mm_add_pd(mmSum20, mmSum21);
10137 	    mmSum30 = _mm_add_pd(mmSum30, mmSum31);
10138 
10139 	    _mm_storeu_pd(&mmArr[0], mmSum00);
10140 	    _mm_storeu_pd(&mmArr[2], mmSum10);
10141 	    _mm_storeu_pd(&mmArr[4], mmSum20);
10142 	    _mm_storeu_pd(&mmArr[6], mmSum30);
10143 
10144 	    sum0 = mmArr[0] + mmArr[1];
10145 	    sum1 = mmArr[2] + mmArr[3];
10146 	    sum2 = mmArr[4] + mmArr[5];
10147 	    sum3 = mmArr[6] + mmArr[7];
10148 
10149 	    for (; k<jan; k++){
10150 	      sum0 += OLP0[ih][kl][m+0][k]*vec[n][Bnum+k];
10151 	      sum1 += OLP0[ih][kl][m+1][k]*vec[n][Bnum+k];
10152 	      sum2 += OLP0[ih][kl][m+2][k]*vec[n][Bnum+k];
10153 	      sum3 += OLP0[ih][kl][m+3][k]*vec[n][Bnum+k];
10154 	    }
10155 
10156 	    workvec[n][Anum+m+0] += sum0;
10157 	    workvec[n][Anum+m+1] += sum1;
10158 	    workvec[n][Anum+m+2] += sum2;
10159 	    workvec[n][Anum+m+3] += sum3;
10160 	  }
10161 	}
10162 
10163 	for (; m<ian; m++){
10164 	  for (n=0; n<EKC_core_size[Mc_AN]; n++){
10165 
10166 	    sum = 0.0;
10167 	    for (k=0; k<jan; k++){
10168 	      sum += OLP0[ih][kl][m][k]*vec[n][Bnum+k];
10169 	    }
10170 
10171 	    workvec[n][Anum+m] += sum;
10172 	  }
10173 	}
10174 	/**/
10175 
10176 #endif
10177 
10178       }
10179     }
10180   }
10181 
10182   /*  (Vn|S|Vn) */
10183 
10184 #ifdef nosse
10185 
10186   /* Original version */
10187   /**/
10188   for (m=0; m<EKC_core_size[Mc_AN]; m++){
10189     for (n=m; n<EKC_core_size[Mc_AN]; n++){
10190       sum = 0.0;
10191       for (i=0; i<Msize2[Mc_AN]; i++){
10192 	sum += vec[m][i]*workvec[n][i];
10193       }
10194       tmpmat0[m+1][n+1] = sum;
10195       tmpmat0[n+1][m+1] = sum;
10196     }
10197   }
10198   /**/
10199 
10200 #else
10201 
10202   /* Unrolling + SSE version */
10203   /**/
10204   for (m=0; m<(EKC_core_size[Mc_AN]-3); m+=4){
10205     for (n=m; n<EKC_core_size[Mc_AN]; n++){
10206       mmSum00 = _mm_setzero_pd();
10207       mmSum01 = _mm_setzero_pd();
10208       mmSum10 = _mm_setzero_pd();
10209       mmSum11 = _mm_setzero_pd();
10210       mmSum20 = _mm_setzero_pd();
10211       mmSum21 = _mm_setzero_pd();
10212       mmSum30 = _mm_setzero_pd();
10213       mmSum31 = _mm_setzero_pd();
10214 
10215       for (i=0; i<(Msize2[Mc_AN]-3); i+=4){
10216 	mmTmp0 = _mm_loadu_pd(&workvec[n][i+0]);
10217 	mmTmp1 = _mm_loadu_pd(&workvec[n][i+2]);
10218 
10219 	mmSum00 = _mm_add_pd(mmSum00, _mm_mul_pd(_mm_loadu_pd(&vec[m+0][i+0]),mmTmp0));
10220 	mmSum01 = _mm_add_pd(mmSum01, _mm_mul_pd(_mm_loadu_pd(&vec[m+0][i+2]),mmTmp1));
10221 
10222 	mmSum10 = _mm_add_pd(mmSum10, _mm_mul_pd(_mm_loadu_pd(&vec[m+1][i+0]),mmTmp0));
10223 	mmSum11 = _mm_add_pd(mmSum11, _mm_mul_pd(_mm_loadu_pd(&vec[m+1][i+2]),mmTmp1));
10224 
10225 	mmSum20 = _mm_add_pd(mmSum20, _mm_mul_pd(_mm_loadu_pd(&vec[m+2][i+0]),mmTmp0));
10226 	mmSum21 = _mm_add_pd(mmSum21, _mm_mul_pd(_mm_loadu_pd(&vec[m+2][i+2]),mmTmp1));
10227 
10228 	mmSum30 = _mm_add_pd(mmSum30, _mm_mul_pd(_mm_loadu_pd(&vec[m+3][i+0]),mmTmp0));
10229 	mmSum31 = _mm_add_pd(mmSum31, _mm_mul_pd(_mm_loadu_pd(&vec[m+3][i+2]),mmTmp1));
10230       }
10231 
10232       mmSum00 = _mm_add_pd(mmSum00, mmSum01);
10233       mmSum10 = _mm_add_pd(mmSum10, mmSum11);
10234       mmSum20 = _mm_add_pd(mmSum20, mmSum21);
10235       mmSum30 = _mm_add_pd(mmSum30, mmSum31);
10236 
10237       _mm_storeu_pd(&mmArr[0], mmSum00);
10238       _mm_storeu_pd(&mmArr[2], mmSum10);
10239       _mm_storeu_pd(&mmArr[4], mmSum20);
10240       _mm_storeu_pd(&mmArr[6], mmSum30);
10241 
10242       sum0 = mmArr[0] + mmArr[1];
10243       sum1 = mmArr[2] + mmArr[3];
10244       sum2 = mmArr[4] + mmArr[5];
10245       sum3 = mmArr[6] + mmArr[7];
10246 
10247       for (; i<Msize2[Mc_AN]; i++){
10248 	sum0 += vec[m+0][i]*workvec[n][i];
10249 	sum1 += vec[m+1][i]*workvec[n][i];
10250 	sum2 += vec[m+2][i]*workvec[n][i];
10251 	sum3 += vec[m+3][i]*workvec[n][i];
10252       }
10253 
10254       tmpmat0[m+1][n+1] = sum0;
10255       tmpmat0[n+1][m+1] = sum0;
10256 
10257       tmpmat0[m+2][n+1] = sum1;
10258       tmpmat0[n+1][m+2] = sum1;
10259 
10260       tmpmat0[m+3][n+1] = sum2;
10261       tmpmat0[n+1][m+3] = sum2;
10262 
10263       tmpmat0[m+4][n+1] = sum3;
10264       tmpmat0[n+1][m+4] = sum3;
10265     }
10266   }
10267 
10268   for (; m<EKC_core_size[Mc_AN]; m++){
10269     for (n=m; n<EKC_core_size[Mc_AN]; n++){
10270       sum = 0.0;
10271       for (i=0; i<Msize2[Mc_AN]; i++){
10272 	sum += vec[m][i]*workvec[n][i];
10273       }
10274       tmpmat0[m+1][n+1] = sum;
10275       tmpmat0[n+1][m+1] = sum;
10276     }
10277   }
10278   /**/
10279 
10280 #endif
10281 
10282   /* diagonalize tmpmat0 */
10283 
10284   if ( EKC_core_size[Mc_AN]==1){
10285     ko[1] = tmpmat0[1][1];
10286     tmpmat0[1][1] = 1.0;
10287   }
10288   else{
10289 
10290     info = Eigen_lapack_d(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
10291     if (info!=0){
10292       info = Eigen_lapack_x(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
10293       if (info!=0){
10294 	Eigen_lapack_r(tmpmat0, ko, EKC_core_size[Mc_AN], EKC_core_size[Mc_AN]);
10295       }
10296     }
10297   }
10298 
10299   ZeroNum = 0;
10300 
10301   for (n=1; n<=EKC_core_size[Mc_AN]; n++){
10302     if (cutoff_value<ko[n]){
10303       ko[n]  = sqrt(fabs(ko[n]));
10304       iko[n] = 1.0/ko[n];
10305     }
10306     else{
10307       ZeroNum++;
10308       ko[n]  = 0.0;
10309       iko[n] = 0.0;
10310     }
10311   }
10312 
10313   /* U0 = vec * tmpmat0^{-1/2} */
10314 
10315   for (n=0; n<EKC_core_size[Mc_AN]; n++){
10316     for (i=0; i<Msize2[Mc_AN]; i++){
10317       workvec[n][i] = 0.0;
10318     }
10319   }
10320 
10321   for (n=0; n<EKC_core_size[Mc_AN]; n++){
10322     for (k=0; k<EKC_core_size[Mc_AN]; k++){
10323       dum = tmpmat0[k+1][n+1]*iko[n+1];
10324       for (i=0; i<Msize2[Mc_AN]; i++)  workvec[n][i] += vec[k][i]*dum;
10325     }
10326   }
10327 
10328   for (n=0; n<EKC_core_size[Mc_AN]; n++){
10329     for (i=0; i<Msize2[Mc_AN]; i++){
10330       vec[n][i] = workvec[n][i];
10331     }
10332   }
10333 }
10334 
10335 
Eigen_lapack_x(double ** a,double * ko,int n0,int EVmax)10336 int Eigen_lapack_x(double **a, double *ko, int n0, int EVmax)
10337 {
10338 
10339   /*
10340     F77_NAME(dsyevx,DSYEVX)()
10341 
10342     input:  n;
10343     input:  a[n][n];  matrix A
10344     output: a[n][n];  eigevectors
10345     output: ko[n];    eigenvalues
10346   */
10347 
10348   char *name="Eigen_lapack";
10349 
10350   char  *JOBZ="V";
10351   char  *RANGE="I";
10352   char  *UPLO="L";
10353 
10354   INTEGER n=n0;
10355   INTEGER LDA=n0;
10356   double VL,VU; /* dummy */
10357   INTEGER IL,IU;
10358   double ABSTOL=1.0e-13;
10359   INTEGER M;
10360 
10361   double *A,*Z;
10362   INTEGER LDZ=n;
10363   INTEGER LWORK;
10364   double *WORK;
10365   INTEGER *IWORK;
10366   INTEGER *IFAIL, INFO;
10367 
10368   int i,j;
10369 
10370   A=(double*)malloc(sizeof(double)*n*n);
10371   Z=(double*)malloc(sizeof(double)*n*n);
10372 
10373   LWORK=n*8;
10374   WORK=(double*)malloc(sizeof(double)*LWORK);
10375   IWORK=(INTEGER*)malloc(sizeof(INTEGER)*n*5);
10376   IFAIL=(INTEGER*)malloc(sizeof(INTEGER)*n);
10377 
10378   IL = 1;
10379   IU = EVmax;
10380 
10381   for (i=0; i<n; i++) {
10382     for (j=0; j<n; j++) {
10383        A[i*n+j] = a[i+1][j+1];
10384     }
10385   }
10386 
10387 #if 0
10388   printf("A=\n");
10389   for (i=0;i<n;i++) {
10390     for (j=0;j<n;j++) {
10391        printf("%f ",A[i*n+j]);
10392     }
10393     printf("\n");
10394   }
10395   fflush(stdout);
10396 #endif
10397 
10398   F77_NAME(dsyevx,DSYEVX)( JOBZ, RANGE, UPLO, &n, A, &LDA, &VL, &VU, &IL, &IU,
10399            &ABSTOL, &M, ko, Z, &LDZ, WORK, &LWORK, IWORK,
10400            IFAIL, &INFO );
10401 
10402   if (INFO==0){
10403 
10404     /* store eigenvectors */
10405     for (i=0;i<EVmax;i++) {
10406       for (j=0;j<n;j++) {
10407 	/*  a[i+1][j+1]= Z[i*n+j]; */
10408 	a[j+1][i+1]= Z[i*n+j];
10409       }
10410     }
10411 
10412     /* shift ko by 1 */
10413     for (i=EVmax; i>=1; i--){
10414       ko[i]= ko[i-1];
10415     }
10416   }
10417 
10418   if (INFO!=0) {
10419     /* printf("\n%s: error in dsyevx_, info=%d\n\n",name,INFO); */
10420   }
10421 
10422   free(IFAIL); free(IWORK); free(WORK); free(Z); free(A);
10423   return INFO;
10424 
10425 }
10426 
10427 
Eigen_lapack_d(double ** a,double * ko,int n0,int EVmax)10428 int Eigen_lapack_d(double **a, double *ko, int n0, int EVmax)
10429 {
10430 
10431   /*
10432     F77_NAME(dsyevd,DSYEVD)()
10433 
10434     input:  n;
10435     input:  a[n][n];  matrix A
10436     output: a[n][n];  eigevectors
10437     output: ko[n];    eigenvalues
10438   */
10439 
10440   static char *name="Eigen_lapack";
10441 
10442   char  *JOBZ="V";
10443   char  *UPLO="L";
10444 
10445   INTEGER n=n0;
10446   INTEGER LDA=n;
10447   double VL,VU; /* dummy */
10448   INTEGER IL,IU;
10449   double ABSTOL=1.0e-13;
10450   INTEGER M;
10451 
10452   double *A;
10453   INTEGER LDZ=n;
10454   INTEGER LWORK,LIWORK;
10455   double *WORK;
10456   INTEGER *IWORK;
10457   INTEGER INFO;
10458 
10459   int i,j;
10460 
10461   A=(double*)malloc(sizeof(double)*n*n);
10462 
10463   LWORK=  1 + 6*n + 2*n*n;
10464   WORK=(double*)malloc(sizeof(double)*LWORK);
10465 
10466   LIWORK = 3 + 5*n;
10467   IWORK=(INTEGER*)malloc(sizeof(INTEGER)*LIWORK);
10468 
10469 
10470   IL = 1;
10471   IU = EVmax;
10472 
10473   for (i=0;i<n;i++) {
10474     for (j=0;j<n;j++) {
10475        A[i*n+j]= a[i+1][j+1];
10476     }
10477   }
10478 
10479 #if 0
10480   printf("A=\n");
10481   for (i=0;i<n;i++) {
10482     for (j=0;j<n;j++) {
10483        printf("%f ",A[i*n+j]);
10484     }
10485     printf("\n");
10486   }
10487   fflush(stdout);
10488 #endif
10489 
10490   F77_NAME(dsyevd,DSYEVD)( JOBZ, UPLO, &n, A, &LDA, ko, WORK, &LWORK, IWORK, &LIWORK, &INFO );
10491 
10492   if (INFO==0){
10493 
10494     /* store eigenvectors */
10495     for (i=0;i<EVmax;i++) {
10496       for (j=0;j<n;j++) {
10497 	/* a[i+1][j+1]= Z[i*n+j]; */
10498 	a[j+1][i+1]= A[i*n+j];
10499       }
10500     }
10501 
10502     /* shift ko by 1 */
10503     for (i=EVmax; i>=1; i--){
10504       ko[i]= ko[i-1];
10505     }
10506   }
10507 
10508   if (INFO!=0) {
10509     /* printf("\n%s: error in dsyevd_, info=%d\n\n",name,INFO); */
10510   }
10511 
10512   free(IWORK); free(WORK); free(A);
10513   return INFO;
10514 }
10515 
10516 
Eigen_lapack_r(double ** a,double * ko,int n0,int EVmax)10517 int Eigen_lapack_r(double **a, double *ko, int n0, int EVmax)
10518 {
10519 
10520   /*
10521     F77_NAME(dsyevr,DSYEVR)()
10522 
10523     input:  n;
10524     input:  a[n][n];  matrix A
10525     output: a[n][n];  eigevectors
10526     output: ko[n];    eigenvalues
10527   */
10528 
10529   static char *name="Eigen_lapack";
10530 
10531   char  *JOBZ="V";
10532   char  *RANGE="I";
10533   char  *UPLO="L";
10534 
10535   INTEGER n=n0;
10536   INTEGER LDA=n;
10537   double VL,VU; /* dummy */
10538   INTEGER IL,IU;
10539   double ABSTOL=1.0e-13;
10540   INTEGER M;
10541 
10542   double *A,*Z;
10543   INTEGER LDZ=n;
10544   INTEGER LWORK,LIWORK;
10545   double *WORK;
10546   INTEGER *IWORK;
10547   INTEGER *ISUPPZ;
10548   INTEGER INFO;
10549 
10550   int i,j;
10551 
10552   A=(double*)malloc(sizeof(double)*n*n);
10553   Z=(double*)malloc(sizeof(double)*n*n);
10554 
10555   LWORK= (n+16)*n;   /*  n*26 of (n+6)*n */
10556   WORK=(double*)malloc(sizeof(double)*LWORK);
10557 
10558   LIWORK = (n+1)*n;  /*  n*10 or ??? */
10559   IWORK=(INTEGER*)malloc(sizeof(INTEGER)*LIWORK);
10560 
10561   ISUPPZ =(INTEGER*)malloc(sizeof(INTEGER)*n*2);
10562 
10563   IL = 1;
10564   IU = EVmax;
10565 
10566   for (i=0;i<n;i++) {
10567     for (j=0;j<n;j++) {
10568        A[i*n+j]= a[i+1][j+1];
10569     }
10570   }
10571 
10572 #if 0
10573   printf("A=\n");
10574   for (i=0;i<n;i++) {
10575     for (j=0;j<n;j++) {
10576        printf("%f ",A[i*n+j]);
10577     }
10578     printf("\n");
10579   }
10580   fflush(stdout);
10581 #endif
10582 
10583   F77_NAME(dsyevr,DSYEVR)( JOBZ, RANGE, UPLO, &n, A, &LDA, &VL, &VU, &IL, &IU,
10584            &ABSTOL, &M, ko, Z, &LDZ, ISUPPZ, WORK, &LWORK,
10585            IWORK, &LIWORK, &INFO );
10586 
10587   if (INFO==0){
10588 
10589     /* store eigenvectors */
10590     for (i=0;i<EVmax;i++) {
10591       for (j=0;j<n;j++) {
10592 	/*  a[i+1][j+1]= Z[i*n+j]; */
10593 	a[j+1][i+1]= Z[i*n+j];
10594       }
10595     }
10596 
10597     /* shift ko by 1 */
10598     for (i=EVmax; i>=1; i--){
10599       ko[i]= ko[i-1];
10600     }
10601   }
10602 
10603   if (INFO!=0) {
10604      printf("\n%s: error in dsyevr_, info=%d\n\n",name,INFO);
10605      MPI_Finalize();
10606      exit(10);
10607   }
10608 
10609   free(ISUPPZ); free(IWORK); free(WORK); free(Z); free(A);
10610 
10611   return INFO;
10612 }
10613