1 /**********************************************************************
2   Set_Allocate_Atom2CPU.c:
3 
4     Set_Allocate_Atom2CPU.c is a subroutine to allocate atoms to processors
5     for the MPI parallel computation.
6 
7   Log of Set_Allocate_Atom2CPU.c:
8 
9      22/Nov/2001  Released by T.Ozaki
10 
11 ***********************************************************************/
12 
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <math.h>
16 #include <time.h>
17 #include "openmx_common.h"
18 #include "mpi.h"
19 
20 static void Output_Atom2CPU();
21 static void Allocation_Species();
22 static void Allocation_Atoms_3D(int MD_iter, int NL_switch);
23 
24 
25 
Set_Allocate_Atom2CPU(int MD_iter,int species_flag,int weight_flag)26 int Set_Allocate_Atom2CPU(int MD_iter, int species_flag, int weight_flag)
27 {
28   double time0;
29   time_t TStime,TEtime;
30 
31   time(&TStime);
32 
33   if (species_flag==1)
34     Allocation_Species();
35   else{
36     Allocation_Atoms_3D(MD_iter, weight_flag);
37   }
38 
39   time(&TEtime);
40   time0 = TEtime - TStime;
41   return time0;
42 }
43 
44 
45 
46 #pragma optimization_level 1
Allocation_Atoms_3D(int MD_iter,int weight_flag)47 void Allocation_Atoms_3D(int MD_iter, int weight_flag)
48 {
49   /***************************************
50         allocate atoms to processes
51       by Modified Recursive Bisection
52   ***************************************/
53   int m,i,j,k,k0,Na,np,ID,n0;
54   int myid,numprocs,numprocs0;
55   int max_depth,n,depth,child;
56   int **Num_Procs_in_Child;
57   int **Num_Atoms_in_Child;
58   int ***List_AN_in_Child;
59   int *MatomN;
60   double t,ax,ay,az,sum;
61   double w0,sumw,min_diff;
62   double WMatomnum,longest_time;
63   double ***List_T_in_Child;
64   double **IMT,*weight;
65   double *ko,*WMatomN;
66   double xyz_c[4];
67 
68   /* MPI */
69   MPI_Comm_size(mpi_comm_level1,&numprocs);
70   MPI_Comm_rank(mpi_comm_level1,&myid);
71 
72   /* set numprocs0 */
73   if (numprocs<atomnum) numprocs0 = numprocs;
74   else                  numprocs0 = atomnum;
75 
76   /* max_depth of level */
77 
78   if (numprocs==1 || atomnum==1)
79     max_depth = 0;
80   else
81     max_depth = (int)(log(((double)numprocs0-1.0+1.0e-7))/log(2.0)) + 1;
82 
83   /****************************
84       allocation of arrays
85   ****************************/
86 
87   Num_Procs_in_Child = (int**)malloc(sizeof(int*)*(max_depth+1));
88   n = 1;
89   for (depth=0; depth<(max_depth+1); depth++){
90     Num_Procs_in_Child[depth] = (int*)malloc(sizeof(int)*n);
91     n *= 2;
92   }
93 
94   Num_Atoms_in_Child = (int**)malloc(sizeof(int*)*(max_depth+1));
95   n = 1;
96   for (depth=0; depth<(max_depth+1); depth++){
97     Num_Atoms_in_Child[depth] = (int*)malloc(sizeof(int)*n);
98     for (i=0; i<n; i++) Num_Atoms_in_Child[depth][i] = 0;
99     n *= 2;
100   }
101 
102   IMT = (double**)malloc(sizeof(double*)*4);
103   for (i=0; i<4; i++){
104     IMT[i] = (double*)malloc(sizeof(double)*4);
105   }
106 
107   ko = (double*)malloc(sizeof(double)*4);
108 
109   weight = (double*)malloc(sizeof(double)*(atomnum+1));
110 
111   /* set weight */
112 
113   if (weight_flag==0){
114     for (i=1; i<=atomnum; i++){
115       weight[i] = 1.0;
116     }
117   }
118   else if (weight_flag==1){
119 
120     longest_time = 0.0;
121     for (i=1; i<=atomnum; i++){
122       if (longest_time<time_per_atom[i]) longest_time = time_per_atom[i];
123     }
124 
125     for (i=1; i<=atomnum; i++){
126       weight[i] = time_per_atom[i]/longest_time;
127     }
128   }
129 
130   /* set Num_Procs_in_Child */
131 
132   n = 2;
133   Num_Procs_in_Child[0][0] = numprocs0;
134 
135   for (depth=1; depth<(max_depth+1); depth++){
136     for (i=0; i<n; i++){
137 
138       if (i%2==0){
139         Num_Procs_in_Child[depth][i] = (Num_Procs_in_Child[depth-1][i/2]-1)/2+1;
140       }
141       else{
142         Num_Procs_in_Child[depth][i] = Num_Procs_in_Child[depth-1][i/2]-Num_Procs_in_Child[depth][i-1];
143       }
144     }
145     n *= 2;
146   }
147 
148   /* set Num_Atoms_in_Child at depth=0 */
149 
150   depth = 0; child = 0;
151   Num_Atoms_in_Child[depth][child] = atomnum;
152 
153   /***************************************************************
154    modified recursive bisection to set AN_in_Child at each depth
155   ***************************************************************/
156 
157   /**************************************************************************
158    Since the size of the last index of List_AN_in_Child and List_T_in_Child
159    is determined by the modified recursive bisection, they are allocated
160    on-the-fly.
161   **************************************************************************/
162 
163   /* allocation of List_AN_in_Child and List_T_in_Child */
164   List_AN_in_Child = (int***)malloc(sizeof(int**)*(max_depth+1));
165   List_T_in_Child = (double***)malloc(sizeof(double**)*(max_depth+1));
166   List_AN_in_Child[0] = (int**)malloc(sizeof(int*)*1);
167   List_T_in_Child[0] = (double**)malloc(sizeof(double*)*1);
168   List_AN_in_Child[0][0] = (int*)malloc(sizeof(int)*(atomnum+1));
169   List_T_in_Child[0][0]  = (double*)malloc(sizeof(double)*(atomnum+1));
170   for (k=0; k<atomnum; k++)  List_AN_in_Child[depth][0][k] = k+1;
171 
172   n = 1;
173   for (depth=0; depth<max_depth; depth++){
174 
175     /* allocation of List_AN_in_Child and List_T_in_Child */
176     List_AN_in_Child[depth+1] = (int**)malloc(sizeof(int*)*n*2);
177     List_T_in_Child[depth+1] = (double**)malloc(sizeof(double*)*n*2);
178 
179     /**********************************************************
180      reordering of atoms at depth using the inertia tensor
181     **********************************************************/
182 
183     for (child=0; child<n; child++){
184 
185       /* get the number of atoms in the child */
186 
187       Na = Num_Atoms_in_Child[depth][child];
188 
189       /* calculate the centroid of atoms in the child */
190 
191       xyz_c[1] = 0.0;
192       xyz_c[2] = 0.0;
193       xyz_c[3] = 0.0;
194 
195       for (k=0; k<Na; k++){
196         m = List_AN_in_Child[depth][child][k];
197         xyz_c[1] += Gxyz[m][1]*weight[m];
198         xyz_c[2] += Gxyz[m][2]*weight[m];
199         xyz_c[3] += Gxyz[m][3]*weight[m];
200       }
201 
202       xyz_c[1] /= (double)Na;
203       xyz_c[2] /= (double)Na;
204       xyz_c[3] /= (double)Na;
205 
206       /* make inertia moment tensor */
207 
208       for (i=1; i<=3; i++){
209         for (j=1; j<=3; j++){
210 
211           sum = 0.0;
212           for (k=0; k<Na; k++){
213 	    m = List_AN_in_Child[depth][child][k];
214 	    sum += weight[m]*(Gxyz[m][i]-xyz_c[i])*(Gxyz[m][j]-xyz_c[j]);
215 	  }
216 
217           IMT[i][j] = sum;
218         }
219       }
220 
221       /* diagonalize the inertia moment tensor */
222 
223       Eigen_lapack(IMT,ko,3,3);
224 
225       /* find the principal axis */
226 
227       ax = IMT[1][3];
228       ay = IMT[2][3];
229       az = IMT[3][3];
230 
231       /* calculate the intervening variable, t */
232 
233       for (k=0; k<Na; k++){
234         m = List_AN_in_Child[depth][child][k];
235         t = ax*(Gxyz[m][1]-xyz_c[1]) + ay*(Gxyz[m][2]-xyz_c[2]) + az*(Gxyz[m][3]-xyz_c[3]);
236         List_T_in_Child[depth][child][k] = t;
237       }
238 
239       /* sorting atoms in the child based on t */
240 
241       qsort_double_int((long)Na,List_T_in_Child[depth][child],List_AN_in_Child[depth][child]);
242 
243       /* calculate the sum of weight in the child */
244 
245       sumw = 0.0;
246       for (k=0; k<Na; k++){
247         m = List_AN_in_Child[depth][child][k];
248         sumw += weight[m];
249       }
250 
251       /* find atomic index at which the bisection is made. */
252 
253       np = Num_Procs_in_Child[depth+1][2*child] + Num_Procs_in_Child[depth+1][2*child+1];
254       w0 = (sumw*(double)Num_Procs_in_Child[depth+1][2*child])/(double)np;
255 
256       sumw = 0.0;
257       min_diff = 10000000;
258       for (k=0; k<Na; k++){
259         m = List_AN_in_Child[depth][child][k];
260         sumw += weight[m];
261         if (fabs(w0-sumw)<min_diff){
262           min_diff = fabs(w0-sumw);
263           k0 = k;
264         }
265       }
266 
267       /* adjust k0 to avoid the case that (# of atoms)<(# of processes) */
268 
269       if ( (((k0+1)<Num_Procs_in_Child[depth+1][2*child])
270            ||
271            ((Na-(k0+1))<Num_Procs_in_Child[depth+1][2*child+1]))
272            &&
273            1<Na ){
274 
275         k0 = Num_Procs_in_Child[depth+1][2*child] - 1;
276       }
277 
278       /* bisection of atoms in the child based on Num_Procs_in_Child */
279 
280       Num_Atoms_in_Child[depth+1][2*child  ] = k0 + 1;
281       Num_Atoms_in_Child[depth+1][2*child+1] = Na - (k0+1);
282 
283       /* allocation of List_AN_in_Child and List_T_in_Child */
284       List_AN_in_Child[depth+1][2*child]   = (int*)malloc(sizeof(int)*Num_Atoms_in_Child[depth+1][2*child]);
285       List_T_in_Child[depth+1][2*child]   = (double*)malloc(sizeof(double)*Num_Atoms_in_Child[depth+1][2*child]);
286       List_AN_in_Child[depth+1][2*child+1] = (int*)malloc(sizeof(int)*Num_Atoms_in_Child[depth+1][2*child+1]);
287       List_T_in_Child[depth+1][2*child+1] = (double*)malloc(sizeof(double)*Num_Atoms_in_Child[depth+1][2*child+1]);
288 
289       /* copy depth -> depth+1 */
290 
291       for (k=0; k<Num_Atoms_in_Child[depth+1][2*child]; k++){
292         List_AN_in_Child[depth+1][2*child][k] = List_AN_in_Child[depth][child][k];
293       }
294 
295       for (k=0; k<Num_Atoms_in_Child[depth+1][2*child+1]; k++){
296         m = Num_Atoms_in_Child[depth+1][2*child]+k;
297         List_AN_in_Child[depth+1][2*child+1][k] = List_AN_in_Child[depth][child][m];
298       }
299 
300     } /* child */
301 
302     /* doubling of n */
303     n *= 2;
304 
305   } /* depth */
306 
307   /*
308   if (myid==0){
309     n = 1;
310     for (depth=0; depth<=max_depth; depth++){
311       for (child=0; child<n; child++){
312 
313 	Na = Num_Atoms_in_Child[depth][child];
314 
315 	for (k=0; k<Na; k++){
316 	  m = List_AN_in_Child[depth][child][k];
317 	  t = List_T_in_Child[depth][child][k];
318 	  printf("depth=%2d child=%2d k=%2d m=%2d t=%15.12f\n",depth,child,k,m,t);
319 	}
320       }
321       n *= 2;
322     }
323   }
324   */
325 
326   /***************************************************************
327                  allocation of atoms to processes
328   ***************************************************************/
329 
330   /*
331   sorting of atoms in each child at max_depth.
332   if the sorting is not performed, the force calculations
333   related to HNL3 and 4B will be failed.
334   */
335 
336   n = 1;
337   for (depth=0; depth<max_depth; depth++) n *= 2;
338 
339   for (child=0; child<n; child++){
340     Na = Num_Atoms_in_Child[max_depth][child];
341     qsort_int1((long)Na,List_AN_in_Child[depth][child]);
342   }
343 
344   /* set G2ID, M2G, Matomnum, and WMatomnum */
345 
346   n = 1;
347   for (depth=0; depth<max_depth; depth++) n *= 2;
348 
349   Matomnum = 0;
350   WMatomnum = 0.0;
351   ID = 0;
352 
353   for (child=0; child<n; child++){
354 
355     Na = Num_Atoms_in_Child[max_depth][child];
356 
357     if (Na!=0){
358 
359       for (k=0; k<Na; k++){
360         m = List_AN_in_Child[max_depth][child][k];
361         G2ID[m] = ID;
362       }
363 
364       if (myid==ID){
365 
366         Matomnum = Na;
367         if (alloc_first[10]==0) free(M2G);
368 	M2G = (int*)malloc(sizeof(int)*(Matomnum+2));
369 	alloc_first[10] = 0;
370 
371         for (k=0; k<Na; k++){
372           m = List_AN_in_Child[max_depth][child][k];
373           M2G[k+1] = m;
374         }
375 
376         WMatomnum = 0.0;
377         for (k=0; k<Na; k++){
378           m = List_AN_in_Child[max_depth][child][k];
379           WMatomnum += weight[m];
380         }
381       }
382 
383       ID++;
384     }
385   }
386 
387   /****************************************
388     find Max_Matomnum, MatomN and WMatomN
389   ****************************************/
390 
391   MatomN = (int*)malloc(sizeof(int)*numprocs);
392   WMatomN = (double*)malloc(sizeof(double)*numprocs);
393 
394   MatomN[myid]  = Matomnum;
395   WMatomN[myid] = WMatomnum;
396 
397   for (ID=0; ID<numprocs; ID++){
398     MPI_Bcast(&MatomN[ID],  1, MPI_INT, ID, mpi_comm_level1);
399     MPI_Bcast(&WMatomN[ID], 1, MPI_DOUBLE, ID, mpi_comm_level1);
400   }
401 
402   /* find Max_Matomnum */
403 
404   Max_Matomnum = 0;
405   for (ID=0; ID<numprocs; ID++){
406     if (Max_Matomnum<MatomN[ID]) Max_Matomnum = MatomN[ID];
407   }
408 
409   /*********************************************
410               print the information
411   *********************************************/
412 
413   if (myid==Host_ID && 1<=MD_iter && 0<level_stdout){
414     printf("\n");
415     printf("*******************************************************\n");
416     printf("  Allocation of atoms to proccesors at MD_iter=%5d     \n", MD_iter );
417     printf("*******************************************************\n\n");
418   }
419 
420   for (ID=0; ID<numprocs; ID++){
421 
422     if (myid==Host_ID && 1<=MD_iter && 0<level_stdout){
423       printf(" proc = %3d  # of atoms=%4d  estimated weight=%16.5f\n",
424 	     ID,MatomN[ID],WMatomN[ID]);
425     }
426   }
427 
428   if (myid==Host_ID && 1<=MD_iter && 0<level_stdout) printf("\n\n\n");
429 
430   /****************************************
431          initialize time_per_atom
432   ****************************************/
433 
434   for (k=1; k<=atomnum; k++) time_per_atom[k] = 0.0;
435 
436   /*
437   if (myid==Host_ID){
438 
439     int i,k;
440     char AtomName[38][10]=
441       {"E","H","He","Li","Be","B","C","N","O","F","Ne","Na","Mg","Al","Si",
442        "P","S","Cl","Ar","K","Ca","Sc","Ti","V","Cr","Mn","Fe","Co","Ni","Cu","Zn",
443        "Ga","Ge","As","Se","Br","Kr"
444       };
445 
446     for (i=1; i<=atomnum; i++){
447       k = (G2ID[i])%36+4;
448       printf("%5s  %15.12f %15.12f %15.12f  0.0 0.0 0.0\n",
449                AtomName[k],Gxyz[i][1]*BohrR,Gxyz[i][2]*BohrR,Gxyz[i][3]*BohrR);
450     }
451   }
452   MPI_Finalize();
453   exit(0);
454   */
455 
456   /*
457   {
458     int i1,i2,k1,k2;
459     double dx,dy,dz,r,rmax;
460 
461     rmax = 0.0;
462     for (i1=1; i1<=Matomnum; i1++){
463       k1 = M2G[i1];
464 
465       for (i2=1; i2<=Matomnum; i2++){
466         k2 = M2G[i2];
467 
468         dx = Gxyz[k1][1] - Gxyz[k2][1];
469         dy = Gxyz[k1][2] - Gxyz[k2][2];
470         dz = Gxyz[k1][3] - Gxyz[k2][3];
471         r = sqrt(dx*dx+dy*dy+dz*dz);
472         if (rmax<r) rmax = r;
473       }
474     }
475 
476     printf("ABC1 myid=%2d rmax=%15.12f\n",myid,rmax);fflush(stdout);
477   }
478 
479   MPI_Finalize();
480   exit(0);
481   */
482 
483 
484   /****************************************
485             freeing of arrays
486   ****************************************/
487 
488   free(WMatomN);
489   free(MatomN);
490 
491   free(List_AN_in_Child[0][0]);
492   free(List_T_in_Child[0][0]);
493   free(List_AN_in_Child[0]);
494   free(List_T_in_Child[0]);
495 
496   n = 1;
497   for (depth=0; depth<max_depth; depth++){
498 
499     for (i=0; i<n*2; i++){
500       free(List_T_in_Child[depth+1][i]);
501       free(List_AN_in_Child[depth+1][i]);
502     }
503     free(List_T_in_Child[depth+1]);
504     free(List_AN_in_Child[depth+1]);
505 
506     n *= 2;
507   }
508   free(List_T_in_Child);
509   free(List_AN_in_Child);
510 
511   free(weight);
512   free(ko);
513 
514   for (i=0; i<4; i++){
515     free(IMT[i]);
516   }
517   free(IMT);
518 
519   for (depth=0; depth<(max_depth+1); depth++){
520     free(Num_Atoms_in_Child[depth]);
521   }
522   free(Num_Atoms_in_Child);
523 
524   for (depth=0; depth<(max_depth+1); depth++){
525     free(Num_Procs_in_Child[depth]);
526   }
527   free(Num_Procs_in_Child);
528 }
529 
530 
531 #pragma optimization_level 1
Allocation_Species()532 void Allocation_Species()
533 {
534   int i,num1,num2;
535   int numprocs,myid;
536   /* MPI */
537   MPI_Comm_size(mpi_comm_level1,&numprocs);
538   MPI_Comm_rank(mpi_comm_level1,&myid);
539 
540   /****************************************************
541                   partition of species
542   ****************************************************/
543 
544   for (i=0; i<numprocs; i++){
545     Species_Top[i] = 0;
546     Species_End[i] = 0;
547   }
548 
549   if (SpeciesNum<numprocs) Num_Procs2 = SpeciesNum;
550   else                     Num_Procs2 = numprocs;
551 
552   num1 = SpeciesNum/Num_Procs2;
553   num2 = SpeciesNum%Num_Procs2;
554   for (i=0; i<Num_Procs2; i++){
555     Species_Top[i] = num1*i;
556     Species_End[i] = num1*(i + 1) - 1;
557   }
558   if (num2!=0){
559     for (i=0; i<num2; i++){
560       Species_Top[i] = Species_Top[i] + i;
561       Species_End[i] = Species_End[i] + i + 1;
562     }
563     for (i=num2; i<Num_Procs2; i++){
564       Species_Top[i] = Species_Top[i] + num2;
565       Species_End[i] = Species_End[i] + num2;
566     }
567   }
568 
569   if (myid==Host_ID && 2<=level_stdout){
570     for (i=0; i<Num_Procs2; i++){
571       printf("proc=%4d  Species_Top=%4d  Species_End=%4d\n",
572 	     i,Species_Top[i],Species_End[i]);
573     }
574   }
575 
576   if (myid<Num_Procs2)
577     MSpeciesNum = Species_End[myid] - Species_Top[myid] + 1;
578   else
579     MSpeciesNum = 0;
580 
581   if (myid==Host_ID && 2<=level_stdout){
582     printf("myid=%i  MSpeciesNum=%i\n",myid,MSpeciesNum);
583   }
584 
585 }
586 
587 
588 
589 
590 
591 
592 
593 
594 
595 /*
596 void Output_Atom2CPU()
597 {
598 
599   int i,numprocs;
600   char file_A2C[YOUSO10] = ".A2C";
601   FILE *fp;
602 
603   MPI_Comm_size(mpi_comm_level1,&numprocs);
604 
605   fnjoint(filepath,filename,file_A2C);
606 
607   if ((fp = fopen(file_A2C,"w")) != NULL){
608 
609     fprintf(fp,"\n");
610     fprintf(fp,"***********************************************************\n");
611     fprintf(fp,"***********************************************************\n");
612     fprintf(fp,"               Allocation of atoms to CPUs                 \n");
613     fprintf(fp,"***********************************************************\n");
614     fprintf(fp,"***********************************************************\n");
615     fprintf(fp,"\n");
616 
617     fprintf(fp,"   Average weight = %5.4f\n\n",eachw);
618     fprintf(fp,"   CPU    Atoms    Top     End     CPU_Weight\n");
619     for (i=0; i<numprocs; i++){
620       fprintf(fp,"  %4d   %4d    %4d    %4d    %4d\n",
621               i,
622               Gatom_End[i]-Gatom_Top[i]+1,
623               Gatom_Top[i],
624               Gatom_End[i],
625               CPU_Weight[i]);
626     }
627     fclose(fp);
628   }
629   else
630     printf("Failure of saving the Set_Allocate_Atom2CPU.\n");
631 }
632 */
633