1 /**********************************************************************
2 Set_Allocate_Atom2CPU.c:
3
4 Set_Allocate_Atom2CPU.c is a subroutine to allocate atoms to processors
5 for the MPI parallel computation.
6
7 Log of Set_Allocate_Atom2CPU.c:
8
9 22/Nov/2001 Released by T.Ozaki
10
11 ***********************************************************************/
12
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <math.h>
16 #include <time.h>
17 #include "openmx_common.h"
18 #include "mpi.h"
19
20 static void Output_Atom2CPU();
21 static void Allocation_Species();
22 static void Allocation_Atoms_3D(int MD_iter, int NL_switch);
23
24
25
Set_Allocate_Atom2CPU(int MD_iter,int species_flag,int weight_flag)26 int Set_Allocate_Atom2CPU(int MD_iter, int species_flag, int weight_flag)
27 {
28 double time0;
29 time_t TStime,TEtime;
30
31 time(&TStime);
32
33 if (species_flag==1)
34 Allocation_Species();
35 else{
36 Allocation_Atoms_3D(MD_iter, weight_flag);
37 }
38
39 time(&TEtime);
40 time0 = TEtime - TStime;
41 return time0;
42 }
43
44
45
46 #pragma optimization_level 1
Allocation_Atoms_3D(int MD_iter,int weight_flag)47 void Allocation_Atoms_3D(int MD_iter, int weight_flag)
48 {
49 /***************************************
50 allocate atoms to processes
51 by Modified Recursive Bisection
52 ***************************************/
53 int m,i,j,k,k0,Na,np,ID,n0;
54 int myid,numprocs,numprocs0;
55 int max_depth,n,depth,child;
56 int **Num_Procs_in_Child;
57 int **Num_Atoms_in_Child;
58 int ***List_AN_in_Child;
59 int *MatomN;
60 double t,ax,ay,az,sum;
61 double w0,sumw,min_diff;
62 double WMatomnum,longest_time;
63 double ***List_T_in_Child;
64 double **IMT,*weight;
65 double *ko,*WMatomN;
66 double xyz_c[4];
67
68 /* MPI */
69 MPI_Comm_size(mpi_comm_level1,&numprocs);
70 MPI_Comm_rank(mpi_comm_level1,&myid);
71
72 /* set numprocs0 */
73 if (numprocs<atomnum) numprocs0 = numprocs;
74 else numprocs0 = atomnum;
75
76 /* max_depth of level */
77
78 if (numprocs==1 || atomnum==1)
79 max_depth = 0;
80 else
81 max_depth = (int)(log(((double)numprocs0-1.0+1.0e-7))/log(2.0)) + 1;
82
83 /****************************
84 allocation of arrays
85 ****************************/
86
87 Num_Procs_in_Child = (int**)malloc(sizeof(int*)*(max_depth+1));
88 n = 1;
89 for (depth=0; depth<(max_depth+1); depth++){
90 Num_Procs_in_Child[depth] = (int*)malloc(sizeof(int)*n);
91 n *= 2;
92 }
93
94 Num_Atoms_in_Child = (int**)malloc(sizeof(int*)*(max_depth+1));
95 n = 1;
96 for (depth=0; depth<(max_depth+1); depth++){
97 Num_Atoms_in_Child[depth] = (int*)malloc(sizeof(int)*n);
98 for (i=0; i<n; i++) Num_Atoms_in_Child[depth][i] = 0;
99 n *= 2;
100 }
101
102 IMT = (double**)malloc(sizeof(double*)*4);
103 for (i=0; i<4; i++){
104 IMT[i] = (double*)malloc(sizeof(double)*4);
105 }
106
107 ko = (double*)malloc(sizeof(double)*4);
108
109 weight = (double*)malloc(sizeof(double)*(atomnum+1));
110
111 /* set weight */
112
113 if (weight_flag==0){
114 for (i=1; i<=atomnum; i++){
115 weight[i] = 1.0;
116 }
117 }
118 else if (weight_flag==1){
119
120 longest_time = 0.0;
121 for (i=1; i<=atomnum; i++){
122 if (longest_time<time_per_atom[i]) longest_time = time_per_atom[i];
123 }
124
125 for (i=1; i<=atomnum; i++){
126 weight[i] = time_per_atom[i]/longest_time;
127 }
128 }
129
130 /* set Num_Procs_in_Child */
131
132 n = 2;
133 Num_Procs_in_Child[0][0] = numprocs0;
134
135 for (depth=1; depth<(max_depth+1); depth++){
136 for (i=0; i<n; i++){
137
138 if (i%2==0){
139 Num_Procs_in_Child[depth][i] = (Num_Procs_in_Child[depth-1][i/2]-1)/2+1;
140 }
141 else{
142 Num_Procs_in_Child[depth][i] = Num_Procs_in_Child[depth-1][i/2]-Num_Procs_in_Child[depth][i-1];
143 }
144 }
145 n *= 2;
146 }
147
148 /* set Num_Atoms_in_Child at depth=0 */
149
150 depth = 0; child = 0;
151 Num_Atoms_in_Child[depth][child] = atomnum;
152
153 /***************************************************************
154 modified recursive bisection to set AN_in_Child at each depth
155 ***************************************************************/
156
157 /**************************************************************************
158 Since the size of the last index of List_AN_in_Child and List_T_in_Child
159 is determined by the modified recursive bisection, they are allocated
160 on-the-fly.
161 **************************************************************************/
162
163 /* allocation of List_AN_in_Child and List_T_in_Child */
164 List_AN_in_Child = (int***)malloc(sizeof(int**)*(max_depth+1));
165 List_T_in_Child = (double***)malloc(sizeof(double**)*(max_depth+1));
166 List_AN_in_Child[0] = (int**)malloc(sizeof(int*)*1);
167 List_T_in_Child[0] = (double**)malloc(sizeof(double*)*1);
168 List_AN_in_Child[0][0] = (int*)malloc(sizeof(int)*(atomnum+1));
169 List_T_in_Child[0][0] = (double*)malloc(sizeof(double)*(atomnum+1));
170 for (k=0; k<atomnum; k++) List_AN_in_Child[depth][0][k] = k+1;
171
172 n = 1;
173 for (depth=0; depth<max_depth; depth++){
174
175 /* allocation of List_AN_in_Child and List_T_in_Child */
176 List_AN_in_Child[depth+1] = (int**)malloc(sizeof(int*)*n*2);
177 List_T_in_Child[depth+1] = (double**)malloc(sizeof(double*)*n*2);
178
179 /**********************************************************
180 reordering of atoms at depth using the inertia tensor
181 **********************************************************/
182
183 for (child=0; child<n; child++){
184
185 /* get the number of atoms in the child */
186
187 Na = Num_Atoms_in_Child[depth][child];
188
189 /* calculate the centroid of atoms in the child */
190
191 xyz_c[1] = 0.0;
192 xyz_c[2] = 0.0;
193 xyz_c[3] = 0.0;
194
195 for (k=0; k<Na; k++){
196 m = List_AN_in_Child[depth][child][k];
197 xyz_c[1] += Gxyz[m][1]*weight[m];
198 xyz_c[2] += Gxyz[m][2]*weight[m];
199 xyz_c[3] += Gxyz[m][3]*weight[m];
200 }
201
202 xyz_c[1] /= (double)Na;
203 xyz_c[2] /= (double)Na;
204 xyz_c[3] /= (double)Na;
205
206 /* make inertia moment tensor */
207
208 for (i=1; i<=3; i++){
209 for (j=1; j<=3; j++){
210
211 sum = 0.0;
212 for (k=0; k<Na; k++){
213 m = List_AN_in_Child[depth][child][k];
214 sum += weight[m]*(Gxyz[m][i]-xyz_c[i])*(Gxyz[m][j]-xyz_c[j]);
215 }
216
217 IMT[i][j] = sum;
218 }
219 }
220
221 /* diagonalize the inertia moment tensor */
222
223 Eigen_lapack(IMT,ko,3,3);
224
225 /* find the principal axis */
226
227 ax = IMT[1][3];
228 ay = IMT[2][3];
229 az = IMT[3][3];
230
231 /* calculate the intervening variable, t */
232
233 for (k=0; k<Na; k++){
234 m = List_AN_in_Child[depth][child][k];
235 t = ax*(Gxyz[m][1]-xyz_c[1]) + ay*(Gxyz[m][2]-xyz_c[2]) + az*(Gxyz[m][3]-xyz_c[3]);
236 List_T_in_Child[depth][child][k] = t;
237 }
238
239 /* sorting atoms in the child based on t */
240
241 qsort_double_int((long)Na,List_T_in_Child[depth][child],List_AN_in_Child[depth][child]);
242
243 /* calculate the sum of weight in the child */
244
245 sumw = 0.0;
246 for (k=0; k<Na; k++){
247 m = List_AN_in_Child[depth][child][k];
248 sumw += weight[m];
249 }
250
251 /* find atomic index at which the bisection is made. */
252
253 np = Num_Procs_in_Child[depth+1][2*child] + Num_Procs_in_Child[depth+1][2*child+1];
254 w0 = (sumw*(double)Num_Procs_in_Child[depth+1][2*child])/(double)np;
255
256 sumw = 0.0;
257 min_diff = 10000000;
258 for (k=0; k<Na; k++){
259 m = List_AN_in_Child[depth][child][k];
260 sumw += weight[m];
261 if (fabs(w0-sumw)<min_diff){
262 min_diff = fabs(w0-sumw);
263 k0 = k;
264 }
265 }
266
267 /* adjust k0 to avoid the case that (# of atoms)<(# of processes) */
268
269 if ( (((k0+1)<Num_Procs_in_Child[depth+1][2*child])
270 ||
271 ((Na-(k0+1))<Num_Procs_in_Child[depth+1][2*child+1]))
272 &&
273 1<Na ){
274
275 k0 = Num_Procs_in_Child[depth+1][2*child] - 1;
276 }
277
278 /* bisection of atoms in the child based on Num_Procs_in_Child */
279
280 Num_Atoms_in_Child[depth+1][2*child ] = k0 + 1;
281 Num_Atoms_in_Child[depth+1][2*child+1] = Na - (k0+1);
282
283 /* allocation of List_AN_in_Child and List_T_in_Child */
284 List_AN_in_Child[depth+1][2*child] = (int*)malloc(sizeof(int)*Num_Atoms_in_Child[depth+1][2*child]);
285 List_T_in_Child[depth+1][2*child] = (double*)malloc(sizeof(double)*Num_Atoms_in_Child[depth+1][2*child]);
286 List_AN_in_Child[depth+1][2*child+1] = (int*)malloc(sizeof(int)*Num_Atoms_in_Child[depth+1][2*child+1]);
287 List_T_in_Child[depth+1][2*child+1] = (double*)malloc(sizeof(double)*Num_Atoms_in_Child[depth+1][2*child+1]);
288
289 /* copy depth -> depth+1 */
290
291 for (k=0; k<Num_Atoms_in_Child[depth+1][2*child]; k++){
292 List_AN_in_Child[depth+1][2*child][k] = List_AN_in_Child[depth][child][k];
293 }
294
295 for (k=0; k<Num_Atoms_in_Child[depth+1][2*child+1]; k++){
296 m = Num_Atoms_in_Child[depth+1][2*child]+k;
297 List_AN_in_Child[depth+1][2*child+1][k] = List_AN_in_Child[depth][child][m];
298 }
299
300 } /* child */
301
302 /* doubling of n */
303 n *= 2;
304
305 } /* depth */
306
307 /*
308 if (myid==0){
309 n = 1;
310 for (depth=0; depth<=max_depth; depth++){
311 for (child=0; child<n; child++){
312
313 Na = Num_Atoms_in_Child[depth][child];
314
315 for (k=0; k<Na; k++){
316 m = List_AN_in_Child[depth][child][k];
317 t = List_T_in_Child[depth][child][k];
318 printf("depth=%2d child=%2d k=%2d m=%2d t=%15.12f\n",depth,child,k,m,t);
319 }
320 }
321 n *= 2;
322 }
323 }
324 */
325
326 /***************************************************************
327 allocation of atoms to processes
328 ***************************************************************/
329
330 /*
331 sorting of atoms in each child at max_depth.
332 if the sorting is not performed, the force calculations
333 related to HNL3 and 4B will be failed.
334 */
335
336 n = 1;
337 for (depth=0; depth<max_depth; depth++) n *= 2;
338
339 for (child=0; child<n; child++){
340 Na = Num_Atoms_in_Child[max_depth][child];
341 qsort_int1((long)Na,List_AN_in_Child[depth][child]);
342 }
343
344 /* set G2ID, M2G, Matomnum, and WMatomnum */
345
346 n = 1;
347 for (depth=0; depth<max_depth; depth++) n *= 2;
348
349 Matomnum = 0;
350 WMatomnum = 0.0;
351 ID = 0;
352
353 for (child=0; child<n; child++){
354
355 Na = Num_Atoms_in_Child[max_depth][child];
356
357 if (Na!=0){
358
359 for (k=0; k<Na; k++){
360 m = List_AN_in_Child[max_depth][child][k];
361 G2ID[m] = ID;
362 }
363
364 if (myid==ID){
365
366 Matomnum = Na;
367 if (alloc_first[10]==0) free(M2G);
368 M2G = (int*)malloc(sizeof(int)*(Matomnum+2));
369 alloc_first[10] = 0;
370
371 for (k=0; k<Na; k++){
372 m = List_AN_in_Child[max_depth][child][k];
373 M2G[k+1] = m;
374 }
375
376 WMatomnum = 0.0;
377 for (k=0; k<Na; k++){
378 m = List_AN_in_Child[max_depth][child][k];
379 WMatomnum += weight[m];
380 }
381 }
382
383 ID++;
384 }
385 }
386
387 /****************************************
388 find Max_Matomnum, MatomN and WMatomN
389 ****************************************/
390
391 MatomN = (int*)malloc(sizeof(int)*numprocs);
392 WMatomN = (double*)malloc(sizeof(double)*numprocs);
393
394 MatomN[myid] = Matomnum;
395 WMatomN[myid] = WMatomnum;
396
397 for (ID=0; ID<numprocs; ID++){
398 MPI_Bcast(&MatomN[ID], 1, MPI_INT, ID, mpi_comm_level1);
399 MPI_Bcast(&WMatomN[ID], 1, MPI_DOUBLE, ID, mpi_comm_level1);
400 }
401
402 /* find Max_Matomnum */
403
404 Max_Matomnum = 0;
405 for (ID=0; ID<numprocs; ID++){
406 if (Max_Matomnum<MatomN[ID]) Max_Matomnum = MatomN[ID];
407 }
408
409 /*********************************************
410 print the information
411 *********************************************/
412
413 if (myid==Host_ID && 1<=MD_iter && 0<level_stdout){
414 printf("\n");
415 printf("*******************************************************\n");
416 printf(" Allocation of atoms to proccesors at MD_iter=%5d \n", MD_iter );
417 printf("*******************************************************\n\n");
418 }
419
420 for (ID=0; ID<numprocs; ID++){
421
422 if (myid==Host_ID && 1<=MD_iter && 0<level_stdout){
423 printf(" proc = %3d # of atoms=%4d estimated weight=%16.5f\n",
424 ID,MatomN[ID],WMatomN[ID]);
425 }
426 }
427
428 if (myid==Host_ID && 1<=MD_iter && 0<level_stdout) printf("\n\n\n");
429
430 /****************************************
431 initialize time_per_atom
432 ****************************************/
433
434 for (k=1; k<=atomnum; k++) time_per_atom[k] = 0.0;
435
436 /*
437 if (myid==Host_ID){
438
439 int i,k;
440 char AtomName[38][10]=
441 {"E","H","He","Li","Be","B","C","N","O","F","Ne","Na","Mg","Al","Si",
442 "P","S","Cl","Ar","K","Ca","Sc","Ti","V","Cr","Mn","Fe","Co","Ni","Cu","Zn",
443 "Ga","Ge","As","Se","Br","Kr"
444 };
445
446 for (i=1; i<=atomnum; i++){
447 k = (G2ID[i])%36+4;
448 printf("%5s %15.12f %15.12f %15.12f 0.0 0.0 0.0\n",
449 AtomName[k],Gxyz[i][1]*BohrR,Gxyz[i][2]*BohrR,Gxyz[i][3]*BohrR);
450 }
451 }
452 MPI_Finalize();
453 exit(0);
454 */
455
456 /*
457 {
458 int i1,i2,k1,k2;
459 double dx,dy,dz,r,rmax;
460
461 rmax = 0.0;
462 for (i1=1; i1<=Matomnum; i1++){
463 k1 = M2G[i1];
464
465 for (i2=1; i2<=Matomnum; i2++){
466 k2 = M2G[i2];
467
468 dx = Gxyz[k1][1] - Gxyz[k2][1];
469 dy = Gxyz[k1][2] - Gxyz[k2][2];
470 dz = Gxyz[k1][3] - Gxyz[k2][3];
471 r = sqrt(dx*dx+dy*dy+dz*dz);
472 if (rmax<r) rmax = r;
473 }
474 }
475
476 printf("ABC1 myid=%2d rmax=%15.12f\n",myid,rmax);fflush(stdout);
477 }
478
479 MPI_Finalize();
480 exit(0);
481 */
482
483
484 /****************************************
485 freeing of arrays
486 ****************************************/
487
488 free(WMatomN);
489 free(MatomN);
490
491 free(List_AN_in_Child[0][0]);
492 free(List_T_in_Child[0][0]);
493 free(List_AN_in_Child[0]);
494 free(List_T_in_Child[0]);
495
496 n = 1;
497 for (depth=0; depth<max_depth; depth++){
498
499 for (i=0; i<n*2; i++){
500 free(List_T_in_Child[depth+1][i]);
501 free(List_AN_in_Child[depth+1][i]);
502 }
503 free(List_T_in_Child[depth+1]);
504 free(List_AN_in_Child[depth+1]);
505
506 n *= 2;
507 }
508 free(List_T_in_Child);
509 free(List_AN_in_Child);
510
511 free(weight);
512 free(ko);
513
514 for (i=0; i<4; i++){
515 free(IMT[i]);
516 }
517 free(IMT);
518
519 for (depth=0; depth<(max_depth+1); depth++){
520 free(Num_Atoms_in_Child[depth]);
521 }
522 free(Num_Atoms_in_Child);
523
524 for (depth=0; depth<(max_depth+1); depth++){
525 free(Num_Procs_in_Child[depth]);
526 }
527 free(Num_Procs_in_Child);
528 }
529
530
531 #pragma optimization_level 1
Allocation_Species()532 void Allocation_Species()
533 {
534 int i,num1,num2;
535 int numprocs,myid;
536 /* MPI */
537 MPI_Comm_size(mpi_comm_level1,&numprocs);
538 MPI_Comm_rank(mpi_comm_level1,&myid);
539
540 /****************************************************
541 partition of species
542 ****************************************************/
543
544 for (i=0; i<numprocs; i++){
545 Species_Top[i] = 0;
546 Species_End[i] = 0;
547 }
548
549 if (SpeciesNum<numprocs) Num_Procs2 = SpeciesNum;
550 else Num_Procs2 = numprocs;
551
552 num1 = SpeciesNum/Num_Procs2;
553 num2 = SpeciesNum%Num_Procs2;
554 for (i=0; i<Num_Procs2; i++){
555 Species_Top[i] = num1*i;
556 Species_End[i] = num1*(i + 1) - 1;
557 }
558 if (num2!=0){
559 for (i=0; i<num2; i++){
560 Species_Top[i] = Species_Top[i] + i;
561 Species_End[i] = Species_End[i] + i + 1;
562 }
563 for (i=num2; i<Num_Procs2; i++){
564 Species_Top[i] = Species_Top[i] + num2;
565 Species_End[i] = Species_End[i] + num2;
566 }
567 }
568
569 if (myid==Host_ID && 2<=level_stdout){
570 for (i=0; i<Num_Procs2; i++){
571 printf("proc=%4d Species_Top=%4d Species_End=%4d\n",
572 i,Species_Top[i],Species_End[i]);
573 }
574 }
575
576 if (myid<Num_Procs2)
577 MSpeciesNum = Species_End[myid] - Species_Top[myid] + 1;
578 else
579 MSpeciesNum = 0;
580
581 if (myid==Host_ID && 2<=level_stdout){
582 printf("myid=%i MSpeciesNum=%i\n",myid,MSpeciesNum);
583 }
584
585 }
586
587
588
589
590
591
592
593
594
595 /*
596 void Output_Atom2CPU()
597 {
598
599 int i,numprocs;
600 char file_A2C[YOUSO10] = ".A2C";
601 FILE *fp;
602
603 MPI_Comm_size(mpi_comm_level1,&numprocs);
604
605 fnjoint(filepath,filename,file_A2C);
606
607 if ((fp = fopen(file_A2C,"w")) != NULL){
608
609 fprintf(fp,"\n");
610 fprintf(fp,"***********************************************************\n");
611 fprintf(fp,"***********************************************************\n");
612 fprintf(fp," Allocation of atoms to CPUs \n");
613 fprintf(fp,"***********************************************************\n");
614 fprintf(fp,"***********************************************************\n");
615 fprintf(fp,"\n");
616
617 fprintf(fp," Average weight = %5.4f\n\n",eachw);
618 fprintf(fp," CPU Atoms Top End CPU_Weight\n");
619 for (i=0; i<numprocs; i++){
620 fprintf(fp," %4d %4d %4d %4d %4d\n",
621 i,
622 Gatom_End[i]-Gatom_Top[i]+1,
623 Gatom_Top[i],
624 Gatom_End[i],
625 CPU_Weight[i]);
626 }
627 fclose(fp);
628 }
629 else
630 printf("Failure of saving the Set_Allocate_Atom2CPU.\n");
631 }
632 */
633