1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 1999-2003, 2007-8 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /**********************************************************************/
22 /* This is a modified and combined version of the sched.c and
23    test_sched.c files shipped with FFTW 2, written to implement and
24    test various all-to-all communications scheduling patterns.
25 
26    It is not used in FFTW 3, but I keep it around in case we ever want
27    to play with this again or to change algorithms.  In particular, I
28    used it to implement and test the fill1_comm_sched routine in
29    transpose-pairwise.c, which allows us to create a schedule for one
30    process at a time and is much more compact than the FFTW 2 code.
31 
32    Note that the scheduling algorithm is somewhat modified from that
33    of FFTW 2.  Originally, I thought that one "stall" in the schedule
34    was unavoidable for odd numbers of processes, since this is the
35    case for the soccer-timetabling problem.  However, because of the
36    self-communication step, we can use the self-communication to fill
37    in the stalls.  (Thanks to Ralf Wildenhues for pointing this out.)
38    This greatly simplifies the process re-sorting algorithm. */
39 
40 /**********************************************************************/
41 
42 #include <stdio.h>
43 #include <stdlib.h>
44 
45 /* This file contains routines to compute communications schedules for
46    all-to-all communications (complete exchanges) that are performed
47    in-place.  (That is, the block that processor x sends to processor
48    y gets replaced on processor x by a block received from processor y.)
49 
50    A schedule, int **sched, is a two-dimensional array where
51    sched[pe][i] is the processor that pe expects to exchange a message
52    with on the i-th step of the exchange.  sched[pe][i] == -1 for the
53    i after the last exchange scheduled on pe.
54 
55    Here, processors (pe's, for processing elements), are numbered from
56    0 to npes-1.
57 
58    There are a couple of constraints that a schedule should satisfy
59    (besides the obvious one that every processor has to communicate
60    with every other processor exactly once).
61 
62    * First, and most importantly, there must be no deadlocks.
63 
64    * Second, we would like to overlap communications as much as possible,
65    so that all exchanges occur in parallel.  It turns out that perfect
66    overlap is possible for all number of processes (npes).
67 
68    It turns out that this scheduling problem is actually well-studied,
69    and good solutions are known.  The problem is known as a
70    "time-tabling" problem, and is specifically the problem of
71    scheduling a sports competition (where n teams must compete exactly
72    once with every other team).  The problem is discussed and
73    algorithms are presented in:
74 
75    [1] J. A. M. Schreuder, "Constructing Timetables for Sport
76    Competitions," Mathematical Programming Study 13, pp. 58-67 (1980).
77 
78    [2] A. Schaerf, "Scheduling Sport Tournaments using Constraint
79    Logic Programming," Proc. of 12th Europ. Conf. on
80    Artif. Intell. (ECAI-96), pp. 634-639 (Budapest 1996).
81    http://hermes.dis.uniromal.it/~aschaerf/publications.html
82 
83    (These people actually impose a lot of additional constraints that
84    we don't care about, so they are solving harder problems. [1] gives
85    a simple enough algorithm for our purposes, though.)
86 
87    In the timetabling problem, N teams can all play one another in N-1
88    steps if N is even, and N steps if N is odd.  Here, however,
89    there is a "self-communication" step (a team must also "play itself")
90    and so we can always make an optimal N-step schedule regardless of N.
91 
92    However, we have to do more: for a particular processor, the
93    communications schedule must be sorted in ascending or descending
94    order of processor index.  (This is necessary so that the data
95    coming in for the transpose does not overwrite data that will be
96    sent later; for that processor the incoming and outgoing blocks are
97    of different non-zero sizes.)  Fortunately, because the schedule
98    is stall free, each parallel step of the schedule is independent
99    of every other step, and we can reorder the steps arbitrarily
100    to achieve any desired order on a particular process.
101 */
102 
free_comm_schedule(int ** sched,int npes)103 void free_comm_schedule(int **sched, int npes)
104 {
105      if (sched) {
106 	  int i;
107 
108 	  for (i = 0; i < npes; ++i)
109 	       free(sched[i]);
110 	  free(sched);
111      }
112 }
113 
empty_comm_schedule(int ** sched,int npes)114 void empty_comm_schedule(int **sched, int npes)
115 {
116      int i;
117      for (i = 0; i < npes; ++i)
118 	  sched[i][0] = -1;
119 }
120 
121 extern void fill_comm_schedule(int **sched, int npes);
122 
123 /* Create a new communications schedule for a given number of processors.
124    The schedule is initialized to a deadlock-free, maximum overlap
125    schedule.  Returns NULL on an error (may print a message to
126    stderr if there is a program bug detected).  */
make_comm_schedule(int npes)127 int **make_comm_schedule(int npes)
128 {
129      int **sched;
130      int i;
131 
132      sched = (int **) malloc(sizeof(int *) * npes);
133      if (!sched)
134 	  return NULL;
135 
136      for (i = 0; i < npes; ++i)
137 	  sched[i] = NULL;
138 
139      for (i = 0; i < npes; ++i) {
140 	  sched[i] = (int *) malloc(sizeof(int) * 10 * (npes + 1));
141 	  if (!sched[i]) {
142 	       free_comm_schedule(sched,npes);
143 	       return NULL;
144 	  }
145      }
146 
147      empty_comm_schedule(sched,npes);
148      fill_comm_schedule(sched,npes);
149 
150      if (!check_comm_schedule(sched,npes)) {
151 	  free_comm_schedule(sched,npes);
152 	  return NULL;
153      }
154 
155      return sched;
156 }
157 
add_dest_to_comm_schedule(int ** sched,int pe,int dest)158 static void add_dest_to_comm_schedule(int **sched, int pe, int dest)
159 {
160      int i;
161 
162      for (i = 0; sched[pe][i] != -1; ++i)
163 	  ;
164 
165      sched[pe][i] = dest;
166      sched[pe][i+1] = -1;
167 }
168 
add_pair_to_comm_schedule(int ** sched,int pe1,int pe2)169 static void add_pair_to_comm_schedule(int **sched, int pe1, int pe2)
170 {
171      add_dest_to_comm_schedule(sched, pe1, pe2);
172      if (pe1 != pe2)
173 	  add_dest_to_comm_schedule(sched, pe2, pe1);
174 }
175 
176 /* Simplification of algorithm presented in [1] (we have fewer
177    constraints).  Produces a perfect schedule (npes steps).  */
178 
fill_comm_schedule(int ** sched,int npes)179 void fill_comm_schedule(int **sched, int npes)
180 {
181      int pe, i, n;
182 
183      if (npes % 2 == 0) {
184 	  n = npes;
185 	  for (pe = 0; pe < npes; ++pe)
186 	       add_pair_to_comm_schedule(sched,pe,pe);
187      }
188      else
189 	  n = npes + 1;
190 
191      for (pe = 0; pe < n - 1; ++pe) {
192 	  add_pair_to_comm_schedule(sched, pe, npes % 2 == 0 ? npes - 1 : pe);
193 
194 	  for (i = 1; i < n/2; ++i) {
195 	       int pe_a, pe_b;
196 
197 	       pe_a = pe - i;
198 	       if (pe_a < 0)
199 		    pe_a += n - 1;
200 
201 	       pe_b = (pe + i) % (n - 1);
202 
203 	       add_pair_to_comm_schedule(sched,pe_a,pe_b);
204 	  }
205      }
206 }
207 
208 /* given an array sched[npes], fills it with the communications
209    schedule for process pe. */
fill1_comm_sched(int * sched,int which_pe,int npes)210 void fill1_comm_sched(int *sched, int which_pe, int npes)
211 {
212      int pe, i, n, s = 0;
213      if (npes % 2 == 0) {
214 	  n = npes;
215 	  sched[s++] = which_pe;
216      }
217      else
218 	  n = npes + 1;
219      for (pe = 0; pe < n - 1; ++pe) {
220 	  if (npes % 2 == 0) {
221 	       if (pe == which_pe) sched[s++] = npes - 1;
222 	       else if (npes - 1 == which_pe) sched[s++] = pe;
223 	  }
224 	  else if (pe == which_pe) sched[s++] = pe;
225 
226 	  if (pe != which_pe && which_pe < n - 1) {
227 	       i = (pe - which_pe + (n - 1)) % (n - 1);
228 	       if (i < n/2)
229 		    sched[s++] = (pe + i) % (n - 1);
230 
231 	       i = (which_pe - pe + (n - 1)) % (n - 1);
232 	       if (i < n/2)
233 		    sched[s++] = (pe - i + (n - 1)) % (n - 1);
234 	  }
235      }
236      if (s != npes) {
237 	  fprintf(stderr, "bug in fill1_com_schedule (%d, %d/%d)\n",
238 		  s, which_pe, npes);
239 	  exit(EXIT_FAILURE);
240      }
241 }
242 
243 /* sort the communication schedule sched for npes so that the schedule
244    on process sortpe is ascending or descending (!ascending). */
sort1_comm_sched(int * sched,int npes,int sortpe,int ascending)245 static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
246 {
247      int *sortsched, i;
248      sortsched = (int *) malloc(npes * sizeof(int) * 2);
249      fill1_comm_sched(sortsched, sortpe, npes);
250      if (ascending)
251           for (i = 0; i < npes; ++i)
252                sortsched[npes + sortsched[i]] = sched[i];
253      else
254           for (i = 0; i < npes; ++i)
255                sortsched[2*npes - 1 - sortsched[i]] = sched[i];
256      for (i = 0; i < npes; ++i)
257           sched[i] = sortsched[npes + i];
258      free(sortsched);
259 }
260 
261 /* Below, we have various checks in case of bugs: */
262 
263 /* check for deadlocks by simulating the schedule and looking for
264    cycles in the dependency list; returns 0 if there are deadlocks
265    (or other errors) */
check_schedule_deadlock(int ** sched,int npes)266 static int check_schedule_deadlock(int **sched, int npes)
267 {
268      int *step, *depend, *visited, pe, pe2, period, done = 0;
269      int counter = 0;
270 
271      /* step[pe] is the step in the schedule that a given pe is on */
272      step = (int *) malloc(sizeof(int) * npes);
273 
274      /* depend[pe] is the pe' that pe is currently waiting for a message
275 	from (-1 if none) */
276      depend = (int *) malloc(sizeof(int) * npes);
277 
278      /* visited[pe] tells whether we have visited the current pe already
279 	when we are looking for cycles. */
280      visited = (int *) malloc(sizeof(int) * npes);
281 
282      if (!step || !depend || !visited) {
283 	  free(step); free(depend); free(visited);
284 	  return 0;
285      }
286 
287      for (pe = 0; pe < npes; ++pe)
288 	  step[pe] = 0;
289 
290      while (!done) {
291 	  ++counter;
292 
293 	  for (pe = 0; pe < npes; ++pe)
294 	       depend[pe] = sched[pe][step[pe]];
295 
296 	  /* now look for cycles in the dependencies with period > 2: */
297 	  for (pe = 0; pe < npes; ++pe)
298 	       if (depend[pe] != -1) {
299 		    for (pe2 = 0; pe2 < npes; ++pe2)
300 			 visited[pe2] = 0;
301 
302 		    period = 0;
303 		    pe2 = pe;
304 		    do {
305 			 visited[pe2] = period + 1;
306 			 pe2 = depend[pe2];
307 			 period++;
308 		    } while (pe2 != -1 && !visited[pe2]);
309 
310 		    if (pe2 == -1) {
311 			 fprintf(stderr,
312 				 "BUG: unterminated cycle in schedule!\n");
313 			 free(step); free(depend);
314 			 free(visited);
315 			 return 0;
316 		    }
317 		    if (period - (visited[pe2] - 1) > 2) {
318 			 fprintf(stderr,"BUG: deadlock in schedule!\n");
319 			 free(step); free(depend);
320 			 free(visited);
321 			 return 0;
322 		    }
323 
324 		    if (pe2 == pe)
325 			 step[pe]++;
326 	       }
327 
328 	  done = 1;
329 	  for (pe = 0; pe < npes; ++pe)
330 	       if (sched[pe][step[pe]] != -1) {
331 		    done = 0;
332 		    break;
333 	       }
334      }
335 
336      free(step); free(depend); free(visited);
337      return (counter > 0 ? counter : 1);
338 }
339 
340 /* sanity checks; prints message and returns 0 on failure.
341    undocumented feature: the return value on success is actually the
342    number of steps required for the schedule to complete, counting
343    stalls. */
check_comm_schedule(int ** sched,int npes)344 int check_comm_schedule(int **sched, int npes)
345 {
346      int pe, i, comm_pe;
347 
348      for (pe = 0; pe < npes; ++pe) {
349 	  for (comm_pe = 0; comm_pe < npes; ++comm_pe) {
350 	       for (i = 0; sched[pe][i] != -1 && sched[pe][i] != comm_pe; ++i)
351 		    ;
352 	       if (sched[pe][i] == -1) {
353 		    fprintf(stderr,"BUG: schedule never sends message from "
354 			    "%d to %d.\n",pe,comm_pe);
355 		    return 0;  /* never send message to comm_pe */
356 	       }
357 	  }
358 	  for (i = 0; sched[pe][i] != -1; ++i)
359 	       ;
360 	  if (i != npes) {
361 	       fprintf(stderr,"BUG: schedule sends too many messages from "
362 		       "%d\n",pe);
363 	       return 0;
364 	  }
365      }
366      return check_schedule_deadlock(sched,npes);
367 }
368 
369 /* invert the order of all the schedules; this has no effect on
370    its required properties. */
invert_comm_schedule(int ** sched,int npes)371 void invert_comm_schedule(int **sched, int npes)
372 {
373      int pe, i;
374 
375      for (pe = 0; pe < npes; ++pe)
376 	  for (i = 0; i < npes/2; ++i) {
377 	       int dummy = sched[pe][i];
378 	       sched[pe][i] = sched[pe][npes-1-i];
379 	       sched[pe][npes-1-i] = dummy;
380 	  }
381 }
382 
383 /* Sort the schedule for sort_pe in ascending order of processor
384    index.  Unfortunately, for odd npes (when schedule has a stall
385    to begin with) this will introduce an extra stall due to
386    the motion of the self-communication past a stall.  We could
387    fix this if it were really important.  Actually, we don't
388    get an extra stall when sort_pe == 0 or npes-1, which is sufficient
389    for our purposes. */
sort_comm_schedule(int ** sched,int npes,int sort_pe)390 void sort_comm_schedule(int **sched, int npes, int sort_pe)
391 {
392      int i,j,pe;
393 
394      /* Note that we can do this sort in O(npes) swaps because we know
395 	that the numbers we are sorting are just 0...npes-1.   But we'll
396 	just do a bubble sort for simplicity here. */
397 
398      for (i = 0; i < npes - 1; ++i)
399 	  for (j = i + 1; j < npes; ++j)
400 	       if (sched[sort_pe][i] > sched[sort_pe][j]) {
401 		    for (pe = 0; pe < npes; ++pe) {
402 			 int s = sched[pe][i];
403 			 sched[pe][i] = sched[pe][j];
404 			 sched[pe][j] = s;
405 		    }
406 	       }
407 }
408 
409 /* print the schedule (for debugging purposes) */
print_comm_schedule(int ** sched,int npes)410 void print_comm_schedule(int **sched, int npes)
411 {
412      int pe, i, width;
413 
414      if (npes < 10)
415 	  width = 1;
416      else if (npes < 100)
417 	  width = 2;
418      else
419 	  width = 3;
420 
421      for (pe = 0; pe < npes; ++pe) {
422 	  printf("pe %*d schedule:", width, pe);
423 	  for (i = 0; sched[pe][i] != -1; ++i)
424 	       printf("  %*d",width,sched[pe][i]);
425 	  printf("\n");
426      }
427 }
428 
main(int argc,char ** argv)429 int main(int argc, char **argv)
430 {
431      int **sched;
432      int npes = -1, sortpe = -1, steps, i;
433 
434      if (argc >= 2) {
435 	  npes = atoi(argv[1]);
436 	  if (npes <= 0) {
437 	       fprintf(stderr,"npes must be positive!");
438 	       return 1;
439 	  }
440      }
441      if (argc >= 3) {
442 	  sortpe = atoi(argv[2]);
443 	  if (sortpe < 0 || sortpe >= npes) {
444 	       fprintf(stderr,"sortpe must be between 0 and npes-1.\n");
445 	       return 1;
446 	  }
447      }
448 
449      if (npes != -1) {
450 	  printf("Computing schedule for npes = %d:\n",npes);
451 	  sched = make_comm_schedule(npes);
452 	  if (!sched) {
453 	       fprintf(stderr,"Out of memory!");
454 	       return 6;
455 	  }
456 
457 	  if (steps = check_comm_schedule(sched,npes))
458 	       printf("schedule OK (takes %d steps to complete).\n", steps);
459 	  else
460 	       printf("schedule not OK.\n");
461 
462 	  print_comm_schedule(sched, npes);
463 
464 	  if (sortpe != -1) {
465 	       printf("\nRe-creating schedule for pe = %d...\n", sortpe);
466 	       int *sched1 = (int*) malloc(sizeof(int) * npes);
467 	       for (i = 0; i < npes; ++i) sched1[i] = -1;
468 	       fill1_comm_sched(sched1, sortpe, npes);
469 	       printf("  =");
470 	       for (i = 0; i < npes; ++i)
471 		    printf("  %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3),
472 			   sched1[i]);
473 	       printf("\n");
474 
475 	       printf("\nSorting schedule for sortpe = %d...\n", sortpe);
476 	       sort_comm_schedule(sched,npes,sortpe);
477 
478 	       if (steps = check_comm_schedule(sched,npes))
479 		    printf("schedule OK (takes %d steps to complete).\n",
480 			   steps);
481 	       else
482 		    printf("schedule not OK.\n");
483 
484 	       print_comm_schedule(sched, npes);
485 
486 	       printf("\nInverting schedule...\n");
487 	       invert_comm_schedule(sched,npes);
488 
489 	       if (steps = check_comm_schedule(sched,npes))
490 		    printf("schedule OK (takes %d steps to complete).\n",
491 			   steps);
492 	       else
493 		    printf("schedule not OK.\n");
494 
495 	       print_comm_schedule(sched, npes);
496 
497 	       free_comm_schedule(sched,npes);
498 
499 	       free(sched1);
500 	  }
501      }
502      else {
503 	  printf("Doing infinite tests...\n");
504 	  for (npes = 1; ; ++npes) {
505 	       int *sched1 = (int*) malloc(sizeof(int) * npes);
506 	       printf("npes = %d...",npes);
507 	       sched = make_comm_schedule(npes);
508 	       if (!sched) {
509 		    fprintf(stderr,"Out of memory!\n");
510 		    return 5;
511 	       }
512 	       for (sortpe = 0; sortpe < npes; ++sortpe) {
513 		    empty_comm_schedule(sched,npes);
514 		    fill_comm_schedule(sched,npes);
515 		    if (!check_comm_schedule(sched,npes)) {
516 			 fprintf(stderr,
517 				 "\n -- fill error for sortpe = %d!\n",sortpe);
518 			 return 2;
519 		    }
520 
521 		    for (i = 0; i < npes; ++i) sched1[i] = -1;
522 		    fill1_comm_sched(sched1, sortpe, npes);
523 		    for (i = 0; i < npes; ++i)
524 			 if (sched1[i] != sched[sortpe][i])
525 			      fprintf(stderr,
526 				      "\n -- fill1 error for pe = %d!\n",
527 				      sortpe);
528 
529 		    sort_comm_schedule(sched,npes,sortpe);
530 		    if (!check_comm_schedule(sched,npes)) {
531 			 fprintf(stderr,
532 				 "\n -- sort error for sortpe = %d!\n",sortpe);
533 			 return 3;
534 		    }
535 		    invert_comm_schedule(sched,npes);
536 		    if (!check_comm_schedule(sched,npes)) {
537 			 fprintf(stderr,
538 				 "\n -- invert error for sortpe = %d!\n",
539 				 sortpe);
540 			 return 4;
541 		    }
542 	       }
543 	       free_comm_schedule(sched,npes);
544 	       printf("OK\n");
545 	       if (npes % 50 == 0)
546 		    printf("(...Hit Ctrl-C to stop...)\n");
547 	       free(sched1);
548 	  }
549      }
550 
551      return 0;
552 }
553