1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 1999-2003, 2007-8 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /**********************************************************************/
22 /* This is a modified and combined version of the sched.c and
23 test_sched.c files shipped with FFTW 2, written to implement and
24 test various all-to-all communications scheduling patterns.
25
26 It is not used in FFTW 3, but I keep it around in case we ever want
27 to play with this again or to change algorithms. In particular, I
28 used it to implement and test the fill1_comm_sched routine in
29 transpose-pairwise.c, which allows us to create a schedule for one
30 process at a time and is much more compact than the FFTW 2 code.
31
32 Note that the scheduling algorithm is somewhat modified from that
33 of FFTW 2. Originally, I thought that one "stall" in the schedule
34 was unavoidable for odd numbers of processes, since this is the
35 case for the soccer-timetabling problem. However, because of the
36 self-communication step, we can use the self-communication to fill
37 in the stalls. (Thanks to Ralf Wildenhues for pointing this out.)
38 This greatly simplifies the process re-sorting algorithm. */
39
40 /**********************************************************************/
41
42 #include <stdio.h>
43 #include <stdlib.h>
44
45 /* This file contains routines to compute communications schedules for
46 all-to-all communications (complete exchanges) that are performed
47 in-place. (That is, the block that processor x sends to processor
48 y gets replaced on processor x by a block received from processor y.)
49
50 A schedule, int **sched, is a two-dimensional array where
51 sched[pe][i] is the processor that pe expects to exchange a message
52 with on the i-th step of the exchange. sched[pe][i] == -1 for the
53 i after the last exchange scheduled on pe.
54
55 Here, processors (pe's, for processing elements), are numbered from
56 0 to npes-1.
57
58 There are a couple of constraints that a schedule should satisfy
59 (besides the obvious one that every processor has to communicate
60 with every other processor exactly once).
61
62 * First, and most importantly, there must be no deadlocks.
63
64 * Second, we would like to overlap communications as much as possible,
65 so that all exchanges occur in parallel. It turns out that perfect
66 overlap is possible for all number of processes (npes).
67
68 It turns out that this scheduling problem is actually well-studied,
69 and good solutions are known. The problem is known as a
70 "time-tabling" problem, and is specifically the problem of
71 scheduling a sports competition (where n teams must compete exactly
72 once with every other team). The problem is discussed and
73 algorithms are presented in:
74
75 [1] J. A. M. Schreuder, "Constructing Timetables for Sport
76 Competitions," Mathematical Programming Study 13, pp. 58-67 (1980).
77
78 [2] A. Schaerf, "Scheduling Sport Tournaments using Constraint
79 Logic Programming," Proc. of 12th Europ. Conf. on
80 Artif. Intell. (ECAI-96), pp. 634-639 (Budapest 1996).
81 http://hermes.dis.uniromal.it/~aschaerf/publications.html
82
83 (These people actually impose a lot of additional constraints that
84 we don't care about, so they are solving harder problems. [1] gives
85 a simple enough algorithm for our purposes, though.)
86
87 In the timetabling problem, N teams can all play one another in N-1
88 steps if N is even, and N steps if N is odd. Here, however,
89 there is a "self-communication" step (a team must also "play itself")
90 and so we can always make an optimal N-step schedule regardless of N.
91
92 However, we have to do more: for a particular processor, the
93 communications schedule must be sorted in ascending or descending
94 order of processor index. (This is necessary so that the data
95 coming in for the transpose does not overwrite data that will be
96 sent later; for that processor the incoming and outgoing blocks are
97 of different non-zero sizes.) Fortunately, because the schedule
98 is stall free, each parallel step of the schedule is independent
99 of every other step, and we can reorder the steps arbitrarily
100 to achieve any desired order on a particular process.
101 */
102
free_comm_schedule(int ** sched,int npes)103 void free_comm_schedule(int **sched, int npes)
104 {
105 if (sched) {
106 int i;
107
108 for (i = 0; i < npes; ++i)
109 free(sched[i]);
110 free(sched);
111 }
112 }
113
empty_comm_schedule(int ** sched,int npes)114 void empty_comm_schedule(int **sched, int npes)
115 {
116 int i;
117 for (i = 0; i < npes; ++i)
118 sched[i][0] = -1;
119 }
120
121 extern void fill_comm_schedule(int **sched, int npes);
122
123 /* Create a new communications schedule for a given number of processors.
124 The schedule is initialized to a deadlock-free, maximum overlap
125 schedule. Returns NULL on an error (may print a message to
126 stderr if there is a program bug detected). */
make_comm_schedule(int npes)127 int **make_comm_schedule(int npes)
128 {
129 int **sched;
130 int i;
131
132 sched = (int **) malloc(sizeof(int *) * npes);
133 if (!sched)
134 return NULL;
135
136 for (i = 0; i < npes; ++i)
137 sched[i] = NULL;
138
139 for (i = 0; i < npes; ++i) {
140 sched[i] = (int *) malloc(sizeof(int) * 10 * (npes + 1));
141 if (!sched[i]) {
142 free_comm_schedule(sched,npes);
143 return NULL;
144 }
145 }
146
147 empty_comm_schedule(sched,npes);
148 fill_comm_schedule(sched,npes);
149
150 if (!check_comm_schedule(sched,npes)) {
151 free_comm_schedule(sched,npes);
152 return NULL;
153 }
154
155 return sched;
156 }
157
add_dest_to_comm_schedule(int ** sched,int pe,int dest)158 static void add_dest_to_comm_schedule(int **sched, int pe, int dest)
159 {
160 int i;
161
162 for (i = 0; sched[pe][i] != -1; ++i)
163 ;
164
165 sched[pe][i] = dest;
166 sched[pe][i+1] = -1;
167 }
168
add_pair_to_comm_schedule(int ** sched,int pe1,int pe2)169 static void add_pair_to_comm_schedule(int **sched, int pe1, int pe2)
170 {
171 add_dest_to_comm_schedule(sched, pe1, pe2);
172 if (pe1 != pe2)
173 add_dest_to_comm_schedule(sched, pe2, pe1);
174 }
175
176 /* Simplification of algorithm presented in [1] (we have fewer
177 constraints). Produces a perfect schedule (npes steps). */
178
fill_comm_schedule(int ** sched,int npes)179 void fill_comm_schedule(int **sched, int npes)
180 {
181 int pe, i, n;
182
183 if (npes % 2 == 0) {
184 n = npes;
185 for (pe = 0; pe < npes; ++pe)
186 add_pair_to_comm_schedule(sched,pe,pe);
187 }
188 else
189 n = npes + 1;
190
191 for (pe = 0; pe < n - 1; ++pe) {
192 add_pair_to_comm_schedule(sched, pe, npes % 2 == 0 ? npes - 1 : pe);
193
194 for (i = 1; i < n/2; ++i) {
195 int pe_a, pe_b;
196
197 pe_a = pe - i;
198 if (pe_a < 0)
199 pe_a += n - 1;
200
201 pe_b = (pe + i) % (n - 1);
202
203 add_pair_to_comm_schedule(sched,pe_a,pe_b);
204 }
205 }
206 }
207
208 /* given an array sched[npes], fills it with the communications
209 schedule for process pe. */
fill1_comm_sched(int * sched,int which_pe,int npes)210 void fill1_comm_sched(int *sched, int which_pe, int npes)
211 {
212 int pe, i, n, s = 0;
213 if (npes % 2 == 0) {
214 n = npes;
215 sched[s++] = which_pe;
216 }
217 else
218 n = npes + 1;
219 for (pe = 0; pe < n - 1; ++pe) {
220 if (npes % 2 == 0) {
221 if (pe == which_pe) sched[s++] = npes - 1;
222 else if (npes - 1 == which_pe) sched[s++] = pe;
223 }
224 else if (pe == which_pe) sched[s++] = pe;
225
226 if (pe != which_pe && which_pe < n - 1) {
227 i = (pe - which_pe + (n - 1)) % (n - 1);
228 if (i < n/2)
229 sched[s++] = (pe + i) % (n - 1);
230
231 i = (which_pe - pe + (n - 1)) % (n - 1);
232 if (i < n/2)
233 sched[s++] = (pe - i + (n - 1)) % (n - 1);
234 }
235 }
236 if (s != npes) {
237 fprintf(stderr, "bug in fill1_com_schedule (%d, %d/%d)\n",
238 s, which_pe, npes);
239 exit(EXIT_FAILURE);
240 }
241 }
242
243 /* sort the communication schedule sched for npes so that the schedule
244 on process sortpe is ascending or descending (!ascending). */
sort1_comm_sched(int * sched,int npes,int sortpe,int ascending)245 static void sort1_comm_sched(int *sched, int npes, int sortpe, int ascending)
246 {
247 int *sortsched, i;
248 sortsched = (int *) malloc(npes * sizeof(int) * 2);
249 fill1_comm_sched(sortsched, sortpe, npes);
250 if (ascending)
251 for (i = 0; i < npes; ++i)
252 sortsched[npes + sortsched[i]] = sched[i];
253 else
254 for (i = 0; i < npes; ++i)
255 sortsched[2*npes - 1 - sortsched[i]] = sched[i];
256 for (i = 0; i < npes; ++i)
257 sched[i] = sortsched[npes + i];
258 free(sortsched);
259 }
260
261 /* Below, we have various checks in case of bugs: */
262
263 /* check for deadlocks by simulating the schedule and looking for
264 cycles in the dependency list; returns 0 if there are deadlocks
265 (or other errors) */
check_schedule_deadlock(int ** sched,int npes)266 static int check_schedule_deadlock(int **sched, int npes)
267 {
268 int *step, *depend, *visited, pe, pe2, period, done = 0;
269 int counter = 0;
270
271 /* step[pe] is the step in the schedule that a given pe is on */
272 step = (int *) malloc(sizeof(int) * npes);
273
274 /* depend[pe] is the pe' that pe is currently waiting for a message
275 from (-1 if none) */
276 depend = (int *) malloc(sizeof(int) * npes);
277
278 /* visited[pe] tells whether we have visited the current pe already
279 when we are looking for cycles. */
280 visited = (int *) malloc(sizeof(int) * npes);
281
282 if (!step || !depend || !visited) {
283 free(step); free(depend); free(visited);
284 return 0;
285 }
286
287 for (pe = 0; pe < npes; ++pe)
288 step[pe] = 0;
289
290 while (!done) {
291 ++counter;
292
293 for (pe = 0; pe < npes; ++pe)
294 depend[pe] = sched[pe][step[pe]];
295
296 /* now look for cycles in the dependencies with period > 2: */
297 for (pe = 0; pe < npes; ++pe)
298 if (depend[pe] != -1) {
299 for (pe2 = 0; pe2 < npes; ++pe2)
300 visited[pe2] = 0;
301
302 period = 0;
303 pe2 = pe;
304 do {
305 visited[pe2] = period + 1;
306 pe2 = depend[pe2];
307 period++;
308 } while (pe2 != -1 && !visited[pe2]);
309
310 if (pe2 == -1) {
311 fprintf(stderr,
312 "BUG: unterminated cycle in schedule!\n");
313 free(step); free(depend);
314 free(visited);
315 return 0;
316 }
317 if (period - (visited[pe2] - 1) > 2) {
318 fprintf(stderr,"BUG: deadlock in schedule!\n");
319 free(step); free(depend);
320 free(visited);
321 return 0;
322 }
323
324 if (pe2 == pe)
325 step[pe]++;
326 }
327
328 done = 1;
329 for (pe = 0; pe < npes; ++pe)
330 if (sched[pe][step[pe]] != -1) {
331 done = 0;
332 break;
333 }
334 }
335
336 free(step); free(depend); free(visited);
337 return (counter > 0 ? counter : 1);
338 }
339
340 /* sanity checks; prints message and returns 0 on failure.
341 undocumented feature: the return value on success is actually the
342 number of steps required for the schedule to complete, counting
343 stalls. */
check_comm_schedule(int ** sched,int npes)344 int check_comm_schedule(int **sched, int npes)
345 {
346 int pe, i, comm_pe;
347
348 for (pe = 0; pe < npes; ++pe) {
349 for (comm_pe = 0; comm_pe < npes; ++comm_pe) {
350 for (i = 0; sched[pe][i] != -1 && sched[pe][i] != comm_pe; ++i)
351 ;
352 if (sched[pe][i] == -1) {
353 fprintf(stderr,"BUG: schedule never sends message from "
354 "%d to %d.\n",pe,comm_pe);
355 return 0; /* never send message to comm_pe */
356 }
357 }
358 for (i = 0; sched[pe][i] != -1; ++i)
359 ;
360 if (i != npes) {
361 fprintf(stderr,"BUG: schedule sends too many messages from "
362 "%d\n",pe);
363 return 0;
364 }
365 }
366 return check_schedule_deadlock(sched,npes);
367 }
368
369 /* invert the order of all the schedules; this has no effect on
370 its required properties. */
invert_comm_schedule(int ** sched,int npes)371 void invert_comm_schedule(int **sched, int npes)
372 {
373 int pe, i;
374
375 for (pe = 0; pe < npes; ++pe)
376 for (i = 0; i < npes/2; ++i) {
377 int dummy = sched[pe][i];
378 sched[pe][i] = sched[pe][npes-1-i];
379 sched[pe][npes-1-i] = dummy;
380 }
381 }
382
383 /* Sort the schedule for sort_pe in ascending order of processor
384 index. Unfortunately, for odd npes (when schedule has a stall
385 to begin with) this will introduce an extra stall due to
386 the motion of the self-communication past a stall. We could
387 fix this if it were really important. Actually, we don't
388 get an extra stall when sort_pe == 0 or npes-1, which is sufficient
389 for our purposes. */
sort_comm_schedule(int ** sched,int npes,int sort_pe)390 void sort_comm_schedule(int **sched, int npes, int sort_pe)
391 {
392 int i,j,pe;
393
394 /* Note that we can do this sort in O(npes) swaps because we know
395 that the numbers we are sorting are just 0...npes-1. But we'll
396 just do a bubble sort for simplicity here. */
397
398 for (i = 0; i < npes - 1; ++i)
399 for (j = i + 1; j < npes; ++j)
400 if (sched[sort_pe][i] > sched[sort_pe][j]) {
401 for (pe = 0; pe < npes; ++pe) {
402 int s = sched[pe][i];
403 sched[pe][i] = sched[pe][j];
404 sched[pe][j] = s;
405 }
406 }
407 }
408
409 /* print the schedule (for debugging purposes) */
print_comm_schedule(int ** sched,int npes)410 void print_comm_schedule(int **sched, int npes)
411 {
412 int pe, i, width;
413
414 if (npes < 10)
415 width = 1;
416 else if (npes < 100)
417 width = 2;
418 else
419 width = 3;
420
421 for (pe = 0; pe < npes; ++pe) {
422 printf("pe %*d schedule:", width, pe);
423 for (i = 0; sched[pe][i] != -1; ++i)
424 printf(" %*d",width,sched[pe][i]);
425 printf("\n");
426 }
427 }
428
main(int argc,char ** argv)429 int main(int argc, char **argv)
430 {
431 int **sched;
432 int npes = -1, sortpe = -1, steps, i;
433
434 if (argc >= 2) {
435 npes = atoi(argv[1]);
436 if (npes <= 0) {
437 fprintf(stderr,"npes must be positive!");
438 return 1;
439 }
440 }
441 if (argc >= 3) {
442 sortpe = atoi(argv[2]);
443 if (sortpe < 0 || sortpe >= npes) {
444 fprintf(stderr,"sortpe must be between 0 and npes-1.\n");
445 return 1;
446 }
447 }
448
449 if (npes != -1) {
450 printf("Computing schedule for npes = %d:\n",npes);
451 sched = make_comm_schedule(npes);
452 if (!sched) {
453 fprintf(stderr,"Out of memory!");
454 return 6;
455 }
456
457 if (steps = check_comm_schedule(sched,npes))
458 printf("schedule OK (takes %d steps to complete).\n", steps);
459 else
460 printf("schedule not OK.\n");
461
462 print_comm_schedule(sched, npes);
463
464 if (sortpe != -1) {
465 printf("\nRe-creating schedule for pe = %d...\n", sortpe);
466 int *sched1 = (int*) malloc(sizeof(int) * npes);
467 for (i = 0; i < npes; ++i) sched1[i] = -1;
468 fill1_comm_sched(sched1, sortpe, npes);
469 printf(" =");
470 for (i = 0; i < npes; ++i)
471 printf(" %*d", npes < 10 ? 1 : (npes < 100 ? 2 : 3),
472 sched1[i]);
473 printf("\n");
474
475 printf("\nSorting schedule for sortpe = %d...\n", sortpe);
476 sort_comm_schedule(sched,npes,sortpe);
477
478 if (steps = check_comm_schedule(sched,npes))
479 printf("schedule OK (takes %d steps to complete).\n",
480 steps);
481 else
482 printf("schedule not OK.\n");
483
484 print_comm_schedule(sched, npes);
485
486 printf("\nInverting schedule...\n");
487 invert_comm_schedule(sched,npes);
488
489 if (steps = check_comm_schedule(sched,npes))
490 printf("schedule OK (takes %d steps to complete).\n",
491 steps);
492 else
493 printf("schedule not OK.\n");
494
495 print_comm_schedule(sched, npes);
496
497 free_comm_schedule(sched,npes);
498
499 free(sched1);
500 }
501 }
502 else {
503 printf("Doing infinite tests...\n");
504 for (npes = 1; ; ++npes) {
505 int *sched1 = (int*) malloc(sizeof(int) * npes);
506 printf("npes = %d...",npes);
507 sched = make_comm_schedule(npes);
508 if (!sched) {
509 fprintf(stderr,"Out of memory!\n");
510 return 5;
511 }
512 for (sortpe = 0; sortpe < npes; ++sortpe) {
513 empty_comm_schedule(sched,npes);
514 fill_comm_schedule(sched,npes);
515 if (!check_comm_schedule(sched,npes)) {
516 fprintf(stderr,
517 "\n -- fill error for sortpe = %d!\n",sortpe);
518 return 2;
519 }
520
521 for (i = 0; i < npes; ++i) sched1[i] = -1;
522 fill1_comm_sched(sched1, sortpe, npes);
523 for (i = 0; i < npes; ++i)
524 if (sched1[i] != sched[sortpe][i])
525 fprintf(stderr,
526 "\n -- fill1 error for pe = %d!\n",
527 sortpe);
528
529 sort_comm_schedule(sched,npes,sortpe);
530 if (!check_comm_schedule(sched,npes)) {
531 fprintf(stderr,
532 "\n -- sort error for sortpe = %d!\n",sortpe);
533 return 3;
534 }
535 invert_comm_schedule(sched,npes);
536 if (!check_comm_schedule(sched,npes)) {
537 fprintf(stderr,
538 "\n -- invert error for sortpe = %d!\n",
539 sortpe);
540 return 4;
541 }
542 }
543 free_comm_schedule(sched,npes);
544 printf("OK\n");
545 if (npes % 50 == 0)
546 printf("(...Hit Ctrl-C to stop...)\n");
547 free(sched1);
548 }
549 }
550
551 return 0;
552 }
553