1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 /*            MPI-3 distributed linked list construction example
7  *            --------------------------------------------------
8  *
9  * Construct a distributed shared linked list using proposed MPI-3 dynamic
10  * windows.  Initially process 0 creates the head of the list, attaches it to
11  * the window, and broadcasts the pointer to all processes.  All processes then
12  * concurrently append N new elements to the list.  When a process attempts to
13  * attach its element to the tail of list it may discover that its tail pointer
14  * is stale and it must chase ahead to the new tail before the element can be
15  * attached.
16  */
17 
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <mpi.h>
21 #include <assert.h>
22 #include "mpitest.h"
23 
24 #ifdef HAVE_UNISTD_H
25 #include <unistd.h>
26 #endif
27 
28 #define NUM_ELEMS 32
29 #define NPROBE    100
30 #define ELEM_PER_ROW 16
31 
32 /* Linked list pointer */
33 typedef struct {
34     int rank;
35     MPI_Aint disp;
36 } llist_ptr_t;
37 
38 /* Linked list element */
39 typedef struct {
40     int value;
41     llist_ptr_t next;
42 } llist_elem_t;
43 
44 static const llist_ptr_t nil = { -1, (MPI_Aint) MPI_BOTTOM };
45 
46 static const int verbose = 0;
47 
48 /* List of locally allocated list elements. */
49 static llist_elem_t **my_elems = NULL;
50 static int my_elems_size = 0;
51 static int my_elems_count = 0;
52 
53 /* Allocate a new shared linked list element */
alloc_elem(int value,MPI_Win win)54 MPI_Aint alloc_elem(int value, MPI_Win win)
55 {
56     MPI_Aint disp;
57     llist_elem_t *elem_ptr;
58 
59     /* Allocate the new element and register it with the window */
60     MPI_Alloc_mem(sizeof(llist_elem_t), MPI_INFO_NULL, &elem_ptr);
61     elem_ptr->value = value;
62     elem_ptr->next = nil;
63     MPI_Win_attach(win, elem_ptr, sizeof(llist_elem_t));
64 
65     /* Add the element to the list of local elements so we can free it later. */
66     if (my_elems_size == my_elems_count) {
67         my_elems_size += 100;
68         my_elems = realloc(my_elems, my_elems_size * sizeof(void *));
69     }
70     my_elems[my_elems_count] = elem_ptr;
71     my_elems_count++;
72 
73     MPI_Get_address(elem_ptr, &disp);
74     return disp;
75 }
76 
main(int argc,char ** argv)77 int main(int argc, char **argv)
78 {
79     int procid, nproc, i;
80     MPI_Win llist_win;
81     llist_ptr_t head_ptr, tail_ptr;
82     int errs = 0;
83 
84     MTest_Init(&argc, &argv);
85 
86     MPI_Comm_rank(MPI_COMM_WORLD, &procid);
87     MPI_Comm_size(MPI_COMM_WORLD, &nproc);
88 
89     MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &llist_win);
90 
91     /* Process 0 creates the head node */
92     if (procid == 0)
93         head_ptr.disp = alloc_elem(-1, llist_win);
94 
95     /* Broadcast the head pointer to everyone */
96     head_ptr.rank = 0;
97     MPI_Bcast(&head_ptr.disp, 1, MPI_AINT, 0, MPI_COMM_WORLD);
98     tail_ptr = head_ptr;
99 
100     /* All processes concurrently append NUM_ELEMS elements to the list */
101     for (i = 0; i < NUM_ELEMS; i++) {
102         llist_ptr_t new_elem_ptr;
103         int success;
104 
105         /* Create a new list element and register it with the window */
106         new_elem_ptr.rank = procid;
107         new_elem_ptr.disp = alloc_elem(procid, llist_win);
108 
109         /* Append the new node to the list.  This might take multiple attempts if
110          * others have already appended and our tail pointer is stale. */
111         do {
112             llist_ptr_t next_tail_ptr = nil;
113 
114             MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
115 
116             MPI_Compare_and_swap((void *) &new_elem_ptr.rank, (void *) &nil.rank,
117                                  (void *) &next_tail_ptr.rank, MPI_INT, tail_ptr.rank,
118                                  (MPI_Aint) & (((llist_elem_t *) tail_ptr.disp)->next.rank),
119                                  llist_win);
120 
121             MPI_Win_unlock(tail_ptr.rank, llist_win);
122             success = (next_tail_ptr.rank == nil.rank);
123 
124             if (success) {
125                 int i, flag;
126 
127                 MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
128 
129                 MPI_Put(&new_elem_ptr.disp, 1, MPI_AINT, tail_ptr.rank,
130                         (MPI_Aint) & (((llist_elem_t *) tail_ptr.disp)->next.disp), 1,
131                         MPI_AINT, llist_win);
132 
133                 MPI_Win_unlock(tail_ptr.rank, llist_win);
134                 tail_ptr = new_elem_ptr;
135 
136                 /* For implementations that use pt-to-pt messaging, force progress for other threads'
137                  * RMA operations. */
138                 for (i = 0; i < NPROBE; i++)
139                     MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag,
140                                MPI_STATUS_IGNORE);
141 
142             } else {
143                 /* Tail pointer is stale, fetch the displacement.  May take multiple tries
144                  * if it is being updated. */
145                 do {
146                     MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
147 
148                     MPI_Get(&next_tail_ptr.disp, 1, MPI_AINT, tail_ptr.rank,
149                             (MPI_Aint) & (((llist_elem_t *) tail_ptr.disp)->next.disp),
150                             1, MPI_AINT, llist_win);
151 
152                     MPI_Win_unlock(tail_ptr.rank, llist_win);
153                 } while (next_tail_ptr.disp == nil.disp);
154                 tail_ptr = next_tail_ptr;
155             }
156         } while (!success);
157     }
158 
159     MPI_Barrier(MPI_COMM_WORLD);
160 
161     /* Traverse the list and verify that all processes inserted exactly the correct
162      * number of elements. */
163     if (procid == 0) {
164         int have_root = 0;
165         int *counts, count = 0;
166 
167         counts = (int *) malloc(sizeof(int) * nproc);
168         assert(counts != NULL);
169 
170         for (i = 0; i < nproc; i++)
171             counts[i] = 0;
172 
173         tail_ptr = head_ptr;
174 
175         /* Walk the list and tally up the number of elements inserted by each rank */
176         while (tail_ptr.disp != nil.disp) {
177             llist_elem_t elem;
178 
179             MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tail_ptr.rank, 0, llist_win);
180 
181             MPI_Get(&elem, sizeof(llist_elem_t), MPI_BYTE,
182                     tail_ptr.rank, tail_ptr.disp, sizeof(llist_elem_t), MPI_BYTE, llist_win);
183 
184             MPI_Win_unlock(tail_ptr.rank, llist_win);
185 
186             tail_ptr = elem.next;
187 
188             /* This is not the root */
189             if (have_root) {
190                 assert(elem.value >= 0 && elem.value < nproc);
191                 counts[elem.value]++;
192                 count++;
193 
194                 if (verbose) {
195                     int last_elem = tail_ptr.disp == nil.disp;
196                     printf("%2d%s", elem.value, last_elem ? "" : " -> ");
197                     if (count % ELEM_PER_ROW == 0 && !last_elem)
198                         printf("\n");
199                 }
200             }
201 
202             /* This is the root */
203             else {
204                 assert(elem.value == -1);
205                 have_root = 1;
206             }
207         }
208 
209         if (verbose)
210             printf("\n\n");
211 
212         /* Verify the counts we collected */
213         for (i = 0; i < nproc; i++) {
214             int expected = NUM_ELEMS;
215 
216             if (counts[i] != expected) {
217                 printf("Error: Rank %d inserted %d elements, expected %d\n", i, counts[i],
218                        expected);
219                 errs++;
220             }
221         }
222 
223         free(counts);
224     }
225 
226     MPI_Win_free(&llist_win);
227 
228     /* Free all the elements in the list */
229     for (; my_elems_count > 0; my_elems_count--)
230         MPI_Free_mem(my_elems[my_elems_count - 1]);
231 
232     MTest_Finalize(errs);
233     return MTestReturnValue(errs);
234 }
235