1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <assert.h>
9 #include <strings.h>
10 
11 #include <mpi.h>
12 #include "mcs_mutex.h"
13 
14 /* TODO: Make these mutex operations no-ops for sequential runs */
15 
16 /** Create an MCS mutex.  Collective on comm.
17   *
18   * @param[out] comm communicator containing all processes that will use the
19   *                  mutex
20   * @param[out] tail_rank rank of the process in comm that holds the tail
21   *                  pointer
22   * @param[out] hdl  handle to the mutex
23   * @return          MPI status
24   */
MCS_Mutex_create(int tail_rank,MPI_Comm comm,MCS_Mutex * hdl_out)25 int MCS_Mutex_create(int tail_rank, MPI_Comm comm, MCS_Mutex * hdl_out)
26 {
27     int rank, nproc;
28     MCS_Mutex hdl;
29 
30     hdl = malloc(sizeof(struct mcs_mutex_s));
31     assert(hdl != NULL);
32 
33     MPI_Comm_dup(comm, &hdl->comm);
34 
35     MPI_Comm_rank(hdl->comm, &rank);
36     MPI_Comm_size(hdl->comm, &nproc);
37 
38     hdl->tail_rank = tail_rank;
39 
40 #ifdef USE_WIN_SHARED
41     MPI_Win_allocate_shared(2 * sizeof(int), sizeof(int), MPI_INFO_NULL,
42                             hdl->comm, &hdl->base, &hdl->window);
43 #else
44 #ifdef USE_WIN_ALLOC_SHM
45     MPI_Info_create(&hdl->win_info);
46     MPI_Info_set(hdl->win_info, "alloc_shm", "true");
47 #else
48     MPI_Info_create(&hdl->win_info);
49     MPI_Info_set(hdl->win_info, "alloc_shm", "false");
50 #endif
51     MPI_Win_allocate(2 * sizeof(int), sizeof(int), hdl->win_info, hdl->comm,
52                      &hdl->base, &hdl->window);
53 #endif
54 
55     MPI_Win_lock_all(0, hdl->window);
56 
57     hdl->base[0] = MPI_PROC_NULL;
58     hdl->base[1] = MPI_PROC_NULL;
59 
60     MPI_Win_sync(hdl->window);
61     MPI_Barrier(hdl->comm);
62 
63     *hdl_out = hdl;
64     return MPI_SUCCESS;
65 }
66 
67 
68 /** Free an MCS mutex.  Collective on ranks in the communicator used at the
69   * time of creation.
70   *
71   * @param[in] hdl handle to the group that will be freed
72   * @return        MPI status
73   */
MCS_Mutex_free(MCS_Mutex * hdl_ptr)74 int MCS_Mutex_free(MCS_Mutex * hdl_ptr)
75 {
76     MCS_Mutex hdl = *hdl_ptr;
77 
78     MPI_Win_unlock_all(hdl->window);
79 
80     MPI_Win_free(&hdl->window);
81     MPI_Comm_free(&hdl->comm);
82 #ifndef USE_WIN_SHARED
83     MPI_Info_free(&hdl->win_info);
84 #endif
85 
86     free(hdl);
87     hdl_ptr = NULL;
88 
89     return MPI_SUCCESS;
90 }
91 
92 
93 /** Lock a mutex.
94   *
95   * @param[in] hdl   Handle to the mutex
96   * @return          MPI status
97   */
MCS_Mutex_lock(MCS_Mutex hdl)98 int MCS_Mutex_lock(MCS_Mutex hdl)
99 {
100     int rank, nproc;
101     int prev;
102 
103     MPI_Comm_rank(hdl->comm, &rank);
104     MPI_Comm_size(hdl->comm, &nproc);
105 
106     /* This store is safe, since it cannot happen concurrently with a remote
107      * write */
108     hdl->base[MCS_MTX_ELEM_DISP] = MPI_PROC_NULL;
109     MPI_Win_sync(hdl->window);
110 
111     MPI_Fetch_and_op(&rank, &prev, MPI_INT, hdl->tail_rank, MCS_MTX_TAIL_DISP,
112                      MPI_REPLACE, hdl->window);
113     MPI_Win_flush(hdl->tail_rank, hdl->window);
114 
115     /* If there was a previous tail, update their next pointer and wait for
116      * notification.  Otherwise, the mutex was successfully acquired. */
117     if (prev != MPI_PROC_NULL) {
118         /* Wait for notification */
119         MPI_Status status;
120 
121         MPI_Accumulate(&rank, 1, MPI_INT, prev, MCS_MTX_ELEM_DISP, 1, MPI_INT, MPI_REPLACE,
122                        hdl->window);
123         MPI_Win_flush(prev, hdl->window);
124 
125         debug_print("%2d: LOCK   - waiting for notification from %d\n", rank, prev);
126         MPI_Recv(NULL, 0, MPI_BYTE, prev, MCS_MUTEX_TAG, hdl->comm, &status);
127     }
128 
129     debug_print("%2d: LOCK   - lock acquired\n", rank);
130 
131     return MPI_SUCCESS;
132 }
133 
134 
135 /** Attempt to acquire a mutex.
136   *
137   * @param[in] hdl   Handle to the mutex
138   * @param[out] success Indicates whether the mutex was acquired
139   * @return          MPI status
140   */
MCS_Mutex_trylock(MCS_Mutex hdl,int * success)141 int MCS_Mutex_trylock(MCS_Mutex hdl, int *success)
142 {
143     int rank, nproc;
144     int tail, nil = MPI_PROC_NULL;
145 
146     MPI_Comm_rank(hdl->comm, &rank);
147     MPI_Comm_size(hdl->comm, &nproc);
148 
149     /* This store is safe, since it cannot happen concurrently with a remote
150      * write */
151     hdl->base[MCS_MTX_ELEM_DISP] = MPI_PROC_NULL;
152     MPI_Win_sync(hdl->window);
153 
154     /* Check if the lock is available and claim it if it is. */
155     MPI_Compare_and_swap(&rank, &nil, &tail, MPI_INT, hdl->tail_rank,
156                          MCS_MTX_TAIL_DISP, hdl->window);
157     MPI_Win_flush(hdl->tail_rank, hdl->window);
158 
159     /* If the old tail was MPI_PROC_NULL, we have claimed the mutex */
160     *success = (tail == nil);
161 
162     debug_print("%2d: TRYLOCK - %s\n", rank, (*success) ? "Success" : "Non-success");
163 
164     return MPI_SUCCESS;
165 }
166 
167 
168 /** Unlock a mutex.
169   *
170   * @param[in] hdl   Handle to the mutex
171   * @return          MPI status
172   */
MCS_Mutex_unlock(MCS_Mutex hdl)173 int MCS_Mutex_unlock(MCS_Mutex hdl)
174 {
175     int rank, nproc, next;
176 
177     MPI_Comm_rank(hdl->comm, &rank);
178     MPI_Comm_size(hdl->comm, &nproc);
179 
180     MPI_Win_sync(hdl->window);
181 
182     /* Read my next pointer.  FOP is used since another process may write to
183      * this location concurrent with this read. */
184     MPI_Fetch_and_op(NULL, &next, MPI_INT, rank, MCS_MTX_ELEM_DISP, MPI_NO_OP, hdl->window);
185     MPI_Win_flush(rank, hdl->window);
186 
187     if (next == MPI_PROC_NULL) {
188         int tail;
189         int nil = MPI_PROC_NULL;
190 
191         /* Check if we are the at the tail of the lock queue.  If so, we're
192          * done.  If not, we need to send notification. */
193         MPI_Compare_and_swap(&nil, &rank, &tail, MPI_INT, hdl->tail_rank,
194                              MCS_MTX_TAIL_DISP, hdl->window);
195         MPI_Win_flush(hdl->tail_rank, hdl->window);
196 
197         if (tail != rank) {
198             debug_print("%2d: UNLOCK - waiting for next pointer (tail = %d)\n", rank, tail);
199             assert(tail >= 0 && tail < nproc);
200 
201             for (;;) {
202                 int flag;
203 
204                 MPI_Fetch_and_op(NULL, &next, MPI_INT, rank, MCS_MTX_ELEM_DISP,
205                                  MPI_NO_OP, hdl->window);
206 
207                 MPI_Win_flush(rank, hdl->window);
208                 if (next != MPI_PROC_NULL)
209                     break;
210 
211                 MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
212             }
213         }
214     }
215 
216     /* Notify the next waiting process */
217     if (next != MPI_PROC_NULL) {
218         debug_print("%2d: UNLOCK - notifying %d\n", rank, next);
219         MPI_Send(NULL, 0, MPI_BYTE, next, MCS_MUTEX_TAG, hdl->comm);
220     }
221 
222     debug_print("%2d: UNLOCK - lock released\n", rank);
223 
224     return MPI_SUCCESS;
225 }
226