1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2005 The Trustees of Indiana University.
4  *                         All rights reserved.
5  * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
6  *                         All rights reserved.
7  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8  *                         University of Stuttgart.  All rights reserved.
9  * Copyright (c) 2004-2005 The Regents of the University of California.
10  *                         All rights reserved.
11  * Copyright (c) 2007-2018 Los Alamos National Security, LLC.  All rights
12  *                         reserved.
13  * Copyright (c) 2010      IBM Corporation.  All rights reserved.
14  * Copyright (c) 2012-2013 Sandia National Laboratories.  All rights reserved.
15  * Copyright (c) 2018      Intel, Inc. All rights reserved.
16  * $COPYRIGHT$
17  *
18  * Additional copyrights may follow
19  *
20  * $HEADER$
21  */
22 
23 #include "ompi_config.h"
24 
25 #include "osc_rdma_passive_target.h"
26 #include "osc_rdma_comm.h"
27 
28 #include "mpi.h"
29 
30 
ompi_osc_rdma_sync(struct ompi_win_t * win)31 int ompi_osc_rdma_sync (struct ompi_win_t *win)
32 {
33     ompi_osc_rdma_progress (GET_MODULE(win));
34     return OMPI_SUCCESS;
35 }
36 
ompi_osc_rdma_flush(int target,struct ompi_win_t * win)37 int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
38 {
39     ompi_osc_rdma_module_t *module = GET_MODULE(win);
40     ompi_osc_rdma_sync_t *lock;
41     ompi_osc_rdma_peer_t *peer;
42 
43     assert (0 <= target);
44 
45     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush: %d, %s", target, win->w_name);
46 
47     OPAL_THREAD_LOCK(&module->lock);
48 
49     lock = ompi_osc_rdma_module_sync_lookup (module, target, &peer);
50     if (OPAL_UNLIKELY(NULL == lock || OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type)) {
51         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "flush: target %d is not locked in window %s",
52                          target, win->w_name);
53         OPAL_THREAD_UNLOCK(&module->lock);
54         return OMPI_ERR_RMA_SYNC;
55     }
56     OPAL_THREAD_UNLOCK(&module->lock);
57 
58     /* finish all outstanding fragments */
59     ompi_osc_rdma_sync_rdma_complete (lock);
60 
61     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush on target %d complete", target);
62 
63     return OMPI_SUCCESS;
64 }
65 
66 
ompi_osc_rdma_flush_all(struct ompi_win_t * win)67 int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
68 {
69     ompi_osc_rdma_module_t *module = GET_MODULE(win);
70     ompi_osc_rdma_sync_t *lock;
71     int ret = OMPI_SUCCESS;
72     uint32_t key;
73     void *node;
74 
75     /* flush is only allowed from within a passive target epoch */
76     if (!ompi_osc_rdma_in_passive_epoch (module)) {
77         return OMPI_ERR_RMA_SYNC;
78     }
79 
80     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all: %s", win->w_name);
81 
82     /* globally complete all outstanding rdma requests */
83     if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) {
84         ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
85     }
86 
87     /* flush all locks */
88     ret = opal_hash_table_get_first_key_uint32 (&module->outstanding_locks, &key, (void **) &lock, &node);
89     while (OPAL_SUCCESS == ret) {
90         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "flushing lock %p", (void *) lock);
91         ompi_osc_rdma_sync_rdma_complete (lock);
92         ret = opal_hash_table_get_next_key_uint32 (&module->outstanding_locks, &key, (void **) &lock,
93                                                    node, &node);
94     }
95 
96     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all complete");
97 
98     return OPAL_SUCCESS;
99 }
100 
101 
ompi_osc_rdma_flush_local(int target,struct ompi_win_t * win)102 int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
103 {
104     return ompi_osc_rdma_flush (target, win);
105 }
106 
107 
ompi_osc_rdma_flush_local_all(struct ompi_win_t * win)108 int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
109 {
110     return ompi_osc_rdma_flush_all (win);
111 }
112 
113 /* locking via atomics */
ompi_osc_rdma_lock_atomic_internal(ompi_osc_rdma_module_t * module,ompi_osc_rdma_peer_t * peer,ompi_osc_rdma_sync_t * lock)114 static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
115                                                       ompi_osc_rdma_sync_t *lock)
116 {
117     const int locking_mode = module->locking_mode;
118     int ret;
119 
120     if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
121         do {
122             OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global exclusive lock");
123             if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
124                 /* lock the master lock. this requires no rank has a global shared lock */
125                 ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock),
126                                                          0xffffffff00000000L);
127                 if (OMPI_SUCCESS != ret) {
128                     ompi_osc_rdma_progress (module);
129                     continue;
130                 }
131             }
132 
133             OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring exclusive lock on peer");
134             ret = ompi_osc_rdma_lock_try_acquire_exclusive (module, peer,  offsetof (ompi_osc_rdma_state_t, local_lock));
135             if (ret) {
136                 /* release the global lock */
137                 if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
138                     ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
139                 }
140                 ompi_osc_rdma_progress (module);
141                 continue;
142             }
143 
144             peer->flags |= OMPI_OSC_RDMA_PEER_EXCLUSIVE;
145             break;
146         } while (1);
147     } else {
148         do {
149             /* go right to the target to acquire a shared lock */
150             OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global shared lock");
151             ret = ompi_osc_rdma_lock_acquire_shared (module, peer, 1, offsetof (ompi_osc_rdma_state_t, local_lock),
152                                                      OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
153             if (OMPI_SUCCESS == ret) {
154                 return OMPI_SUCCESS;
155             }
156 
157             ompi_osc_rdma_progress (module);
158         } while (1);
159     }
160 
161     return OMPI_SUCCESS;
162 }
163 
ompi_osc_rdma_unlock_atomic_internal(ompi_osc_rdma_module_t * module,ompi_osc_rdma_peer_t * peer,ompi_osc_rdma_sync_t * lock)164 static inline int ompi_osc_rdma_unlock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
165                                                         ompi_osc_rdma_sync_t *lock)
166 {
167     const int locking_mode = module->locking_mode;
168 
169     if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
170         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock on peer");
171         ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
172 
173         if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
174             OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global exclusive lock");
175             ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
176         }
177 
178         peer->flags &= ~OMPI_OSC_RDMA_PEER_EXCLUSIVE;
179     } else {
180         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global shared lock");
181         ompi_osc_rdma_lock_release_shared (module, peer, -1, offsetof (ompi_osc_rdma_state_t, local_lock));
182         peer->flags &= ~OMPI_OSC_RDMA_PEER_DEMAND_LOCKED;
183     }
184 
185     return OMPI_SUCCESS;
186 }
187 
ompi_osc_rdma_demand_lock_peer(ompi_osc_rdma_module_t * module,ompi_osc_rdma_peer_t * peer)188 int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
189 {
190     ompi_osc_rdma_sync_t *lock = &module->all_sync;
191     int ret = OMPI_SUCCESS;
192 
193     /* check for bad usage */
194     assert (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == lock->type);
195 
196     OPAL_THREAD_SCOPED_LOCK(&peer->lock,
197     do {
198         if (!ompi_osc_rdma_peer_is_demand_locked (peer)) {
199             ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
200             OPAL_THREAD_SCOPED_LOCK(&lock->lock, opal_list_append (&lock->demand_locked_peers, &peer->super));
201             peer->flags |= OMPI_OSC_RDMA_PEER_DEMAND_LOCKED;
202         }
203     } while (0);
204     );
205 
206     return ret;
207 }
208 
ompi_osc_rdma_lock_atomic(int lock_type,int target,int assert,ompi_win_t * win)209 int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
210 {
211     ompi_osc_rdma_module_t *module = GET_MODULE(win);
212     ompi_osc_rdma_peer_t *peer = ompi_osc_rdma_module_peer (module, target);
213     ompi_osc_rdma_sync_t *lock;
214     int ret = OMPI_SUCCESS;
215 
216     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock: %d, %d, %d, %s", lock_type, target, assert, win->w_name);
217 
218     if (module->no_locks) {
219         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
220         return OMPI_ERR_RMA_SYNC;
221     }
222 
223     if (module->all_sync.epoch_active && (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type || MPI_LOCK_EXCLUSIVE == lock_type)) {
224         /* impossible to get an exclusive lock while holding a global shared lock or in a active
225          * target access epoch */
226         return OMPI_ERR_RMA_SYNC;
227     }
228 
229     /* clear the global sync object (in case MPI_Win_fence was called) */
230     module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
231 
232     /* create lock item */
233     lock = ompi_osc_rdma_sync_allocate (module);
234     if (OPAL_UNLIKELY(NULL == lock)) {
235         return OMPI_ERR_OUT_OF_RESOURCE;
236     }
237 
238     lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
239     lock->sync.lock.target = target;
240     lock->sync.lock.type = lock_type;
241     lock->sync.lock.assert = assert;
242 
243     lock->peer_list.peer = peer;
244     lock->num_peers = 1;
245     OBJ_RETAIN(peer);
246 
247     if (0 == (assert & MPI_MODE_NOCHECK)) {
248         ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
249     }
250 
251     if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
252         ++module->passive_target_access_epoch;
253 
254         opal_atomic_wmb ();
255 
256         OPAL_THREAD_SCOPED_LOCK(&module->lock, ompi_osc_rdma_module_lock_insert (module, lock));
257     } else {
258         OBJ_RELEASE(lock);
259     }
260 
261     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock %d complete", target);
262 
263     return ret;
264 }
265 
266 
ompi_osc_rdma_unlock_atomic(int target,ompi_win_t * win)267 int ompi_osc_rdma_unlock_atomic (int target, ompi_win_t *win)
268 {
269     ompi_osc_rdma_module_t *module = GET_MODULE(win);
270     ompi_osc_rdma_peer_t *peer;
271     ompi_osc_rdma_sync_t *lock;
272     int ret = OMPI_SUCCESS;
273 
274     OPAL_THREAD_LOCK(&module->lock);
275 
276     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock: %d, %s", target, win->w_name);
277 
278     lock = ompi_osc_rdma_module_lock_find (module, target, &peer);
279     if (OPAL_UNLIKELY(NULL == lock)) {
280         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "target %d is not locked in window %s",
281                          target, win->w_name);
282         OPAL_THREAD_UNLOCK(&module->lock);
283         return OMPI_ERR_RMA_SYNC;
284     }
285 
286     ompi_osc_rdma_module_lock_remove (module, lock);
287 
288     /* finish all outstanding fragments */
289     ompi_osc_rdma_sync_rdma_complete (lock);
290 
291     if (!(lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
292         ret = ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
293     }
294 
295     /* release our reference to this peer */
296     OBJ_RELEASE(peer);
297 
298     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock %d complete", target);
299 
300     --module->passive_target_access_epoch;
301 
302     opal_atomic_wmb ();
303 
304     OPAL_THREAD_UNLOCK(&module->lock);
305 
306     /* delete the lock */
307     ompi_osc_rdma_sync_return (lock);
308 
309     return ret;
310 }
311 
ompi_osc_rdma_lock_all_atomic(int assert,struct ompi_win_t * win)312 int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win)
313 {
314     ompi_osc_rdma_module_t *module = GET_MODULE(win);
315     ompi_osc_rdma_sync_t *lock;
316     int ret = OMPI_SUCCESS;
317 
318     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all: %d, %s", assert, win->w_name);
319 
320     if (module->no_locks) {
321         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
322         return OMPI_ERR_RMA_SYNC;
323     }
324 
325     OPAL_THREAD_LOCK(&module->lock);
326     if (module->all_sync.epoch_active) {
327         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted lock_all when active target epoch is %s "
328                          "and lock all epoch is %s",
329                          (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type && module->all_sync.epoch_active) ?
330                          "active" : "inactive",
331                          (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) ? "active" : "inactive");
332         OPAL_THREAD_UNLOCK(&module->lock);
333         return OMPI_ERR_RMA_SYNC;
334     }
335 
336     /* set up lock */
337     lock = &module->all_sync;
338 
339     lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
340     lock->sync.lock.target = -1;
341     lock->sync.lock.type   = MPI_LOCK_SHARED;
342     lock->sync.lock.assert = assert;
343     lock->num_peers = ompi_comm_size (module->comm);
344 
345     lock->epoch_active = true;
346     /* NTH: TODO -- like fence it might be a good idea to create an array to access all peers
347      * without having to access the hash table. Such a change would likely increase performance
348      * at the expense of memory usage. Ex. if a window has 1M peers then 8MB per process would
349      * be needed for this array. */
350 
351     if (0 == (assert & MPI_MODE_NOCHECK)) {
352         /* increment the global shared lock */
353         if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == module->locking_mode) {
354             ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL,
355                                                      offsetof(ompi_osc_rdma_state_t, global_lock),
356                                                      0x00000000ffffffffUL);
357         } else {
358             /* always lock myself */
359             ret = ompi_osc_rdma_demand_lock_peer (module, module->my_peer);
360         }
361     }
362 
363     if (OPAL_LIKELY(OMPI_SUCCESS != ret)) {
364         lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
365         lock->num_peers = 0;
366         lock->epoch_active = false;
367     } else {
368         ++module->passive_target_access_epoch;
369     }
370 
371     opal_atomic_wmb ();
372 
373     OPAL_THREAD_UNLOCK(&module->lock);
374 
375     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all complete");
376 
377     return ret;
378 }
379 
ompi_osc_rdma_unlock_all_atomic(struct ompi_win_t * win)380 int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win)
381 {
382     ompi_osc_rdma_module_t *module = GET_MODULE(win);
383     ompi_osc_rdma_sync_t *lock;
384 
385     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all: %s", win->w_name);
386 
387     OPAL_THREAD_LOCK(&module->lock);
388 
389     lock = &module->all_sync;
390     if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type) {
391         OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "not locked in window %s", win->w_name);
392         OPAL_THREAD_UNLOCK(&module->lock);
393         return OMPI_ERR_RMA_SYNC;
394     }
395 
396     /* finish all outstanding fragments */
397     ompi_osc_rdma_sync_rdma_complete (lock);
398 
399     if (0 == (lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
400         if (OMPI_OSC_RDMA_LOCKING_ON_DEMAND == module->locking_mode) {
401             ompi_osc_rdma_peer_t *peer, *next;
402 
403             /* drop all on-demand locks */
404             OPAL_LIST_FOREACH_SAFE(peer, next, &lock->demand_locked_peers, ompi_osc_rdma_peer_t) {
405                 (void) ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
406                 opal_list_remove_item (&lock->demand_locked_peers, &peer->super);
407             }
408         } else {
409             /* decrement the master lock shared count */
410             (void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL,
411                                                       offsetof (ompi_osc_rdma_state_t, global_lock));
412         }
413     }
414 
415     lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
416     lock->num_peers = 0;
417     lock->epoch_active = false;
418 
419     --module->passive_target_access_epoch;
420 
421     opal_atomic_wmb ();
422 
423     OPAL_THREAD_UNLOCK(&module->lock);
424 
425     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all complete");
426 
427     return OMPI_SUCCESS;
428 }
429