1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2005 The Trustees of Indiana University.
4 * All rights reserved.
5 * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
6 * All rights reserved.
7 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8 * University of Stuttgart. All rights reserved.
9 * Copyright (c) 2004-2005 The Regents of the University of California.
10 * All rights reserved.
11 * Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights
12 * reserved.
13 * Copyright (c) 2010 IBM Corporation. All rights reserved.
14 * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
15 * Copyright (c) 2018 Intel, Inc. All rights reserved.
16 * $COPYRIGHT$
17 *
18 * Additional copyrights may follow
19 *
20 * $HEADER$
21 */
22
23 #include "ompi_config.h"
24
25 #include "osc_rdma_passive_target.h"
26 #include "osc_rdma_comm.h"
27
28 #include "mpi.h"
29
30
ompi_osc_rdma_sync(struct ompi_win_t * win)31 int ompi_osc_rdma_sync (struct ompi_win_t *win)
32 {
33 ompi_osc_rdma_progress (GET_MODULE(win));
34 return OMPI_SUCCESS;
35 }
36
ompi_osc_rdma_flush(int target,struct ompi_win_t * win)37 int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
38 {
39 ompi_osc_rdma_module_t *module = GET_MODULE(win);
40 ompi_osc_rdma_sync_t *lock;
41 ompi_osc_rdma_peer_t *peer;
42
43 assert (0 <= target);
44
45 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush: %d, %s", target, win->w_name);
46
47 OPAL_THREAD_LOCK(&module->lock);
48
49 lock = ompi_osc_rdma_module_sync_lookup (module, target, &peer);
50 if (OPAL_UNLIKELY(NULL == lock || OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type)) {
51 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "flush: target %d is not locked in window %s",
52 target, win->w_name);
53 OPAL_THREAD_UNLOCK(&module->lock);
54 return OMPI_ERR_RMA_SYNC;
55 }
56 OPAL_THREAD_UNLOCK(&module->lock);
57
58 /* finish all outstanding fragments */
59 ompi_osc_rdma_sync_rdma_complete (lock);
60
61 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush on target %d complete", target);
62
63 return OMPI_SUCCESS;
64 }
65
66
ompi_osc_rdma_flush_all(struct ompi_win_t * win)67 int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
68 {
69 ompi_osc_rdma_module_t *module = GET_MODULE(win);
70 ompi_osc_rdma_sync_t *lock;
71 int ret = OMPI_SUCCESS;
72 uint32_t key;
73 void *node;
74
75 /* flush is only allowed from within a passive target epoch */
76 if (!ompi_osc_rdma_in_passive_epoch (module)) {
77 return OMPI_ERR_RMA_SYNC;
78 }
79
80 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all: %s", win->w_name);
81
82 /* globally complete all outstanding rdma requests */
83 if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) {
84 ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
85 }
86
87 /* flush all locks */
88 ret = opal_hash_table_get_first_key_uint32 (&module->outstanding_locks, &key, (void **) &lock, &node);
89 while (OPAL_SUCCESS == ret) {
90 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "flushing lock %p", (void *) lock);
91 ompi_osc_rdma_sync_rdma_complete (lock);
92 ret = opal_hash_table_get_next_key_uint32 (&module->outstanding_locks, &key, (void **) &lock,
93 node, &node);
94 }
95
96 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all complete");
97
98 return OPAL_SUCCESS;
99 }
100
101
ompi_osc_rdma_flush_local(int target,struct ompi_win_t * win)102 int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
103 {
104 return ompi_osc_rdma_flush (target, win);
105 }
106
107
ompi_osc_rdma_flush_local_all(struct ompi_win_t * win)108 int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
109 {
110 return ompi_osc_rdma_flush_all (win);
111 }
112
113 /* locking via atomics */
ompi_osc_rdma_lock_atomic_internal(ompi_osc_rdma_module_t * module,ompi_osc_rdma_peer_t * peer,ompi_osc_rdma_sync_t * lock)114 static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
115 ompi_osc_rdma_sync_t *lock)
116 {
117 const int locking_mode = module->locking_mode;
118 int ret;
119
120 if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
121 do {
122 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global exclusive lock");
123 if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
124 /* lock the master lock. this requires no rank has a global shared lock */
125 ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock),
126 0xffffffff00000000L);
127 if (OMPI_SUCCESS != ret) {
128 ompi_osc_rdma_progress (module);
129 continue;
130 }
131 }
132
133 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring exclusive lock on peer");
134 ret = ompi_osc_rdma_lock_try_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
135 if (ret) {
136 /* release the global lock */
137 if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
138 ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
139 }
140 ompi_osc_rdma_progress (module);
141 continue;
142 }
143
144 peer->flags |= OMPI_OSC_RDMA_PEER_EXCLUSIVE;
145 break;
146 } while (1);
147 } else {
148 do {
149 /* go right to the target to acquire a shared lock */
150 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global shared lock");
151 ret = ompi_osc_rdma_lock_acquire_shared (module, peer, 1, offsetof (ompi_osc_rdma_state_t, local_lock),
152 OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
153 if (OMPI_SUCCESS == ret) {
154 return OMPI_SUCCESS;
155 }
156
157 ompi_osc_rdma_progress (module);
158 } while (1);
159 }
160
161 return OMPI_SUCCESS;
162 }
163
ompi_osc_rdma_unlock_atomic_internal(ompi_osc_rdma_module_t * module,ompi_osc_rdma_peer_t * peer,ompi_osc_rdma_sync_t * lock)164 static inline int ompi_osc_rdma_unlock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
165 ompi_osc_rdma_sync_t *lock)
166 {
167 const int locking_mode = module->locking_mode;
168
169 if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
170 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock on peer");
171 ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
172
173 if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
174 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global exclusive lock");
175 ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
176 }
177
178 peer->flags &= ~OMPI_OSC_RDMA_PEER_EXCLUSIVE;
179 } else {
180 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global shared lock");
181 ompi_osc_rdma_lock_release_shared (module, peer, -1, offsetof (ompi_osc_rdma_state_t, local_lock));
182 peer->flags &= ~OMPI_OSC_RDMA_PEER_DEMAND_LOCKED;
183 }
184
185 return OMPI_SUCCESS;
186 }
187
ompi_osc_rdma_demand_lock_peer(ompi_osc_rdma_module_t * module,ompi_osc_rdma_peer_t * peer)188 int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
189 {
190 ompi_osc_rdma_sync_t *lock = &module->all_sync;
191 int ret = OMPI_SUCCESS;
192
193 /* check for bad usage */
194 assert (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == lock->type);
195
196 OPAL_THREAD_SCOPED_LOCK(&peer->lock,
197 do {
198 if (!ompi_osc_rdma_peer_is_demand_locked (peer)) {
199 ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
200 OPAL_THREAD_SCOPED_LOCK(&lock->lock, opal_list_append (&lock->demand_locked_peers, &peer->super));
201 peer->flags |= OMPI_OSC_RDMA_PEER_DEMAND_LOCKED;
202 }
203 } while (0);
204 );
205
206 return ret;
207 }
208
ompi_osc_rdma_lock_atomic(int lock_type,int target,int assert,ompi_win_t * win)209 int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
210 {
211 ompi_osc_rdma_module_t *module = GET_MODULE(win);
212 ompi_osc_rdma_peer_t *peer = ompi_osc_rdma_module_peer (module, target);
213 ompi_osc_rdma_sync_t *lock;
214 int ret = OMPI_SUCCESS;
215
216 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock: %d, %d, %d, %s", lock_type, target, assert, win->w_name);
217
218 if (module->no_locks) {
219 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
220 return OMPI_ERR_RMA_SYNC;
221 }
222
223 if (module->all_sync.epoch_active && (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type || MPI_LOCK_EXCLUSIVE == lock_type)) {
224 /* impossible to get an exclusive lock while holding a global shared lock or in a active
225 * target access epoch */
226 return OMPI_ERR_RMA_SYNC;
227 }
228
229 /* clear the global sync object (in case MPI_Win_fence was called) */
230 module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
231
232 /* create lock item */
233 lock = ompi_osc_rdma_sync_allocate (module);
234 if (OPAL_UNLIKELY(NULL == lock)) {
235 return OMPI_ERR_OUT_OF_RESOURCE;
236 }
237
238 lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
239 lock->sync.lock.target = target;
240 lock->sync.lock.type = lock_type;
241 lock->sync.lock.assert = assert;
242
243 lock->peer_list.peer = peer;
244 lock->num_peers = 1;
245 OBJ_RETAIN(peer);
246
247 if (0 == (assert & MPI_MODE_NOCHECK)) {
248 ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
249 }
250
251 if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
252 ++module->passive_target_access_epoch;
253
254 opal_atomic_wmb ();
255
256 OPAL_THREAD_SCOPED_LOCK(&module->lock, ompi_osc_rdma_module_lock_insert (module, lock));
257 } else {
258 OBJ_RELEASE(lock);
259 }
260
261 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock %d complete", target);
262
263 return ret;
264 }
265
266
ompi_osc_rdma_unlock_atomic(int target,ompi_win_t * win)267 int ompi_osc_rdma_unlock_atomic (int target, ompi_win_t *win)
268 {
269 ompi_osc_rdma_module_t *module = GET_MODULE(win);
270 ompi_osc_rdma_peer_t *peer;
271 ompi_osc_rdma_sync_t *lock;
272 int ret = OMPI_SUCCESS;
273
274 OPAL_THREAD_LOCK(&module->lock);
275
276 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock: %d, %s", target, win->w_name);
277
278 lock = ompi_osc_rdma_module_lock_find (module, target, &peer);
279 if (OPAL_UNLIKELY(NULL == lock)) {
280 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "target %d is not locked in window %s",
281 target, win->w_name);
282 OPAL_THREAD_UNLOCK(&module->lock);
283 return OMPI_ERR_RMA_SYNC;
284 }
285
286 ompi_osc_rdma_module_lock_remove (module, lock);
287
288 /* finish all outstanding fragments */
289 ompi_osc_rdma_sync_rdma_complete (lock);
290
291 if (!(lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
292 ret = ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
293 }
294
295 /* release our reference to this peer */
296 OBJ_RELEASE(peer);
297
298 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock %d complete", target);
299
300 --module->passive_target_access_epoch;
301
302 opal_atomic_wmb ();
303
304 OPAL_THREAD_UNLOCK(&module->lock);
305
306 /* delete the lock */
307 ompi_osc_rdma_sync_return (lock);
308
309 return ret;
310 }
311
ompi_osc_rdma_lock_all_atomic(int assert,struct ompi_win_t * win)312 int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win)
313 {
314 ompi_osc_rdma_module_t *module = GET_MODULE(win);
315 ompi_osc_rdma_sync_t *lock;
316 int ret = OMPI_SUCCESS;
317
318 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all: %d, %s", assert, win->w_name);
319
320 if (module->no_locks) {
321 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
322 return OMPI_ERR_RMA_SYNC;
323 }
324
325 OPAL_THREAD_LOCK(&module->lock);
326 if (module->all_sync.epoch_active) {
327 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted lock_all when active target epoch is %s "
328 "and lock all epoch is %s",
329 (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type && module->all_sync.epoch_active) ?
330 "active" : "inactive",
331 (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) ? "active" : "inactive");
332 OPAL_THREAD_UNLOCK(&module->lock);
333 return OMPI_ERR_RMA_SYNC;
334 }
335
336 /* set up lock */
337 lock = &module->all_sync;
338
339 lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
340 lock->sync.lock.target = -1;
341 lock->sync.lock.type = MPI_LOCK_SHARED;
342 lock->sync.lock.assert = assert;
343 lock->num_peers = ompi_comm_size (module->comm);
344
345 lock->epoch_active = true;
346 /* NTH: TODO -- like fence it might be a good idea to create an array to access all peers
347 * without having to access the hash table. Such a change would likely increase performance
348 * at the expense of memory usage. Ex. if a window has 1M peers then 8MB per process would
349 * be needed for this array. */
350
351 if (0 == (assert & MPI_MODE_NOCHECK)) {
352 /* increment the global shared lock */
353 if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == module->locking_mode) {
354 ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL,
355 offsetof(ompi_osc_rdma_state_t, global_lock),
356 0x00000000ffffffffUL);
357 } else {
358 /* always lock myself */
359 ret = ompi_osc_rdma_demand_lock_peer (module, module->my_peer);
360 }
361 }
362
363 if (OPAL_LIKELY(OMPI_SUCCESS != ret)) {
364 lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
365 lock->num_peers = 0;
366 lock->epoch_active = false;
367 } else {
368 ++module->passive_target_access_epoch;
369 }
370
371 opal_atomic_wmb ();
372
373 OPAL_THREAD_UNLOCK(&module->lock);
374
375 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all complete");
376
377 return ret;
378 }
379
ompi_osc_rdma_unlock_all_atomic(struct ompi_win_t * win)380 int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win)
381 {
382 ompi_osc_rdma_module_t *module = GET_MODULE(win);
383 ompi_osc_rdma_sync_t *lock;
384
385 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all: %s", win->w_name);
386
387 OPAL_THREAD_LOCK(&module->lock);
388
389 lock = &module->all_sync;
390 if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type) {
391 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "not locked in window %s", win->w_name);
392 OPAL_THREAD_UNLOCK(&module->lock);
393 return OMPI_ERR_RMA_SYNC;
394 }
395
396 /* finish all outstanding fragments */
397 ompi_osc_rdma_sync_rdma_complete (lock);
398
399 if (0 == (lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
400 if (OMPI_OSC_RDMA_LOCKING_ON_DEMAND == module->locking_mode) {
401 ompi_osc_rdma_peer_t *peer, *next;
402
403 /* drop all on-demand locks */
404 OPAL_LIST_FOREACH_SAFE(peer, next, &lock->demand_locked_peers, ompi_osc_rdma_peer_t) {
405 (void) ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
406 opal_list_remove_item (&lock->demand_locked_peers, &peer->super);
407 }
408 } else {
409 /* decrement the master lock shared count */
410 (void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL,
411 offsetof (ompi_osc_rdma_state_t, global_lock));
412 }
413 }
414
415 lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
416 lock->num_peers = 0;
417 lock->epoch_active = false;
418
419 --module->passive_target_access_epoch;
420
421 opal_atomic_wmb ();
422
423 OPAL_THREAD_UNLOCK(&module->lock);
424
425 OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all complete");
426
427 return OMPI_SUCCESS;
428 }
429