1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2017 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
14 * Copyright (c) 2013 Los Alamos National Security, LLC. All Rights
15 * reserved.
16 * Copyright (c) 2015-2016 Research Organization for Information Science
17 * and Technology (RIST). All rights reserved.
18 * Copyright (c) 2017 IBM Corporation. All rights reserved.
19 * $COPYRIGHT$
20 *
21 * Additional copyrights may follow
22 *
23 * $HEADER$
24 */
25
26 #include "ompi_config.h"
27
28 #include "mpi.h"
29 #include "opal/util/bit_ops.h"
30 #include "ompi/constants.h"
31 #include "ompi/communicator/communicator.h"
32 #include "ompi/mca/coll/coll.h"
33 #include "ompi/mca/coll/base/coll_tags.h"
34 #include "ompi/mca/pml/pml.h"
35 #include "ompi/mca/coll/base/coll_base_functions.h"
36 #include "coll_base_topo.h"
37 #include "coll_base_util.h"
38
39 /**
40 * A quick version of the MPI_Sendreceive implemented for the barrier.
41 * No actual data is moved across the wire, we use 0-byte messages to
42 * signal a two peer synchronization.
43 */
44 static inline int
ompi_coll_base_sendrecv_zero(int dest,int stag,int source,int rtag,MPI_Comm comm)45 ompi_coll_base_sendrecv_zero( int dest, int stag,
46 int source, int rtag,
47 MPI_Comm comm )
48
49 {
50 int rc, line = 0;
51 ompi_request_t *req = MPI_REQUEST_NULL;
52 ompi_status_public_t status;
53
54 /* post new irecv */
55 rc = MCA_PML_CALL(irecv( NULL, 0, MPI_BYTE, source, rtag,
56 comm, &req ));
57 if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
58
59 /* send data to children */
60 rc = MCA_PML_CALL(send( NULL, 0, MPI_BYTE, dest, stag,
61 MCA_PML_BASE_SEND_STANDARD, comm ));
62 if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
63
64 rc = ompi_request_wait( &req, &status );
65 if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
66
67 return (MPI_SUCCESS);
68
69 error_handler:
70 if( MPI_REQUEST_NULL != req ) { /* cancel and complete the receive request */
71 (void)ompi_request_cancel(req);
72 (void)ompi_request_wait(&req, &status);
73 }
74
75 OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
76 __FILE__, line, rc));
77 (void)line; // silence compiler warning
78 return rc;
79 }
80
81 /*
82 * Barrier is ment to be a synchronous operation, as some BTLs can mark
83 * a request done before its passed to the NIC and progress might not be made
84 * elsewhere we cannot allow a process to exit the barrier until its last
85 * [round of] sends are completed.
86 *
87 * It is last round of sends rather than 'last' individual send as each pair of
88 * peers can use different channels/devices/btls and the receiver of one of
89 * these sends might be forced to wait as the sender
90 * leaves the collective and does not make progress until the next mpi call
91 *
92 */
93
94 /*
95 * Simple double ring version of barrier
96 *
97 * synchronous gurantee made by last ring of sends are synchronous
98 *
99 */
ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)100 int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
101 mca_coll_base_module_t *module)
102 {
103 int rank, size, err = 0, line = 0, left, right;
104
105 size = ompi_comm_size(comm);
106 if( 1 == size )
107 return OMPI_SUCCESS;
108 rank = ompi_comm_rank(comm);
109
110 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));
111
112 left = ((rank-1)%size);
113 right = ((rank+1)%size);
114
115 if (rank > 0) { /* receive message from the left */
116 err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
117 MCA_COLL_BASE_TAG_BARRIER, comm,
118 MPI_STATUS_IGNORE));
119 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
120 }
121
122 /* Send message to the right */
123 err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
124 MCA_COLL_BASE_TAG_BARRIER,
125 MCA_PML_BASE_SEND_STANDARD, comm));
126 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
127
128 /* root needs to receive from the last node */
129 if (rank == 0) {
130 err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
131 MCA_COLL_BASE_TAG_BARRIER, comm,
132 MPI_STATUS_IGNORE));
133 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
134 }
135
136 /* Allow nodes to exit */
137 if (rank > 0) { /* post Receive from left */
138 err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
139 MCA_COLL_BASE_TAG_BARRIER, comm,
140 MPI_STATUS_IGNORE));
141 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
142 }
143
144 /* send message to the right one */
145 err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
146 MCA_COLL_BASE_TAG_BARRIER,
147 MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
148 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
149
150 /* rank 0 post receive from the last node */
151 if (rank == 0) {
152 err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
153 MCA_COLL_BASE_TAG_BARRIER, comm,
154 MPI_STATUS_IGNORE));
155 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
156 }
157
158 return MPI_SUCCESS;
159
160 err_hndl:
161 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
162 __FILE__, line, err, rank));
163 (void)line; // silence compiler warning
164 return err;
165 }
166
167
168 /*
169 * To make synchronous, uses sync sends and sync sendrecvs
170 */
171
ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)172 int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
173 mca_coll_base_module_t *module)
174 {
175 int rank, size, adjsize, err, line, mask, remote;
176
177 size = ompi_comm_size(comm);
178 if( 1 == size )
179 return OMPI_SUCCESS;
180 rank = ompi_comm_rank(comm);
181 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
182 "ompi_coll_base_barrier_intra_recursivedoubling rank %d",
183 rank));
184
185 /* do nearest power of 2 less than size calc */
186 adjsize = opal_next_poweroftwo(size);
187 adjsize >>= 1;
188
189 /* if size is not exact power of two, perform an extra step */
190 if (adjsize != size) {
191 if (rank >= adjsize) {
192 /* send message to lower ranked node */
193 remote = rank - adjsize;
194 err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
195 remote, MCA_COLL_BASE_TAG_BARRIER,
196 comm);
197 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
198
199 } else if (rank < (size - adjsize)) {
200
201 /* receive message from high level rank */
202 err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
203 MCA_COLL_BASE_TAG_BARRIER, comm,
204 MPI_STATUS_IGNORE));
205
206 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
207 }
208 }
209
210 /* exchange messages */
211 if ( rank < adjsize ) {
212 mask = 0x1;
213 while ( mask < adjsize ) {
214 remote = rank ^ mask;
215 mask <<= 1;
216 if (remote >= adjsize) continue;
217
218 /* post receive from the remote node */
219 err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
220 remote, MCA_COLL_BASE_TAG_BARRIER,
221 comm);
222 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
223 }
224 }
225
226 /* non-power of 2 case */
227 if (adjsize != size) {
228 if (rank < (size - adjsize)) {
229 /* send enter message to higher ranked node */
230 remote = rank + adjsize;
231 err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
232 MCA_COLL_BASE_TAG_BARRIER,
233 MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
234
235 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
236 }
237 }
238
239 return MPI_SUCCESS;
240
241 err_hndl:
242 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
243 __FILE__, line, err, rank));
244 (void)line; // silence compiler warning
245 return err;
246 }
247
248
249 /*
250 * To make synchronous, uses sync sends and sync sendrecvs
251 */
252
ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)253 int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
254 mca_coll_base_module_t *module)
255 {
256 int rank, size, distance, to, from, err, line = 0;
257
258 size = ompi_comm_size(comm);
259 if( 1 == size )
260 return MPI_SUCCESS;
261 rank = ompi_comm_rank(comm);
262 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
263 "ompi_coll_base_barrier_intra_bruck rank %d", rank));
264
265 /* exchange data with rank-2^k and rank+2^k */
266 for (distance = 1; distance < size; distance <<= 1) {
267 from = (rank + size - distance) % size;
268 to = (rank + distance) % size;
269
270 /* send message to lower ranked node */
271 err = ompi_coll_base_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
272 from, MCA_COLL_BASE_TAG_BARRIER,
273 comm);
274 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
275 }
276
277 return MPI_SUCCESS;
278
279 err_hndl:
280 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
281 __FILE__, line, err, rank));
282 (void)line; // silence compiler warning
283 return err;
284 }
285
286
287 /*
288 * To make synchronous, uses sync sends and sync sendrecvs
289 */
290 /* special case for two processes */
ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)291 int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
292 mca_coll_base_module_t *module)
293 {
294 int remote, size, err;
295
296 size = ompi_comm_size(comm);
297 if( 1 == size )
298 return MPI_SUCCESS;
299 if( 2 != ompi_comm_size(comm) ) {
300 return MPI_ERR_UNSUPPORTED_OPERATION;
301 }
302
303 remote = ompi_comm_rank(comm);
304 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
305 "ompi_coll_base_barrier_intra_two_procs rank %d", remote));
306
307 remote = (remote + 1) & 0x1;
308
309 err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
310 remote, MCA_COLL_BASE_TAG_BARRIER,
311 comm);
312 return (err);
313 }
314
315
316 /*
317 * Linear functions are copied from the BASIC coll module
318 * they do not segment the message and are simple implementations
319 * but for some small number of nodes and/or small data sizes they
320 * are just as fast as base/tree based segmenting operations
321 * and as such may be selected by the decision functions
322 * These are copied into this module due to the way we select modules
323 * in V1. i.e. in V2 we will handle this differently and so will not
324 * have to duplicate code.
325 * GEF Oct05 after asking Jeff.
326 */
327
328 /* copied function (with appropriate renaming) starts here */
329
ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)330 int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
331 mca_coll_base_module_t *module)
332 {
333 int i, err, rank, size, line;
334 ompi_request_t** requests = NULL;
335
336 size = ompi_comm_size(comm);
337 if( 1 == size )
338 return MPI_SUCCESS;
339 rank = ompi_comm_rank(comm);
340
341 /* All non-root send & receive zero-length message. */
342 if (rank > 0) {
343 err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
344 MCA_COLL_BASE_TAG_BARRIER,
345 MCA_PML_BASE_SEND_STANDARD, comm));
346 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
347
348 err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
349 MCA_COLL_BASE_TAG_BARRIER,
350 comm, MPI_STATUS_IGNORE));
351 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
352 }
353
354 /* The root collects and broadcasts the messages. */
355
356 else {
357 requests = ompi_coll_base_comm_get_reqs(module->base_data, size);
358 if( NULL == requests ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; }
359
360 for (i = 1; i < size; ++i) {
361 err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
362 MCA_COLL_BASE_TAG_BARRIER, comm,
363 &(requests[i])));
364 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
365 }
366 err = ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );
367 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
368 requests = NULL; /* we're done the requests array is clean */
369
370 for (i = 1; i < size; ++i) {
371 err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, i,
372 MCA_COLL_BASE_TAG_BARRIER,
373 MCA_PML_BASE_SEND_STANDARD, comm));
374 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
375 }
376 }
377
378 /* All done */
379 return MPI_SUCCESS;
380 err_hndl:
381 if( NULL != requests ) {
382 /* find a real error code */
383 if (MPI_ERR_IN_STATUS == err) {
384 for( i = 0; i < size; i++ ) {
385 if (MPI_REQUEST_NULL == requests[i]) continue;
386 if (MPI_ERR_PENDING == requests[i]->req_status.MPI_ERROR) continue;
387 err = requests[i]->req_status.MPI_ERROR;
388 break;
389 }
390 }
391 ompi_coll_base_free_reqs(requests, size);
392 }
393 OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
394 __FILE__, line, err, rank) );
395 (void)line; // silence compiler warning
396 return err;
397 }
398 /* copied function (with appropriate renaming) ends here */
399
400 /*
401 * Another recursive doubling type algorithm, but in this case
402 * we go up the tree and back down the tree.
403 */
ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)404 int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm,
405 mca_coll_base_module_t *module)
406 {
407 int rank, size, depth, err, jump, partner;
408
409 size = ompi_comm_size(comm);
410 if( 1 == size )
411 return MPI_SUCCESS;
412 rank = ompi_comm_rank(comm);
413 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
414 "ompi_coll_base_barrier_intra_tree %d",
415 rank));
416
417 /* Find the nearest power of 2 of the communicator size. */
418 depth = opal_next_poweroftwo_inclusive(size);
419
420 for (jump=1; jump<depth; jump<<=1) {
421 partner = rank ^ jump;
422 if (!(partner & (jump-1)) && partner < size) {
423 if (partner > rank) {
424 err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
425 MCA_COLL_BASE_TAG_BARRIER, comm,
426 MPI_STATUS_IGNORE));
427 if (MPI_SUCCESS != err)
428 return err;
429 } else if (partner < rank) {
430 err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner,
431 MCA_COLL_BASE_TAG_BARRIER,
432 MCA_PML_BASE_SEND_STANDARD, comm));
433 if (MPI_SUCCESS != err)
434 return err;
435 }
436 }
437 }
438
439 depth >>= 1;
440 for (jump = depth; jump>0; jump>>=1) {
441 partner = rank ^ jump;
442 if (!(partner & (jump-1)) && partner < size) {
443 if (partner > rank) {
444 err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner,
445 MCA_COLL_BASE_TAG_BARRIER,
446 MCA_PML_BASE_SEND_STANDARD, comm));
447 if (MPI_SUCCESS != err)
448 return err;
449 } else if (partner < rank) {
450 err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
451 MCA_COLL_BASE_TAG_BARRIER, comm,
452 MPI_STATUS_IGNORE));
453 if (MPI_SUCCESS != err)
454 return err;
455 }
456 }
457 }
458
459 return MPI_SUCCESS;
460 }
461