1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2017 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
14  * Copyright (c) 2013      Los Alamos National Security, LLC. All Rights
15  *                         reserved.
16  * Copyright (c) 2015-2016 Research Organization for Information Science
17  *                         and Technology (RIST). All rights reserved.
18  * Copyright (c) 2017      IBM Corporation. All rights reserved.
19  * $COPYRIGHT$
20  *
21  * Additional copyrights may follow
22  *
23  * $HEADER$
24  */
25 
26 #include "ompi_config.h"
27 
28 #include "mpi.h"
29 #include "opal/util/bit_ops.h"
30 #include "ompi/constants.h"
31 #include "ompi/communicator/communicator.h"
32 #include "ompi/mca/coll/coll.h"
33 #include "ompi/mca/coll/base/coll_tags.h"
34 #include "ompi/mca/pml/pml.h"
35 #include "ompi/mca/coll/base/coll_base_functions.h"
36 #include "coll_base_topo.h"
37 #include "coll_base_util.h"
38 
39 /**
40  * A quick version of the MPI_Sendreceive implemented for the barrier.
41  * No actual data is moved across the wire, we use 0-byte messages to
42  * signal a two peer synchronization.
43  */
44 static inline int
ompi_coll_base_sendrecv_zero(int dest,int stag,int source,int rtag,MPI_Comm comm)45 ompi_coll_base_sendrecv_zero( int dest, int stag,
46                               int source, int rtag,
47                               MPI_Comm comm )
48 
49 {
50     int rc, line = 0;
51     ompi_request_t *req = MPI_REQUEST_NULL;
52     ompi_status_public_t status;
53 
54     /* post new irecv */
55     rc = MCA_PML_CALL(irecv( NULL, 0, MPI_BYTE, source, rtag,
56                              comm, &req ));
57     if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
58 
59     /* send data to children */
60     rc = MCA_PML_CALL(send( NULL, 0, MPI_BYTE, dest, stag,
61                             MCA_PML_BASE_SEND_STANDARD, comm ));
62     if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
63 
64     rc = ompi_request_wait( &req, &status );
65     if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
66 
67     return (MPI_SUCCESS);
68 
69  error_handler:
70     if( MPI_REQUEST_NULL != req ) {  /* cancel and complete the receive request */
71         (void)ompi_request_cancel(req);
72         (void)ompi_request_wait(&req, &status);
73     }
74 
75     OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
76                   __FILE__, line, rc));
77     (void)line;  // silence compiler warning
78     return rc;
79 }
80 
81 /*
82  * Barrier is ment to be a synchronous operation, as some BTLs can mark
83  * a request done before its passed to the NIC and progress might not be made
84  * elsewhere we cannot allow a process to exit the barrier until its last
85  * [round of] sends are completed.
86  *
87  * It is last round of sends rather than 'last' individual send as each pair of
88  * peers can use different channels/devices/btls and the receiver of one of
89  * these sends might be forced to wait as the sender
90  * leaves the collective and does not make progress until the next mpi call
91  *
92  */
93 
94 /*
95  * Simple double ring version of barrier
96  *
97  * synchronous gurantee made by last ring of sends are synchronous
98  *
99  */
ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)100 int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
101                                              mca_coll_base_module_t *module)
102 {
103     int rank, size, err = 0, line = 0, left, right;
104 
105     size = ompi_comm_size(comm);
106     if( 1 == size )
107         return OMPI_SUCCESS;
108     rank = ompi_comm_rank(comm);
109 
110     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));
111 
112     left = ((rank-1)%size);
113     right = ((rank+1)%size);
114 
115     if (rank > 0) { /* receive message from the left */
116         err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
117                                 MCA_COLL_BASE_TAG_BARRIER, comm,
118                                 MPI_STATUS_IGNORE));
119         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
120     }
121 
122     /* Send message to the right */
123     err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
124                             MCA_COLL_BASE_TAG_BARRIER,
125                             MCA_PML_BASE_SEND_STANDARD, comm));
126     if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
127 
128     /* root needs to receive from the last node */
129     if (rank == 0) {
130         err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
131                                 MCA_COLL_BASE_TAG_BARRIER, comm,
132                                 MPI_STATUS_IGNORE));
133         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
134     }
135 
136     /* Allow nodes to exit */
137     if (rank > 0) { /* post Receive from left */
138         err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
139                                 MCA_COLL_BASE_TAG_BARRIER, comm,
140                                 MPI_STATUS_IGNORE));
141         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
142     }
143 
144     /* send message to the right one */
145     err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
146                             MCA_COLL_BASE_TAG_BARRIER,
147                             MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
148     if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
149 
150     /* rank 0 post receive from the last node */
151     if (rank == 0) {
152         err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
153                                 MCA_COLL_BASE_TAG_BARRIER, comm,
154                                 MPI_STATUS_IGNORE));
155         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
156     }
157 
158     return MPI_SUCCESS;
159 
160  err_hndl:
161     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
162                  __FILE__, line, err, rank));
163     (void)line;  // silence compiler warning
164     return err;
165 }
166 
167 
168 /*
169  * To make synchronous, uses sync sends and sync sendrecvs
170  */
171 
ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)172 int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
173                                                     mca_coll_base_module_t *module)
174 {
175     int rank, size, adjsize, err, line, mask, remote;
176 
177     size = ompi_comm_size(comm);
178     if( 1 == size )
179         return OMPI_SUCCESS;
180     rank = ompi_comm_rank(comm);
181     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
182                  "ompi_coll_base_barrier_intra_recursivedoubling rank %d",
183                  rank));
184 
185     /* do nearest power of 2 less than size calc */
186     adjsize = opal_next_poweroftwo(size);
187     adjsize >>= 1;
188 
189     /* if size is not exact power of two, perform an extra step */
190     if (adjsize != size) {
191         if (rank >= adjsize) {
192             /* send message to lower ranked node */
193             remote = rank - adjsize;
194             err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
195                                                remote, MCA_COLL_BASE_TAG_BARRIER,
196                                                comm);
197             if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
198 
199         } else if (rank < (size - adjsize)) {
200 
201             /* receive message from high level rank */
202             err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
203                                     MCA_COLL_BASE_TAG_BARRIER, comm,
204                                     MPI_STATUS_IGNORE));
205 
206             if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
207         }
208     }
209 
210     /* exchange messages */
211     if ( rank < adjsize ) {
212         mask = 0x1;
213         while ( mask < adjsize ) {
214             remote = rank ^ mask;
215             mask <<= 1;
216             if (remote >= adjsize) continue;
217 
218             /* post receive from the remote node */
219             err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
220                                                remote, MCA_COLL_BASE_TAG_BARRIER,
221                                                comm);
222             if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
223         }
224     }
225 
226     /* non-power of 2 case */
227     if (adjsize != size) {
228         if (rank < (size - adjsize)) {
229             /* send enter message to higher ranked node */
230             remote = rank + adjsize;
231             err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
232                                     MCA_COLL_BASE_TAG_BARRIER,
233                                     MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
234 
235             if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
236         }
237     }
238 
239     return MPI_SUCCESS;
240 
241  err_hndl:
242     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
243                  __FILE__, line, err, rank));
244     (void)line;  // silence compiler warning
245     return err;
246 }
247 
248 
249 /*
250  * To make synchronous, uses sync sends and sync sendrecvs
251  */
252 
ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)253 int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
254                                         mca_coll_base_module_t *module)
255 {
256     int rank, size, distance, to, from, err, line = 0;
257 
258     size = ompi_comm_size(comm);
259     if( 1 == size )
260         return MPI_SUCCESS;
261     rank = ompi_comm_rank(comm);
262     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
263                  "ompi_coll_base_barrier_intra_bruck rank %d", rank));
264 
265     /* exchange data with rank-2^k and rank+2^k */
266     for (distance = 1; distance < size; distance <<= 1) {
267         from = (rank + size - distance) % size;
268         to   = (rank + distance) % size;
269 
270         /* send message to lower ranked node */
271         err = ompi_coll_base_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
272                                            from, MCA_COLL_BASE_TAG_BARRIER,
273                                            comm);
274         if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
275     }
276 
277     return MPI_SUCCESS;
278 
279  err_hndl:
280     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
281                  __FILE__, line, err, rank));
282     (void)line;  // silence compiler warning
283     return err;
284 }
285 
286 
287 /*
288  * To make synchronous, uses sync sends and sync sendrecvs
289  */
290 /* special case for two processes */
ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)291 int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
292                                             mca_coll_base_module_t *module)
293 {
294     int remote, size, err;
295 
296     size = ompi_comm_size(comm);
297     if( 1 == size )
298         return MPI_SUCCESS;
299     if( 2 != ompi_comm_size(comm) ) {
300         return MPI_ERR_UNSUPPORTED_OPERATION;
301     }
302 
303     remote = ompi_comm_rank(comm);
304     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
305                  "ompi_coll_base_barrier_intra_two_procs rank %d", remote));
306 
307     remote = (remote + 1) & 0x1;
308 
309     err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
310                                        remote, MCA_COLL_BASE_TAG_BARRIER,
311                                        comm);
312     return (err);
313 }
314 
315 
316 /*
317  * Linear functions are copied from the BASIC coll module
318  * they do not segment the message and are simple implementations
319  * but for some small number of nodes and/or small data sizes they
320  * are just as fast as base/tree based segmenting operations
321  * and as such may be selected by the decision functions
322  * These are copied into this module due to the way we select modules
323  * in V1. i.e. in V2 we will handle this differently and so will not
324  * have to duplicate code.
325  * GEF Oct05 after asking Jeff.
326  */
327 
328 /* copied function (with appropriate renaming) starts here */
329 
ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)330 int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
331                                               mca_coll_base_module_t *module)
332 {
333     int i, err, rank, size, line;
334     ompi_request_t** requests = NULL;
335 
336     size = ompi_comm_size(comm);
337     if( 1 == size )
338         return MPI_SUCCESS;
339     rank = ompi_comm_rank(comm);
340 
341     /* All non-root send & receive zero-length message. */
342     if (rank > 0) {
343         err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
344                                  MCA_COLL_BASE_TAG_BARRIER,
345                                  MCA_PML_BASE_SEND_STANDARD, comm));
346         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
347 
348         err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
349                                  MCA_COLL_BASE_TAG_BARRIER,
350                                  comm, MPI_STATUS_IGNORE));
351         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
352     }
353 
354     /* The root collects and broadcasts the messages. */
355 
356     else {
357         requests = ompi_coll_base_comm_get_reqs(module->base_data, size);
358         if( NULL == requests ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; }
359 
360         for (i = 1; i < size; ++i) {
361             err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
362                                      MCA_COLL_BASE_TAG_BARRIER, comm,
363                                      &(requests[i])));
364             if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
365         }
366         err = ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );
367         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
368         requests = NULL;  /* we're done the requests array is clean */
369 
370         for (i = 1; i < size; ++i) {
371             err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, i,
372                                     MCA_COLL_BASE_TAG_BARRIER,
373                                     MCA_PML_BASE_SEND_STANDARD, comm));
374             if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
375         }
376     }
377 
378     /* All done */
379     return MPI_SUCCESS;
380  err_hndl:
381     if( NULL != requests ) {
382         /* find a real error code */
383         if (MPI_ERR_IN_STATUS == err) {
384             for( i = 0; i < size; i++ ) {
385                 if (MPI_REQUEST_NULL == requests[i]) continue;
386                 if (MPI_ERR_PENDING == requests[i]->req_status.MPI_ERROR) continue;
387                 err = requests[i]->req_status.MPI_ERROR;
388                 break;
389             }
390         }
391         ompi_coll_base_free_reqs(requests, size);
392     }
393     OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
394                   __FILE__, line, err, rank) );
395     (void)line;  // silence compiler warning
396     return err;
397 }
398 /* copied function (with appropriate renaming) ends here */
399 
400 /*
401  * Another recursive doubling type algorithm, but in this case
402  * we go up the tree and back down the tree.
403  */
ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t * comm,mca_coll_base_module_t * module)404 int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm,
405                                        mca_coll_base_module_t *module)
406 {
407     int rank, size, depth, err, jump, partner;
408 
409     size = ompi_comm_size(comm);
410     if( 1 == size )
411         return MPI_SUCCESS;
412     rank = ompi_comm_rank(comm);
413     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
414                  "ompi_coll_base_barrier_intra_tree %d",
415                  rank));
416 
417     /* Find the nearest power of 2 of the communicator size. */
418     depth = opal_next_poweroftwo_inclusive(size);
419 
420     for (jump=1; jump<depth; jump<<=1) {
421         partner = rank ^ jump;
422         if (!(partner & (jump-1)) && partner < size) {
423             if (partner > rank) {
424                 err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
425                                          MCA_COLL_BASE_TAG_BARRIER, comm,
426                                          MPI_STATUS_IGNORE));
427                 if (MPI_SUCCESS != err)
428                     return err;
429             } else if (partner < rank) {
430                 err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner,
431                                          MCA_COLL_BASE_TAG_BARRIER,
432                                          MCA_PML_BASE_SEND_STANDARD, comm));
433                 if (MPI_SUCCESS != err)
434                     return err;
435             }
436         }
437     }
438 
439     depth >>= 1;
440     for (jump = depth; jump>0; jump>>=1) {
441         partner = rank ^ jump;
442         if (!(partner & (jump-1)) && partner < size) {
443             if (partner > rank) {
444                 err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner,
445                                          MCA_COLL_BASE_TAG_BARRIER,
446                                          MCA_PML_BASE_SEND_STANDARD, comm));
447                 if (MPI_SUCCESS != err)
448                     return err;
449             } else if (partner < rank) {
450                 err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
451                                          MCA_COLL_BASE_TAG_BARRIER, comm,
452                                          MPI_STATUS_IGNORE));
453                 if (MPI_SUCCESS != err)
454                     return err;
455             }
456         }
457     }
458 
459     return MPI_SUCCESS;
460 }
461