1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpiimpl.h"
7 
8 /* FIXME This function uses some heuristsics based off of some testing on a
9  * cluster at Argonne.  We need a better system for detrmining and controlling
10  * the cutoff points for these algorithms.  If I've done this right, you should
11  * be able to make changes along these lines almost exclusively in this function
12  * and some new functions. [goodell@ 2008/01/07] */
MPIR_Bcast_intra_smp(void * buffer,int count,MPI_Datatype datatype,int root,MPIR_Comm * comm_ptr,MPIR_Errflag_t * errflag)13 int MPIR_Bcast_intra_smp(void *buffer, int count, MPI_Datatype datatype, int root,
14                          MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
15 {
16     int mpi_errno = MPI_SUCCESS;
17     int mpi_errno_ret = MPI_SUCCESS;
18     MPI_Aint type_size, nbytes = 0;
19     MPI_Status *status_p;
20 #ifdef HAVE_ERROR_CHECKING
21     MPI_Status status;
22     status_p = &status;
23     MPI_Aint recvd_size;
24 #else
25     status_p = MPI_STATUS_IGNORE;
26 #endif
27 
28 #ifdef HAVE_ERROR_CHECKING
29     MPIR_Assert(MPIR_Comm_is_parent_comm(comm_ptr));
30 #endif
31 
32     MPIR_Datatype_get_size_macro(datatype, type_size);
33 
34     nbytes = type_size * count;
35     if (nbytes == 0)
36         goto fn_exit;   /* nothing to do */
37 
38     if ((nbytes < MPIR_CVAR_BCAST_SHORT_MSG_SIZE) ||
39         (comm_ptr->local_size < MPIR_CVAR_BCAST_MIN_PROCS)) {
40         /* send to intranode-rank 0 on the root's node */
41         if (comm_ptr->node_comm != NULL && MPIR_Get_intranode_rank(comm_ptr, root) > 0) {       /* is not the node root (0) and is on our node (!-1) */
42             if (root == comm_ptr->rank) {
43                 mpi_errno = MPIC_Send(buffer, count, datatype, 0,
44                                       MPIR_BCAST_TAG, comm_ptr->node_comm, errflag);
45                 if (mpi_errno) {
46                     /* for communication errors, just record the error but continue */
47                     *errflag =
48                         MPIX_ERR_PROC_FAILED ==
49                         MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
50                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
51                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
52                 }
53             } else if (0 == comm_ptr->node_comm->rank) {
54                 mpi_errno =
55                     MPIC_Recv(buffer, count, datatype, MPIR_Get_intranode_rank(comm_ptr, root),
56                               MPIR_BCAST_TAG, comm_ptr->node_comm, status_p, errflag);
57                 if (mpi_errno) {
58                     /* for communication errors, just record the error but continue */
59                     *errflag =
60                         MPIX_ERR_PROC_FAILED ==
61                         MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
62                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
63                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
64                 }
65 #ifdef HAVE_ERROR_CHECKING
66                 /* check that we received as much as we expected */
67                 MPIR_Get_count_impl(status_p, MPI_BYTE, &recvd_size);
68                 if (recvd_size != nbytes) {
69                     if (*errflag == MPIR_ERR_NONE)
70                         *errflag = MPIR_ERR_OTHER;
71                     MPIR_ERR_SET2(mpi_errno, MPI_ERR_OTHER,
72                                   "**collective_size_mismatch",
73                                   "**collective_size_mismatch %d %d", recvd_size, nbytes);
74                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
75                 }
76 #endif
77             }
78 
79         }
80 
81         /* perform the internode broadcast */
82         if (comm_ptr->node_roots_comm != NULL) {
83             mpi_errno = MPIR_Bcast(buffer, count, datatype,
84                                    MPIR_Get_internode_rank(comm_ptr, root),
85                                    comm_ptr->node_roots_comm, errflag);
86             if (mpi_errno) {
87                 /* for communication errors, just record the error but continue */
88                 *errflag =
89                     MPIX_ERR_PROC_FAILED ==
90                     MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
91                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
92                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
93             }
94         }
95 
96         /* perform the intranode broadcast on all except for the root's node */
97         if (comm_ptr->node_comm != NULL) {
98             mpi_errno = MPIR_Bcast(buffer, count, datatype, 0, comm_ptr->node_comm, errflag);
99             if (mpi_errno) {
100                 /* for communication errors, just record the error but continue */
101                 *errflag =
102                     MPIX_ERR_PROC_FAILED ==
103                     MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
104                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
105                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
106             }
107         }
108     } else {    /* (nbytes > MPIR_CVAR_BCAST_SHORT_MSG_SIZE) && (comm_ptr->size >= MPIR_CVAR_BCAST_MIN_PROCS) */
109 
110         /* supposedly...
111          * smp+doubling good for pof2
112          * reg+ring better for non-pof2 */
113         if (nbytes < MPIR_CVAR_BCAST_LONG_MSG_SIZE && MPL_is_pof2(comm_ptr->local_size, NULL)) {
114             /* medium-sized msg and pof2 np */
115 
116             /* perform the intranode broadcast on the root's node */
117             if (comm_ptr->node_comm != NULL && MPIR_Get_intranode_rank(comm_ptr, root) > 0) {   /* is not the node root (0) and is on our node (!-1) */
118                 /* FIXME binomial may not be the best algorithm for on-node
119                  * bcast.  We need a more comprehensive system for selecting the
120                  * right algorithms here. */
121                 mpi_errno = MPIR_Bcast(buffer, count, datatype,
122                                        MPIR_Get_intranode_rank(comm_ptr, root),
123                                        comm_ptr->node_comm, errflag);
124                 if (mpi_errno) {
125                     /* for communication errors, just record the error but continue */
126                     *errflag =
127                         MPIX_ERR_PROC_FAILED ==
128                         MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
129                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
130                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
131                 }
132             }
133 
134             /* perform the internode broadcast */
135             if (comm_ptr->node_roots_comm != NULL) {
136                 mpi_errno = MPIR_Bcast(buffer, count, datatype,
137                                        MPIR_Get_internode_rank(comm_ptr, root),
138                                        comm_ptr->node_roots_comm, errflag);
139                 if (mpi_errno) {
140                     /* for communication errors, just record the error but continue */
141                     *errflag =
142                         MPIX_ERR_PROC_FAILED ==
143                         MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
144                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
145                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
146                 }
147             }
148 
149             /* perform the intranode broadcast on all except for the root's node */
150             if (comm_ptr->node_comm != NULL && MPIR_Get_intranode_rank(comm_ptr, root) <= 0) {  /* 0 if root was local root too, -1 if different node than root */
151                 /* FIXME binomial may not be the best algorithm for on-node
152                  * bcast.  We need a more comprehensive system for selecting the
153                  * right algorithms here. */
154                 mpi_errno = MPIR_Bcast(buffer, count, datatype, 0, comm_ptr->node_comm, errflag);
155                 if (mpi_errno) {
156                     /* for communication errors, just record the error but continue */
157                     *errflag =
158                         MPIX_ERR_PROC_FAILED ==
159                         MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
160                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
161                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
162                 }
163             }
164         } else {        /* large msg or non-pof2 */
165 
166             /* FIXME It would be good to have an SMP-aware version of this
167              * algorithm that (at least approximately) minimized internode
168              * communication. */
169             mpi_errno =
170                 MPIR_Bcast_intra_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr,
171                                                         errflag);
172             if (mpi_errno) {
173                 /* for communication errors, just record the error but continue */
174                 *errflag =
175                     MPIX_ERR_PROC_FAILED ==
176                     MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
177                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
178                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
179             }
180         }
181     }
182 
183   fn_exit:
184     /* --BEGIN ERROR HANDLING-- */
185     if (mpi_errno_ret)
186         mpi_errno = mpi_errno_ret;
187     else if (*errflag != MPIR_ERR_NONE)
188         MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
189     /* --END ERROR HANDLING-- */
190     return mpi_errno;
191 }
192