1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include "mpiimpl.h"
7
8 /* FIXME This function uses some heuristsics based off of some testing on a
9 * cluster at Argonne. We need a better system for detrmining and controlling
10 * the cutoff points for these algorithms. If I've done this right, you should
11 * be able to make changes along these lines almost exclusively in this function
12 * and some new functions. [goodell@ 2008/01/07] */
MPIR_Bcast_intra_smp(void * buffer,int count,MPI_Datatype datatype,int root,MPIR_Comm * comm_ptr,MPIR_Errflag_t * errflag)13 int MPIR_Bcast_intra_smp(void *buffer, int count, MPI_Datatype datatype, int root,
14 MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
15 {
16 int mpi_errno = MPI_SUCCESS;
17 int mpi_errno_ret = MPI_SUCCESS;
18 MPI_Aint type_size, nbytes = 0;
19 MPI_Status *status_p;
20 #ifdef HAVE_ERROR_CHECKING
21 MPI_Status status;
22 status_p = &status;
23 MPI_Aint recvd_size;
24 #else
25 status_p = MPI_STATUS_IGNORE;
26 #endif
27
28 #ifdef HAVE_ERROR_CHECKING
29 MPIR_Assert(MPIR_Comm_is_parent_comm(comm_ptr));
30 #endif
31
32 MPIR_Datatype_get_size_macro(datatype, type_size);
33
34 nbytes = type_size * count;
35 if (nbytes == 0)
36 goto fn_exit; /* nothing to do */
37
38 if ((nbytes < MPIR_CVAR_BCAST_SHORT_MSG_SIZE) ||
39 (comm_ptr->local_size < MPIR_CVAR_BCAST_MIN_PROCS)) {
40 /* send to intranode-rank 0 on the root's node */
41 if (comm_ptr->node_comm != NULL && MPIR_Get_intranode_rank(comm_ptr, root) > 0) { /* is not the node root (0) and is on our node (!-1) */
42 if (root == comm_ptr->rank) {
43 mpi_errno = MPIC_Send(buffer, count, datatype, 0,
44 MPIR_BCAST_TAG, comm_ptr->node_comm, errflag);
45 if (mpi_errno) {
46 /* for communication errors, just record the error but continue */
47 *errflag =
48 MPIX_ERR_PROC_FAILED ==
49 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
50 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
51 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
52 }
53 } else if (0 == comm_ptr->node_comm->rank) {
54 mpi_errno =
55 MPIC_Recv(buffer, count, datatype, MPIR_Get_intranode_rank(comm_ptr, root),
56 MPIR_BCAST_TAG, comm_ptr->node_comm, status_p, errflag);
57 if (mpi_errno) {
58 /* for communication errors, just record the error but continue */
59 *errflag =
60 MPIX_ERR_PROC_FAILED ==
61 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
62 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
63 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
64 }
65 #ifdef HAVE_ERROR_CHECKING
66 /* check that we received as much as we expected */
67 MPIR_Get_count_impl(status_p, MPI_BYTE, &recvd_size);
68 if (recvd_size != nbytes) {
69 if (*errflag == MPIR_ERR_NONE)
70 *errflag = MPIR_ERR_OTHER;
71 MPIR_ERR_SET2(mpi_errno, MPI_ERR_OTHER,
72 "**collective_size_mismatch",
73 "**collective_size_mismatch %d %d", recvd_size, nbytes);
74 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
75 }
76 #endif
77 }
78
79 }
80
81 /* perform the internode broadcast */
82 if (comm_ptr->node_roots_comm != NULL) {
83 mpi_errno = MPIR_Bcast(buffer, count, datatype,
84 MPIR_Get_internode_rank(comm_ptr, root),
85 comm_ptr->node_roots_comm, errflag);
86 if (mpi_errno) {
87 /* for communication errors, just record the error but continue */
88 *errflag =
89 MPIX_ERR_PROC_FAILED ==
90 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
91 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
92 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
93 }
94 }
95
96 /* perform the intranode broadcast on all except for the root's node */
97 if (comm_ptr->node_comm != NULL) {
98 mpi_errno = MPIR_Bcast(buffer, count, datatype, 0, comm_ptr->node_comm, errflag);
99 if (mpi_errno) {
100 /* for communication errors, just record the error but continue */
101 *errflag =
102 MPIX_ERR_PROC_FAILED ==
103 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
104 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
105 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
106 }
107 }
108 } else { /* (nbytes > MPIR_CVAR_BCAST_SHORT_MSG_SIZE) && (comm_ptr->size >= MPIR_CVAR_BCAST_MIN_PROCS) */
109
110 /* supposedly...
111 * smp+doubling good for pof2
112 * reg+ring better for non-pof2 */
113 if (nbytes < MPIR_CVAR_BCAST_LONG_MSG_SIZE && MPL_is_pof2(comm_ptr->local_size, NULL)) {
114 /* medium-sized msg and pof2 np */
115
116 /* perform the intranode broadcast on the root's node */
117 if (comm_ptr->node_comm != NULL && MPIR_Get_intranode_rank(comm_ptr, root) > 0) { /* is not the node root (0) and is on our node (!-1) */
118 /* FIXME binomial may not be the best algorithm for on-node
119 * bcast. We need a more comprehensive system for selecting the
120 * right algorithms here. */
121 mpi_errno = MPIR_Bcast(buffer, count, datatype,
122 MPIR_Get_intranode_rank(comm_ptr, root),
123 comm_ptr->node_comm, errflag);
124 if (mpi_errno) {
125 /* for communication errors, just record the error but continue */
126 *errflag =
127 MPIX_ERR_PROC_FAILED ==
128 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
129 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
130 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
131 }
132 }
133
134 /* perform the internode broadcast */
135 if (comm_ptr->node_roots_comm != NULL) {
136 mpi_errno = MPIR_Bcast(buffer, count, datatype,
137 MPIR_Get_internode_rank(comm_ptr, root),
138 comm_ptr->node_roots_comm, errflag);
139 if (mpi_errno) {
140 /* for communication errors, just record the error but continue */
141 *errflag =
142 MPIX_ERR_PROC_FAILED ==
143 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
144 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
145 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
146 }
147 }
148
149 /* perform the intranode broadcast on all except for the root's node */
150 if (comm_ptr->node_comm != NULL && MPIR_Get_intranode_rank(comm_ptr, root) <= 0) { /* 0 if root was local root too, -1 if different node than root */
151 /* FIXME binomial may not be the best algorithm for on-node
152 * bcast. We need a more comprehensive system for selecting the
153 * right algorithms here. */
154 mpi_errno = MPIR_Bcast(buffer, count, datatype, 0, comm_ptr->node_comm, errflag);
155 if (mpi_errno) {
156 /* for communication errors, just record the error but continue */
157 *errflag =
158 MPIX_ERR_PROC_FAILED ==
159 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
160 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
161 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
162 }
163 }
164 } else { /* large msg or non-pof2 */
165
166 /* FIXME It would be good to have an SMP-aware version of this
167 * algorithm that (at least approximately) minimized internode
168 * communication. */
169 mpi_errno =
170 MPIR_Bcast_intra_scatter_ring_allgather(buffer, count, datatype, root, comm_ptr,
171 errflag);
172 if (mpi_errno) {
173 /* for communication errors, just record the error but continue */
174 *errflag =
175 MPIX_ERR_PROC_FAILED ==
176 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
177 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
178 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
179 }
180 }
181 }
182
183 fn_exit:
184 /* --BEGIN ERROR HANDLING-- */
185 if (mpi_errno_ret)
186 mpi_errno = mpi_errno_ret;
187 else if (*errflag != MPIR_ERR_NONE)
188 MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
189 /* --END ERROR HANDLING-- */
190 return mpi_errno;
191 }
192