1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpiimpl.h"
7 
8 /* This function implements a binomial tree reduce.
9 
10    Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
11  */
MPIR_Reduce_intra_binomial(const void * sendbuf,void * recvbuf,int count,MPI_Datatype datatype,MPI_Op op,int root,MPIR_Comm * comm_ptr,MPIR_Errflag_t * errflag)12 int MPIR_Reduce_intra_binomial(const void *sendbuf,
13                                void *recvbuf,
14                                int count,
15                                MPI_Datatype datatype,
16                                MPI_Op op, int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
17 {
18     int mpi_errno = MPI_SUCCESS;
19     int mpi_errno_ret = MPI_SUCCESS;
20     MPI_Status status;
21     int comm_size, rank, is_commutative, type_size ATTRIBUTE((unused));
22     int mask, relrank, source, lroot;
23     MPI_Aint true_lb, true_extent, extent;
24     void *tmp_buf;
25     MPIR_CHKLMEM_DECL(2);
26 
27     if (count == 0)
28         return MPI_SUCCESS;
29 
30     comm_size = comm_ptr->local_size;
31     rank = comm_ptr->rank;
32 
33     /* Create a temporary buffer */
34 
35     MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
36     MPIR_Datatype_get_extent_macro(datatype, extent);
37 
38     is_commutative = MPIR_Op_is_commutative(op);
39 
40     MPIR_CHKLMEM_MALLOC(tmp_buf, void *, count * (MPL_MAX(extent, true_extent)),
41                         mpi_errno, "temporary buffer", MPL_MEM_BUFFER);
42     /* adjust for potential negative lower bound in datatype */
43     tmp_buf = (void *) ((char *) tmp_buf - true_lb);
44 
45     /* If I'm not the root, then my recvbuf may not be valid, therefore
46      * I have to allocate a temporary one */
47     if (rank != root) {
48         MPIR_CHKLMEM_MALLOC(recvbuf, void *,
49                             count * (MPL_MAX(extent, true_extent)),
50                             mpi_errno, "receive buffer", MPL_MEM_BUFFER);
51         recvbuf = (void *) ((char *) recvbuf - true_lb);
52     }
53 
54     if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
55         mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf, count, datatype);
56         MPIR_ERR_CHECK(mpi_errno);
57     }
58 
59     MPIR_Datatype_get_size_macro(datatype, type_size);
60 
61     /* This code is from MPICH-1. */
62 
63     /* Here's the algorithm.  Relative to the root, look at the bit pattern in
64      * my rank.  Starting from the right (lsb), if the bit is 1, send to
65      * the node with that bit zero and exit; if the bit is 0, receive from the
66      * node with that bit set and combine (as long as that node is within the
67      * group)
68      *
69      * Note that by receiving with source selection, we guarantee that we get
70      * the same bits with the same input.  If we allowed the parent to receive
71      * the children in any order, then timing differences could cause different
72      * results (roundoff error, over/underflows in some cases, etc).
73      *
74      * Because of the way these are ordered, if root is 0, then this is correct
75      * for both commutative and non-commutitive operations.  If root is not
76      * 0, then for non-commutitive, we use a root of zero and then send
77      * the result to the root.  To see this, note that the ordering is
78      * mask = 1: (ab)(cd)(ef)(gh)            (odds send to evens)
79      * mask = 2: ((ab)(cd))((ef)(gh))        (3,6 send to 0,4)
80      * mask = 4: (((ab)(cd))((ef)(gh)))      (4 sends to 0)
81      *
82      * Comments on buffering.
83      * If the datatype is not contiguous, we still need to pass contiguous
84      * data to the user routine.
85      * In this case, we should make a copy of the data in some format,
86      * and send/operate on that.
87      *
88      * In general, we can't use MPI_PACK, because the alignment of that
89      * is rather vague, and the data may not be re-usable.  What we actually
90      * need is a "squeeze" operation that removes the skips.
91      */
92     mask = 0x1;
93     if (is_commutative)
94         lroot = root;
95     else
96         lroot = 0;
97     relrank = (rank - lroot + comm_size) % comm_size;
98 
99     while (/*(mask & relrank) == 0 && */ mask < comm_size) {
100         /* Receive */
101         if ((mask & relrank) == 0) {
102             source = (relrank | mask);
103             if (source < comm_size) {
104                 source = (source + lroot) % comm_size;
105                 mpi_errno = MPIC_Recv(tmp_buf, count, datatype, source,
106                                       MPIR_REDUCE_TAG, comm_ptr, &status, errflag);
107                 if (mpi_errno) {
108                     /* for communication errors, just record the error but continue */
109                     *errflag =
110                         MPIX_ERR_PROC_FAILED ==
111                         MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
112                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
113                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
114                 }
115 
116                 /* The sender is above us, so the received buffer must be
117                  * the second argument (in the noncommutative case). */
118                 if (is_commutative) {
119                     mpi_errno = MPIR_Reduce_local(tmp_buf, recvbuf, count, datatype, op);
120                     MPIR_ERR_CHECK(mpi_errno);
121                 } else {
122                     mpi_errno = MPIR_Reduce_local(recvbuf, tmp_buf, count, datatype, op);
123                     MPIR_ERR_CHECK(mpi_errno);
124 
125                     mpi_errno = MPIR_Localcopy(tmp_buf, count, datatype, recvbuf, count, datatype);
126                     MPIR_ERR_CHECK(mpi_errno);
127                 }
128             }
129         } else {
130             /* I've received all that I'm going to.  Send my result to
131              * my parent */
132             source = ((relrank & (~mask)) + lroot) % comm_size;
133             mpi_errno = MPIC_Send(recvbuf, count, datatype,
134                                   source, MPIR_REDUCE_TAG, comm_ptr, errflag);
135             if (mpi_errno) {
136                 /* for communication errors, just record the error but continue */
137                 *errflag =
138                     MPIX_ERR_PROC_FAILED ==
139                     MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
140                 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
141                 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
142             }
143             break;
144         }
145         mask <<= 1;
146     }
147 
148     if (!is_commutative && (root != 0)) {
149         if (rank == 0) {
150             mpi_errno = MPIC_Send(recvbuf, count, datatype, root,
151                                   MPIR_REDUCE_TAG, comm_ptr, errflag);
152         } else if (rank == root) {
153             mpi_errno = MPIC_Recv(recvbuf, count, datatype, 0,
154                                   MPIR_REDUCE_TAG, comm_ptr, &status, errflag);
155         }
156         if (mpi_errno) {
157             /* for communication errors, just record the error but continue */
158             *errflag =
159                 MPIX_ERR_PROC_FAILED ==
160                 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
161             MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
162             MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
163         }
164     }
165 
166   fn_exit:
167     MPIR_CHKLMEM_FREEALL();
168     if (mpi_errno_ret)
169         mpi_errno = mpi_errno_ret;
170     else if (*errflag != MPIR_ERR_NONE)
171         MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
172     return mpi_errno;
173   fn_fail:
174     goto fn_exit;
175 }
176