1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3  *  (C) 2010 by Argonne National Laboratory.
4  *      See COPYRIGHT in top-level directory.
5  */
6 
7 #include "mpiimpl.h"
8 #include "collutil.h"
9 
10 /* -- Begin Profiling Symbol Block for routine MPIX_Ireduce_scatter_block */
11 #if defined(HAVE_PRAGMA_WEAK)
12 #pragma weak MPIX_Ireduce_scatter_block = PMPIX_Ireduce_scatter_block
13 #elif defined(HAVE_PRAGMA_HP_SEC_DEF)
14 #pragma _HP_SECONDARY_DEF PMPIX_Ireduce_scatter_block  MPIX_Ireduce_scatter_block
15 #elif defined(HAVE_PRAGMA_CRI_DUP)
16 #pragma _CRI duplicate MPIX_Ireduce_scatter_block as PMPIX_Ireduce_scatter_block
17 #endif
18 /* -- End Profiling Symbol Block */
19 
20 /* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
21    the MPI routines */
22 #ifndef MPICH_MPI_FROM_PMPI
23 #undef MPIX_Ireduce_scatter_block
24 #define MPIX_Ireduce_scatter_block PMPIX_Ireduce_scatter_block
25 
26 /* any non-MPI functions go here, especially non-static ones */
27 
28 /* A recursive halving MPIX_Ireduce_scatter_block algorithm.  Requires that op is
29  * commutative.  Typically yields better performance for shorter messages. */
30 #undef FUNCNAME
31 #define FUNCNAME MPIR_Ireduce_scatter_block_rec_hlv
32 #undef FCNAME
33 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Ireduce_scatter_block_rec_hlv(const void * sendbuf,void * recvbuf,int recvcount,MPI_Datatype datatype,MPI_Op op,MPID_Comm * comm_ptr,MPID_Sched_t s)34 int MPIR_Ireduce_scatter_block_rec_hlv(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s)
35 {
36     int mpi_errno = MPI_SUCCESS;
37     int rank, comm_size, i;
38     MPI_Aint extent, true_extent, true_lb;
39     int  *disps;
40     void *tmp_recvbuf, *tmp_results;
41     int type_size ATTRIBUTE((unused)), total_count, dst;
42     int mask;
43     int *newcnts, *newdisps, rem, newdst, send_idx, recv_idx,
44         last_idx, send_cnt, recv_cnt;
45     int pof2, old_i, newrank;
46     MPIR_SCHED_CHKPMEM_DECL(5);
47 
48     comm_size = comm_ptr->local_size;
49     rank = comm_ptr->rank;
50 
51     MPID_Datatype_get_extent_macro(datatype, extent);
52     MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
53 
54     MPIU_Assert(MPIR_Op_is_commutative(op));
55 
56     MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps");
57 
58     total_count = 0;
59     for (i=0; i<comm_size; i++) {
60         disps[i] = total_count;
61         total_count += recvcount;
62     }
63 
64     if (total_count == 0) {
65         goto fn_exit;
66     }
67 
68     MPID_Datatype_get_size_macro(datatype, type_size);
69 
70     /* allocate temp. buffer to receive incoming data */
71     MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, total_count*(MPIR_MAX(true_extent,extent)), mpi_errno, "tmp_recvbuf");
72     /* adjust for potential negative lower bound in datatype */
73     tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);
74 
75     /* need to allocate another temporary buffer to accumulate
76        results because recvbuf may not be big enough */
77     MPIR_SCHED_CHKPMEM_MALLOC(tmp_results, void *, total_count*(MPIR_MAX(true_extent,extent)), mpi_errno, "tmp_results");
78     /* adjust for potential negative lower bound in datatype */
79     tmp_results = (void *)((char*)tmp_results - true_lb);
80 
81     /* copy sendbuf into tmp_results */
82     if (sendbuf != MPI_IN_PLACE)
83         mpi_errno = MPID_Sched_copy(sendbuf, total_count, datatype,
84                                     tmp_results, total_count, datatype, s);
85     else
86         mpi_errno = MPID_Sched_copy(recvbuf, total_count, datatype,
87                                     tmp_results, total_count, datatype, s);
88     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
89     MPID_SCHED_BARRIER(s);
90 
91     pof2 = 1;
92     while (pof2 <= comm_size) pof2 <<= 1;
93     pof2 >>=1;
94 
95     rem = comm_size - pof2;
96 
97     /* In the non-power-of-two case, all even-numbered
98        processes of rank < 2*rem send their data to
99        (rank+1). These even-numbered processes no longer
100        participate in the algorithm until the very end. The
101        remaining processes form a nice power-of-two. */
102 
103     if (rank < 2*rem) {
104         if (rank % 2 == 0) { /* even */
105             mpi_errno = MPID_Sched_send(tmp_results, total_count, datatype, rank+1, comm_ptr, s);
106             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
107             MPID_SCHED_BARRIER(s);
108 
109             /* temporarily set the rank to -1 so that this
110                process does not pariticipate in recursive
111                doubling */
112             newrank = -1;
113         }
114         else { /* odd */
115             mpi_errno = MPID_Sched_recv(tmp_recvbuf, total_count, datatype, rank-1, comm_ptr, s);
116             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
117             MPID_SCHED_BARRIER(s);
118 
119             /* do the reduction on received data. since the
120                ordering is right, it doesn't matter whether
121                the operation is commutative or not. */
122             mpi_errno = MPID_Sched_reduce(tmp_recvbuf, tmp_results, total_count, datatype, op, s);
123             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
124             MPID_SCHED_BARRIER(s);
125 
126             /* change the rank */
127             newrank = rank / 2;
128         }
129     }
130     else  /* rank >= 2*rem */
131         newrank = rank - rem;
132 
133     if (newrank != -1) {
134         /* recalculate the recvcnts and disps arrays because the
135            even-numbered processes who no longer participate will
136            have their result calculated by the process to their
137            right (rank+1). */
138 
139         MPIR_SCHED_CHKPMEM_MALLOC(newcnts, int *, pof2*sizeof(int), mpi_errno, "newcnts");
140         MPIR_SCHED_CHKPMEM_MALLOC(newdisps, int *, pof2*sizeof(int), mpi_errno, "newdisps");
141 
142         for (i = 0; i < pof2; i++) {
143             /* what does i map to in the old ranking? */
144             old_i = (i < rem) ? i*2 + 1 : i + rem;
145             if (old_i < 2*rem) {
146                 /* This process has to also do its left neighbor's
147                    work */
148                 newcnts[i] = 2 * recvcount;
149             }
150             else
151                 newcnts[i] = recvcount;
152         }
153 
154         newdisps[0] = 0;
155         for (i=1; i<pof2; i++)
156             newdisps[i] = newdisps[i-1] + newcnts[i-1];
157 
158         mask = pof2 >> 1;
159         send_idx = recv_idx = 0;
160         last_idx = pof2;
161         while (mask > 0) {
162             newdst = newrank ^ mask;
163             /* find real rank of dest */
164             dst = (newdst < rem) ? newdst*2 + 1 : newdst + rem;
165 
166             send_cnt = recv_cnt = 0;
167             if (newrank < newdst) {
168                 send_idx = recv_idx + mask;
169                 for (i=send_idx; i<last_idx; i++)
170                     send_cnt += newcnts[i];
171                 for (i=recv_idx; i<send_idx; i++)
172                     recv_cnt += newcnts[i];
173             }
174             else {
175                 recv_idx = send_idx + mask;
176                 for (i=send_idx; i<recv_idx; i++)
177                     send_cnt += newcnts[i];
178                 for (i=recv_idx; i<last_idx; i++)
179                     recv_cnt += newcnts[i];
180             }
181 
182             /* Send data from tmp_results. Recv into tmp_recvbuf */
183             {
184                 /* avoid sending and receiving pointless 0-byte messages */
185                 int send_dst = (send_cnt ? dst : MPI_PROC_NULL);
186                 int recv_dst = (recv_cnt ? dst : MPI_PROC_NULL);
187 
188                 mpi_errno = MPID_Sched_send(((char *)tmp_results + newdisps[send_idx]*extent),
189                                             send_cnt, datatype, send_dst, comm_ptr, s);
190                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
191                 mpi_errno = MPID_Sched_recv(((char *) tmp_recvbuf + newdisps[recv_idx]*extent),
192                                             recv_cnt, datatype, recv_dst, comm_ptr, s);
193                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
194                 MPID_SCHED_BARRIER(s);
195             }
196 
197             /* tmp_recvbuf contains data received in this step.
198                tmp_results contains data accumulated so far */
199             if (recv_cnt) {
200                 mpi_errno = MPID_Sched_reduce(((char *)tmp_recvbuf + newdisps[recv_idx]*extent),
201                                               ((char *)tmp_results + newdisps[recv_idx]*extent),
202                                               recv_cnt, datatype, op, s);
203                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
204                 MPID_SCHED_BARRIER(s);
205             }
206 
207             /* update send_idx for next iteration */
208             send_idx = recv_idx;
209             last_idx = recv_idx + mask;
210             mask >>= 1;
211         }
212 
213         /* copy this process's result from tmp_results to recvbuf */
214         mpi_errno = MPID_Sched_copy(((char *)tmp_results + disps[rank]*extent),
215                                     recvcount, datatype,
216                                     recvbuf, recvcount, datatype, s);
217         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
218         MPID_SCHED_BARRIER(s);
219 
220     }
221 
222     /* In the non-power-of-two case, all odd-numbered
223        processes of rank < 2*rem send to (rank-1) the result they
224        calculated for that process */
225     if (rank < 2*rem) {
226         if (rank % 2) { /* odd */
227             mpi_errno = MPID_Sched_send(((char *)tmp_results + disps[rank-1]*extent),
228                                         recvcount, datatype, rank-1, comm_ptr, s);
229             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
230             MPID_SCHED_BARRIER(s);
231         }
232         else  {   /* even */
233             mpi_errno = MPID_Sched_recv(recvbuf, recvcount, datatype, rank+1, comm_ptr, s);
234             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
235             MPID_SCHED_BARRIER(s);
236         }
237     }
238 
239 
240     MPIR_SCHED_CHKPMEM_COMMIT(s);
241 fn_exit:
242     return mpi_errno;
243 fn_fail:
244     MPIR_SCHED_CHKPMEM_REAP(s);
245     goto fn_exit;
246 }
247 
248 /* A pairwise exchange algorithm for MPI_Ireduce_scatter_block.  Requires a
249  * commutative op and is intended for use with large messages. */
250 #undef FUNCNAME
251 #define FUNCNAME MPIR_Ireduce_scatter_block_rec_pairwise
252 #undef FCNAME
253 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Ireduce_scatter_block_pairwise(const void * sendbuf,void * recvbuf,int recvcount,MPI_Datatype datatype,MPI_Op op,MPID_Comm * comm_ptr,MPID_Sched_t s)254 int MPIR_Ireduce_scatter_block_pairwise(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s)
255 {
256     int mpi_errno = MPI_SUCCESS;
257     int   rank, comm_size, i;
258     MPI_Aint extent, true_extent, true_lb;
259     int  *disps;
260     void *tmp_recvbuf;
261     int src, dst;
262     int is_commutative;
263     int total_count;
264     MPIR_SCHED_CHKPMEM_DECL(2);
265 
266     comm_size = comm_ptr->local_size;
267     rank = comm_ptr->rank;
268 
269     MPID_Datatype_get_extent_macro(datatype, extent);
270     MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
271 
272     is_commutative = MPIR_Op_is_commutative(op);
273     MPIU_Assert(is_commutative);
274 
275     MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps");
276 
277     total_count = 0;
278     for (i=0; i<comm_size; i++) {
279         disps[i] = total_count;
280         total_count += recvcount;
281     }
282 
283     if (total_count == 0) {
284         goto fn_exit;
285     }
286     /* total_count*extent eventually gets malloced. it isn't added to
287      * a user-passed in buffer */
288     MPID_Ensure_Aint_fits_in_pointer(total_count * MPIR_MAX(true_extent, extent));
289 
290     if (sendbuf != MPI_IN_PLACE) {
291         /* copy local data into recvbuf */
292         mpi_errno = MPID_Sched_copy(((char *)sendbuf+disps[rank]*extent),
293                                     recvcount, datatype,
294                                     recvbuf, recvcount, datatype, s);
295         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
296         MPID_SCHED_BARRIER(s);
297     }
298 
299     /* allocate temporary buffer to store incoming data */
300     MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, recvcount*(MPIR_MAX(true_extent,extent))+1, mpi_errno, "tmp_recvbuf");
301     /* adjust for potential negative lower bound in datatype */
302     tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);
303 
304     for (i=1; i<comm_size; i++) {
305         src = (rank - i + comm_size) % comm_size;
306         dst = (rank + i) % comm_size;
307 
308         /* send the data that dst needs. recv data that this process
309            needs from src into tmp_recvbuf */
310         if (sendbuf != MPI_IN_PLACE) {
311             mpi_errno = MPID_Sched_send(((char *)sendbuf+disps[dst]*extent),
312                                         recvcount, datatype, dst, comm_ptr, s);
313             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
314         }
315         else {
316             mpi_errno = MPID_Sched_send(((char *)recvbuf+disps[dst]*extent),
317                                         recvcount, datatype, dst, comm_ptr, s);
318             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
319         }
320         mpi_errno = MPID_Sched_recv(tmp_recvbuf, recvcount, datatype, src, comm_ptr, s);
321         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
322         MPID_SCHED_BARRIER(s);
323 
324         /* FIXME does this algorithm actually work correctly for noncommutative ops?
325          * If so, relax restriction in assert and comments... */
326         if (is_commutative || (src < rank)) {
327             if (sendbuf != MPI_IN_PLACE) {
328                 mpi_errno = MPID_Sched_reduce(tmp_recvbuf, recvbuf, recvcount, datatype, op, s);
329                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
330             }
331             else {
332                 mpi_errno = MPID_Sched_reduce(tmp_recvbuf, ((char *)recvbuf+disps[rank]*extent),
333                                               recvcount, datatype, op, s);
334                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
335                 /* We can't store the result at the beginning of
336                    recvbuf right here because there is useful data there that
337                    other process/processes need.  At the end we will copy back
338                    the result to the beginning of recvbuf. */
339             }
340             MPID_SCHED_BARRIER(s);
341         }
342         else {
343             if (sendbuf != MPI_IN_PLACE) {
344                 mpi_errno = MPID_Sched_reduce(recvbuf, tmp_recvbuf, recvcount, datatype, op, s);
345                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
346                 MPID_SCHED_BARRIER(s);
347                 /* copy result back into recvbuf */
348                 mpi_errno = MPID_Sched_copy(tmp_recvbuf, recvcount, datatype,
349                                             recvbuf, recvcount, datatype, s);
350                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
351             }
352             else {
353                 mpi_errno = MPID_Sched_reduce(((char *)recvbuf+disps[rank]*extent),
354                                               tmp_recvbuf, recvcount, datatype, op, s);
355                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
356                 MPID_SCHED_BARRIER(s);
357                 /* copy result back into recvbuf */
358                 mpi_errno = MPID_Sched_copy(tmp_recvbuf, recvcount, datatype,
359                                             ((char *)recvbuf + disps[rank]*extent),
360                                             recvcount, datatype, s);
361                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
362             }
363             MPID_SCHED_BARRIER(s);
364         }
365     }
366 
367     /* if MPI_IN_PLACE, move output data to the beginning of
368        recvbuf. already done for rank 0. */
369     if ((sendbuf == MPI_IN_PLACE) && (rank != 0)) {
370         mpi_errno = MPID_Sched_copy(((char *)recvbuf + disps[rank]*extent),
371                                     recvcount, datatype,
372                                     recvbuf, recvcount, datatype, s);
373         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
374         MPID_SCHED_BARRIER(s);
375     }
376 
377     MPIR_SCHED_CHKPMEM_COMMIT(s);
378 fn_exit:
379     return mpi_errno;
380 fn_fail:
381     MPIR_SCHED_CHKPMEM_REAP(s);
382     goto fn_exit;
383 }
384 
385 /* A recursive doubling algorithm for MPI_Ireduce_scatter_block, suitable for
386  * noncommutative and (non-pof2 or block irregular). */
387 #undef FUNCNAME
388 #define FUNCNAME MPIR_Ireduce_scatter_block_rec_dbl
389 #undef FCNAME
390 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Ireduce_scatter_block_rec_dbl(const void * sendbuf,void * recvbuf,int recvcount,MPI_Datatype datatype,MPI_Op op,MPID_Comm * comm_ptr,MPID_Sched_t s)391 int MPIR_Ireduce_scatter_block_rec_dbl(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s)
392 {
393     int mpi_errno = MPI_SUCCESS;
394     int rank, comm_size, i;
395     MPI_Aint extent, true_extent, true_lb;
396     int  *disps;
397     void *tmp_recvbuf, *tmp_results;
398     int type_size ATTRIBUTE((unused)), dis[2], blklens[2], total_count, dst;
399     int mask, dst_tree_root, my_tree_root, j, k;
400     int received;
401     MPI_Datatype sendtype, recvtype;
402     int nprocs_completed, tmp_mask, tree_root, is_commutative;
403     MPIR_SCHED_CHKPMEM_DECL(5);
404 
405     comm_size = comm_ptr->local_size;
406     rank = comm_ptr->rank;
407 
408     MPID_Datatype_get_extent_macro(datatype, extent);
409     MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
410     is_commutative = MPIR_Op_is_commutative(op);
411 
412     MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps");
413 
414     total_count = 0;
415     for (i=0; i<comm_size; i++) {
416         disps[i] = total_count;
417         total_count += recvcount;
418     }
419 
420     if (total_count == 0) {
421         goto fn_exit;
422     }
423 
424     MPID_Datatype_get_size_macro(datatype, type_size);
425 
426     /* total_count*extent eventually gets malloced. it isn't added to
427      * a user-passed in buffer */
428     MPID_Ensure_Aint_fits_in_pointer(total_count * MPIR_MAX(true_extent, extent));
429 
430 
431     /* need to allocate temporary buffer to receive incoming data*/
432     MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, total_count*(MPIR_MAX(true_extent,extent)), mpi_errno, "tmp_recvbuf");
433     /* adjust for potential negative lower bound in datatype */
434     tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);
435 
436     /* need to allocate another temporary buffer to accumulate
437        results */
438     MPIR_SCHED_CHKPMEM_MALLOC(tmp_results, void *, total_count*(MPIR_MAX(true_extent,extent)), mpi_errno, "tmp_results");
439     /* adjust for potential negative lower bound in datatype */
440     tmp_results = (void *)((char*)tmp_results - true_lb);
441 
442     /* copy sendbuf into tmp_results */
443     if (sendbuf != MPI_IN_PLACE)
444         mpi_errno = MPID_Sched_copy(sendbuf, total_count, datatype,
445                                     tmp_results, total_count, datatype, s);
446     else
447         mpi_errno = MPID_Sched_copy(recvbuf, total_count, datatype,
448                                     tmp_results, total_count, datatype, s);
449 
450     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
451     MPID_SCHED_BARRIER(s);
452 
453     mask = 0x1;
454     i = 0;
455     while (mask < comm_size) {
456         dst = rank ^ mask;
457 
458         dst_tree_root = dst >> i;
459         dst_tree_root <<= i;
460 
461         my_tree_root = rank >> i;
462         my_tree_root <<= i;
463 
464         /* At step 1, processes exchange (n-n/p) amount of
465            data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
466            amount of data, and so forth. We use derived datatypes for this.
467 
468            At each step, a process does not need to send data
469            indexed from my_tree_root to
470            my_tree_root+mask-1. Similarly, a process won't receive
471            data indexed from dst_tree_root to dst_tree_root+mask-1. */
472 
473         /* calculate sendtype */
474         blklens[0] = my_tree_root * recvcount;
475         blklens[1] = (comm_size - (my_tree_root + mask)) * recvcount;
476         if (blklens[1] < 0)
477             blklens[1] = 0;
478 
479         dis[0] = 0;
480         dis[1] = blklens[0] + recvcount * (MPIR_MIN((my_tree_root + mask), comm_size) - my_tree_root);
481 
482         mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &sendtype);
483         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
484 
485         mpi_errno = MPIR_Type_commit_impl(&sendtype);
486         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
487 
488         /* calculate recvtype */
489         blklens[0] = recvcount * MPIR_MIN(dst_tree_root, comm_size);
490         blklens[1] = recvcount * (comm_size - (dst_tree_root + mask));
491         if (blklens[1] < 0)
492             blklens[1] = 0;
493 
494         dis[0] = 0;
495         dis[1] = blklens[0];
496         dis[1] = blklens[0] + recvcount * (MPIR_MIN((dst_tree_root + mask), comm_size) - dst_tree_root);
497 
498         mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &recvtype);
499         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
500 
501         mpi_errno = MPIR_Type_commit_impl(&recvtype);
502         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
503 
504         received = 0;
505         if (dst < comm_size) {
506             /* tmp_results contains data to be sent in each step. Data is
507                received in tmp_recvbuf and then accumulated into
508                tmp_results. accumulation is done later below.   */
509             mpi_errno = MPID_Sched_send(tmp_results, 1, sendtype, dst, comm_ptr, s);
510             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
511             mpi_errno = MPID_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s);
512             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
513             MPID_SCHED_BARRIER(s);
514             received = 1;
515         }
516 
517         /* if some processes in this process's subtree in this step
518            did not have any destination process to communicate with
519            because of non-power-of-two, we need to send them the
520            result. We use a logarithmic recursive-halfing algorithm
521            for this. */
522 
523         if (dst_tree_root + mask > comm_size) {
524             nprocs_completed = comm_size - my_tree_root - mask;
525             /* nprocs_completed is the number of processes in this
526                subtree that have all the data. Send data to others
527                in a tree fashion. First find root of current tree
528                that is being divided into two. k is the number of
529                least-significant bits in this process's rank that
530                must be zeroed out to find the rank of the root */
531             j = mask;
532             k = 0;
533             while (j) {
534                 j >>= 1;
535                 k++;
536             }
537             k--;
538 
539             tmp_mask = mask >> 1;
540             while (tmp_mask) {
541                 dst = rank ^ tmp_mask;
542 
543                 tree_root = rank >> k;
544                 tree_root <<= k;
545 
546                 /* send only if this proc has data and destination
547                    doesn't have data. at any step, multiple processes
548                    can send if they have the data */
549                 if ((dst > rank) &&
550                     (rank < tree_root + nprocs_completed)
551                     && (dst >= tree_root + nprocs_completed))
552                 {
553                     /* send the current result */
554                     mpi_errno = MPID_Sched_send(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s);
555                     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
556                     MPID_SCHED_BARRIER(s);
557                 }
558                 /* recv only if this proc. doesn't have data and sender
559                    has data */
560                 else if ((dst < rank) &&
561                          (dst < tree_root + nprocs_completed) &&
562                          (rank >= tree_root + nprocs_completed))
563                 {
564                     mpi_errno = MPID_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s);
565                     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
566                     MPID_SCHED_BARRIER(s);
567                     received = 1;
568                 }
569                 tmp_mask >>= 1;
570                 k--;
571             }
572         }
573 
574         /* N.B. The following comment comes from the FT version of
575          * MPI_Reduce_scatter.  It does not currently apply to this code, but
576          * will in the future when we update the NBC code to be fault-tolerant
577          * in roughly the same fashion. [goodell@ 2011-03-03] */
578         /* The following reduction is done here instead of after
579            the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is
580            because to do it above, in the noncommutative
581            case, we would need an extra temp buffer so as not to
582            overwrite temp_recvbuf, because temp_recvbuf may have
583            to be communicated to other processes in the
584            non-power-of-two case. To avoid that extra allocation,
585            we do the reduce here. */
586         if (received) {
587             if (is_commutative || (dst_tree_root < my_tree_root)) {
588                 mpi_errno = MPID_Sched_reduce(tmp_recvbuf, tmp_results, blklens[0], datatype, op, s);
589                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
590                 mpi_errno = MPID_Sched_reduce(((char *)tmp_recvbuf + dis[1]*extent),
591                                               ((char *)tmp_results + dis[1]*extent),
592                                               blklens[1], datatype, op, s);
593                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
594                 MPID_SCHED_BARRIER(s);
595             }
596             else {
597                 mpi_errno = MPID_Sched_reduce(tmp_results, tmp_recvbuf, blklens[0], datatype, op, s);
598                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
599                 mpi_errno = MPID_Sched_reduce(((char *)tmp_results + dis[1]*extent),
600                                               ((char *)tmp_recvbuf + dis[1]*extent),
601                                               blklens[1], datatype, op, s);
602                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
603                 MPID_SCHED_BARRIER(s);
604 
605                 /* copy result back into tmp_results */
606                 mpi_errno = MPID_Sched_copy(tmp_recvbuf, 1, recvtype,
607                                             tmp_results, 1, recvtype, s);
608                 if (mpi_errno) MPIU_ERR_POP(mpi_errno);
609                 MPID_SCHED_BARRIER(s);
610             }
611         }
612 
613         MPIR_Type_free_impl(&sendtype);
614         MPIR_Type_free_impl(&recvtype);
615 
616         mask <<= 1;
617         i++;
618     }
619 
620     /* now copy final results from tmp_results to recvbuf */
621     mpi_errno = MPID_Sched_copy(((char *)tmp_results+disps[rank]*extent),
622                                 recvcount, datatype,
623                                 recvbuf, recvcount, datatype, s);
624     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
625     MPID_SCHED_BARRIER(s);
626 
627     MPIR_SCHED_CHKPMEM_COMMIT(s);
628 fn_exit:
629     return mpi_errno;
630 fn_fail:
631     MPIR_SCHED_CHKPMEM_REAP(s);
632     goto fn_exit;
633 }
634 
635 /* Implements the reduce-scatter butterfly algorithm described in J. L. Traff's
636  * "An Improved Algorithm for (Non-commutative) Reduce-Scatter with an Application"
637  * from EuroPVM/MPI 2005.  This function currently only implements support for
638  * the power-of-2 case. */
639 #undef FUNCNAME
640 #define FUNCNAME MPIR_Reduce_scatter_block_noncomm
641 #undef FCNAME
642 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Ireduce_scatter_block_noncomm(const void * sendbuf,void * recvbuf,int recvcount,MPI_Datatype datatype,MPI_Op op,MPID_Comm * comm_ptr,MPID_Sched_t s)643 int MPIR_Ireduce_scatter_block_noncomm(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s)
644 {
645     int mpi_errno = MPI_SUCCESS;
646     int comm_size = comm_ptr->local_size;
647     int rank = comm_ptr->rank;
648     int pof2;
649     int log2_comm_size;
650     int i, k;
651     int recv_offset, send_offset;
652     int block_size, total_count, size;
653     MPI_Aint true_extent, true_lb;
654     int buf0_was_inout;
655     void *tmp_buf0;
656     void *tmp_buf1;
657     void *result_ptr;
658     MPIR_SCHED_CHKPMEM_DECL(2);
659 
660     MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
661 
662     pof2 = 1;
663     log2_comm_size = 0;
664     while (pof2 < comm_size) {
665         pof2 <<= 1;
666         ++log2_comm_size;
667     }
668 
669     /* begin error checking */
670     MPIU_Assert(pof2 == comm_size); /* FIXME this version only works for power of 2 procs */
671     /* end error checking */
672 
673     /* size of a block (count of datatype per block, NOT bytes per block) */
674     block_size = recvcount;
675     total_count = block_size * comm_size;
676 
677     MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf0, void *, true_extent * total_count, mpi_errno, "tmp_buf0");
678     MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf1, void *, true_extent * total_count, mpi_errno, "tmp_buf1");
679     /* adjust for potential negative lower bound in datatype */
680     tmp_buf0 = (void *)((char*)tmp_buf0 - true_lb);
681     tmp_buf1 = (void *)((char*)tmp_buf1 - true_lb);
682 
683     /* Copy our send data to tmp_buf0.  We do this one block at a time and
684        permute the blocks as we go according to the mirror permutation. */
685     for (i = 0; i < comm_size; ++i) {
686         mpi_errno = MPID_Sched_copy(((char *)(sendbuf == MPI_IN_PLACE ? recvbuf : sendbuf) + (i * true_extent * block_size)),
687                                     block_size, datatype,
688                                     ((char *)tmp_buf0 + (MPIU_Mirror_permutation(i, log2_comm_size) * true_extent * block_size)),
689                                      block_size, datatype, s);
690         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
691     }
692     MPID_SCHED_BARRIER(s);
693     buf0_was_inout = 1;
694 
695     send_offset = 0;
696     recv_offset = 0;
697     size = total_count;
698     for (k = 0; k < log2_comm_size; ++k) {
699         /* use a double-buffering scheme to avoid local copies */
700         char *incoming_data = (buf0_was_inout ? tmp_buf1 : tmp_buf0);
701         char *outgoing_data = (buf0_was_inout ? tmp_buf0 : tmp_buf1);
702         int peer = rank ^ (0x1 << k);
703         size /= 2;
704 
705         if (rank > peer) {
706             /* we have the higher rank: send top half, recv bottom half */
707             recv_offset += size;
708         }
709         else {
710             /* we have the lower rank: recv top half, send bottom half */
711             send_offset += size;
712         }
713 
714         mpi_errno = MPID_Sched_send((outgoing_data + send_offset*true_extent),
715                                     size, datatype, peer, comm_ptr, s);
716         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
717         mpi_errno = MPID_Sched_recv((incoming_data + recv_offset*true_extent),
718                                     size, datatype, peer, comm_ptr, s);
719         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
720         MPID_SCHED_BARRIER(s);
721 
722         /* always perform the reduction at recv_offset, the data at send_offset
723            is now our peer's responsibility */
724         if (rank > peer) {
725             /* higher ranked value so need to call op(received_data, my_data) */
726             mpi_errno = MPID_Sched_reduce((incoming_data + recv_offset*true_extent),
727                                           (outgoing_data + recv_offset*true_extent),
728                                           size, datatype, op, s);
729             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
730             buf0_was_inout = buf0_was_inout;
731         }
732         else {
733             /* lower ranked value so need to call op(my_data, received_data) */
734             mpi_errno = MPID_Sched_reduce((outgoing_data + recv_offset*true_extent),
735                                           (incoming_data + recv_offset*true_extent),
736                                           size, datatype, op, s);
737             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
738             buf0_was_inout = !buf0_was_inout;
739         }
740         MPID_SCHED_BARRIER(s);
741 
742         /* the next round of send/recv needs to happen within the block (of size
743            "size") that we just received and reduced */
744         send_offset = recv_offset;
745     }
746 
747     MPIU_Assert(size == recvcount);
748 
749     /* copy the reduced data to the recvbuf */
750     result_ptr = (char *)(buf0_was_inout ? tmp_buf0 : tmp_buf1) + recv_offset * true_extent;
751     mpi_errno = MPID_Sched_copy(result_ptr, size, datatype,
752                                 recvbuf, size, datatype, s);
753     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
754 
755     MPIR_SCHED_CHKPMEM_COMMIT(s);
756 fn_exit:
757     return mpi_errno;
758 fn_fail:
759     MPIR_SCHED_CHKPMEM_REAP(s);
760     goto fn_exit;
761 }
762 
763 #undef FUNCNAME
764 #define FUNCNAME MPIR_Ireduce_scatter_block_intra
765 #undef FCNAME
766 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Ireduce_scatter_block_intra(const void * sendbuf,void * recvbuf,int recvcount,MPI_Datatype datatype,MPI_Op op,MPID_Comm * comm_ptr,MPID_Sched_t s)767 int MPIR_Ireduce_scatter_block_intra(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s)
768 {
769     int mpi_errno = MPI_SUCCESS;
770     int is_commutative;
771     int total_count, type_size, nbytes;
772     int comm_size;
773 
774     is_commutative = MPIR_Op_is_commutative(op);
775 
776     comm_size = comm_ptr->local_size;
777     total_count = recvcount * comm_size;
778     if (total_count == 0) {
779         goto fn_exit;
780     }
781     MPID_Datatype_get_size_macro(datatype, type_size);
782     nbytes = total_count * type_size;
783 
784     /* select an appropriate algorithm based on commutivity and message size */
785     if (is_commutative && (nbytes < MPIR_PARAM_REDSCAT_COMMUTATIVE_LONG_MSG_SIZE)) {
786         mpi_errno = MPIR_Ireduce_scatter_block_rec_hlv(sendbuf, recvbuf, recvcount, datatype, op, comm_ptr, s);
787         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
788     }
789     else if (is_commutative && (nbytes >= MPIR_PARAM_REDSCAT_COMMUTATIVE_LONG_MSG_SIZE)) {
790         mpi_errno = MPIR_Ireduce_scatter_block_pairwise(sendbuf, recvbuf, recvcount, datatype, op, comm_ptr, s);
791         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
792     }
793     else /* (!is_commutative) */ {
794         if (MPIU_is_pof2(comm_size, NULL)) {
795             /* noncommutative, pof2 size */
796             mpi_errno = MPIR_Ireduce_scatter_block_noncomm(sendbuf, recvbuf, recvcount, datatype, op, comm_ptr, s);
797             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
798         }
799         else {
800             /* noncommutative and non-pof2, use recursive doubling. */
801             mpi_errno = MPIR_Ireduce_scatter_block_rec_dbl(sendbuf, recvbuf, recvcount, datatype, op, comm_ptr, s);
802             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
803         }
804     }
805 
806 fn_exit:
807     return mpi_errno;
808 fn_fail:
809     goto fn_exit;
810 }
811 
812 
813 #undef FUNCNAME
814 #define FUNCNAME MPIR_Ireduce_scatter_block_inter
815 #undef FCNAME
816 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Ireduce_scatter_block_inter(const void * sendbuf,void * recvbuf,int recvcount,MPI_Datatype datatype,MPI_Op op,MPID_Comm * comm_ptr,MPID_Sched_t s)817 int MPIR_Ireduce_scatter_block_inter(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s)
818 {
819 /* Intercommunicator Ireduce_scatter_block.
820    We first do an intercommunicator reduce to rank 0 on left group,
821    then an intercommunicator reduce to rank 0 on right group, followed
822    by local intracommunicator scattervs in each group.
823 */
824     int mpi_errno = MPI_SUCCESS;
825     int rank, root, local_size, total_count;
826     MPI_Aint true_extent, true_lb = 0, extent;
827     void *tmp_buf = NULL;
828     MPID_Comm *newcomm_ptr = NULL;
829     MPIR_SCHED_CHKPMEM_DECL(1);
830 
831     rank = comm_ptr->rank;
832     local_size = comm_ptr->local_size;
833 
834     total_count = recvcount * local_size;
835 
836     if (rank == 0) {
837         /* In each group, rank 0 allocates a temp. buffer for the
838            reduce */
839         MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
840         MPID_Datatype_get_extent_macro(datatype, extent);
841 
842         MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, total_count*(MPIR_MAX(extent,true_extent)), mpi_errno, "tmp_buf");
843 
844         /* adjust for potential negative lower bound in datatype */
845         tmp_buf = (void *)((char*)tmp_buf - true_lb);
846     }
847 
848     /* first do a reduce from right group to rank 0 in left group,
849        then from left group to rank 0 in right group*/
850     MPIU_Assert(comm_ptr->coll_fns && comm_ptr->coll_fns->Ireduce);
851     if (comm_ptr->is_low_group) {
852         /* reduce from right group to rank 0*/
853         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
854         mpi_errno = comm_ptr->coll_fns->Ireduce(sendbuf, tmp_buf, total_count,
855                                                 datatype, op, root, comm_ptr, s);
856         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
857 
858         /* sched barrier intentionally omitted here to allow both reductions to
859          * proceed in parallel */
860 
861         /* reduce to rank 0 of right group */
862         root = 0;
863         mpi_errno = comm_ptr->coll_fns->Ireduce(sendbuf, tmp_buf, total_count,
864                                                 datatype, op, root, comm_ptr, s);
865         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
866     }
867     else {
868         /* reduce to rank 0 of right group */
869         root = 0;
870         mpi_errno = comm_ptr->coll_fns->Ireduce(sendbuf, tmp_buf, total_count,
871                                                 datatype, op, root, comm_ptr, s);
872         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
873 
874         /* sched barrier intentionally omitted here to allow both reductions to
875          * proceed in parallel */
876 
877         /* reduce from right group to rank 0*/
878         root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL;
879         mpi_errno = comm_ptr->coll_fns->Ireduce(sendbuf, tmp_buf, total_count,
880                                                 datatype, op, root, comm_ptr, s);
881         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
882     }
883     MPID_SCHED_BARRIER(s);
884 
885     /* Get the local intracommunicator */
886     if (!comm_ptr->local_comm) {
887         mpi_errno = MPIR_Setup_intercomm_localcomm(comm_ptr);
888         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
889     }
890 
891     newcomm_ptr = comm_ptr->local_comm;
892 
893     MPIU_Assert(newcomm_ptr->coll_fns && newcomm_ptr->coll_fns->Iscatter);
894     mpi_errno = newcomm_ptr->coll_fns->Iscatter(tmp_buf, recvcount, datatype,
895                                                 recvbuf, recvcount, datatype, 0,
896                                                 newcomm_ptr, s);
897     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
898 
899     MPIR_SCHED_CHKPMEM_COMMIT(s);
900 fn_exit:
901     return mpi_errno;
902 fn_fail:
903     MPIR_SCHED_CHKPMEM_REAP(s);
904     goto fn_exit;
905 }
906 
907 #undef FUNCNAME
908 #define FUNCNAME MPIR_Ireduce_scatter_block_impl
909 #undef FCNAME
910 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Ireduce_scatter_block_impl(const void * sendbuf,void * recvbuf,int recvcount,MPI_Datatype datatype,MPI_Op op,MPID_Comm * comm_ptr,MPI_Request * request)911 int MPIR_Ireduce_scatter_block_impl(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPI_Request *request)
912 {
913     int mpi_errno = MPI_SUCCESS;
914     int tag = -1;
915     MPID_Request *reqp = NULL;
916     MPID_Sched_t s = MPID_SCHED_NULL;
917 
918     *request = MPI_REQUEST_NULL;
919 
920     mpi_errno = MPID_Sched_next_tag(comm_ptr, &tag);
921     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
922     mpi_errno = MPID_Sched_create(&s);
923     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
924 
925     MPIU_Assert(comm_ptr->coll_fns != NULL);
926     MPIU_Assert(comm_ptr->coll_fns->Ireduce_scatter_block != NULL);
927     mpi_errno = comm_ptr->coll_fns->Ireduce_scatter_block(sendbuf, recvbuf, recvcount, datatype, op, comm_ptr, s);
928     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
929 
930     mpi_errno = MPID_Sched_start(&s, comm_ptr, tag, &reqp);
931     if (reqp)
932         *request = reqp->handle;
933     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
934 
935 fn_exit:
936     return mpi_errno;
937 fn_fail:
938     goto fn_exit;
939 }
940 
941 #endif /* MPICH_MPI_FROM_PMPI */
942 
943 #undef FUNCNAME
944 #define FUNCNAME MPIX_Ireduce_scatter_block
945 #undef FCNAME
946 #define FCNAME MPIU_QUOTE(FUNCNAME)
947 /*@
948 MPIX_Ireduce_scatter_block - XXX description here
949 
950 Input Parameters:
951 + sendbuf - starting address of the send buffer (choice)
952 . recvcount - element count per block (non-negative integer)
953 . datatype - data type of elements of input buffer (handle)
954 . op - operation (handle)
955 - comm - communicator (handle)
956 
957 Output Parameters:
958 + recvbuf - starting address of the receive buffer (choice)
959 - request - communication request (handle)
960 
961 .N ThreadSafe
962 
963 .N Fortran
964 
965 .N Errors
966 @*/
MPIX_Ireduce_scatter_block(const void * sendbuf,void * recvbuf,int recvcount,MPI_Datatype datatype,MPI_Op op,MPI_Comm comm,MPI_Request * request)967 int MPIX_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request)
968 {
969     int mpi_errno = MPI_SUCCESS;
970     MPID_Comm *comm_ptr = NULL;
971     MPID_MPI_STATE_DECL(MPID_STATE_MPIX_IREDUCE_SCATTER_BLOCK);
972 
973     MPIU_THREAD_CS_ENTER(ALLFUNC,);
974     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIX_IREDUCE_SCATTER_BLOCK);
975 
976     /* Validate parameters, especially handles needing to be converted */
977 #   ifdef HAVE_ERROR_CHECKING
978     {
979         MPID_BEGIN_ERROR_CHECKS
980         {
981             MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno);
982             MPIR_ERRTEST_OP(op, mpi_errno);
983             MPIR_ERRTEST_COMM(comm, mpi_errno);
984 
985             /* TODO more checks may be appropriate */
986         }
987         MPID_END_ERROR_CHECKS
988     }
989 #   endif /* HAVE_ERROR_CHECKING */
990 
991     /* Convert MPI object handles to object pointers */
992     MPID_Comm_get_ptr(comm, comm_ptr);
993 
994     /* Validate parameters and objects (post conversion) */
995 #   ifdef HAVE_ERROR_CHECKING
996     {
997         MPID_BEGIN_ERROR_CHECKS
998         {
999             MPID_Comm_valid_ptr(comm_ptr, mpi_errno);
1000             if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) {
1001                 MPID_Datatype *datatype_ptr = NULL;
1002                 MPID_Datatype_get_ptr(datatype, datatype_ptr);
1003                 MPID_Datatype_valid_ptr(datatype_ptr, mpi_errno);
1004                 if (mpi_errno != MPI_SUCCESS) goto fn_fail;
1005                 MPID_Datatype_committed_ptr(datatype_ptr, mpi_errno);
1006                 if (mpi_errno != MPI_SUCCESS) goto fn_fail;
1007             }
1008 
1009             if (HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) {
1010                 MPID_Op *op_ptr = NULL;
1011                 MPID_Op_get_ptr(op, op_ptr);
1012                 MPID_Op_valid_ptr(op_ptr, mpi_errno);
1013             }
1014             else if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) {
1015                 mpi_errno = ( * MPIR_OP_HDL_TO_DTYPE_FN(op) )(datatype);
1016             }
1017             if (mpi_errno != MPI_SUCCESS) goto fn_fail;
1018 
1019             MPIR_ERRTEST_ARGNULL(request,"request", mpi_errno);
1020             /* TODO more checks may be appropriate (counts, in_place, buffer aliasing, etc) */
1021         }
1022         MPID_END_ERROR_CHECKS
1023     }
1024 #   endif /* HAVE_ERROR_CHECKING */
1025 
1026     /* ... body of routine ...  */
1027 
1028     mpi_errno = MPIR_Ireduce_scatter_block_impl(sendbuf, recvbuf, recvcount, datatype, op, comm_ptr, request);
1029     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1030 
1031     /* ... end of body of routine ... */
1032 
1033 fn_exit:
1034     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIX_IREDUCE_SCATTER_BLOCK);
1035     MPIU_THREAD_CS_EXIT(ALLFUNC,);
1036     return mpi_errno;
1037 
1038 fn_fail:
1039     /* --BEGIN ERROR HANDLING-- */
1040 #   ifdef HAVE_ERROR_CHECKING
1041     {
1042         mpi_errno = MPIR_Err_create_code(
1043             mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER,
1044             "**mpix_ireduce_scatter_block", "**mpix_ireduce_scatter_block %p %p %d %D %O %C %p", sendbuf, recvbuf, recvcount, datatype, op, comm, request);
1045     }
1046 #   endif
1047     mpi_errno = MPIR_Err_return_comm(comm_ptr, FCNAME, mpi_errno);
1048     goto fn_exit;
1049     /* --END ERROR HANDLING-- */
1050     goto fn_exit;
1051 }
1052