1 /* begin_generated_IBM_copyright_prolog */
2 /* */
3 /* This is an automatically generated copyright prolog. */
4 /* After initializing, DO NOT MODIFY OR MOVE */
5 /* --------------------------------------------------------------- */
6 /* Licensed Materials - Property of IBM */
7 /* Blue Gene/Q 5765-PER 5765-PRP */
8 /* */
9 /* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved */
10 /* US Government Users Restricted Rights - */
11 /* Use, duplication, or disclosure restricted */
12 /* by GSA ADP Schedule Contract with IBM Corp. */
13 /* */
14 /* --------------------------------------------------------------- */
15 /* */
16 /* end_generated_IBM_copyright_prolog */
17 /* (C)Copyright IBM Corp. 2007, 2011 */
18 /**
19 * \file src/pt2pt/mpidi_sendmsg.c
20 * \brief Funnel point for starting all MPI messages
21 */
22 #include <mpidimpl.h>
23
24
25 static inline void
MPIDI_SendMsg_short(pami_context_t context,MPID_Request * sreq,pami_endpoint_t dest,void * sndbuf,unsigned sndlen,unsigned isSync)26 MPIDI_SendMsg_short(pami_context_t context,
27 MPID_Request * sreq,
28 pami_endpoint_t dest,
29 void * sndbuf,
30 unsigned sndlen,
31 unsigned isSync)
32 {
33 MPIDI_MsgInfo * msginfo = &sreq->mpid.envelope.msginfo;
34
35 pami_send_immediate_t params = {
36 .dispatch = MPIDI_Protocols_Short,
37 .dest = dest,
38 .header = {
39 .iov_base = msginfo,
40 .iov_len = sizeof(MPIDI_MsgInfo),
41 },
42 .data = {
43 .iov_base = sndbuf,
44 .iov_len = sndlen,
45 },
46 };
47 if (isSync)
48 params.dispatch = MPIDI_Protocols_ShortSync;
49
50 pami_result_t rc;
51 rc = PAMI_Send_immediate(context, ¶ms);
52 #ifdef TRACE_ON
53 if (rc)
54 {
55 TRACE_ERR("sizeof(msginfo)=%zu sizeof(data)=%u\n", sizeof(MPIDI_MsgInfo), sndlen);
56 }
57 #endif
58 MPID_assert(rc == PAMI_SUCCESS);
59 #ifdef MPIDI_TRACE
60 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].mode=params.dispatch;
61 if (!isSync) {
62 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].NoComp=1;
63 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].sendShort=1;
64 } else
65 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].sendEnvelop=1;
66
67 #endif
68
69 MPIDI_SendDoneCB_inline(context, sreq, PAMI_SUCCESS);
70 #if (MPIDI_STATISTICS)
71 MPID_NSTAT(mpid_statp->sendsComplete);
72 #endif
73 }
74
75 static void
76 MPIDI_SendMsg_eager(pami_context_t context,
77 MPID_Request * sreq,
78 pami_endpoint_t dest,
79 void * sndbuf,
80 unsigned sndlen)
81 __attribute__((__noinline__));
82 static void
MPIDI_SendMsg_eager(pami_context_t context,MPID_Request * sreq,pami_endpoint_t dest,void * sndbuf,unsigned sndlen)83 MPIDI_SendMsg_eager(pami_context_t context,
84 MPID_Request * sreq,
85 pami_endpoint_t dest,
86 void * sndbuf,
87 unsigned sndlen)
88 {
89 MPIDI_MsgInfo * msginfo = &sreq->mpid.envelope.msginfo;
90
91 pami_send_t params = {
92 .send = {
93 .dispatch = MPIDI_Protocols_Eager,
94 .dest = dest,
95 .header = {
96 .iov_base = msginfo,
97 .iov_len = sizeof(MPIDI_MsgInfo),
98 },
99 .data = {
100 .iov_base = sndbuf,
101 .iov_len = sndlen,
102 },
103 },
104 .events = {
105 .cookie = sreq,
106 .local_fn = MPIDI_SendDoneCB,
107 .remote_fn= NULL,
108 },
109 };
110
111 pami_result_t rc;
112 rc = PAMI_Send(context, ¶ms);
113 MPID_assert(rc == PAMI_SUCCESS);
114 #ifdef MPIDI_TRACE
115 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].mode=MPIDI_Protocols_Eager;
116 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].sendEager=1;
117 #endif
118 }
119
120
121 static void
122 MPIDI_SendMsg_rzv(pami_context_t context,
123 MPID_Request * sreq,
124 pami_endpoint_t dest,
125 void * sndbuf,
126 unsigned sndlen)
127 __attribute__((__noinline__));
128 static void
MPIDI_SendMsg_rzv(pami_context_t context,MPID_Request * sreq,pami_endpoint_t dest,void * sndbuf,unsigned sndlen)129 MPIDI_SendMsg_rzv(pami_context_t context,
130 MPID_Request * sreq,
131 pami_endpoint_t dest,
132 void * sndbuf,
133 unsigned sndlen)
134 {
135 pami_result_t rc;
136
137 /* Set the isRzv bit in the SEND request. This is important for
138 * canceling requests.
139 */
140 MPIDI_Request_setRzv(sreq, 1);
141
142 /* The rendezvous information, such as the origin/local/sender
143 * node's send buffer and the number of bytes the origin node wishes
144 * to send, is sent as the payload of the request-to-send (RTS)
145 * message.
146 */
147 #ifdef USE_PAMI_RDMA
148 size_t sndlen_out;
149 rc = PAMI_Memregion_create(context,
150 sndbuf,
151 sndlen,
152 &sndlen_out,
153 &sreq->mpid.envelope.memregion);
154 MPID_assert(rc == PAMI_SUCCESS);
155 MPID_assert(sndlen == sndlen_out);
156 TRACE_ERR("RZV send for mr=%#llx addr=%p *addr[0]=%#016llx *addr[1]=%#016llx bytes=%u\n",
157 *(unsigned long long*)&sreq->mpid.envelope.memregion,
158 sndbuf,
159 *(((unsigned long long*)sndbuf)+0),
160 *(((unsigned long long*)sndbuf)+1),
161 sndlen);
162 #else
163 sreq->mpid.envelope.memregion_used = 0;
164 #ifdef OUT_OF_ORDER_HANDLING
165 if ((!MPIDI_Process.mp_s_use_pami_get) && (!sreq->mpid.shm))
166 #else
167 if (!MPIDI_Process.mp_s_use_pami_get)
168 #endif
169 {
170 size_t sndlen_out;
171 rc = PAMI_Memregion_create(context,
172 sndbuf,
173 sndlen,
174 &sndlen_out,
175 &sreq->mpid.envelope.memregion);
176 if(rc == PAMI_SUCCESS)
177 {
178 MPID_assert(sndlen == sndlen_out);
179 TRACE_ERR("RZV send for mr=%#llx addr=%p *addr[0]=%#016llx *addr[1]=%#016llx bytes=%u\n",
180 *(unsigned long long*)&sreq->mpid.envelope.memregion,
181 sndbuf,
182 *(((unsigned long long*)sndbuf)+0),
183 *(((unsigned long long*)sndbuf)+1),
184 sndlen);
185 sreq->mpid.envelope.memregion_used = 1;
186 }
187 sreq->mpid.envelope.data = sndbuf;
188 } else {
189 TRACE_ERR("RZV send (failed registration for sreq=%p addr=%p *addr[0]=%#016llx *addr[1]=%#016llx bytes=%u\n",
190 sreq,sndbuf,
191 *(((unsigned long long*)sndbuf)+0),
192 *(((unsigned long long*)sndbuf)+1),
193 sndlen);
194 sreq->mpid.envelope.data = sndbuf;
195 }
196 #endif
197 sreq->mpid.envelope.length = sndlen;
198
199 /* Do not specify a callback function to be invoked when the RTS
200 * message has been sent. The MPI_Send is completed only when the
201 * target/remote/receiver node has completed an PAMI_Get from the
202 * origin node and has then sent a rendezvous acknowledgement (ACK)
203 * to the origin node to signify the end of the transfer. When the
204 * ACK message is received by the origin node the same callback
205 * function is used to complete the MPI_Send as the non-rendezvous
206 * case.
207 */
208 pami_send_immediate_t params = {
209 .dispatch = MPIDI_Protocols_RVZ,
210 .dest = dest,
211 .header = {
212 .iov_base = &sreq->mpid.envelope,
213 .iov_len = sizeof(MPIDI_MsgEnvelope),
214 },
215 .data = {
216 .iov_base = NULL,
217 .iov_len = 0,
218 },
219 };
220
221 rc = PAMI_Send_immediate(context, ¶ms);
222 MPID_assert(rc == PAMI_SUCCESS);
223 #ifdef MPIDI_TRACE
224 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].bufaddr=sreq->mpid.envelope.data;
225 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].mode=MPIDI_Protocols_RVZ;
226 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].sendRzv=1;
227 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].sendEnvelop=1;
228 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].memRegion=sreq->mpid.envelope.memregion_used;
229 MPIDI_Out_cntr[dest].S[(sreq->mpid.idx)].use_pami_get=MPIDI_Process.mp_s_use_pami_get;
230 #endif
231 }
232
233
234 static void
235 MPIDI_SendMsg_rzv_zerobyte(pami_context_t context,
236 MPID_Request * sreq,
237 pami_endpoint_t dest)
238 __attribute__((__noinline__));
239 static void
MPIDI_SendMsg_rzv_zerobyte(pami_context_t context,MPID_Request * sreq,pami_endpoint_t dest)240 MPIDI_SendMsg_rzv_zerobyte(pami_context_t context,
241 MPID_Request * sreq,
242 pami_endpoint_t dest)
243 {
244 pami_result_t rc;
245
246 /* Set the isRzv bit in the SEND request. This is important for
247 * canceling requests.
248 */
249 MPIDI_Request_setRzv(sreq, 1);
250
251 /* The rendezvous information, such as the origin/local/sender
252 * node's send buffer and the number of bytes the origin node wishes
253 * to send, is sent as the payload of the request-to-send (RTS)
254 * message.
255 */
256
257 sreq->mpid.envelope.data = NULL;
258 sreq->mpid.envelope.length = 0;
259
260 /* Do not specify a callback function to be invoked when the RTS
261 * message has been sent. The MPI_Send is completed only when the
262 * target/remote/receiver node has matched the receive and has then
263 * sent a rendezvous acknowledgement (ACK) to the origin node to
264 * signify the end of the transfer. When the ACK message is received
265 * by the origin node the same callback function is used to complete
266 * the MPI_Send as the non-rendezvous case.
267 */
268 pami_send_immediate_t params = {
269 .dispatch = MPIDI_Protocols_RVZ_zerobyte,
270 .dest = dest,
271 .header = {
272 .iov_base = &sreq->mpid.envelope,
273 .iov_len = sizeof(MPIDI_MsgEnvelope),
274 },
275 .data = {
276 .iov_base = NULL,
277 .iov_len = 0,
278 },
279 };
280
281 rc = PAMI_Send_immediate(context, ¶ms);
282 MPID_assert(rc == PAMI_SUCCESS);
283 }
284
285
286
287 static void
288 MPIDI_SendMsg_process_userdefined_dt(MPID_Request * sreq,
289 void ** sndbuf,
290 size_t * data_sz)
291 __attribute__((__noinline__));
292 static void
MPIDI_SendMsg_process_userdefined_dt(MPID_Request * sreq,void ** _sndbuf,size_t * _data_sz)293 MPIDI_SendMsg_process_userdefined_dt(MPID_Request * sreq,
294 void ** _sndbuf,
295 size_t * _data_sz)
296 {
297 size_t data_sz;
298 int dt_contig;
299 MPI_Aint dt_true_lb;
300 MPID_Datatype * dt_ptr;
301 void * sndbuf;
302
303 /*
304 * Get the datatype info
305 */
306 MPIDI_Datatype_get_info(sreq->mpid.userbufcount,
307 sreq->mpid.datatype,
308 dt_contig,
309 data_sz,
310 dt_ptr,
311 dt_true_lb);
312
313 MPID_assert(sreq->mpid.uebuf == NULL);
314
315 /*
316 * Contiguous data type
317 */
318 if (likely(dt_contig))
319 {
320 sndbuf = sreq->mpid.userbuf + dt_true_lb;
321 }
322
323 /*
324 * Non-contiguous data type; allocate and populate temporary send
325 * buffer
326 */
327 else
328 {
329 MPID_Segment segment;
330
331 sreq->mpid.uebuf = sndbuf = MPIU_Malloc(data_sz);
332 if (unlikely(sndbuf == NULL))
333 {
334 sreq->status.MPI_ERROR = MPI_ERR_NO_SPACE;
335 sreq->status.count = 0;
336 MPID_Abort(NULL, MPI_ERR_NO_SPACE, -1,
337 "Unable to allocate non-contiguous buffer");
338 }
339 sreq->mpid.uebuf_malloc = 1;
340
341 DLOOP_Offset last = data_sz;
342 MPID_Segment_init(sreq->mpid.userbuf,
343 sreq->mpid.userbufcount,
344 sreq->mpid.datatype,
345 &segment,
346 0);
347 MPID_Segment_pack(&segment, 0, &last, sndbuf);
348 MPID_assert(last == data_sz);
349 }
350
351 *_sndbuf = sndbuf;
352 *_data_sz = data_sz;
353 }
354
355
356 static inline void
MPIDI_SendMsg(pami_context_t context,MPID_Request * sreq,unsigned isSync,const unsigned isInternal)357 MPIDI_SendMsg(pami_context_t context,
358 MPID_Request * sreq,
359 unsigned isSync,
360 const unsigned isInternal)
361 {
362 /* ------------------------------ */
363 /* special case: NULL destination */
364 /* ------------------------------ */
365 int rank = MPIDI_Request_getPeerRank_comm(sreq);
366 if (unlikely(rank == MPI_PROC_NULL))
367 {
368 if (isSync)
369 MPIDI_Request_complete(sreq);
370 MPIDI_Request_complete(sreq);
371 return;
372 }
373 else
374 {
375 MPIDI_Request_setPeerRank_pami(sreq, MPID_VCR_GET_LPID(sreq->comm->vcr, rank));
376 }
377
378 MPIDI_Request_setSync(sreq, isSync);
379 MPIDI_Request_setPeerRequestH(sreq);
380
381 /*
382 * Create the destination endpoint
383 */
384 pami_endpoint_t dest;
385 MPIDI_Context_endpoint(sreq, &dest);
386 pami_task_t dest_tid;
387 dest_tid=sreq->comm->vcr[rank];
388 #if (MPIDI_STATISTICS)
389 MPID_NSTAT(mpid_statp->sends);
390 #endif
391 #ifdef OUT_OF_ORDER_HANDLING
392 MPIDI_Out_cntr_t *out_cntr;
393
394 MPIU_THREAD_CS_ENTER(MSGQUEUE,0);
395 out_cntr = &MPIDI_Out_cntr[dest_tid];
396 out_cntr->nMsgs++;
397 MPIU_THREAD_CS_EXIT(MSGQUEUE,0);
398 MPIDI_Request_setMatchSeq(sreq, out_cntr->nMsgs);
399 #endif
400
401 size_t data_sz;
402 void * sndbuf;
403 if (likely(HANDLE_GET_KIND(sreq->mpid.datatype) == HANDLE_KIND_BUILTIN))
404 {
405 sndbuf = sreq->mpid.userbuf;
406 data_sz = sreq->mpid.userbufcount * MPID_Datatype_get_basic_size(sreq->mpid.datatype);
407 }
408 else
409 {
410 MPIDI_SendMsg_process_userdefined_dt(sreq, &sndbuf, &data_sz);
411 }
412 #ifdef MPIDI_TRACE
413 sreq->mpid.partner_id=dest;
414 GET_REC_S(sreq,context,isSync,data_sz)
415 #endif
416
417 #ifdef OUT_OF_ORDER_HANDLING
418 sreq->mpid.shm=0;
419 #endif
420
421 #ifdef WORKAROUND_UNIMPLEMENTED_SEND_IMMEDIATE_OVERFLOW
422 if (isInternal == 0)
423 #endif
424 {
425 if (unlikely(PAMIX_Task_is_local(dest_tid) != 0))
426 {
427 /*
428 * Always use the short protocol when data_sz is small.
429 */
430 if (likely(data_sz < MPIDI_Process.short_limit))
431 {
432 TRACE_ERR("Sending(short,intranode) bytes=%u (short_limit=%u)\n", data_sz, MPIDI_Process.short_limit);
433 MPIDI_SendMsg_short(context,
434 sreq,
435 dest,
436 sndbuf,
437 data_sz,
438 isSync);
439 }
440 /*
441 * Use the eager protocol when data_sz is less than the 'local' eager limit.
442 */
443 else if (data_sz < MPIDI_Process.eager_limit_local)
444 {
445 TRACE_ERR("Sending(eager,intranode) bytes=%u (eager_limit_local=%u)\n", data_sz, MPIDI_Process.eager_limit_local);
446 MPIDI_SendMsg_eager(context,
447 sreq,
448 dest,
449 sndbuf,
450 data_sz);
451 }
452 /*
453 * Use the default rendezvous protocol (glue implementation that
454 * guarantees no unexpected data).
455 */
456 else
457 {
458 TRACE_ERR("Sending(RZV,intranode) bytes=%u (eager_limit=%u)\n", data_sz, MPIDI_Process.eager_limit);
459 #ifdef OUT_OF_ORDER_HANDLING
460 sreq->mpid.shm=1;
461 #endif
462 MPIDI_SendMsg_rzv(context,
463 sreq,
464 dest,
465 sndbuf,
466 data_sz);
467 }
468 }
469 /*
470 * Always use the short protocol when data_sz is small.
471 */
472 else if (likely(data_sz < MPIDI_Process.short_limit))
473 {
474 TRACE_ERR("Sending(short) bytes=%u (eager_limit=%u)\n", data_sz, MPIDI_Process.eager_limit);
475 MPIDI_SendMsg_short(context,
476 sreq,
477 dest,
478 sndbuf,
479 data_sz,
480 isSync);
481 }
482 /*
483 * Use the eager protocol when data_sz is less than the eager limit.
484 */
485 else if (data_sz < MPIDI_Process.eager_limit)
486 {
487 TRACE_ERR("Sending(eager) bytes=%u (eager_limit=%u)\n", data_sz, MPIDI_Process.eager_limit);
488 MPIDI_SendMsg_eager(context,
489 sreq,
490 dest,
491 sndbuf,
492 data_sz);
493 #ifdef MPIDI_STATISTICS
494 if (MPID_cc_is_complete(&sreq->cc))
495 {
496 MPID_NSTAT(mpid_statp->sendsComplete);
497 }
498 #endif
499 }
500 /*
501 * Use the default rendezvous protocol (glue implementation that
502 * guarantees no unexpected data).
503 */
504 else
505 {
506 TRACE_ERR("Sending(RZV) bytes=%u (eager_limit=%u)\n", data_sz, MPIDI_Process.eager_limit);
507 if (likely(data_sz > 0))
508 {
509 MPIDI_SendMsg_rzv(context,
510 sreq,
511 dest,
512 sndbuf,
513 data_sz);
514 }
515 else
516 {
517 MPIDI_SendMsg_rzv_zerobyte(context, sreq, dest);
518 }
519 #ifdef MPIDI_STATISTICS
520 if (MPID_cc_is_complete(&sreq->cc))
521 {
522 MPID_NSTAT(mpid_statp->sendsComplete);
523 }
524 #endif
525 }
526 }
527
528 #ifdef WORKAROUND_UNIMPLEMENTED_SEND_IMMEDIATE_OVERFLOW
529 /* internal only == no send immediate */
530 else
531 {
532 const unsigned eager_limit =
533 PAMIX_Task_is_local(dest_tid)==0?
534 MPIDI_Process.eager_limit:
535 MPIDI_Process.eager_limit_local;
536
537 if (data_sz < eager_limit)
538 {
539 TRACE_ERR("Sending(eager) bytes=%u (eager_limit=%u)\n", data_sz, eager_limit);
540 MPIDI_SendMsg_eager(context,
541 sreq,
542 dest,
543 sndbuf,
544 data_sz);
545 }
546 else
547 {
548 TRACE_ERR("Sending(RZV) bytes=%u (eager_limit=NA)\n", data_sz);
549 #ifdef OUT_OF_ORDER_HANDLING
550 sreq->mpid.shm=(PAMIX_Task_is_local(dest_tid)==0);
551 #endif
552 MPIDI_SendMsg_rzv(context,
553 sreq,
554 dest,
555 sndbuf,
556 data_sz);
557 }
558
559 #ifdef MPIDI_STATISTICS
560 if (MPID_cc_is_complete(&sreq->cc))
561 {
562 MPID_NSTAT(mpid_statp->sendsComplete);
563 }
564 #endif
565 }
566 #endif /* WORKAROUND_UNIMPLEMENTED_SEND_IMMEDIATE_OVERFLOW */
567 }
568
569
570 /*
571 * \brief Central function for all low-level sends.
572 *
573 * This is assumed to have been posted to a context, and is now being
574 * called from inside advance. This has (unspecified) locking
575 * implications.
576 *
577 * Prerequisites:
578 * + Not sending to a NULL rank
579 * + Request already allocated
580 *
581 * \param[in] context The PAMI context on which to do the send operation
582 * \param[in,out] sreq Structure containing all relevant info about the message.
583 */
584 pami_result_t
MPIDI_Send_handoff(pami_context_t context,void * _sreq)585 MPIDI_Send_handoff(pami_context_t context,
586 void * _sreq)
587 {
588 MPID_Request * sreq = (MPID_Request*)_sreq;
589 MPID_assert(sreq != NULL);
590
591 MPIDI_SendMsg(context, sreq, 0, 0);
592 return PAMI_SUCCESS;
593 }
594
595
596 pami_result_t
MPIDI_Ssend_handoff(pami_context_t context,void * _sreq)597 MPIDI_Ssend_handoff(pami_context_t context,
598 void * _sreq)
599 {
600 MPID_Request * sreq = (MPID_Request*)_sreq;
601 MPID_assert(sreq != NULL);
602
603 MPIDI_SendMsg(context, sreq, 1, 0);
604 return PAMI_SUCCESS;
605 }
606
607
608 /*
609 * \brief Central function for all low-level sends.
610 *
611 * This is assumed to have been posted to a context, and is now being
612 * called from inside advance. This has (unspecified) locking
613 * implications.
614 *
615 * Prerequisites:
616 * + Not sending to a NULL rank
617 * + Request already allocated
618 *
619 * \param[in] context The PAMI context on which to do the send operation
620 * \param[in,out] sreq Structure containing all relevant info about the message.
621 */
622 pami_result_t
MPIDI_Isend_handoff(pami_context_t context,void * _sreq)623 MPIDI_Isend_handoff(pami_context_t context,
624 void * _sreq)
625 {
626 MPID_Request * sreq = (MPID_Request*)_sreq;
627 MPID_assert(sreq != NULL);
628
629 /* This initializes all the fields not set in MPI_Isend() */
630 MPIDI_Request_initialize(sreq);
631
632 /* Since this is only called from MPI_Isend(), it is not synchronous */
633 MPIDI_SendMsg(context, sreq, 0, 0);
634 return PAMI_SUCCESS;
635 }
636
637 pami_result_t
MPIDI_Isend_handoff_internal(pami_context_t context,void * _sreq)638 MPIDI_Isend_handoff_internal(pami_context_t context,
639 void * _sreq)
640 {
641 MPID_Request * sreq = (MPID_Request*)_sreq;
642 MPID_assert(sreq != NULL);
643
644 /* This initializes all the fields not set in MPI_Isend() */
645 MPIDI_Request_initialize(sreq);
646
647 /* Since this is only called from MPI_Isend(), it is not synchronous */
648 MPIDI_SendMsg(context, sreq, 0, 1);
649 return PAMI_SUCCESS;
650 }
651