1 /*
2 * Copyright (C) 2016 by Argonne National Laboratory.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32 #ifndef _FI_BGQ_DIRECT_RMA_H_
33 #define _FI_BGQ_DIRECT_RMA_H_
34
35 #define FABRIC_DIRECT_RMA 1
36
37 #include <pthread.h>
38
39 #include "rdma/bgq/fi_bgq_compiler.h"
40 #include "rdma/bgq/fi_bgq_spi.h"
41
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45
fi_bgq_check_rma(struct fi_bgq_ep * bgq_ep,enum fi_av_type av_type)46 static inline int fi_bgq_check_rma(struct fi_bgq_ep *bgq_ep,
47 enum fi_av_type av_type)
48 {
49 #ifdef DEBUG
50 if (!bgq_ep)
51 return -FI_EINVAL;
52 if (bgq_ep->state != FI_BGQ_EP_ENABLED)
53 return -FI_EINVAL;
54
55 if (av_type == FI_AV_UNSPEC)
56 return -FI_EINVAL;
57 if (av_type == FI_AV_MAP && bgq_ep->av_type != FI_AV_MAP)
58 return -FI_EINVAL;
59 if (av_type == FI_AV_TABLE && bgq_ep->av_type != FI_AV_TABLE)
60 return -FI_EINVAL;
61 #endif
62 return 0;
63 }
64
65
fi_bgq_readv_internal(struct fi_bgq_ep * bgq_ep,const struct iovec * iov,const size_t niov,const union fi_bgq_addr * bgq_target_addr,const uint64_t * addr,const uint64_t * key,union fi_bgq_context * bgq_context,const uint64_t tx_op_flags,const uint64_t enable_cq,const uint64_t enable_cntr,const int lock_required)66 static inline void fi_bgq_readv_internal (struct fi_bgq_ep * bgq_ep,
67 const struct iovec * iov,
68 const size_t niov,
69 const union fi_bgq_addr * bgq_target_addr,
70 const uint64_t * addr,
71 const uint64_t * key,
72 union fi_bgq_context * bgq_context,
73 const uint64_t tx_op_flags,
74 const uint64_t enable_cq,
75 const uint64_t enable_cntr,
76 const int lock_required)
77 {
78 #ifdef FI_BGQ_TRACE
79 fprintf(stderr,"fi_bgq_readv_internal starting - niov is %ld do_cntr is %d\n",niov,(enable_cntr && ( bgq_ep->write_cntr != 0)));
80 fflush(stderr);
81 #endif
82 assert(niov <= 8);
83
84 const uint64_t do_cq = enable_cq && (tx_op_flags & FI_COMPLETION);
85
86 struct fi_bgq_cntr * write_cntr = bgq_ep->write_cntr;
87 const uint64_t do_cntr = enable_cntr && (write_cntr != 0);
88
89 MUHWI_Descriptor_t * model = &bgq_ep->tx.read.emulation.mfifo_model;
90
91 const uint64_t fifo_map = fi_bgq_addr_get_fifo_map(bgq_target_addr->fi);
92
93 /* busy-wait until a fifo slot is available .. */
94 MUHWI_Descriptor_t * desc =
95 fi_bgq_spi_injfifo_tail_wait(&bgq_ep->tx.injfifo);
96
97 /* copy the descriptor model into the injection fifo */
98 qpx_memcpy64((void*)desc, (const void *)model);
99
100 /* set the target torus address and fifo map */
101 desc->PacketHeader.NetworkHeader.pt2pt.Destination = fi_bgq_uid_get_destination(bgq_target_addr->uid.fi);
102 desc->Torus_FIFO_Map = fifo_map;
103
104 /* locate the payload lookaside slot */
105 MUHWI_Descriptor_t * dput_desc =
106 (MUHWI_Descriptor_t *)fi_bgq_spi_injfifo_immediate_payload(&bgq_ep->tx.injfifo,
107 desc, &desc->Pa_Payload);
108 desc->Message_Length = (niov << BGQ_MU_DESCRIPTOR_SIZE_IN_POWER_OF_2);
109
110
111 desc->PacketHeader.messageUnitHeader.Packet_Types.Memory_FIFO.Rec_FIFO_Id =
112 fi_bgq_addr_rec_fifo_id(bgq_target_addr->fi);
113
114 union fi_bgq_mu_packet_hdr * hdr = (union fi_bgq_mu_packet_hdr *) &desc->PacketHeader;
115 hdr->rma.ndesc = niov;
116
117 /* TODO - how to specify multiple remote injection fifos? */
118
119 union fi_bgq_mu_descriptor * fi_dput_desc = (union fi_bgq_mu_descriptor *) dput_desc;
120
121 unsigned i;
122 for (i = 0; i < niov; ++i) { /* on fence this loop will compile out (niov is 0) */
123
124 qpx_memcpy64((void*)&dput_desc[i],
125 (const void*)&bgq_ep->tx.read.emulation.dput_model);
126
127 dput_desc[i].Torus_FIFO_Map = fifo_map;
128 dput_desc[i].Message_Length = iov[i].iov_len;
129 dput_desc[i].Pa_Payload = addr[i];
130
131 /* determine the physical address of the destination data location */
132 uint64_t iov_base_paddr = 0;
133 uint32_t cnk_rc __attribute__ ((unused));
134 cnk_rc = fi_bgq_cnk_vaddr2paddr(iov[i].iov_base, iov[i].iov_len, &iov_base_paddr);
135 assert(cnk_rc==0);
136 MUSPI_SetRecPayloadBaseAddressInfo(&dput_desc[i], FI_BGQ_MU_BAT_ID_GLOBAL, iov_base_paddr);
137
138 assert((key[i] & 0xFFFF000000000000ul) == 0); /* TODO - change this when key size > 48b */
139 fi_dput_desc[i].rma.key_lsb = key[i];
140 }
141
142 if (do_cntr && niov < 8) { /* likely */
143 #ifdef FI_BGQ_TRACE
144 fprintf(stderr,"fi_bgq_readv_internal do_cntr && niov %ld < 8\n",niov);
145 fflush(stderr);
146 #endif
147 /* add the counter update direct-put descriptor to the
148 * tail of the rget/mfifo payload */
149
150 qpx_memcpy64((void*)&dput_desc[niov],
151 (const void*)&bgq_ep->tx.read.cntr_model);
152
153 dput_desc[niov].Torus_FIFO_Map = fifo_map;
154 MUSPI_SetRecPayloadBaseAddressInfo(&dput_desc[niov],
155 FI_BGQ_MU_BAT_ID_GLOBAL,
156 MUSPI_GetAtomicAddress(write_cntr->std.paddr, MUHWI_ATOMIC_OPCODE_STORE_ADD));
157
158 desc->Message_Length += sizeof(MUHWI_Descriptor_t);
159 union fi_bgq_mu_packet_hdr * hdr = (union fi_bgq_mu_packet_hdr *) &desc->PacketHeader;
160 hdr->rma.ndesc += 1;
161
162 if (!do_cq) { /* likely */
163
164 #ifdef FI_BGQ_TRACE
165 fprintf(stderr,"fi_bgq_readv_internal do_cntr && niov < 8 AND (!do_cq)\n");
166 fflush(stderr);
167 #endif
168 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
169
170 } else if (niov < 7) {
171
172 /* add the cq update direct-put descriptor to the
173 * tail of the rget/mfifo payload (after the cntr update) */
174
175 /* initialize the completion entry */
176 assert(bgq_context);
177 assert(((uintptr_t)bgq_context & 0x07ull) == 0); /* must be 8 byte aligned */
178 bgq_context->flags = FI_RMA | FI_READ;
179 bgq_context->len = 0;
180 bgq_context->buf = NULL;
181 bgq_context->byte_counter = 1;
182 bgq_context->tag = 0;
183
184 uint64_t byte_counter_paddr = 0;
185 uint32_t cnk_rc __attribute__ ((unused));
186 cnk_rc = fi_bgq_cnk_vaddr2paddr((void*)&bgq_context->byte_counter,
187 sizeof(uint64_t), &byte_counter_paddr);
188 assert(cnk_rc == 0);
189
190 MUHWI_Descriptor_t * cq_desc = &dput_desc[niov+1];
191
192 qpx_memcpy64((void*)cq_desc,
193 (const void*)&bgq_ep->tx.read.cq_model);
194
195 cq_desc->Torus_FIFO_Map = fifo_map;
196 MUSPI_SetRecPayloadBaseAddressInfo(cq_desc,
197 FI_BGQ_MU_BAT_ID_GLOBAL, byte_counter_paddr);
198
199 desc->Message_Length += sizeof(MUHWI_Descriptor_t);
200 union fi_bgq_mu_packet_hdr * hdr = (union fi_bgq_mu_packet_hdr *) &desc->PacketHeader;
201 hdr->rma.ndesc += 1;
202
203 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
204
205 fi_bgq_cq_enqueue_pending(bgq_ep->send_cq, bgq_context, lock_required);
206
207 } else {
208
209 /* the rget/mfifo payload is full - inject the data
210 * movement descriptors, then inject the counter
211 * completion descriptor */
212 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
213
214 /* be lazy and do a single recursive call */
215 fi_bgq_readv_internal(bgq_ep,
216 NULL, 0, /* no iovec array */
217 bgq_target_addr,
218 NULL, NULL, /* no addr array, no key array */
219 bgq_context, tx_op_flags,
220 1, /* enable cq */
221 0, /* disable cntr */
222 lock_required);
223 }
224
225 } else if (do_cntr) { /* unlikely */
226
227 /* the rget/mfifo payload is full - inject the data
228 * movement descriptors, then inject any counter or cq
229 * completion descriptor(s) via a recursive call */
230 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
231
232 fi_bgq_readv_internal(bgq_ep,
233 NULL, 0, /* no iovec array */
234 bgq_target_addr,
235 NULL, NULL, /* no addr array, no key array */
236 bgq_context, tx_op_flags,
237 do_cq,
238 1, /* enable cntr */
239 lock_required);
240
241 } else if (do_cq && niov < 8) {
242
243 /* no cntr completion
244 *
245 * add the cq byte counter decrement direct-put
246 * descriptor to the tail of the rget/mfifo payload */
247
248 /* initialize the completion entry */
249 assert(bgq_context);
250 assert(((uintptr_t)bgq_context & 0x07ull) == 0); /* must be 8 byte aligned */
251 bgq_context->flags = FI_RMA | FI_READ;
252 bgq_context->len = 0;
253 bgq_context->buf = NULL;
254 bgq_context->byte_counter = 1;
255 bgq_context->tag = 0;
256
257 uint64_t byte_counter_paddr = 0;
258 uint32_t cnk_rc __attribute__ ((unused));
259 cnk_rc = fi_bgq_cnk_vaddr2paddr((void*)&bgq_context->byte_counter,
260 sizeof(uint64_t), &byte_counter_paddr);
261 assert(cnk_rc == 0);
262
263 MUHWI_Descriptor_t * cq_desc = &dput_desc[niov];
264
265 qpx_memcpy64((void*)cq_desc,
266 (const void*)&bgq_ep->tx.read.cq_model);
267
268 cq_desc->Torus_FIFO_Map = fifo_map;
269 MUSPI_SetRecPayloadBaseAddressInfo(cq_desc,
270 FI_BGQ_MU_BAT_ID_GLOBAL, byte_counter_paddr);
271
272 desc->Message_Length += sizeof(MUHWI_Descriptor_t);
273 union fi_bgq_mu_packet_hdr * hdr = (union fi_bgq_mu_packet_hdr *) &desc->PacketHeader;
274 hdr->rma.ndesc += 1;
275
276 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
277
278 fi_bgq_cq_enqueue_pending(bgq_ep->send_cq, bgq_context, lock_required);
279
280 } else if (do_cq) {
281
282 /* the rget/mfifo payload is full - inject the data
283 * movement descriptors, then inject the cq completion
284 * descriptor via a recursive call */
285 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
286
287 fi_bgq_readv_internal(bgq_ep,
288 NULL, 0, /* no iovec array */
289 bgq_target_addr,
290 NULL, NULL, /* no addr array, no key array */
291 bgq_context, tx_op_flags,
292 1, /* enable cq */
293 0, /* disable cntr */
294 lock_required);
295
296 } else {
297 /* no cntr and no cq? very unlikely, if not invalid */
298
299 /* if there are no completion operations then there *must* be
300 * at least one data movement operations */
301 assert(niov > 0);
302
303 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
304 }
305 }
306
fi_bgq_inject_write_generic(struct fid_ep * ep,const void * buf,size_t len,fi_addr_t dst_addr,uint64_t addr,uint64_t key,int lock_required)307 static inline ssize_t fi_bgq_inject_write_generic(struct fid_ep *ep,
308 const void *buf, size_t len, fi_addr_t dst_addr,
309 uint64_t addr, uint64_t key,
310 int lock_required)
311 {
312 #ifdef FI_BGQ_TRACE
313 fprintf(stderr,"fi_bgq_inject_write_generic starting\n");
314 #endif
315 int ret;
316 struct fi_bgq_ep *bgq_ep;
317
318 bgq_ep = container_of(ep, struct fi_bgq_ep, ep_fid);
319
320 ret = fi_bgq_check_rma(bgq_ep, FI_BGQ_FABRIC_DIRECT_AV);
321 if (ret) return ret;
322
323 // if (av_type == FI_AV_TABLE)
324 // dst_addr = bgq_ep->av->table[(size_t)dst_addr];
325
326 ret = fi_bgq_lock_if_required(&bgq_ep->lock, lock_required);
327 if (ret) return ret;
328
329 MUHWI_Descriptor_t * model =
330 (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_BASIC) ?
331 &bgq_ep->tx.write.direct.dput_model :
332 &bgq_ep->tx.write.emulation.mfifo_model;
333
334 /*
335 * busy-wait until a fifo slot is available ..
336 */
337 MUHWI_Descriptor_t * desc =
338 fi_bgq_spi_injfifo_tail_wait(&bgq_ep->tx.injfifo);
339
340 /* copy the descriptor model into the injection fifo */
341 qpx_memcpy64((void*)desc, (const void *)model);
342
343 /* set the destination torus address and fifo map */
344 union fi_bgq_addr * bgq_dst_addr = (union fi_bgq_addr *)&dst_addr;
345 desc->PacketHeader.NetworkHeader.pt2pt.Destination = fi_bgq_uid_get_destination(bgq_dst_addr->uid.fi);
346 desc->Torus_FIFO_Map = fi_bgq_addr_get_fifo_map(bgq_dst_addr->fi);
347 desc->Message_Length = len;
348
349 /* locate the payload lookaside slot */
350 void * payload =
351 fi_bgq_spi_injfifo_immediate_payload(&bgq_ep->tx.injfifo,
352 desc, &desc->Pa_Payload);
353 assert(len <= sizeof(union fi_bgq_mu_packet_payload));
354 memcpy(payload, buf, len);
355
356 if (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_BASIC) { /* branch will compile out */
357 #ifdef FI_BGQ_TRACE
358 fprintf(stderr,"fi_bgq_inject_write_generic - virtual addr is 0x%016lx physical addr is 0x%016lx key is %lu \n",addr,(addr-key),key);
359 #endif
360
361 /* the 'key' is the paddr of the remote memory region */
362 MUSPI_SetRecPayloadBaseAddressInfo(desc, FI_BGQ_MU_BAT_ID_GLOBAL, addr-key);
363
364 } else if (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_SCALABLE) { /* branch will compile out */
365
366 desc->PacketHeader.messageUnitHeader.Packet_Types.Memory_FIFO.Rec_FIFO_Id =
367 fi_bgq_addr_rec_fifo_id(bgq_dst_addr->fi);
368
369 /* the 'key' is used to index into the remote base address table */
370 union fi_bgq_mu_packet_hdr * hdr = (union fi_bgq_mu_packet_hdr *) &desc->PacketHeader;
371 hdr->rma.key = key;
372 hdr->rma.offset = addr;
373 hdr->rma.nbytes = len;
374 hdr->rma.ndesc = 0;
375
376 } else {
377 assert(0);
378 }
379
380 /* the src buffer is available for reuse - increment the endpoint counter */
381 struct fi_bgq_cntr * write_cntr = bgq_ep->write_cntr;
382 if (write_cntr) L2_AtomicStoreAdd(write_cntr->std.l2_vaddr, 1);
383
384 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
385
386 ret = fi_bgq_unlock_if_required(&bgq_ep->lock, lock_required);
387 if (ret) return ret;
388
389 return 0;
390 }
391
fi_bgq_write_fence(struct fi_bgq_ep * bgq_ep,const uint64_t tx_op_flags,const union fi_bgq_addr * bgq_dst_addr,union fi_bgq_context * bgq_context,const int lock_required)392 static inline void fi_bgq_write_fence (struct fi_bgq_ep * bgq_ep,
393 const uint64_t tx_op_flags,
394 const union fi_bgq_addr * bgq_dst_addr,
395 union fi_bgq_context * bgq_context,
396 const int lock_required)
397 {
398 fi_bgq_readv_internal(bgq_ep,
399 NULL, 0, /* no iovec array */
400 bgq_dst_addr,
401 NULL, NULL, /* no addr array, key array */
402 bgq_context, tx_op_flags,
403 1,
404 1,
405 lock_required);
406 }
407
fi_bgq_write_internal(struct fi_bgq_ep * bgq_ep,const void * buf,size_t len,const union fi_bgq_addr * bgq_dst_addr,uint64_t addr,const uint64_t key,union fi_bgq_context * bgq_context,const uint64_t tx_op_flags,const uint64_t enable_cq,const uint64_t enable_cntr,const int lock_required)408 static inline void fi_bgq_write_internal (struct fi_bgq_ep * bgq_ep,
409 const void * buf,
410 size_t len,
411 const union fi_bgq_addr * bgq_dst_addr,
412 uint64_t addr,
413 const uint64_t key,
414 union fi_bgq_context * bgq_context,
415 const uint64_t tx_op_flags,
416 const uint64_t enable_cq,
417 const uint64_t enable_cntr,
418 const int lock_required)
419 {
420
421 #ifdef FI_BGQ_TRACE
422 fprintf(stderr,"fi_bgq_write_internal starting\n");
423 #endif
424 const uint64_t do_cq = enable_cq && ((tx_op_flags & FI_COMPLETION) == FI_COMPLETION);
425
426 struct fi_bgq_cntr * write_cntr = bgq_ep->write_cntr;
427 const uint64_t do_cntr = enable_cntr && (write_cntr != 0);
428
429 MUHWI_Descriptor_t * model =
430 (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_BASIC) ?
431 &bgq_ep->tx.write.direct.dput_model :
432 &bgq_ep->tx.write.emulation.mfifo_model;
433
434 /* busy-wait until a fifo slot is available .. */
435 MUHWI_Descriptor_t * desc =
436 fi_bgq_spi_injfifo_tail_wait(&bgq_ep->tx.injfifo);
437
438 /* copy the descriptor model into the injection fifo */
439 qpx_memcpy64((void*)desc, (const void *)model);
440
441 /* set the destination torus address and fifo map */
442 desc->PacketHeader.NetworkHeader.pt2pt.Destination = fi_bgq_uid_get_destination(bgq_dst_addr->uid.fi);
443 desc->Torus_FIFO_Map = fi_bgq_addr_get_fifo_map(bgq_dst_addr->fi);
444
445 if (tx_op_flags & FI_INJECT) { /* unlikely */
446
447 assert(len <= sizeof(union fi_bgq_mu_packet_payload));
448
449 /* locate the payload lookaside slot */
450 void * payload =
451 fi_bgq_spi_injfifo_immediate_payload(&bgq_ep->tx.injfifo,
452 desc, &desc->Pa_Payload);
453
454 memcpy(payload, buf, len);
455 desc->Message_Length = len;
456
457 if (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_BASIC) { /* branch will compile out */
458
459 #ifdef FI_BGQ_TRACE
460 fprintf(stderr,"fi_bgq_write_internal tx_op_flags & FI_INJECT - virtual addr is 0x%016lx physical addr is 0x%016lx key is %lu \n",addr,(addr-key),key);
461 #endif
462 /* the 'key' is the paddr of the remote memory region */
463 MUSPI_SetRecPayloadBaseAddressInfo(desc, FI_BGQ_MU_BAT_ID_GLOBAL, addr-key);
464
465 } else if (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_SCALABLE) { /* branch will compile out */
466
467 desc->PacketHeader.messageUnitHeader.Packet_Types.Memory_FIFO.Rec_FIFO_Id =
468 fi_bgq_addr_rec_fifo_id(bgq_dst_addr->fi);
469
470 /* the 'key' is used to index into the remote base address table */
471 union fi_bgq_mu_packet_hdr * hdr = (union fi_bgq_mu_packet_hdr *) &desc->PacketHeader;
472 hdr->rma.key = key;
473 hdr->rma.offset = addr;
474 hdr->rma.nbytes = len;
475 hdr->rma.ndesc = 0;
476
477 } else {
478 assert(0);
479 }
480
481 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
482
483 /* FI_TRANSMIT_COMPLETE and FI_DELIVERY_COMPLETE are not supported */
484 assert((tx_op_flags & (FI_COMPLETION | FI_TRANSMIT_COMPLETE)) != (FI_COMPLETION | FI_TRANSMIT_COMPLETE));
485 assert((tx_op_flags & (FI_COMPLETION | FI_DELIVERY_COMPLETE)) != (FI_COMPLETION | FI_DELIVERY_COMPLETE));
486
487 if (do_cq) {
488
489 assert(bgq_context);
490 assert(((uintptr_t)bgq_context & 0x07ull) == 0); /* must be 8 byte aligned */
491 bgq_context->flags = FI_RMA | FI_WRITE;
492 bgq_context->len = 0;
493 bgq_context->buf = NULL;
494 bgq_context->byte_counter = 0;
495 bgq_context->tag = 0;
496
497 fi_bgq_cq_enqueue_completed(bgq_ep->send_cq, bgq_context, lock_required);
498 }
499
500 /* the src buffer is available for reuse - increment the endpoint counter */
501 if (do_cntr) L2_AtomicStoreAdd(write_cntr->std.l2_vaddr, 1);
502
503 } else {
504 size_t xfer_bytes = MIN(len, sizeof(union fi_bgq_mu_packet_payload));
505
506 if (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_BASIC) { /* branch will compile out */
507
508 #ifdef FI_BGQ_TRACE
509 fprintf(stderr,"fi_bgq_write_internal - NOT tx_op_flags & FI_INJECT - virtual addr is 0x%016lx physical addr is 0x%016lx key is %lu \n",addr,(addr-key),key);
510 #endif
511 /* the 'key' is the paddr of the remote memory region */
512 MUSPI_SetRecPayloadBaseAddressInfo(desc, FI_BGQ_MU_BAT_ID_GLOBAL, addr-key);
513
514 } else if (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_SCALABLE) { /* branch will compile out */
515
516 desc->PacketHeader.messageUnitHeader.Packet_Types.Memory_FIFO.Rec_FIFO_Id =
517 fi_bgq_addr_rec_fifo_id(bgq_dst_addr->fi);
518
519 /* the 'key' is used to index into the remote base address table */
520 union fi_bgq_mu_packet_hdr * hdr = (union fi_bgq_mu_packet_hdr *) &desc->PacketHeader;
521 hdr->rma.key = key;
522 hdr->rma.offset = addr;
523 hdr->rma.nbytes = xfer_bytes;
524 hdr->rma.ndesc = 0;
525
526 } else {
527 assert(0);
528 }
529
530 /* determine the physical address of the source data */
531 uint64_t src_paddr = 0;
532 uint32_t cnk_rc __attribute__ ((unused));
533 cnk_rc = fi_bgq_cnk_vaddr2paddr(buf, len, &src_paddr);
534 assert(cnk_rc==0);
535
536 desc->Message_Length = xfer_bytes;
537 desc->Pa_Payload = src_paddr;
538
539 if (len <= sizeof(union fi_bgq_mu_packet_payload)) { /* likely */
540
541 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
542
543 } else {
544
545 MUHWI_Descriptor_t model = *desc;
546 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
547
548 src_paddr += xfer_bytes;
549 len -= xfer_bytes;
550 addr += xfer_bytes;
551
552 while (len > 0) {
553 desc = fi_bgq_spi_injfifo_tail_wait(&bgq_ep->tx.injfifo);
554
555 qpx_memcpy64((void*)desc, (const void*)&model);
556
557 xfer_bytes = MIN(len, sizeof(union fi_bgq_mu_packet_payload));
558 desc->Message_Length = xfer_bytes;
559 desc->Pa_Payload = src_paddr;
560
561 union fi_bgq_mu_packet_hdr * hdr = (union fi_bgq_mu_packet_hdr *) &desc->PacketHeader;
562 if (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_BASIC) {
563 #ifdef FI_BGQ_TRACE
564 fprintf(stderr,"fi_bgq_write_internal for multiple packets - NOT tx_op_flags & FI_INJECT - virtual addr is 0x%016lx physical addr is 0x%016lx key is %lu \n",addr,(addr-key),key);
565 #endif
566 /* the 'key' is the paddr of the remote memory region */
567 MUSPI_SetRecPayloadBaseAddressInfo(desc, FI_BGQ_MU_BAT_ID_GLOBAL, addr-key);
568
569 }
570 else if (FI_BGQ_FABRIC_DIRECT_MR == FI_MR_SCALABLE) {
571 hdr->rma.offset = addr;
572 hdr->rma.nbytes = xfer_bytes;
573 }
574 else {
575 assert(0);
576 }
577
578
579 MUSPI_InjFifoAdvanceDesc(bgq_ep->tx.injfifo.muspi_injfifo);
580
581 src_paddr += xfer_bytes;
582 len -= xfer_bytes;
583 addr += xfer_bytes;
584 }
585 }
586
587 if (do_cq || do_cntr)
588 fi_bgq_readv_internal(bgq_ep, NULL, 0, bgq_dst_addr,
589 NULL, NULL, bgq_context,
590 tx_op_flags, do_cq, do_cntr, lock_required);
591 }
592 }
593
594
595
596
597
fi_bgq_write_generic(struct fid_ep * ep,const void * buf,size_t len,void * desc,fi_addr_t dst_addr,uint64_t addr,uint64_t key,void * context,int lock_required)598 static inline ssize_t fi_bgq_write_generic(struct fid_ep *ep,
599 const void *buf, size_t len, void *desc, fi_addr_t dst_addr,
600 uint64_t addr, uint64_t key, void *context,
601 int lock_required)
602 {
603 int ret;
604 struct fi_bgq_ep *bgq_ep;
605
606 bgq_ep = container_of(ep, struct fi_bgq_ep, ep_fid);
607
608 ret = fi_bgq_check_rma(bgq_ep, FI_BGQ_FABRIC_DIRECT_AV);
609 if (ret) return ret;
610
611 ret = fi_bgq_lock_if_required(&bgq_ep->lock, lock_required);
612 if (ret) return ret;
613
614 fi_bgq_write_internal(bgq_ep, buf, len, (union fi_bgq_addr *)&dst_addr,
615 addr, key, (union fi_bgq_context *)context,
616 bgq_ep->tx.op_flags, 1, 1, lock_required);
617
618 ret = fi_bgq_unlock_if_required(&bgq_ep->lock, lock_required);
619 if (ret) {
620 return ret;
621 }
622
623 return 0;
624 }
625
fi_bgq_writev_generic(struct fid_ep * ep,const struct iovec * iov,void ** desc,size_t count,fi_addr_t dst_addr,uint64_t addr,uint64_t key,void * context,int lock_required)626 static inline ssize_t fi_bgq_writev_generic(struct fid_ep *ep,
627 const struct iovec *iov, void **desc, size_t count,
628 fi_addr_t dst_addr, uint64_t addr, uint64_t key, void *context,
629 int lock_required)
630 {
631 int ret;
632 struct fi_bgq_ep *bgq_ep;
633
634 bgq_ep = container_of(ep, struct fi_bgq_ep, ep_fid);
635
636 ret = fi_bgq_check_rma(bgq_ep, FI_BGQ_FABRIC_DIRECT_AV);
637 if (ret) return ret;
638
639 ret = fi_bgq_lock_if_required(&bgq_ep->lock, lock_required);
640 if (ret) return ret;
641
642 const union fi_bgq_addr bgq_dst_addr = *((union fi_bgq_addr *)&dst_addr);
643
644 size_t index = 0;
645 for (index = 0; index < count; ++index) {
646
647 size_t len = iov[index].iov_len;
648 void * buf = iov[index].iov_base;
649
650 fi_bgq_write_internal(bgq_ep, buf, len, &bgq_dst_addr,
651 addr, key, (union fi_bgq_context *)context,
652 0, 0, 0, lock_required);
653
654 addr += len;
655 }
656
657 fi_bgq_write_fence(bgq_ep, bgq_ep->tx.op_flags, &bgq_dst_addr, (union fi_bgq_context *)context,
658 lock_required);
659
660 ret = fi_bgq_unlock_if_required(&bgq_ep->lock, lock_required);
661 if (ret) return ret;
662
663 return 0;
664 }
665
666
fi_bgq_writemsg_generic(struct fid_ep * ep,const struct fi_msg_rma * msg,uint64_t flags,int lock_required)667 static inline ssize_t fi_bgq_writemsg_generic(struct fid_ep *ep,
668 const struct fi_msg_rma *msg, uint64_t flags,
669 int lock_required)
670 {
671 int ret;
672 struct fi_bgq_ep *bgq_ep;
673
674 bgq_ep = container_of(ep, struct fi_bgq_ep, ep_fid);
675
676 ret = fi_bgq_check_rma(bgq_ep, FI_BGQ_FABRIC_DIRECT_AV);
677 if (ret) return ret;
678
679 ret = fi_bgq_lock_if_required(&bgq_ep->lock, lock_required);
680 if (ret) return ret;
681
682 union fi_bgq_addr * bgq_dst_addr = (union fi_bgq_addr *)&msg->addr;
683
684
685 size_t rma_iov_index = 0;
686 const size_t rma_iov_count = msg->rma_iov_count;
687 uint64_t rma_iov_bytes = msg->rma_iov[rma_iov_index].len;
688 uint64_t rma_iov_addr = msg->rma_iov[rma_iov_index].addr;
689 uint64_t rma_iov_key = msg->rma_iov[rma_iov_index].key;
690
691 size_t msg_iov_index = 0;
692 const size_t msg_iov_count = msg->iov_count;
693 uint64_t msg_iov_bytes = msg->msg_iov[msg_iov_index].iov_len;
694 uintptr_t msg_iov_vaddr = (uintptr_t)msg->msg_iov[msg_iov_index].iov_base;
695
696 #ifdef FI_BGQ_TRACE
697 fprintf(stderr,"fi_bgq_writemsg_generic msg_iov_bytes is %lu rma_iov_bytes is %lu base vadder is 0x%016lx lock_required is %d\n",msg_iov_bytes,rma_iov_bytes,msg_iov_vaddr,lock_required);
698 fflush(stderr);
699 #endif
700 while (msg_iov_bytes != 0 && rma_iov_bytes != 0) {
701
702 size_t len = (msg_iov_bytes <= rma_iov_bytes) ? msg_iov_bytes : rma_iov_bytes;
703
704 #ifdef FI_BGQ_TRACE
705 fprintf(stderr,"fi_bgq_writemsg_generic calling fi_bgq_write_internal with msg_iov_vaddr 0x%016lx and len %lu\n",msg_iov_vaddr,len);
706 fflush(stderr);
707 #endif
708 fi_bgq_write_internal(bgq_ep, (void*)msg_iov_vaddr, len, bgq_dst_addr,
709 rma_iov_addr, rma_iov_key, NULL, 0, 0, 0, lock_required);
710
711 msg_iov_bytes -= len;
712 msg_iov_vaddr += len;
713
714 if ((msg_iov_bytes == 0) && ((msg_iov_index+1) < msg_iov_count)) {
715 ++msg_iov_index;
716 msg_iov_bytes = msg->msg_iov[msg_iov_index].iov_len;
717 msg_iov_vaddr = (uintptr_t)msg->msg_iov[msg_iov_index].iov_base;
718 }
719
720 rma_iov_bytes -= len;
721 rma_iov_addr += len;
722
723 if ((rma_iov_bytes == 0) && ((rma_iov_index+1) < rma_iov_count)) {
724 ++rma_iov_index;
725 rma_iov_bytes = msg->rma_iov[rma_iov_index].len;
726 rma_iov_addr = msg->rma_iov[rma_iov_index].addr;
727 rma_iov_key = msg->rma_iov[rma_iov_index].key;
728 }
729 }
730
731 #ifdef FI_BGQ_TRACE
732 fprintf(stderr,"fi_bgq_writemsg_generic calling fi_bgq_write_fence\n");
733 fflush(stderr);
734 #endif
735 fi_bgq_write_fence(bgq_ep, flags, bgq_dst_addr,
736 (union fi_bgq_context *)msg->context,
737 lock_required);
738
739 ret = fi_bgq_unlock_if_required(&bgq_ep->lock, lock_required);
740 if (ret) return ret;
741
742 return 0;
743 }
744
745
fi_bgq_read_generic(struct fid_ep * ep,void * buf,size_t len,void * desc,fi_addr_t src_addr,uint64_t addr,uint64_t key,void * context,int lock_required)746 static inline ssize_t fi_bgq_read_generic(struct fid_ep *ep,
747 void *buf, size_t len, void *desc, fi_addr_t src_addr,
748 uint64_t addr, uint64_t key, void *context,
749 int lock_required)
750 {
751 int ret;
752 struct fi_bgq_ep *bgq_ep;
753
754 bgq_ep = container_of(ep, struct fi_bgq_ep, ep_fid);
755
756 ret = fi_bgq_check_rma(bgq_ep, FI_BGQ_FABRIC_DIRECT_AV);
757 if (ret) return ret;
758
759 ret = fi_bgq_lock_if_required(&bgq_ep->lock, lock_required);
760 if (ret) return ret;
761
762 struct iovec iov;
763 iov.iov_base = buf;
764 iov.iov_len = len;
765
766 fi_bgq_readv_internal(bgq_ep, &iov, 1, (union fi_bgq_addr *)&src_addr,
767 &addr, &key, (union fi_bgq_context *)context,
768 bgq_ep->tx.op_flags, 1, 1, lock_required);
769
770 ret = fi_bgq_unlock_if_required(&bgq_ep->lock, lock_required);
771 if (ret)
772 return ret;
773
774 return 0;
775 }
776
fi_bgq_readv_generic(struct fid_ep * ep,const struct iovec * iov,void ** desc,size_t count,fi_addr_t src_addr,uint64_t addr,uint64_t key,void * context,int lock_required)777 static inline ssize_t fi_bgq_readv_generic (struct fid_ep *ep,
778 const struct iovec *iov, void **desc, size_t count,
779 fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context,
780 int lock_required)
781 {
782
783 #ifdef FI_BGQ_TRACE
784 fprintf(stderr,"fi_bgq_readv_generic count is %lu addr is 0x%016lx key is 0x%016lx\n",count,addr,key);
785 fflush(stderr);
786 #endif
787
788 int ret;
789 struct fi_bgq_ep *bgq_ep;
790
791 bgq_ep = container_of(ep, struct fi_bgq_ep, ep_fid);
792
793 ret = fi_bgq_check_rma(bgq_ep, FI_BGQ_FABRIC_DIRECT_AV);
794 if (ret) return ret;
795
796 ret = fi_bgq_lock_if_required(&bgq_ep->lock, lock_required);
797 if (ret) return ret;
798
799 union fi_bgq_addr * bgq_addr = (union fi_bgq_addr *)&src_addr;
800 union fi_bgq_context * bgq_context = (union fi_bgq_context *)context;
801 const uint64_t tx_op_flags = bgq_ep->tx.op_flags;
802
803 uint64_t addr_v[8] = { addr, addr, addr, addr, addr, addr, addr, addr };
804 uint64_t key_v[8] = { key, key, key, key, key, key, key, key };
805
806 /* max 8 descriptors (iovecs) per readv_internal */
807 size_t index = 0;
808 const size_t full_count = count >> 3;
809 for (index = 0; index < full_count; index += 8) {
810
811 fi_bgq_readv_internal(bgq_ep, &iov[index], 8, bgq_addr,
812 addr_v, key_v, NULL, 0, 0, 0,
813 lock_required);
814 }
815
816 /* if 'partial_ndesc' is zero, the fi_bgq_readv_internal() will fence */
817 const size_t partial_ndesc = count & 0x07ull;
818 fi_bgq_readv_internal(bgq_ep, &iov[index], partial_ndesc, bgq_addr,
819 addr_v, key_v, bgq_context, tx_op_flags, 1, 1,
820 lock_required);
821
822 ret = fi_bgq_unlock_if_required(&bgq_ep->lock, lock_required);
823 if (ret)
824 return ret;
825
826 return 0;
827 }
828
829
fi_bgq_readmsg_generic(struct fid_ep * ep,const struct fi_msg_rma * msg,uint64_t flags,int lock_required)830 static inline ssize_t fi_bgq_readmsg_generic(struct fid_ep *ep,
831 const struct fi_msg_rma *msg, uint64_t flags,
832 int lock_required)
833 {
834 #ifdef FI_BGQ_TRACE
835 fprintf(stderr,"fi_bgq_readmsg_generic starting\n");
836 fflush(stderr);
837 #endif
838 int ret;
839 struct fi_bgq_ep *bgq_ep;
840
841 bgq_ep = container_of(ep, struct fi_bgq_ep, ep_fid);
842
843 ret = fi_bgq_check_rma(bgq_ep, FI_BGQ_FABRIC_DIRECT_AV);
844 if (ret) return ret;
845
846 ret = fi_bgq_lock_if_required(&bgq_ep->lock, lock_required);
847 if (ret) return ret;
848
849 struct fi_bgq_cq * cq = bgq_ep->send_cq;
850 const uint64_t enable_cq =
851 (cq == NULL) || ((cq != NULL) && ((cq->bflags & FI_SELECTIVE_COMPLETION) && (flags & FI_COMPLETION) == 0)) ? 0 : 1;
852
853 union fi_bgq_context * bgq_context = (union fi_bgq_context *) msg->context;
854 union fi_bgq_addr * bgq_src_addr = (union fi_bgq_addr *)&msg->addr;
855
856 /* for fi_read*(), the 'src' is the remote data */
857 size_t src_iov_index = 0;
858 const size_t src_iov_count = msg->rma_iov_count;
859 uint64_t src_iov_bytes = msg->rma_iov[0].len;
860 uint64_t src_iov_addr = msg->rma_iov[0].addr;
861 uint64_t src_iov_key = msg->rma_iov[0].key;
862
863 /* for fi_read*(), the 'dst' is the local data */
864 size_t dst_iov_index = 0;
865 const size_t dst_iov_count = msg->iov_count;
866 uint64_t dst_iov_bytes = msg->msg_iov[0].iov_len;
867 void * dst_iov_vaddr = msg->msg_iov[0].iov_base;
868
869 size_t niov;
870 struct iovec iov[8];
871 uint64_t addr[8];
872 uint64_t key[8];
873
874 while (src_iov_index < src_iov_count) {
875
876 for (niov = 0; niov < 8; ++niov) {
877 const size_t len = (dst_iov_bytes <= src_iov_bytes) ? dst_iov_bytes : src_iov_bytes;
878 iov[niov].iov_len = len;
879 iov[niov].iov_base = dst_iov_vaddr;
880 addr[niov] = src_iov_addr;
881 key[niov] = src_iov_key;
882
883 dst_iov_bytes -= len;
884 src_iov_bytes -= len;
885
886 if (src_iov_bytes == 0) {
887
888 /* all done with this src rma iovec */
889
890 if (src_iov_index == (src_iov_count-1)) {
891
892 /* this is the last src rma iovec .. perform
893 * read with completion processing and return
894 *
895 * the 'dst_iov_bytes' must be zero and it must
896 * be the last dst iovec as well */
897 assert(dst_iov_bytes==0);
898 assert(dst_iov_index == (dst_iov_count-1));
899
900 fi_bgq_readv_internal(bgq_ep, iov, niov+1,
901 bgq_src_addr, addr, key,
902 bgq_context,
903 flags,
904 enable_cq, 1, /* enable_cq, enable_cntr */
905 lock_required);
906
907 ret = fi_bgq_unlock_if_required(&bgq_ep->lock, lock_required);
908 if (ret) return ret;
909
910 return 0;
911
912 } else {
913
914 /* advance to next src rma iovec */
915 ++src_iov_index;
916 src_iov_bytes = msg->rma_iov[src_iov_index].len;
917 src_iov_addr = msg->rma_iov[src_iov_index].addr;
918 src_iov_key = msg->rma_iov[src_iov_index].key;
919 }
920 } else {
921 src_iov_addr += len;
922 }
923
924
925 if (dst_iov_bytes == 0) {
926
927 /* all done with this dst iovec */
928
929 if (dst_iov_index == (dst_iov_count-1)) {
930 /* this is the last dst iovec .. do nothing since
931 * the 'src_iov_bytes' must be zero and it must
932 * be the last src rma iovec as well */
933 assert(src_iov_bytes==0);
934 assert(src_iov_index == (src_iov_count-1));
935
936 /* in fact, it should be impossible to get here */
937 assert(0);
938 } else {
939
940 /* advance to next dst iovec */
941 ++dst_iov_index;
942 dst_iov_bytes = msg->msg_iov[dst_iov_index].iov_len;
943 dst_iov_vaddr = msg->msg_iov[dst_iov_index].iov_base;
944 }
945 } else {
946 dst_iov_vaddr = (void*)((uintptr_t)dst_iov_vaddr + len);
947 }
948
949
950 } /* end for */
951
952 fi_bgq_readv_internal(bgq_ep, iov, 8, bgq_src_addr, addr, key,
953 NULL, 0,
954 0, 0, /* disable_cq, disable_cntr */
955 lock_required);
956
957 } /* end while */
958
959 /* should never get here */
960 assert(0);
961
962 ret = fi_bgq_unlock_if_required(&bgq_ep->lock, lock_required);
963 if (ret) return ret;
964
965 return 0;
966 }
967
968
969 /* Declare specialized functions that qualify for FABRIC_DIRECT.
970 * - No locks
971 */
972
973 #define FI_BGQ_RMA_FABRIC_DIRECT_LOCK 0
974
FI_BGQ_RMA_SPECIALIZED_FUNC(FI_BGQ_RMA_FABRIC_DIRECT_LOCK)975 FI_BGQ_RMA_SPECIALIZED_FUNC(FI_BGQ_RMA_FABRIC_DIRECT_LOCK)
976
977 #ifdef FABRIC_DIRECT
978
979 #define fi_write(ep, buf, len, desc, dst_addr, addr, key, context) \
980 (FI_BGQ_RMA_SPECIALIZED_FUNC_NAME(write, \
981 FI_BGQ_RMA_FABRIC_DIRECT_LOCK) \
982 (ep, buf, len, desc, dst_addr, addr, key, context))
983
984 #define fi_inject_write(ep, buf, len, dst_addr, addr, key) \
985 (FI_BGQ_RMA_SPECIALIZED_FUNC_NAME(inject_write, \
986 FI_BGQ_RMA_FABRIC_DIRECT_LOCK) \
987 (ep, buf, len, dst_addr, addr, key))
988
989 #define fi_read(ep, buf, len, desc, src_addr, addr, key, context) \
990 (FI_BGQ_RMA_SPECIALIZED_FUNC_NAME(read, \
991 FI_BGQ_RMA_FABRIC_DIRECT_LOCK) \
992 (ep, buf, len, desc, src_addr, addr, key, context))
993
994 #define fi_readmsg(ep, msg, flags) \
995 (FI_BGQ_RMA_SPECIALIZED_FUNC_NAME(readmsg, \
996 FI_BGQ_RMA_FABRIC_DIRECT_LOCK) \
997 (ep, msg, flags))
998
999 static inline ssize_t
1000 fi_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags)
1001 {
1002 return ep->rma->writemsg(ep, msg, flags);
1003 }
1004 static inline ssize_t
fi_writev(struct fid_ep * ep,const struct iovec * iov,void ** desc,size_t count,fi_addr_t dest_addr,uint64_t addr,uint64_t key,void * context)1005 fi_writev(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count,
1006 fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context)
1007 {
1008 return ep->rma->writev(ep, iov, desc, count, dest_addr, addr, key, context);
1009 }
1010
1011 #endif
1012
1013 #ifdef __cplusplus
1014 }
1015 #endif
1016
1017 #endif /* _FI_BGQ_DIRECT_RMA_H_ */
1018