1 /*
2 * Copyright (c) 2018-2019 Intel Corporation, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33 #if HAVE_CONFIG_H
34 # include <config.h>
35 #endif /* HAVE_CONFIG_H */
36
37 #include <string.h>
38
39 #include <rdma/fabric.h>
40 #include <rdma/fi_atomic.h>
41 #include <rdma/fi_cm.h>
42 #include <rdma/fi_domain.h>
43 #include <rdma/fi_endpoint.h>
44 #include <rdma/fi_eq.h>
45 #include <rdma/fi_rma.h>
46 #include <rdma/fi_tagged.h>
47 #include "rdma/providers/fi_log.h"
48
49 #include <ofi.h>
50 #include <ofi_util.h>
51 #include <ofi_iov.h>
52 #include <ofi_list.h>
53 #include <ofi_proto.h>
54 #include <ofi_prov.h>
55 #include <ofi_enosys.h>
56
57 #define MRAIL_MAX_INFO 100
58
59 #define MRAIL_PASSTHRU_TX_OP_FLAGS (FI_INJECT_COMPLETE | \
60 FI_TRANSMIT_COMPLETE | \
61 FI_DELIVERY_COMPLETE)
62 #define MRAIL_PASSTHRU_RX_OP_FLAGS (0ULL)
63 #define MRAIL_TX_OP_FLAGS (FI_INJECT | FI_COMPLETION)
64 #define MRAIL_RX_OP_FLAGS (FI_COMPLETION)
65
66 #define MRAIL_PASSTHRU_MODES (0ULL)
67 #define MRAIL_PASSTHRU_MR_MODES (OFI_MR_BASIC_MAP)
68
69 #define MRAIL_RAIL_CQ_FORMAT FI_CQ_FORMAT_TAGGED
70
71 extern struct fi_info mrail_info;
72 extern struct fi_provider mrail_prov;
73 extern struct util_prov mrail_util_prov;
74 extern struct fi_fabric_attr mrail_fabric_attr;
75
76 extern struct fi_info *mrail_info_vec[MRAIL_MAX_INFO];
77 extern size_t mrail_num_info;
78
79 enum {
80 MRAIL_POLICY_FIXED,
81 MRAIL_POLICY_ROUND_ROBIN,
82 MRAIL_POLICY_STRIPING
83 };
84
85 #define MRAIL_MAX_CONFIG 8
86
87 struct mrail_config {
88 size_t max_size;
89 int policy;
90 };
91
92 extern struct mrail_config mrail_config[MRAIL_MAX_CONFIG];
93 extern int mrail_num_config;
94 extern int mrail_local_rank;
95
96 extern struct fi_ops_rma mrail_ops_rma;
97
98 struct mrail_match_attr {
99 fi_addr_t addr;
100 uint64_t tag;
101 };
102
103 struct mrail_unexp_msg_entry {
104 struct dlist_entry entry;
105 fi_addr_t addr;
106 uint64_t tag;
107 void *context;
108 char data[]; /* completion entry */
109 };
110
111 struct mrail_recv_queue;
112
113 typedef struct mrail_unexp_msg_entry *
114 (*mrail_get_unexp_msg_entry_func)(struct mrail_recv_queue *recv_queue, void *context);
115
116 struct mrail_recv_queue {
117 struct fi_provider *prov;
118 struct dlist_entry recv_list;
119 struct dlist_entry unexp_msg_list;
120 dlist_func_t *match_recv;
121 dlist_func_t *match_unexp;
122 mrail_get_unexp_msg_entry_func get_unexp_msg_entry;
123 };
124
125 struct mrail_recv *
126 mrail_match_recv_handle_unexp(struct mrail_recv_queue *recv_queue, uint64_t tag,
127 uint64_t addr, char *data, size_t len, void *context);
128
129 /* mrail protocol */
130 #define MRAIL_HDR_VERSION 2
131
132 enum {
133 MRAIL_PROTO_EAGER,
134 MRAIL_PROTO_RNDV
135 };
136
137 enum {
138 MRAIL_RNDV_REQ,
139 MRAIL_RNDV_ACK
140 };
141
142 struct mrail_hdr {
143 uint8_t version;
144 uint8_t op;
145 uint8_t protocol;
146 uint8_t protocol_cmd;
147 uint32_t seq;
148 uint64_t tag;
149 };
150
151 #define MRAIL_IOV_LIMIT 5
152
153 /* bit 60~63 are provider defined */
154 #define MRAIL_RNDV_FLAG (1ULL << 60)
155
156 struct mrail_rndv_hdr {
157 uint64_t context;
158 };
159
160 struct mrail_rndv_req {
161 size_t len;
162 size_t count;
163 size_t mr_count;
164 struct fi_rma_iov rma_iov[MRAIL_IOV_LIMIT];
165 size_t rawkey_size;
166 uint8_t rawkey[]; /* rawkey + base_addr */
167 };
168
169 struct mrail_tx_buf {
170 /* context should stay at top and would get overwritten on
171 * util buf release */
172 void *context;
173 struct mrail_ep *ep;
174 /* flags would be used for both operation flags (FI_COMPLETION)
175 * and completion flags (FI_MSG, FI_TAGGED, etc) */
176 uint64_t flags;
177 struct mrail_hdr hdr;
178 struct mrail_rndv_hdr rndv_hdr;
179 struct mrail_rndv_req *rndv_req;
180 fid_t rndv_mr_fid;
181 };
182
183 struct mrail_pkt {
184 struct mrail_hdr hdr;
185 char data[];
186 };
187
188 /* TX & RX processing */
189
190 struct mrail_rx_buf {
191 struct fid_ep *rail_ep;
192 struct mrail_pkt pkt;
193 };
194
195 struct mrail_rndv_recv {
196 void *context;
197 uint64_t flags;
198 uint64_t tag;
199 uint64_t data;
200 size_t len;
201 };
202
203 struct mrail_recv {
204 struct iovec iov[MRAIL_IOV_LIMIT];
205 void *desc[MRAIL_IOV_LIMIT];
206 uint8_t count;
207 void *context;
208 uint64_t flags;
209 uint64_t comp_flags;
210 struct mrail_hdr hdr;
211 struct mrail_ep *ep;
212 struct dlist_entry entry;
213 fi_addr_t addr;
214 uint64_t tag;
215 uint64_t ignore;
216 struct mrail_rndv_recv rndv;
217 };
218 DECLARE_FREESTACK(struct mrail_recv, mrail_recv_fs);
219
220 int mrail_cq_process_buf_recv(struct fi_cq_tagged_entry *comp,
221 struct mrail_recv *recv);
222
223 struct mrail_fabric {
224 struct util_fabric util_fabric;
225 struct fi_info *info;
226 struct fid_fabric **fabrics;
227 size_t num_fabrics;
228 };
229
230 struct mrail_domain {
231 struct util_domain util_domain;
232 struct fi_info *info;
233 struct fid_domain **domains;
234 size_t num_domains;
235 size_t addrlen;
236 };
237
238 struct mrail_av {
239 struct util_av util_av;
240 struct fid_av **avs;
241 size_t *rail_addrlen;
242 size_t num_avs;
243 };
244
245 struct mrail_peer_info {
246 struct slist ooo_recv_queue;
247 fi_addr_t addr;
248 uint32_t seq_no;
249 uint32_t expected_seq_no;
250 };
251
252 struct mrail_ooo_recv {
253 struct slist_entry entry;
254 struct fi_cq_tagged_entry comp;
255 uint32_t seq_no;
256 };
257
258 typedef int (*mrail_cq_process_comp_func_t)(struct fi_cq_tagged_entry *comp,
259 fi_addr_t src_addr);
260 struct mrail_cq {
261 struct util_cq util_cq;
262 struct fid_cq **cqs;
263 size_t num_cqs;
264 mrail_cq_process_comp_func_t process_comp;
265 };
266
267 struct mrail_ep {
268 struct util_ep util_ep;
269 struct fi_info *info;
270 struct {
271 struct fid_ep *ep;
272 struct fi_info *info;
273 } *rails;
274 size_t num_eps;
275 ofi_atomic32_t tx_rail;
276 ofi_atomic32_t rx_rail;
277 int default_tx_rail;
278
279 struct mrail_recv_fs *recv_fs;
280 struct mrail_recv_queue recv_queue;
281 struct mrail_recv_queue trecv_queue;
282
283 struct ofi_bufpool *req_pool;
284 struct ofi_bufpool *ooo_recv_pool;
285 struct ofi_bufpool *tx_buf_pool;
286 struct slist deferred_reqs;
287 };
288
289 struct mrail_addr_key {
290 uint64_t base_addr;
291 uint64_t key;
292 };
293
294 struct mrail_mr {
295 struct fid_mr mr_fid;
296 size_t num_mrs;
297 struct {
298 uint64_t base_addr;
299 struct fid_mr *mr;
300 } rails[];
301 };
302
303 int mrail_fabric_open(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
304 void *context);
305 int mrail_domain_open(struct fid_fabric *fabric, struct fi_info *info,
306 struct fid_domain **domain, void *context);
307 int mrail_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
308 struct fid_cq **cq_fid, void *context);
309 int mrail_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
310 struct fid_av **av_fid, void *context);
311 int mrail_ep_open(struct fid_domain *domain, struct fi_info *info,
312 struct fid_ep **ep_fid, void *context);
313
314 static inline struct mrail_recv *
mrail_pop_recv(struct mrail_ep * mrail_ep)315 mrail_pop_recv(struct mrail_ep *mrail_ep)
316 {
317 struct mrail_recv *recv;
318 ofi_ep_lock_acquire(&mrail_ep->util_ep);
319 recv = freestack_isempty(mrail_ep->recv_fs) ? NULL :
320 freestack_pop(mrail_ep->recv_fs);
321 ofi_ep_lock_release(&mrail_ep->util_ep);
322 return recv;
323 }
324
325 static inline void
mrail_push_recv(struct mrail_recv * recv)326 mrail_push_recv(struct mrail_recv *recv)
327 {
328 ofi_ep_lock_acquire(&recv->ep->util_ep);
329 freestack_push(recv->ep->recv_fs, recv);
330 ofi_ep_lock_release(&recv->ep->util_ep);
331 }
332
mrail_get_info_cached(char * name)333 static inline struct fi_info *mrail_get_info_cached(char *name)
334 {
335 struct fi_info *info;
336 size_t i;
337
338 for (i = 0; i < mrail_num_info; i++) {
339 info = mrail_info_vec[i];
340 if (!strcmp(info->fabric_attr->name, name))
341 return info;
342 }
343
344 FI_WARN(&mrail_prov, FI_LOG_CORE, "Unable to find matching "
345 "fi_info in mrail_info_vec for given fabric name\n");
346 return NULL;
347 }
348
mrail_close_fids(struct fid ** fids,size_t count)349 static inline int mrail_close_fids(struct fid **fids, size_t count)
350 {
351 int ret, retv = 0;
352 size_t i;
353
354 for (i = 0; i < count; i++) {
355 if (fids[i]) {
356 ret = fi_close(fids[i]);
357 if (ret)
358 retv = ret;
359 }
360 }
361 return retv;
362 }
363
mrail_get_tx_rail_rr(struct mrail_ep * mrail_ep)364 static inline size_t mrail_get_tx_rail_rr(struct mrail_ep *mrail_ep)
365 {
366 return (ofi_atomic_inc32(&mrail_ep->tx_rail) - 1) % mrail_ep->num_eps;
367 }
368
mrail_get_policy(size_t size)369 static inline int mrail_get_policy(size_t size)
370 {
371 int i;
372
373 for (i = 0; i < mrail_num_config - 1; i++)
374 if (size <= mrail_config[i].max_size)
375 break;
376
377 return mrail_config[i].policy;
378 }
379
mrail_get_tx_rail(struct mrail_ep * mrail_ep,int policy)380 static inline size_t mrail_get_tx_rail(struct mrail_ep *mrail_ep, int policy)
381 {
382 return policy == MRAIL_POLICY_FIXED ?
383 mrail_ep->default_tx_rail :
384 mrail_get_tx_rail_rr(mrail_ep);
385
386 }
387
388 struct mrail_subreq {
389 struct fi_context context;
390 struct mrail_req *parent;
391 void *descs[MRAIL_IOV_LIMIT];
392 struct iovec iov[MRAIL_IOV_LIMIT];
393 struct fi_rma_iov rma_iov[MRAIL_IOV_LIMIT];
394 size_t iov_count;
395 size_t rma_iov_count;
396 };
397
398 struct mrail_req {
399 struct slist_entry entry;
400 uint64_t flags;
401 uint64_t data;
402 struct mrail_ep *mrail_ep;
403 struct mrail_peer_info *peer_info;
404 struct fi_cq_tagged_entry comp;
405 ofi_atomic32_t expected_subcomps;
406 int op_type;
407 int pending_subreq;
408 struct mrail_subreq subreqs[];
409 };
410
411 static inline
mrail_alloc_req(struct mrail_ep * mrail_ep)412 struct mrail_req *mrail_alloc_req(struct mrail_ep *mrail_ep)
413 {
414 struct mrail_req *req;
415
416 ofi_ep_lock_acquire(&mrail_ep->util_ep);
417 req = ofi_buf_alloc(mrail_ep->req_pool);
418 ofi_ep_lock_release(&mrail_ep->util_ep);
419
420 return req;
421 }
422
423 static inline
mrail_free_req(struct mrail_ep * mrail_ep,struct mrail_req * req)424 void mrail_free_req(struct mrail_ep *mrail_ep, struct mrail_req *req)
425 {
426 ofi_ep_lock_acquire(&mrail_ep->util_ep);
427 ofi_buf_free(req);
428 ofi_ep_lock_release(&mrail_ep->util_ep);
429 }
430
431 void mrail_progress_deferred_reqs(struct mrail_ep *mrail_ep);
432
433 void mrail_poll_cq(struct util_cq *cq);
434
mrail_cntr_incerr(struct util_cntr * cntr)435 static inline void mrail_cntr_incerr(struct util_cntr *cntr)
436 {
437 if (cntr) {
438 cntr->cntr_fid.ops->adderr(&cntr->cntr_fid, 1);
439 }
440 }
441
442 int mrail_send_rndv_ack_blocking(struct mrail_ep *mrail_ep,
443 struct mrail_cq *mrail_cq,
444 fi_addr_t dest_addr,
445 void *context);
446