1 /*
2  * Copyright (c) 2018-2019 Intel Corporation, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #if HAVE_CONFIG_H
34 #  include <config.h>
35 #endif /* HAVE_CONFIG_H */
36 
37 #include <string.h>
38 
39 #include <rdma/fabric.h>
40 #include <rdma/fi_atomic.h>
41 #include <rdma/fi_cm.h>
42 #include <rdma/fi_domain.h>
43 #include <rdma/fi_endpoint.h>
44 #include <rdma/fi_eq.h>
45 #include <rdma/fi_rma.h>
46 #include <rdma/fi_tagged.h>
47 #include "rdma/providers/fi_log.h"
48 
49 #include <ofi.h>
50 #include <ofi_util.h>
51 #include <ofi_iov.h>
52 #include <ofi_list.h>
53 #include <ofi_proto.h>
54 #include <ofi_prov.h>
55 #include <ofi_enosys.h>
56 
57 #define MRAIL_MAX_INFO 100
58 
59 #define MRAIL_PASSTHRU_TX_OP_FLAGS	(FI_INJECT_COMPLETE | \
60 					 FI_TRANSMIT_COMPLETE | \
61 					 FI_DELIVERY_COMPLETE)
62 #define MRAIL_PASSTHRU_RX_OP_FLAGS	(0ULL)
63 #define MRAIL_TX_OP_FLAGS		(FI_INJECT | FI_COMPLETION)
64 #define MRAIL_RX_OP_FLAGS		(FI_COMPLETION)
65 
66 #define MRAIL_PASSTHRU_MODES 	(0ULL)
67 #define MRAIL_PASSTHRU_MR_MODES	(OFI_MR_BASIC_MAP)
68 
69 #define MRAIL_RAIL_CQ_FORMAT	FI_CQ_FORMAT_TAGGED
70 
71 extern struct fi_info mrail_info;
72 extern struct fi_provider mrail_prov;
73 extern struct util_prov mrail_util_prov;
74 extern struct fi_fabric_attr mrail_fabric_attr;
75 
76 extern struct fi_info *mrail_info_vec[MRAIL_MAX_INFO];
77 extern size_t mrail_num_info;
78 
79 enum {
80 	MRAIL_POLICY_FIXED,
81 	MRAIL_POLICY_ROUND_ROBIN,
82 	MRAIL_POLICY_STRIPING
83 };
84 
85 #define MRAIL_MAX_CONFIG		8
86 
87 struct mrail_config {
88 	size_t		max_size;
89 	int		policy;
90 };
91 
92 extern struct mrail_config mrail_config[MRAIL_MAX_CONFIG];
93 extern int mrail_num_config;
94 extern int mrail_local_rank;
95 
96 extern struct fi_ops_rma mrail_ops_rma;
97 
98 struct mrail_match_attr {
99 	fi_addr_t addr;
100 	uint64_t tag;
101 };
102 
103 struct mrail_unexp_msg_entry {
104 	struct dlist_entry 	entry;
105 	fi_addr_t 		addr;
106 	uint64_t 		tag;
107 	void			*context;
108 	char			data[];		/* completion entry */
109 };
110 
111 struct mrail_recv_queue;
112 
113 typedef struct mrail_unexp_msg_entry *
114 (*mrail_get_unexp_msg_entry_func)(struct mrail_recv_queue *recv_queue, void *context);
115 
116 struct mrail_recv_queue {
117 	struct fi_provider 		*prov;
118 	struct dlist_entry 		recv_list;
119 	struct dlist_entry 		unexp_msg_list;
120 	dlist_func_t 			*match_recv;
121 	dlist_func_t 			*match_unexp;
122 	mrail_get_unexp_msg_entry_func	get_unexp_msg_entry;
123 };
124 
125 struct mrail_recv *
126 mrail_match_recv_handle_unexp(struct mrail_recv_queue *recv_queue, uint64_t tag,
127 			      uint64_t addr, char *data, size_t len, void *context);
128 
129 /* mrail protocol */
130 #define MRAIL_HDR_VERSION 2
131 
132 enum {
133 	MRAIL_PROTO_EAGER,
134 	MRAIL_PROTO_RNDV
135 };
136 
137 enum {
138 	MRAIL_RNDV_REQ,
139 	MRAIL_RNDV_ACK
140 };
141 
142 struct mrail_hdr {
143 	uint8_t		version;
144 	uint8_t		op;
145 	uint8_t		protocol;
146 	uint8_t		protocol_cmd;
147 	uint32_t	seq;
148 	uint64_t 	tag;
149 };
150 
151 #define MRAIL_IOV_LIMIT		5
152 
153 /* bit 60~63 are provider defined */
154 #define MRAIL_RNDV_FLAG		(1ULL << 60)
155 
156 struct mrail_rndv_hdr {
157 	uint64_t		context;
158 };
159 
160 struct mrail_rndv_req {
161 	size_t			len;
162 	size_t			count;
163 	size_t			mr_count;
164 	struct fi_rma_iov	rma_iov[MRAIL_IOV_LIMIT];
165 	size_t			rawkey_size;
166 	uint8_t			rawkey[]; /* rawkey + base_addr */
167 };
168 
169 struct mrail_tx_buf {
170 	/* context should stay at top and would get overwritten on
171 	 * util buf release */
172 	void			*context;
173 	struct mrail_ep		*ep;
174 	/* flags would be used for both operation flags (FI_COMPLETION)
175 	 * and completion flags (FI_MSG, FI_TAGGED, etc) */
176 	uint64_t		flags;
177 	struct mrail_hdr	hdr;
178 	struct mrail_rndv_hdr	rndv_hdr;
179 	struct mrail_rndv_req	*rndv_req;
180 	fid_t			rndv_mr_fid;
181 };
182 
183 struct mrail_pkt {
184 	struct mrail_hdr	hdr;
185 	char 			data[];
186 };
187 
188 /* TX & RX processing */
189 
190 struct mrail_rx_buf {
191 	struct fid_ep		*rail_ep;
192 	struct mrail_pkt	pkt;
193 };
194 
195 struct mrail_rndv_recv {
196 	void			*context;
197 	uint64_t		flags;
198 	uint64_t		tag;
199 	uint64_t		data;
200 	size_t			len;
201 };
202 
203 struct mrail_recv {
204 	struct iovec 		iov[MRAIL_IOV_LIMIT];
205 	void 			*desc[MRAIL_IOV_LIMIT];
206 	uint8_t 		count;
207 	void 			*context;
208 	uint64_t 		flags;
209 	uint64_t 		comp_flags;
210 	struct mrail_hdr	hdr;
211 	struct mrail_ep		*ep;
212 	struct dlist_entry 	entry;
213 	fi_addr_t 		addr;
214 	uint64_t 		tag;
215 	uint64_t 		ignore;
216 	struct mrail_rndv_recv	rndv;
217 };
218 DECLARE_FREESTACK(struct mrail_recv, mrail_recv_fs);
219 
220 int mrail_cq_process_buf_recv(struct fi_cq_tagged_entry *comp,
221 			      struct mrail_recv *recv);
222 
223 struct mrail_fabric {
224 	struct util_fabric util_fabric;
225 	struct fi_info *info;
226 	struct fid_fabric **fabrics;
227 	size_t num_fabrics;
228 };
229 
230 struct mrail_domain {
231 	struct util_domain util_domain;
232 	struct fi_info *info;
233 	struct fid_domain **domains;
234 	size_t num_domains;
235 	size_t addrlen;
236 };
237 
238 struct mrail_av {
239 	struct util_av util_av;
240 	struct fid_av **avs;
241 	size_t *rail_addrlen;
242 	size_t num_avs;
243 };
244 
245 struct mrail_peer_info {
246 	struct slist	ooo_recv_queue;
247 	fi_addr_t	addr;
248 	uint32_t	seq_no;
249 	uint32_t	expected_seq_no;
250 };
251 
252 struct mrail_ooo_recv {
253 	struct slist_entry 		entry;
254 	struct fi_cq_tagged_entry 	comp;
255 	uint32_t 			seq_no;
256 };
257 
258 typedef int (*mrail_cq_process_comp_func_t)(struct fi_cq_tagged_entry *comp,
259 					    fi_addr_t src_addr);
260 struct mrail_cq {
261 	struct util_cq 			util_cq;
262 	struct fid_cq 			**cqs;
263 	size_t 				num_cqs;
264 	mrail_cq_process_comp_func_t	process_comp;
265 };
266 
267 struct mrail_ep {
268 	struct util_ep		util_ep;
269 	struct fi_info		*info;
270 	struct {
271 		struct fid_ep 		*ep;
272 		struct fi_info		*info;
273 	}			*rails;
274 	size_t			num_eps;
275 	ofi_atomic32_t		tx_rail;
276 	ofi_atomic32_t		rx_rail;
277 	int			default_tx_rail;
278 
279 	struct mrail_recv_fs	*recv_fs;
280 	struct mrail_recv_queue recv_queue;
281 	struct mrail_recv_queue trecv_queue;
282 
283 	struct ofi_bufpool	*req_pool;
284 	struct ofi_bufpool 	*ooo_recv_pool;
285 	struct ofi_bufpool 	*tx_buf_pool;
286 	struct slist		deferred_reqs;
287 };
288 
289 struct mrail_addr_key {
290 	uint64_t base_addr;
291 	uint64_t key;
292 };
293 
294 struct mrail_mr {
295 	struct fid_mr mr_fid;
296 	size_t num_mrs;
297 	struct {
298 		uint64_t base_addr;
299 		struct fid_mr *mr;
300 	} rails[];
301 };
302 
303 int mrail_fabric_open(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
304 		       void *context);
305 int mrail_domain_open(struct fid_fabric *fabric, struct fi_info *info,
306 		       struct fid_domain **domain, void *context);
307 int mrail_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
308 		   struct fid_cq **cq_fid, void *context);
309 int mrail_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
310 		   struct fid_av **av_fid, void *context);
311 int mrail_ep_open(struct fid_domain *domain, struct fi_info *info,
312 		   struct fid_ep **ep_fid, void *context);
313 
314 static inline struct mrail_recv *
mrail_pop_recv(struct mrail_ep * mrail_ep)315 mrail_pop_recv(struct mrail_ep *mrail_ep)
316 {
317 	struct mrail_recv *recv;
318 	ofi_ep_lock_acquire(&mrail_ep->util_ep);
319 	recv = freestack_isempty(mrail_ep->recv_fs) ? NULL :
320 		freestack_pop(mrail_ep->recv_fs);
321 	ofi_ep_lock_release(&mrail_ep->util_ep);
322 	return recv;
323 }
324 
325 static inline void
mrail_push_recv(struct mrail_recv * recv)326 mrail_push_recv(struct mrail_recv *recv)
327 {
328 	ofi_ep_lock_acquire(&recv->ep->util_ep);
329 	freestack_push(recv->ep->recv_fs, recv);
330 	ofi_ep_lock_release(&recv->ep->util_ep);
331 }
332 
mrail_get_info_cached(char * name)333 static inline struct fi_info *mrail_get_info_cached(char *name)
334 {
335 	struct fi_info *info;
336 	size_t i;
337 
338 	for (i = 0; i < mrail_num_info; i++) {
339 		info = mrail_info_vec[i];
340 		if (!strcmp(info->fabric_attr->name, name))
341 			return info;
342 	}
343 
344 	FI_WARN(&mrail_prov, FI_LOG_CORE, "Unable to find matching "
345 		"fi_info in mrail_info_vec for given fabric name\n");
346 	return NULL;
347 }
348 
mrail_close_fids(struct fid ** fids,size_t count)349 static inline int mrail_close_fids(struct fid **fids, size_t count)
350 {
351 	int ret, retv = 0;
352 	size_t i;
353 
354 	for (i = 0; i < count; i++) {
355 		if (fids[i]) {
356 			ret = fi_close(fids[i]);
357 			if (ret)
358 				retv = ret;
359 		}
360 	}
361 	return retv;
362 }
363 
mrail_get_tx_rail_rr(struct mrail_ep * mrail_ep)364 static inline size_t mrail_get_tx_rail_rr(struct mrail_ep *mrail_ep)
365 {
366 	return (ofi_atomic_inc32(&mrail_ep->tx_rail) - 1) % mrail_ep->num_eps;
367 }
368 
mrail_get_policy(size_t size)369 static inline int mrail_get_policy(size_t size)
370 {
371 	int i;
372 
373 	for (i = 0; i < mrail_num_config - 1; i++)
374 		if (size <= mrail_config[i].max_size)
375 			break;
376 
377 	return mrail_config[i].policy;
378 }
379 
mrail_get_tx_rail(struct mrail_ep * mrail_ep,int policy)380 static inline size_t mrail_get_tx_rail(struct mrail_ep *mrail_ep, int policy)
381 {
382 	return policy == MRAIL_POLICY_FIXED ?
383 				mrail_ep->default_tx_rail :
384 				mrail_get_tx_rail_rr(mrail_ep);
385 
386 }
387 
388 struct mrail_subreq {
389 	struct fi_context context;
390 	struct mrail_req *parent;
391 	void *descs[MRAIL_IOV_LIMIT];
392 	struct iovec iov[MRAIL_IOV_LIMIT];
393 	struct fi_rma_iov rma_iov[MRAIL_IOV_LIMIT];
394 	size_t iov_count;
395 	size_t rma_iov_count;
396 };
397 
398 struct mrail_req {
399 	struct slist_entry entry;
400 	uint64_t flags;
401 	uint64_t data;
402 	struct mrail_ep *mrail_ep;
403 	struct mrail_peer_info *peer_info;
404 	struct fi_cq_tagged_entry comp;
405 	ofi_atomic32_t expected_subcomps;
406 	int op_type;
407 	int pending_subreq;
408 	struct mrail_subreq subreqs[];
409 };
410 
411 static inline
mrail_alloc_req(struct mrail_ep * mrail_ep)412 struct mrail_req *mrail_alloc_req(struct mrail_ep *mrail_ep)
413 {
414 	struct mrail_req *req;
415 
416 	ofi_ep_lock_acquire(&mrail_ep->util_ep);
417 	req = ofi_buf_alloc(mrail_ep->req_pool);
418 	ofi_ep_lock_release(&mrail_ep->util_ep);
419 
420 	return req;
421 }
422 
423 static inline
mrail_free_req(struct mrail_ep * mrail_ep,struct mrail_req * req)424 void mrail_free_req(struct mrail_ep *mrail_ep, struct mrail_req *req)
425 {
426 	ofi_ep_lock_acquire(&mrail_ep->util_ep);
427 	ofi_buf_free(req);
428 	ofi_ep_lock_release(&mrail_ep->util_ep);
429 }
430 
431 void mrail_progress_deferred_reqs(struct mrail_ep *mrail_ep);
432 
433 void mrail_poll_cq(struct util_cq *cq);
434 
mrail_cntr_incerr(struct util_cntr * cntr)435 static inline void mrail_cntr_incerr(struct util_cntr *cntr)
436 {
437        if (cntr) {
438                cntr->cntr_fid.ops->adderr(&cntr->cntr_fid, 1);
439        }
440 }
441 
442 int mrail_send_rndv_ack_blocking(struct mrail_ep *mrail_ep,
443 				 struct mrail_cq *mrail_cq,
444 				 fi_addr_t dest_addr,
445 				 void *context);
446