1 /*
2  * Copyright (c) 2015-2018 Cray Inc. All rights reserved.
3  * Copyright (c) 2015-2018 Los Alamos National Security, LLC.
4  *                         All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 
35 #include <stdlib.h>
36 #include <string.h>
37 #include <assert.h>
38 #include <sys/mman.h>
39 #include <signal.h>
40 
41 #include "gnix.h"
42 #include "gnix_nic.h"
43 #include "gnix_cm_nic.h"
44 #include "gnix_vc.h"
45 #include "gnix_mbox_allocator.h"
46 #include "gnix_util.h"
47 #include "fi_ext_gni.h"
48 
49 /*
50  * TODO: make this a domain parameter
51  */
52 #define GNIX_VC_FL_MIN_SIZE 128
53 #define GNIX_VC_FL_INIT_REFILL_SIZE 10
54 
55 static int gnix_nics_per_ptag[GNI_PTAG_MAX];
56 struct dlist_entry gnix_nic_list_ptag[GNI_PTAG_MAX];
57 DLIST_HEAD(gnix_nic_list);
58 pthread_mutex_t gnix_nic_list_lock = PTHREAD_MUTEX_INITIALIZER;
59 
60 /*
61  * globals
62  */
63 
64 uint32_t gnix_max_nics_per_ptag = GNIX_DEF_MAX_NICS_PER_PTAG;
65 
66 /*
67  * local variables
68  */
69 
70 static struct gnix_nic_attr default_attr = {
71 		.gni_cdm_hndl        = NULL,
72 		.gni_nic_hndl        = NULL
73 };
74 
75 /*******************************************************************************
76  * Helper functions.
77  ******************************************************************************/
78 
79 /*
80  * this function is intended to be invoked as an argument to pthread_create,
81  */
__gnix_nic_prog_thread_fn(void * the_arg)82 static void *__gnix_nic_prog_thread_fn(void *the_arg)
83 {
84 	int ret = FI_SUCCESS, prev_state;
85 	int retry = 0;
86 	uint32_t which;
87 	struct gnix_nic *nic = (struct gnix_nic *)the_arg;
88 	sigset_t  sigmask;
89 	gni_cq_handle_t cqv[2];
90 	gni_return_t status;
91 	gni_cq_entry_t cqe;
92 
93 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
94 
95 	/*
96 	 * temporarily disable cancelability while we set up
97 	 * some stuff
98 	 */
99 
100 	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &prev_state);
101 
102 	/*
103 	 * help out Cray core-spec, say we're not an app thread
104 	 * and can be run on core-spec cpus.
105 	 */
106 
107 	ret = _gnix_task_is_not_app();
108 	if (ret)
109 		GNIX_WARN(FI_LOG_EP_CTRL,
110 			"_gnix_task_is_not_app call returned %d\n",
111 			ret);
112 
113 	/*
114 	 * block all signals, don't want this thread to catch
115 	 * signals that may be for app threads
116 	 */
117 
118 	memset(&sigmask, 0, sizeof(sigset_t));
119 	ret = sigfillset(&sigmask);
120 	if (ret) {
121 		GNIX_WARN(FI_LOG_EP_CTRL,
122 		"sigfillset call returned %d\n", ret);
123 	} else {
124 
125 		ret = pthread_sigmask(SIG_SETMASK,
126 					&sigmask, NULL);
127 		if (ret)
128 			GNIX_WARN(FI_LOG_EP_CTRL,
129 			"pthread_sigmask call returned %d\n", ret);
130 	}
131 
132 	/*
133 	 * okay now we're ready to be cancelable.
134 	 */
135 
136 	pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &prev_state);
137 
138 	pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
139 
140 	cqv[0] = nic->tx_cq_blk;
141 	cqv[1] = nic->rx_cq_blk;
142 
143 try_again:
144 	status = GNI_CqVectorMonitor(cqv,
145 				     2,
146 				     -1,
147 				     &which);
148 
149 	switch (status) {
150 	case GNI_RC_SUCCESS:
151 
152 		/*
153 		 * first dequeue RX CQEs
154 		 */
155 		if (nic->rx_cq_blk != nic->rx_cq && which == 1) {
156 			do {
157 				status = GNI_CqGetEvent(nic->rx_cq_blk,
158 							&cqe);
159 			} while (status == GNI_RC_SUCCESS);
160 		}
161 		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &prev_state);
162 		_gnix_nic_progress(nic);
163 		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &prev_state);
164 		retry = 1;
165 		break;
166 	case GNI_RC_TIMEOUT:
167 	case GNI_RC_NOT_DONE:
168         /* Invalid state indicates call interrupted by signal using various tools */
169 	case GNI_RC_INVALID_STATE:
170 		retry = 1;
171 		break;
172 	case GNI_RC_INVALID_PARAM:
173 	case GNI_RC_ERROR_RESOURCE:
174 	case GNI_RC_ERROR_NOMEM:
175 		retry = 0;
176 		GNIX_WARN(FI_LOG_EP_CTRL,
177 			  "GNI_CqGetEvent returned %s\n",
178 			  gni_err_str[status]);
179 		break;
180 	default:
181 		retry = 0;
182 		GNIX_WARN(FI_LOG_EP_CTRL,
183 			  "GNI_CqGetEvent returned unexpected code %s\n",
184 			  gni_err_str[status]);
185 		break;
186 	}
187 
188 	if (retry)
189 		goto try_again;
190 
191 	return NULL;
192 }
193 
194 /*
195  * setup memory registration for remote GNI_PostCqWrite's to target
196  */
197 
__nic_setup_irq_cq(struct gnix_nic * nic)198 static int __nic_setup_irq_cq(struct gnix_nic *nic)
199 {
200 	int ret = FI_SUCCESS;
201 	size_t len;
202 	gni_return_t status;
203 	int fd = -1;
204 	void *mmap_addr;
205 	int vmdh_index = -1;
206 	int flags = GNI_MEM_READWRITE;
207 	struct gnix_auth_key *info;
208 	struct fi_gni_auth_key key;
209 
210 	len = (size_t)sysconf(_SC_PAGESIZE);
211 
212 	mmap_addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
213 			MAP_SHARED | MAP_ANON, fd, 0);
214 	if (mmap_addr == MAP_FAILED) {
215 		GNIX_WARN(FI_LOG_EP_CTRL, "mmap failed - %s\n",
216 			strerror(errno));
217 		ret = -errno;
218 		goto err;
219 	}
220 
221 	nic->irq_mmap_addr = mmap_addr;
222 	nic->irq_mmap_len = len;
223 
224 	/* On some systems, the page may not be zero'd from first use.
225 		 Memset it here */
226 	memset(mmap_addr, 0x0, len);
227 
228 	if (nic->using_vmdh) {
229 		key.type = GNIX_AKT_RAW;
230 		key.raw.protection_key = nic->cookie;
231 
232 		info = _gnix_auth_key_lookup((uint8_t *) &key, sizeof(key));
233 		assert(info);
234 
235 		if (!nic->mdd_resources_set) {
236 			/* check to see if the ptag registration limit was set
237 			   yet or not -- becomes read-only after success */
238 			ret = _gnix_auth_key_enable(info);
239 			if (ret != FI_SUCCESS && ret != -FI_EBUSY) {
240 				GNIX_WARN(FI_LOG_DOMAIN,
241 					"failed to enable authorization key, "
242 					"unexpected error rc=%d\n", ret);
243 			}
244 
245 			status = GNI_SetMddResources(nic->gni_nic_hndl,
246 					(info->attr.prov_key_limit +
247 					info->attr.user_key_limit));
248 			if (status != GNI_RC_SUCCESS) {
249 				GNIX_FATAL(FI_LOG_DOMAIN,
250 					"failed to set MDD resources, rc=%d\n",
251 					status);
252 			}
253 
254 			nic->mdd_resources_set = 1;
255 		}
256 		vmdh_index = _gnix_get_next_reserved_key(info);
257 		if (vmdh_index <= 0) {
258 			GNIX_FATAL(FI_LOG_DOMAIN,
259 				"failed to get next reserved key, "
260 				"rc=%d\n", vmdh_index);
261 		}
262 
263 		flags |= GNI_MEM_USE_VMDH;
264 	}
265 
266 	status = GNI_MemRegister(nic->gni_nic_hndl,
267 				(uint64_t) nic->irq_mmap_addr,
268 				len,
269 				nic->rx_cq_blk,
270 				flags,
271 				vmdh_index,
272 				 &nic->irq_mem_hndl);
273 	if (status != GNI_RC_SUCCESS) {
274 		ret = gnixu_to_fi_errno(status);
275 		GNIX_WARN(FI_LOG_EP_CTRL,
276 			  "GNI_MemRegister returned %s\n",
277 			  gni_err_str[status]);
278 		goto err_w_mmap;
279 	}
280 
281 #if 0
282 	fprintf(stderr,"registered ireq memhndl 0x%016lx 0x%016lx\n",
283 		nic->irq_mem_hndl.qword1,
284 		nic->irq_mem_hndl.qword2);
285 #endif
286 
287 
288 	return ret;
289 
290 err_w_mmap:
291 	munmap(mmap_addr, len);
292 err:
293 	return ret;
294 }
295 
296 /*
297  * release resources previously set up for remote
298  * GNI_PostCqWrite's to target
299  */
__nic_teardown_irq_cq(struct gnix_nic * nic)300 static int __nic_teardown_irq_cq(struct gnix_nic *nic)
301 {
302 	int ret = FI_SUCCESS;
303 	gni_return_t status;
304 
305 	if (nic == NULL)
306 		return ret;
307 
308 	if (nic->irq_mmap_addr == NULL)
309 		return ret;
310 
311 	if ((nic->irq_mem_hndl.qword1) ||
312 		(nic->irq_mem_hndl.qword2)) {
313 		status = GNI_MemDeregister(nic->gni_nic_hndl,
314 					  &nic->irq_mem_hndl);
315 		if (status != GNI_RC_SUCCESS) {
316 			ret = gnixu_to_fi_errno(status);
317 			GNIX_WARN(FI_LOG_EP_CTRL,
318 				  "GNI_MemDeregister returned %s\n",
319 				  gni_err_str[status]);
320 		}
321 	}
322 
323 	munmap(nic->irq_mmap_addr,
324 		nic->irq_mmap_len);
325 	return ret;
326 }
327 
328 
329 /*
330  * place holder for better attributes checker
331  */
__gnix_nic_check_attr_sanity(struct gnix_nic_attr * attr)332 static int __gnix_nic_check_attr_sanity(struct gnix_nic_attr *attr)
333 {
334 	return FI_SUCCESS;
335 }
336 
337 static inline struct gnix_tx_descriptor *
__desc_lkup_by_id(struct gnix_nic * nic,int desc_id)338 __desc_lkup_by_id(struct gnix_nic *nic, int desc_id)
339 {
340 	struct gnix_tx_descriptor *tx_desc;
341 
342 	assert((desc_id >= 0) && (desc_id <= nic->max_tx_desc_id));
343 	tx_desc = &nic->tx_desc_base[desc_id];
344 	return tx_desc;
345 }
346 
__nic_rx_overrun(struct gnix_nic * nic)347 static int __nic_rx_overrun(struct gnix_nic *nic)
348 {
349 	int i, max_id, ret;
350 	struct gnix_vc *vc;
351 	gni_return_t status;
352 	gni_cq_entry_t cqe;
353 
354 	GNIX_WARN(FI_LOG_EP_DATA, "\n");
355 
356 	/* clear out the CQ */
357 	/*
358 	 * TODO:  really need to process CQEs better for error reporting,
359 	 * etc.
360 	 */
361 	while ((status = GNI_CqGetEvent(nic->rx_cq, &cqe)) == GNI_RC_SUCCESS);
362 	assert(status == GNI_RC_NOT_DONE);
363 
364 	COND_ACQUIRE(nic->requires_lock, &nic->vc_id_lock);
365 	max_id = nic->vc_id_table_count;
366 	COND_RELEASE(nic->requires_lock, &nic->vc_id_lock);
367 	/*
368 	 * TODO: optimization would
369 	 * be to keep track of last time
370 	 * this happened and where smsg msgs.
371 	 * were found.
372 	 */
373 	for (i = 0; i < max_id; i++) {
374 		ret = _gnix_test_bit(&nic->vc_id_bitmap, i);
375 		if (ret) {
376 			vc = __gnix_nic_elem_by_rem_id(nic, i);
377 			ret = _gnix_vc_rx_schedule(vc);
378 			assert(ret == FI_SUCCESS);
379 		}
380 	}
381 
382 	return FI_SUCCESS;
383 }
384 
__process_rx_cqe(struct gnix_nic * nic,gni_cq_entry_t cqe)385 static int __process_rx_cqe(struct gnix_nic *nic, gni_cq_entry_t cqe)
386 {
387 	int ret = FI_SUCCESS, vc_id = 0;
388 	struct gnix_vc *vc;
389 
390 	vc_id =  GNI_CQ_GET_INST_ID(cqe);
391 
392 	/*
393 	 * its possible this vc has been destroyed, so may get NULL
394 	 * back.
395 	 */
396 
397 	vc = __gnix_nic_elem_by_rem_id(nic, vc_id);
398 	if (vc != NULL) {
399 		switch (vc->conn_state) {
400 		case GNIX_VC_CONNECTING:
401 			GNIX_DEBUG(FI_LOG_EP_DATA,
402 				  "Scheduling VC for RX processing (%p)\n",
403 				  vc);
404 			ret = _gnix_vc_rx_schedule(vc);
405 			assert(ret == FI_SUCCESS);
406 			break;
407 		case GNIX_VC_CONNECTED:
408 			GNIX_DEBUG(FI_LOG_EP_DATA,
409 				  "Processing VC RX (%p)\n",
410 				  vc);
411 			ret = _gnix_vc_rx_schedule(vc);
412 			assert(ret == FI_SUCCESS);
413 			break;
414 		default:
415 			break;  /* VC not in a state for scheduling or
416 				   SMSG processing */
417 		}
418 	}
419 
420 	return ret;
421 }
422 
__nic_rx_progress(struct gnix_nic * nic)423 static int __nic_rx_progress(struct gnix_nic *nic)
424 {
425 	int ret = FI_SUCCESS;
426 	gni_return_t status = GNI_RC_NOT_DONE;
427 	gni_cq_entry_t cqe;
428 
429 	status = GNI_CqTestEvent(nic->rx_cq);
430 	if (status == GNI_RC_NOT_DONE)
431 		return FI_SUCCESS;
432 
433 	COND_ACQUIRE(nic->requires_lock, &nic->lock);
434 
435 	do {
436 		status = GNI_CqGetEvent(nic->rx_cq, &cqe);
437 		if (OFI_UNLIKELY(status == GNI_RC_NOT_DONE)) {
438 			ret = FI_SUCCESS;
439 			break;
440 		}
441 
442 		if (OFI_LIKELY(status == GNI_RC_SUCCESS)) {
443 			/* Find and schedule the associated VC. */
444 			ret = __process_rx_cqe(nic, cqe);
445 			if (ret != FI_SUCCESS) {
446 				GNIX_WARN(FI_LOG_EP_DATA,
447 					  "process_rx_cqe() failed: %d\n",
448 					  ret);
449 			}
450 		} else if (status == GNI_RC_ERROR_RESOURCE) {
451 			/* The remote CQ was overrun.  Events related to any VC
452 			 * could have been missed.  Schedule each VC to be sure
453 			 * all messages are processed. */
454 			assert(GNI_CQ_OVERRUN(cqe));
455 			__nic_rx_overrun(nic);
456 		} else {
457 			GNIX_WARN(FI_LOG_EP_DATA,
458 				  "GNI_CqGetEvent returned %s\n",
459 				  gni_err_str[status]);
460 			ret = gnixu_to_fi_errno(status);
461 			break;
462 		}
463 	} while (1);
464 
465 	COND_RELEASE(nic->requires_lock, &nic->lock);
466 
467 	return ret;
468 }
469 
_gnix_nic_txd_err_inject(struct gnix_nic * nic,struct gnix_tx_descriptor * txd)470 void _gnix_nic_txd_err_inject(struct gnix_nic *nic,
471 			      struct gnix_tx_descriptor *txd)
472 {
473 	slist_insert_tail(&txd->err_list, &nic->err_txds);
474 }
475 
__gnix_nic_txd_err_get(struct gnix_nic * nic,struct gnix_tx_descriptor ** txd)476 static int __gnix_nic_txd_err_get(struct gnix_nic *nic,
477 				  struct gnix_tx_descriptor **txd)
478 {
479 	struct slist_entry *list_entry;
480 	struct gnix_tx_descriptor *txd_p;
481 
482 	list_entry = slist_remove_head(&nic->err_txds);
483 	if (list_entry) {
484 		txd_p = container_of(list_entry,
485 				     struct gnix_tx_descriptor,
486 				     err_list);
487 		*txd = txd_p;
488 		return 1;
489 	}
490 
491 	return 0;
492 }
493 
__nic_get_completed_txd(struct gnix_nic * nic,gni_cq_handle_t hw_cq,struct gnix_tx_descriptor ** txd,gni_return_t * tx_status)494 static void __nic_get_completed_txd(struct gnix_nic *nic,
495 				   gni_cq_handle_t hw_cq,
496 				   struct gnix_tx_descriptor **txd,
497 				   gni_return_t *tx_status)
498 {
499 	gni_post_descriptor_t *gni_desc;
500 	struct gnix_tx_descriptor *txd_p = NULL;
501 	struct gnix_fab_req *req;
502 	gni_return_t status;
503 	int msg_id;
504 	gni_cq_entry_t cqe;
505 	uint32_t recov = 1;
506 
507 	if (__gnix_nic_txd_err_get(nic, &txd_p)) {
508 		*txd = txd_p;
509 		*tx_status = GNI_RC_TRANSACTION_ERROR;
510 		return;
511 	}
512 
513 	status = GNI_CqGetEvent(hw_cq, &cqe);
514 	if (status == GNI_RC_NOT_DONE) {
515 		*txd = NULL;
516 		*tx_status = GNI_RC_NOT_DONE;
517 		return;
518 	}
519 
520 	assert(status == GNI_RC_SUCCESS ||
521 	       status == GNI_RC_TRANSACTION_ERROR);
522 
523 	if (OFI_UNLIKELY(status == GNI_RC_TRANSACTION_ERROR)) {
524 		status = GNI_CqErrorRecoverable(cqe, &recov);
525 		if (status == GNI_RC_SUCCESS) {
526 			if (!recov) {
527 				char ebuf[512];
528 
529 				GNI_CqErrorStr(cqe, ebuf, sizeof(ebuf));
530 				GNIX_WARN(FI_LOG_EP_DATA,
531 					  "CQ error status: %s\n",
532 					   ebuf);
533 			}
534 		} else {
535 			GNIX_WARN(FI_LOG_EP_DATA,
536 				  "GNI_CqErrorRecover returned: %s\n",
537 				   gni_err_str[status]);
538 			recov = 0;  /* assume something bad has happened */
539 		}
540 	}
541 
542 	if (GNI_CQ_GET_TYPE(cqe) == GNI_CQ_EVENT_TYPE_POST) {
543 		status = GNI_GetCompleted(hw_cq, cqe, &gni_desc);
544 
545 		assert(status == GNI_RC_SUCCESS ||
546 		       status == GNI_RC_TRANSACTION_ERROR);
547 
548 		txd_p = container_of(gni_desc,
549 				   struct gnix_tx_descriptor,
550 				   gni_desc);
551 	} else if (GNI_CQ_GET_TYPE(cqe) == GNI_CQ_EVENT_TYPE_SMSG) {
552 		msg_id = GNI_CQ_GET_MSG_ID(cqe);
553 		txd_p = __desc_lkup_by_id(nic, msg_id);
554 	}
555 
556 	if (OFI_UNLIKELY(txd_p == NULL))
557 		GNIX_FATAL(FI_LOG_EP_DATA, "Unexpected CQE: 0x%lx", cqe);
558 
559 	/*
560 	 * set retry count on the request to max to force
561 	 * delivering error'd CQ event to application
562 	 */
563 	if (!recov) {
564 		status = GNI_RC_TRANSACTION_ERROR;
565 		req = txd_p->req;
566 		if (req)
567 			req->tx_failures = UINT_MAX;
568 	}
569 
570 	*tx_status = status;
571 	*txd = txd_p;
572 
573 }
574 
__nic_tx_progress(struct gnix_nic * nic,gni_cq_handle_t cq)575 static int __nic_tx_progress(struct gnix_nic *nic, gni_cq_handle_t cq)
576 {
577 	int ret = FI_SUCCESS;
578 	gni_return_t tx_status;
579 	struct gnix_tx_descriptor *txd;
580 
581 	do {
582 		txd = NULL;
583 
584 		COND_ACQUIRE(nic->requires_lock, &nic->lock);
585 		__nic_get_completed_txd(nic, cq, &txd,
586 					&tx_status);
587 		COND_RELEASE(nic->requires_lock, &nic->lock);
588 
589 		if (txd && txd->completer_fn) {
590 			ret = txd->completer_fn(txd, tx_status);
591 			if (ret != FI_SUCCESS) {
592 				/*
593 				 * TODO: need to post error to CQ
594 				 */
595 				GNIX_WARN(FI_LOG_EP_DATA,
596 					  "TXD completer failed: %d", ret);
597 			}
598 		}
599 
600 		if ((txd == NULL) || ret != FI_SUCCESS)
601 			break;
602 	} while (1);
603 
604 	return ret;
605 }
606 
_gnix_nic_progress(void * arg)607 int _gnix_nic_progress(void *arg)
608 {
609 	struct gnix_nic *nic = (struct gnix_nic *)arg;
610 	int ret = FI_SUCCESS;
611 
612 	ret =  __nic_tx_progress(nic, nic->tx_cq);
613 	if (OFI_UNLIKELY(ret != FI_SUCCESS))
614 		return ret;
615 
616 	if (nic->tx_cq_blk && nic->tx_cq_blk != nic->tx_cq) {
617 		ret =  __nic_tx_progress(nic, nic->tx_cq_blk);
618 		if (OFI_UNLIKELY(ret != FI_SUCCESS))
619 			return ret;
620 	}
621 
622 	ret = __nic_rx_progress(nic);
623 	if (ret != FI_SUCCESS)
624 		return ret;
625 
626 	ret = _gnix_vc_nic_progress(nic);
627 	if (ret != FI_SUCCESS)
628 		return ret;
629 
630 	return ret;
631 }
632 
_gnix_nic_free_rem_id(struct gnix_nic * nic,int remote_id)633 int _gnix_nic_free_rem_id(struct gnix_nic *nic, int remote_id)
634 {
635 	assert(nic);
636 
637 	if ((remote_id < 0) || (remote_id > nic->vc_id_table_count))
638 		return -FI_EINVAL;
639 
640 	_gnix_clear_bit(&nic->vc_id_bitmap, remote_id);
641 
642 	return FI_SUCCESS;
643 }
644 
645 /*
646  * this function is needed to allow for quick lookup of a vc based on
647  * the contents of the GNI CQE coming off of the GNI RX CQ associated
648  * with GNI nic being used by this VC.  Using a bitmap to expedite
649  * scanning vc's in the case of a GNI CQ overrun.
650  */
651 
_gnix_nic_get_rem_id(struct gnix_nic * nic,int * remote_id,void * entry)652 int _gnix_nic_get_rem_id(struct gnix_nic *nic, int *remote_id, void *entry)
653 {
654 	int ret = FI_SUCCESS;
655 	void **table_base;
656 
657 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
658 
659 	/*
660 	 * TODO:  really need to search bitmap for clear
661 	 * bit before resizing the table
662 	 */
663 
664 	COND_ACQUIRE(nic->requires_lock, &nic->vc_id_lock);
665 	if (nic->vc_id_table_capacity == nic->vc_id_table_count) {
666 		table_base = realloc(nic->vc_id_table,
667 				     2 * nic->vc_id_table_capacity *
668 				     sizeof(void *));
669 		if (table_base == NULL) {
670 			ret =  -FI_ENOMEM;
671 			goto err;
672 		}
673 		nic->vc_id_table_capacity *= 2;
674 		nic->vc_id_table = table_base;
675 
676 		ret = _gnix_realloc_bitmap(&nic->vc_id_bitmap,
677 					   nic->vc_id_table_capacity);
678 		if (ret != FI_SUCCESS) {
679 			assert(ret == -FI_ENOMEM);
680 			goto err;
681 		}
682 	}
683 
684 	nic->vc_id_table[nic->vc_id_table_count] = entry;
685 	*remote_id = nic->vc_id_table_count;
686 
687 	/*
688 	 * set bit in the bitmap
689 	 */
690 
691 	_gnix_set_bit(&nic->vc_id_bitmap, nic->vc_id_table_count);
692 
693 	++(nic->vc_id_table_count);
694 err:
695 	COND_RELEASE(nic->requires_lock, &nic->vc_id_lock);
696 	return ret;
697 }
698 
699 /*
700  * allocate a free list of tx descs for a gnix_nic struct.
701  */
702 
__gnix_nic_tx_freelist_init(struct gnix_nic * nic,int n_descs)703 static int __gnix_nic_tx_freelist_init(struct gnix_nic *nic, int n_descs)
704 {
705 	int i, ret = FI_SUCCESS;
706 	struct gnix_tx_descriptor *desc_base, *desc_ptr;
707 
708 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
709 
710 	/*
711 	 * set up free list of tx descriptors.
712 	 */
713 
714 	desc_base = calloc(n_descs, sizeof(struct gnix_tx_descriptor));
715 	if (desc_base == NULL) {
716 		ret = -FI_ENOMEM;
717 		goto err;
718 	}
719 
720 	dlist_init(&nic->tx_desc_free_list);
721 	dlist_init(&nic->tx_desc_active_list);
722 
723 	for (i = 0, desc_ptr = desc_base; i < n_descs; i++, desc_ptr++) {
724 		desc_ptr->id = i;
725 		dlist_insert_tail(&desc_ptr->list,
726 				  &nic->tx_desc_free_list);
727 	}
728 
729 	nic->max_tx_desc_id = n_descs - 1;
730 	nic->tx_desc_base = desc_base;
731 
732 	fastlock_init(&nic->tx_desc_lock);
733 
734 	return ret;
735 
736 err:
737 	return ret;
738 
739 }
740 
741 /*
742  * clean up the tx descs free list
743  */
__gnix_nic_tx_freelist_destroy(struct gnix_nic * nic)744 static void __gnix_nic_tx_freelist_destroy(struct gnix_nic *nic)
745 {
746 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
747 
748 	free(nic->tx_desc_base);
749 	fastlock_destroy(&nic->tx_desc_lock);
750 }
751 
752 /*
753  * free a gnix nic and associated resources if refcnt drops to 0
754  */
755 
__nic_destruct(void * obj)756 static void __nic_destruct(void *obj)
757 {
758 	int ret = FI_SUCCESS;
759 	gni_return_t status = GNI_RC_SUCCESS;
760 	struct gnix_nic *nic = (struct gnix_nic *) obj;
761 
762 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
763 
764 	/* Get us out of the progression tables we are destroying the nic
765 	 * and we don't want the wait progression thread to progress us
766 	 * after our structures are destroyed.
767 	 */
768 	pthread_mutex_lock(&gnix_nic_list_lock);
769 
770 	dlist_remove(&nic->gnix_nic_list);
771 	--gnix_nics_per_ptag[nic->ptag];
772 	dlist_remove(&nic->ptag_nic_list);
773 
774 	pthread_mutex_unlock(&gnix_nic_list_lock);
775 	__gnix_nic_tx_freelist_destroy(nic);
776 
777 	/*
778 	 *free irq cq related resources
779 	 */
780 
781 	ret = __nic_teardown_irq_cq(nic);
782 	if (ret != FI_SUCCESS)
783 		GNIX_WARN(FI_LOG_EP_CTRL,
784 			  "__nic_teardown_irq_cq returned %s\n",
785 			  fi_strerror(-ret));
786 
787 	/*
788 	 * kill off progress thread, if any
789 	 */
790 
791 	if (nic->progress_thread) {
792 
793 		ret = pthread_cancel(nic->progress_thread);
794 		if ((ret != 0) && (ret != ESRCH)) {
795 			GNIX_WARN(FI_LOG_EP_CTRL,
796 			"pthread_cancel returned %d\n", ret);
797 			goto err;
798 		}
799 
800 		ret = pthread_join(nic->progress_thread,
801 				   NULL);
802 		if ((ret != 0) && (ret != ESRCH)) {
803 			GNIX_WARN(FI_LOG_EP_CTRL,
804 			"pthread_join returned %d\n", ret);
805 			goto err;
806 		}
807 
808 		GNIX_INFO(FI_LOG_EP_CTRL, "pthread_join returned %d\n", ret);
809 		nic->progress_thread = 0;
810 	}
811 
812 	/* Must free mboxes first, because the MR has a pointer to the
813 	 * nic handles below */
814 	ret = _gnix_mbox_allocator_destroy(nic->mbox_hndl);
815 	if (ret != FI_SUCCESS)
816 		GNIX_WARN(FI_LOG_EP_CTRL,
817 			  "_gnix_mbox_allocator_destroy returned %s\n",
818 			  fi_strerror(-ret));
819 
820 	/*
821 	 * see comments in the nic constructor about why
822 	 * the following code section is currently stubbed out.
823 	 */
824 #if 0
825 	ret = _gnix_mbox_allocator_destroy(nic->s_rdma_buf_hndl);
826 	if (ret != FI_SUCCESS)
827 		GNIX_WARN(FI_LOG_EP_CTRL,
828 			  "_gnix_mbox_allocator_destroy returned %s\n",
829 			  fi_strerror(-ret));
830 
831 	ret = _gnix_mbox_allocator_destroy(nic->r_rdma_buf_hndl);
832 	if (ret != FI_SUCCESS)
833 		GNIX_WARN(FI_LOG_EP_CTRL,
834 			  "_gnix_mbox_allocator_destroy returned %s\n",
835 			  fi_strerror(-ret));
836 #endif
837 
838 	if (!nic->gni_cdm_hndl) {
839 		GNIX_WARN(FI_LOG_EP_CTRL, "No CDM attached to nic, nic=%p");
840 	}
841 
842 	assert(nic->gni_cdm_hndl != NULL);
843 
844 	if (nic->rx_cq != NULL && nic->rx_cq != nic->rx_cq_blk) {
845 		status = GNI_CqDestroy(nic->rx_cq);
846 		if (status != GNI_RC_SUCCESS) {
847 			GNIX_WARN(FI_LOG_EP_CTRL,
848 				  "GNI_CqDestroy returned %s\n",
849 				 gni_err_str[status]);
850 			ret = gnixu_to_fi_errno(status);
851 			goto err;
852 		}
853 	}
854 
855 	if (nic->rx_cq_blk != NULL) {
856 		status = GNI_CqDestroy(nic->rx_cq_blk);
857 		if (status != GNI_RC_SUCCESS) {
858 			GNIX_WARN(FI_LOG_EP_CTRL,
859 				  "GNI_CqDestroy returned %s\n",
860 				 gni_err_str[status]);
861 			ret = gnixu_to_fi_errno(status);
862 			goto err;
863 		}
864 	}
865 
866 	if (nic->tx_cq != NULL && nic->tx_cq != nic->tx_cq_blk) {
867 		status = GNI_CqDestroy(nic->tx_cq);
868 		if (status != GNI_RC_SUCCESS) {
869 			GNIX_WARN(FI_LOG_EP_CTRL,
870 				  "GNI_CqDestroy returned %s\n",
871 				 gni_err_str[status]);
872 			ret = gnixu_to_fi_errno(status);
873 			goto err;
874 		}
875 	}
876 
877 	if (nic->tx_cq_blk != NULL) {
878 		status = GNI_CqDestroy(nic->tx_cq_blk);
879 		if (status != GNI_RC_SUCCESS) {
880 			GNIX_WARN(FI_LOG_EP_CTRL,
881 				  "GNI_CqDestroy returned %s\n",
882 				 gni_err_str[status]);
883 			ret = gnixu_to_fi_errno(status);
884 			goto err;
885 		}
886 	}
887 
888 	if (nic->allocd_gni_res & GNIX_NIC_CDM_ALLOCD) {
889 		status = GNI_CdmDestroy(nic->gni_cdm_hndl);
890 		if (status != GNI_RC_SUCCESS) {
891 			GNIX_WARN(FI_LOG_EP_CTRL,
892 				  "GNI_CdmDestroy returned %s\n",
893 				  gni_err_str[status]);
894 			ret = gnixu_to_fi_errno(status);
895 			goto err;
896 		}
897 	}
898 
899 	if (nic->vc_id_table != NULL) {
900 		free(nic->vc_id_table);
901 	} else {
902 		GNIX_WARN(FI_LOG_EP_CTRL, "vc_id_table was NULL\n");
903 	}
904 
905 	/*
906 	 * destroy VC free list associated with this nic
907 	 */
908 
909 	_gnix_fl_destroy(&nic->vc_freelist);
910 
911 	/*
912 	 * remove the nic from the linked lists
913 	 * for the domain and the global nic list
914 	 */
915 
916 err:
917 	_gnix_free_bitmap(&nic->vc_id_bitmap);
918 
919 	free(nic);
920 }
921 
_gnix_nic_free(struct gnix_nic * nic)922 int _gnix_nic_free(struct gnix_nic *nic)
923 {
924 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
925 
926 	if (nic == NULL)
927 		return -FI_EINVAL;
928 
929 	_gnix_ref_put(nic);
930 
931 	return FI_SUCCESS;
932 }
933 
934 /*
935  * allocate a gnix_nic struct using attributes of the domain
936  */
937 
gnix_nic_alloc(struct gnix_fid_domain * domain,struct gnix_nic_attr * attr,struct gnix_nic ** nic_ptr)938 int gnix_nic_alloc(struct gnix_fid_domain *domain,
939 		   struct gnix_nic_attr *attr,
940 		   struct gnix_nic **nic_ptr)
941 {
942 	int ret = FI_SUCCESS;
943 	struct gnix_nic *nic = NULL;
944 	uint32_t device_addr;
945 	gni_return_t status;
946 	uint32_t fake_cdm_id = GNIX_CREATE_CDM_ID;
947 	gni_smsg_attr_t smsg_mbox_attr;
948 	struct gnix_nic_attr *nic_attr = &default_attr;
949 	uint32_t num_corespec_cpus = 0;
950 	bool must_alloc_nic = false;
951 	bool free_list_inited = false;
952 	struct gnix_auth_key *auth_key;
953 
954 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
955 
956 	*nic_ptr = NULL;
957 	nic_attr->gni_cdm_modes = gnix_cdm_modes;
958 
959 	if (attr) {
960 		ret = __gnix_nic_check_attr_sanity(attr);
961 		if (ret != FI_SUCCESS)
962 			return ret;
963 		nic_attr = attr;
964 		must_alloc_nic = nic_attr->must_alloc;
965 	}
966 
967 	auth_key = nic_attr->auth_key;
968 
969 	/*
970 	 * If we've maxed out the number of nics for this domain/ptag,
971 	 * search the list of existing nics.  Take the gnix_nic_list_lock
972 	 * here since the gnix_nic_list will be manipulated whether or
973 	 * not we attach to an existing nic or create a new one.
974 	 *
975 	 * Should not matter much that this is a pretty fat critical section
976 	 * since endpoint setup for RDM type will typically occur near
977 	 * app startup, likely in a single threaded region, and for the
978 	 * case of MSG, where there will likely be many 100s of EPs, after
979 	 * a few initial slow times through this section when nics are created,
980 	 * max nic count for the ptag will be reached and only the first part
981 	 * of the critical section - iteration over existing nics - will be
982 	 * happening.
983 	 */
984 
985 	pthread_mutex_lock(&gnix_nic_list_lock);
986 
987 	/*
988 	 * we can reuse previously allocated nics as long as a
989 	 * must_alloc is not specified in the nic_attr arg.
990 	 */
991 
992 	if ((must_alloc_nic == false) &&
993 	    (gnix_nics_per_ptag[auth_key->ptag] >= gnix_max_nics_per_ptag)) {
994 		assert(!dlist_empty(&gnix_nic_list_ptag[auth_key->ptag]));
995 
996 		nic = dlist_first_entry(&gnix_nic_list_ptag[auth_key->ptag],
997 					struct gnix_nic, ptag_nic_list);
998 		dlist_remove(&nic->ptag_nic_list);
999 		dlist_insert_tail(&nic->ptag_nic_list,
1000 				  &gnix_nic_list_ptag[auth_key->ptag]);
1001 		_gnix_ref_get(nic);
1002 
1003 		GNIX_INFO(FI_LOG_EP_CTRL, "Reusing NIC:%p\n", nic);
1004 	}
1005 
1006 	/*
1007 	 * no nic found create a cdm and attach
1008 	 */
1009 
1010 	if (!nic) {
1011 
1012 		nic = calloc(1, sizeof(struct gnix_nic));
1013 		if (nic == NULL) {
1014 			ret = -FI_ENOMEM;
1015 			goto err;
1016 		}
1017 
1018 		nic->using_vmdh = domain->using_vmdh;
1019 
1020 		if (nic_attr->use_cdm_id == false) {
1021 			ret = _gnix_cm_nic_create_cdm_id(domain, &fake_cdm_id);
1022 			if (ret != FI_SUCCESS) {
1023 				GNIX_WARN(FI_LOG_EP_CTRL,
1024 					  "_gnix_cm_nic_create_cdm_id returned %s\n",
1025 					  fi_strerror(-ret));
1026 				goto err;
1027 			}
1028 		} else
1029 			fake_cdm_id = nic_attr->cdm_id;
1030 
1031 		if (nic_attr->gni_cdm_hndl == NULL) {
1032 			status = GNI_CdmCreate(fake_cdm_id,
1033 						auth_key->ptag,
1034 						auth_key->cookie,
1035 						gnix_cdm_modes,
1036 						&nic->gni_cdm_hndl);
1037 			if (status != GNI_RC_SUCCESS) {
1038 				GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CdmCreate returned %s\n",
1039 					 gni_err_str[status]);
1040 				ret = gnixu_to_fi_errno(status);
1041 				goto err1;
1042 			}
1043 			nic->allocd_gni_res |= GNIX_NIC_CDM_ALLOCD;
1044 		} else {
1045 			nic->gni_cdm_hndl = nic_attr->gni_cdm_hndl;
1046 		}
1047 
1048 		/*
1049 		 * Okay, now go for the attach
1050 		*/
1051 
1052 		if (nic_attr->gni_nic_hndl == NULL) {
1053 			status = GNI_CdmAttach(nic->gni_cdm_hndl,
1054 						0,
1055 						&device_addr,
1056 						&nic->gni_nic_hndl);
1057 			if (status != GNI_RC_SUCCESS) {
1058 				GNIX_WARN(FI_LOG_EP_CTRL, "GNI_CdmAttach returned %s\n",
1059 					 gni_err_str[status]);
1060 				_gnix_dump_gni_res(auth_key->ptag);
1061 				ret = gnixu_to_fi_errno(status);
1062 				goto err1;
1063 			}
1064 		} else
1065 			nic->gni_nic_hndl = nic_attr->gni_nic_hndl;
1066 
1067 		/*
1068 		 * create TX CQs - first polling, then blocking
1069 		 */
1070 
1071 		status = GNI_CqCreate(nic->gni_nic_hndl,
1072 					domain->params.tx_cq_size,
1073 					0,                  /* no delay count */
1074 					GNI_CQ_BLOCKING |
1075 						domain->gni_cq_modes,
1076 					NULL,              /* useless handler */
1077 					NULL,               /* useless handler
1078 								context */
1079 					&nic->tx_cq_blk);
1080 		if (status != GNI_RC_SUCCESS) {
1081 			GNIX_WARN(FI_LOG_EP_CTRL,
1082 				  "GNI_CqCreate returned %s\n",
1083 				  gni_err_str[status]);
1084 			_gnix_dump_gni_res(auth_key->ptag);
1085 			ret = gnixu_to_fi_errno(status);
1086 			goto err1;
1087 		}
1088 
1089 		/* Use blocking CQs for all operations if eager_auto_progress
1090 		 * is used.  */
1091 		if (domain->params.eager_auto_progress) {
1092 			nic->tx_cq = nic->tx_cq_blk;
1093 		} else {
1094 			status = GNI_CqCreate(nic->gni_nic_hndl,
1095 						domain->params.tx_cq_size,
1096 						0, /* no delay count */
1097 						domain->gni_cq_modes,
1098 						NULL, /* useless handler */
1099 						NULL, /* useless handler ctx */
1100 						&nic->tx_cq);
1101 			if (status != GNI_RC_SUCCESS) {
1102 				GNIX_WARN(FI_LOG_EP_CTRL,
1103 					  "GNI_CqCreate returned %s\n",
1104 					  gni_err_str[status]);
1105 				_gnix_dump_gni_res(auth_key->ptag);
1106 				ret = gnixu_to_fi_errno(status);
1107 				goto err1;
1108 			}
1109 		}
1110 
1111 
1112 		/*
1113 		 * create RX CQs - first polling, then blocking
1114 		 */
1115 
1116 		status = GNI_CqCreate(nic->gni_nic_hndl,
1117 					domain->params.rx_cq_size,
1118 					0,
1119 					GNI_CQ_BLOCKING |
1120 						domain->gni_cq_modes,
1121 					NULL,
1122 					NULL,
1123 					&nic->rx_cq_blk);
1124 		if (status != GNI_RC_SUCCESS) {
1125 			GNIX_WARN(FI_LOG_EP_CTRL,
1126 				  "GNI_CqCreate returned %s\n",
1127 				  gni_err_str[status]);
1128 			_gnix_dump_gni_res(auth_key->ptag);
1129 			ret = gnixu_to_fi_errno(status);
1130 			goto err1;
1131 		}
1132 
1133 		/* Use blocking CQs for all operations if eager_auto_progress
1134 		 * is used.  */
1135 		if (domain->params.eager_auto_progress) {
1136 			nic->rx_cq = nic->rx_cq_blk;
1137 		} else {
1138 			status = GNI_CqCreate(nic->gni_nic_hndl,
1139 						domain->params.rx_cq_size,
1140 						0,
1141 						domain->gni_cq_modes,
1142 						NULL,
1143 						NULL,
1144 						&nic->rx_cq);
1145 			if (status != GNI_RC_SUCCESS) {
1146 				GNIX_WARN(FI_LOG_EP_CTRL,
1147 					  "GNI_CqCreate returned %s\n",
1148 					  gni_err_str[status]);
1149 				_gnix_dump_gni_res(auth_key->ptag);
1150 				ret = gnixu_to_fi_errno(status);
1151 				goto err1;
1152 			}
1153 		}
1154 
1155 		nic->device_addr = device_addr;
1156 		nic->ptag = auth_key->ptag;
1157 		nic->cookie = auth_key->cookie;
1158 
1159 		nic->vc_id_table_capacity = domain->params.vc_id_table_capacity;
1160 		nic->vc_id_table = malloc(sizeof(void *) *
1161 					       nic->vc_id_table_capacity);
1162 		if (nic->vc_id_table == NULL) {
1163 			GNIX_WARN(FI_LOG_EP_CTRL,
1164 				  "malloc of vc_id_table failed\n");
1165 			ret = -FI_ENOMEM;
1166 			goto err1;
1167 		}
1168 
1169 		ret = _gnix_alloc_bitmap(&nic->vc_id_bitmap,
1170 					 nic->vc_id_table_capacity, NULL);
1171 		if (ret != FI_SUCCESS) {
1172 			GNIX_WARN(FI_LOG_EP_CTRL,
1173 				  "alloc_bitmap returned %d\n", ret);
1174 			goto err1;
1175 		}
1176 		fastlock_init(&nic->vc_id_lock);
1177 
1178 		/*
1179 		 * initialize free list for VC's
1180 		 * In addition to hopefully allowing for a more compact
1181 		 * allocation of VC structs, the free list is also import
1182 		 * because there is a window of time when using auto progress
1183 		 * that a thread may be going through the progress engine
1184 		 * while one of the application threads is actively tearing
1185 		 * down an endpoint (and hence its associated VCs) before the
1186 		 * rem_id for the vc is removed from the vector.
1187 		 * As a consequence, it is important that
1188 		 * the memory allocated within the freelist allocator not be
1189 		 * returned to the system prior to the freelist being destroyed
1190 		 * as part of the nic destructor procedure.  The freelist is
1191 		 * destroyed in that procedure after the progress thread
1192 		 * has been joined.
1193 		 */
1194 
1195 		ret = _gnix_fl_init_ts(sizeof(struct gnix_vc),
1196 				       offsetof(struct gnix_vc, fr_list),
1197 				       GNIX_VC_FL_MIN_SIZE,
1198 				       GNIX_VC_FL_INIT_REFILL_SIZE,
1199 				       0,
1200 				       0,
1201 				       &nic->vc_freelist);
1202 		if (ret == FI_SUCCESS) {
1203 			free_list_inited = true;
1204 		} else {
1205 			GNIX_DEBUG(FI_LOG_EP_DATA, "_gnix_fl_init returned: %s\n",
1206 				   fi_strerror(-ret));
1207 			goto err1;
1208 		}
1209 
1210 		fastlock_init(&nic->lock);
1211 
1212 		ret = __gnix_nic_tx_freelist_init(nic,
1213 						  domain->params.tx_cq_size);
1214 		if (ret != FI_SUCCESS)
1215 			goto err1;
1216 
1217 		fastlock_init(&nic->prog_vcs_lock);
1218 		dlist_init(&nic->prog_vcs);
1219 
1220 		_gnix_ref_init(&nic->ref_cnt, 1, __nic_destruct);
1221 
1222 		smsg_mbox_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
1223 		smsg_mbox_attr.mbox_maxcredit = domain->params.mbox_maxcredit;
1224 		smsg_mbox_attr.msg_maxsize =  domain->params.mbox_msg_maxsize;
1225 
1226 		status = GNI_SmsgBufferSizeNeeded(&smsg_mbox_attr,
1227 						  &nic->mem_per_mbox);
1228 		if (status != GNI_RC_SUCCESS) {
1229 			GNIX_WARN(FI_LOG_EP_CTRL,
1230 				  "GNI_SmsgBufferSizeNeeded returned %s\n",
1231 				  gni_err_str[status]);
1232 			ret = gnixu_to_fi_errno(status);
1233 			goto err1;
1234 		}
1235 
1236 		/*
1237 		 * set up mailbox allocator for SMSG mailboxes
1238 		 */
1239 
1240 		ret = _gnix_mbox_allocator_create(nic,
1241 					  nic->rx_cq,
1242 					  domain->params.mbox_page_size,
1243 					  (size_t)nic->mem_per_mbox,
1244 					  domain->params.mbox_num_per_slab,
1245 					  &nic->mbox_hndl);
1246 
1247 		if (ret != FI_SUCCESS) {
1248 			GNIX_WARN(FI_LOG_EP_CTRL,
1249 				  "_gnix_mbox_alloc returned %s\n",
1250 				  fi_strerror(-ret));
1251 			goto err1;
1252 		}
1253 
1254 		/*
1255 		 * use the mailbox allocator system to set up an
1256 		 * pre-pinned RDMA bounce buffers for longer eager
1257 		 * messages and other cases where zero-copy
1258 		 * can't be safely used.
1259 		 *
1260 		 * One set of blocks is used for the send side.
1261 		 * A second set of blocks is used for the receive
1262 		 * side.  Both sets of blocks are registered against
1263 		 * the blocking RX CQ for this nic.
1264 		 *
1265 		 * TODO: hardwired constants, uff
1266 		 * TODO: better to use a buddy allocator or some other
1267 		 * allocator
1268 		 * Disable these for now as we're not using and they
1269 		 * chew up a lot of IOMMU space per nic.
1270 		 */
1271 
1272 #if 0
1273 		ret = _gnix_mbox_allocator_create(nic,
1274 						  NULL,
1275 						  GNIX_PAGE_2MB,
1276 						  65536,
1277 						  512,
1278 						  &nic->s_rdma_buf_hndl);
1279 		if (ret != FI_SUCCESS) {
1280 			GNIX_WARN(FI_LOG_EP_CTRL,
1281 				  "_gnix_mbox_alloc returned %s\n",
1282 				  fi_strerror(-ret));
1283 			_gnix_dump_gni_res(domain->ptag);
1284 			goto err1;
1285 		}
1286 
1287 		ret = _gnix_mbox_allocator_create(nic,
1288 						  NULL,
1289 						  GNIX_PAGE_2MB,
1290 						  65536,
1291 						  512,
1292 						  &nic->r_rdma_buf_hndl);
1293 		if (ret != FI_SUCCESS) {
1294 			GNIX_WARN(FI_LOG_EP_CTRL,
1295 				  "_gnix_mbox_alloc returned %s\n",
1296 				  fi_strerror(-ret));
1297 			_gnix_dump_gni_res(domain->ptag);
1298 			goto err1;
1299 		}
1300 #endif
1301 
1302 		ret =  __nic_setup_irq_cq(nic);
1303 		if (ret != FI_SUCCESS) {
1304 			GNIX_WARN(FI_LOG_EP_CTRL,
1305 				  "__nic_setup_irq_cq returned %s\n",
1306 				  fi_strerror(-ret));
1307 			_gnix_dump_gni_res(auth_key->ptag);
1308 			goto err1;
1309 		}
1310 
1311 		/*
1312  		 * if the domain is using PROGRESS_AUTO for data, set up
1313  		 * a progress thread.
1314  		 */
1315 
1316 		if (domain->data_progress == FI_PROGRESS_AUTO) {
1317 
1318 			/*
1319 			 * tell CLE job container that next thread should be
1320 			 * runnable anywhere in the cpuset, don't treat as
1321 			 * an error if one is returned, may have perf issues
1322 			 * though...
1323 			 */
1324 
1325 			ret = _gnix_get_num_corespec_cpus(&num_corespec_cpus);
1326 			if (ret != FI_SUCCESS) {
1327 				GNIX_WARN(FI_LOG_EP_CTRL,
1328 				  "failed to get num corespec cpus\n");
1329 			}
1330 			if (num_corespec_cpus > 0) {
1331 				ret = _gnix_job_disable_affinity_apply();
1332 			} else {
1333 				ret = _gnix_job_enable_unassigned_cpus();
1334 			}
1335 			if (ret != 0)
1336 				GNIX_WARN(FI_LOG_EP_CTRL,
1337 				"job_disable/unassigned cpus returned %d\n",
1338 					 ret);
1339 
1340 			ret = pthread_create(&nic->progress_thread,
1341 					     NULL,
1342 					     __gnix_nic_prog_thread_fn,
1343 					     (void *)nic);
1344 			if (ret)
1345 				GNIX_WARN(FI_LOG_EP_CTRL,
1346 				"pthread_create call returned %d\n", ret);
1347 		}
1348 
1349 		dlist_insert_tail(&nic->gnix_nic_list, &gnix_nic_list);
1350 		dlist_insert_tail(&nic->ptag_nic_list,
1351 				  &gnix_nic_list_ptag[auth_key->ptag]);
1352 
1353 		nic->smsg_callbacks = gnix_ep_smsg_callbacks;
1354 
1355 		++gnix_nics_per_ptag[auth_key->ptag];
1356 
1357 		GNIX_INFO(FI_LOG_EP_CTRL, "Allocated NIC:%p\n", nic);
1358 	}
1359 
1360 	if (nic) {
1361 		nic->requires_lock = domain->thread_model != FI_THREAD_COMPLETION;
1362 		nic->using_vmdh = domain->using_vmdh;
1363 	}
1364 
1365 	*nic_ptr = nic;
1366 	goto out;
1367 
1368 err1:
1369 	ofi_atomic_dec32(&gnix_id_counter);
1370 err:
1371 	if (nic != NULL) {
1372 		__nic_teardown_irq_cq(nic);
1373 		if (nic->r_rdma_buf_hndl != NULL)
1374 			_gnix_mbox_allocator_destroy(nic->r_rdma_buf_hndl);
1375 		if (nic->s_rdma_buf_hndl != NULL)
1376 			_gnix_mbox_allocator_destroy(nic->s_rdma_buf_hndl);
1377 		if (nic->mbox_hndl != NULL)
1378 			_gnix_mbox_allocator_destroy(nic->mbox_hndl);
1379 		if (nic->rx_cq != NULL && nic->rx_cq != nic->rx_cq_blk)
1380 			GNI_CqDestroy(nic->rx_cq);
1381 		if (nic->rx_cq_blk != NULL)
1382 			GNI_CqDestroy(nic->rx_cq_blk);
1383 		if (nic->tx_cq != NULL && nic->tx_cq != nic->tx_cq_blk)
1384 			GNI_CqDestroy(nic->tx_cq);
1385 		if (nic->tx_cq_blk != NULL)
1386 			GNI_CqDestroy(nic->tx_cq_blk);
1387 		if ((nic->gni_cdm_hndl != NULL) && (nic->allocd_gni_res &
1388 		    GNIX_NIC_CDM_ALLOCD))
1389 			GNI_CdmDestroy(nic->gni_cdm_hndl);
1390 		if (free_list_inited == true)
1391 			_gnix_fl_destroy(&nic->vc_freelist);
1392 		free(nic);
1393 	}
1394 
1395 out:
1396 	pthread_mutex_unlock(&gnix_nic_list_lock);
1397 	return ret;
1398 }
1399 
_gnix_nic_init(void)1400 void _gnix_nic_init(void)
1401 {
1402 	int i, rc;
1403 
1404 	for (i = 0; i < GNI_PTAG_MAX; i++) {
1405 		dlist_init(&gnix_nic_list_ptag[i]);
1406 	}
1407 
1408 	rc = _gnix_nics_per_rank(&gnix_max_nics_per_ptag);
1409 	if (rc == FI_SUCCESS) {
1410 		GNIX_DEBUG(FI_LOG_FABRIC, "gnix_max_nics_per_ptag: %u\n",
1411 			   gnix_max_nics_per_ptag);
1412 	} else {
1413 		GNIX_WARN(FI_LOG_FABRIC, "_gnix_nics_per_rank failed: %d\n",
1414 			  rc);
1415 	}
1416 
1417 	if (getenv("GNIX_MAX_NICS") != NULL)
1418 		gnix_max_nics_per_ptag = atoi(getenv("GNIX_MAX_NICS"));
1419 
1420 	/*
1421 	 * Well if we didn't get 1 nic, that means we must really be doing
1422 	 * FMA sharing.
1423 	 */
1424 
1425 	if (gnix_max_nics_per_ptag == 0) {
1426 		gnix_max_nics_per_ptag = 1;
1427 		GNIX_WARN(FI_LOG_FABRIC, "Using inter-procss FMA sharing\n");
1428 	}
1429 }
1430 
1431