1 /*
2  * Copyright (c) 2015-2016 Cray Inc.  All rights reserved.
3  * Copyright (c) 2015-2017 Los Alamos National Security, LLC.
4  *                         All rights reserved.
5  * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
6  *
7  * This software is available to you under a choice of one of two
8  * licenses.  You may choose to be licensed under the terms of the GNU
9  * General Public License (GPL) Version 2, available from the file
10  * COPYING in the main directory of this source tree, or the
11  * BSD license below:
12  *
13  *     Redistribution and use in source and binary forms, with or
14  *     without modification, are permitted provided that the following
15  *     conditions are met:
16  *
17  *      - Redistributions of source code must retain the above
18  *        copyright notice, this list of conditions and the following
19  *        disclaimer.
20  *
21  *      - Redistributions in binary form must reproduce the above
22  *        copyright notice, this list of conditions and the following
23  *        disclaimer in the documentation and/or other materials
24  *        provided with the distribution.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33  * SOFTWARE.
34  */
35 
36 #if HAVE_CONFIG_H
37 #  include <config.h>
38 #endif /* HAVE_CONFIG_H */
39 
40 #include <errno.h>
41 #include <fcntl.h>
42 #include <poll.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <assert.h>
47 #include <pthread.h>
48 #include <signal.h>
49 
50 #include <rdma/fabric.h>
51 #include <rdma/fi_cm.h>
52 #include <rdma/fi_domain.h>
53 #include <rdma/fi_endpoint.h>
54 #include <rdma/fi_rma.h>
55 #include <rdma/fi_errno.h>
56 
57 #include <rdma/providers/fi_prov.h>
58 
59 #include "gnix.h"
60 #include "gnix_datagram.h"
61 #include "gnix_util.h"
62 #include "gnix_cm_nic.h"
63 #include "gnix_nic.h"
64 
65 
66 /*******************************************************************************
67  * Helper functions.
68  ******************************************************************************/
69 
70 /*
71  * this function is intended to be invoked as an argument to pthread_create,
72  */
_gnix_dgram_prog_thread_fn(void * the_arg)73 static void *_gnix_dgram_prog_thread_fn(void *the_arg)
74 {
75 	int ret = FI_SUCCESS, prev_state;
76 	struct gnix_dgram_hndl *the_hndl = (struct gnix_dgram_hndl *)the_arg;
77 	sigset_t  sigmask;
78 
79 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
80 
81 	/*
82 	 * temporarily disable cancelability while we set up
83 	 * some stuff
84 	 */
85 
86 	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &prev_state);
87 
88 	/*
89 	 * help out Cray core-spec, say we're not an app thread
90 	 * and can be run on core-spec cpus.
91 	 */
92 
93 	ret = _gnix_task_is_not_app();
94 	if (ret)
95 		GNIX_WARN(FI_LOG_EP_CTRL,
96 		"_gnix_task_is_not_app call returned %d\n", ret);
97 
98 	/*
99 	 * block all signals, don't want this thread to catch
100 	 * signals that may be for app threads
101 	 */
102 
103 	memset(&sigmask, 0, sizeof(sigset_t));
104 	ret = sigfillset(&sigmask);
105 	if (ret) {
106 		GNIX_WARN(FI_LOG_EP_CTRL,
107 		"sigfillset call returned %d\n", ret);
108 	} else {
109 
110 		ret = pthread_sigmask(SIG_SETMASK,
111 					&sigmask, NULL);
112 		if (ret)
113 			GNIX_WARN(FI_LOG_EP_CTRL,
114 			"pthread_sigmask call returned %d\n", ret);
115 	}
116 
117 	/*
118 	 * okay now we're ready to be cancelable.
119 	 */
120 
121 	pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &prev_state);
122 
123 	pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
124 
125 retry:
126 	ret = _gnix_dgram_poll(the_hndl, GNIX_DGRAM_BLOCK);
127 	if ((ret == -FI_ETIMEDOUT) || (ret == FI_SUCCESS))
128 		goto retry;
129 
130 	GNIX_WARN(FI_LOG_EP_CTRL,
131 		"_gnix_dgram_poll returned %s\n", fi_strerror(-ret));
132 
133 	/*
134 	 * TODO: need to be able to enqueue events on to the
135 	 * ep associated with the cm_nic.
136 	 */
137 	return NULL;
138 }
139 
140 /*******************************************************************************
141  * API function implementations.
142  ******************************************************************************/
143 
144 /*
145  * function to pack data into datagram in/out buffers.
146  * On success, returns number of bytes packed in to the buffer,
147  * otherwise -FI errno.
148  */
_gnix_dgram_pack_buf(struct gnix_datagram * d,enum gnix_dgram_buf buf,void * data,uint32_t nbytes)149 ssize_t _gnix_dgram_pack_buf(struct gnix_datagram *d, enum gnix_dgram_buf buf,
150 			 void *data, uint32_t nbytes)
151 {
152 	char *dptr;
153 	uint32_t index;
154 
155 	assert(d != NULL);
156 	if (buf == GNIX_DGRAM_IN_BUF) {
157 		index = d->w_index_in_buf;
158 		dptr = &d->dgram_in_buf[index];
159 	} else {
160 		index = d->w_index_out_buf;
161 		dptr = &d->dgram_out_buf[index];
162 	}
163 
164 	/*
165 	 * make sure there's room
166 	 */
167 	if ((index + nbytes) > GNI_DATAGRAM_MAXSIZE)
168 		return -FI_ENOSPC;
169 
170 	memcpy(dptr, data, nbytes);
171 
172 	if (buf == GNIX_DGRAM_IN_BUF)
173 		d->w_index_in_buf += nbytes;
174 	else
175 		d->w_index_out_buf += nbytes;
176 
177 	return nbytes;
178 }
179 
180 
181 /*
182  * function to unpack data from datagram in/out buffers.
183  * On success, returns number of bytes unpacked,
184  * otherwise -FI errno.
185  */
_gnix_dgram_unpack_buf(struct gnix_datagram * d,enum gnix_dgram_buf buf,void * data,uint32_t nbytes)186 ssize_t _gnix_dgram_unpack_buf(struct gnix_datagram *d, enum gnix_dgram_buf buf,
187 			   void *data, uint32_t nbytes)
188 {
189 	char *dptr;
190 	uint32_t index, bytes_left;
191 
192 	assert(d != NULL);
193 	if (buf == GNIX_DGRAM_IN_BUF) {
194 		index = d->r_index_in_buf;
195 		dptr = &d->dgram_in_buf[index];
196 	} else {
197 		index = d->r_index_out_buf;
198 		dptr = &d->dgram_out_buf[index];
199 	}
200 
201 	/*
202 	 * only copy out up to GNI_DATAGRAM_MAXSIZE
203 	 */
204 
205 	bytes_left = GNI_DATAGRAM_MAXSIZE - index;
206 
207 	nbytes = (nbytes > bytes_left) ? bytes_left : nbytes;
208 
209 	memcpy(data, dptr, nbytes);
210 
211 	if (buf == GNIX_DGRAM_IN_BUF)
212 		d->r_index_in_buf += nbytes;
213 	else
214 		d->r_index_out_buf += nbytes;
215 
216 	return nbytes;
217 }
218 
219 /*
220  * function to rewind the internal pointers to
221  * datagram in/out buffers.
222  */
_gnix_dgram_rewind_buf(struct gnix_datagram * d,enum gnix_dgram_buf buf)223 int _gnix_dgram_rewind_buf(struct gnix_datagram *d, enum gnix_dgram_buf buf)
224 {
225 	assert(d != NULL);
226 	if (buf == GNIX_DGRAM_IN_BUF) {
227 		d->r_index_in_buf = 0;
228 		d->w_index_in_buf = 0;
229 	} else {
230 		d->r_index_out_buf = 0;
231 		d->w_index_out_buf = 0;
232 	}
233 	return FI_SUCCESS;
234 }
235 
_gnix_dgram_alloc(struct gnix_dgram_hndl * hndl,enum gnix_dgram_type type,struct gnix_datagram ** d_ptr)236 int _gnix_dgram_alloc(struct gnix_dgram_hndl *hndl, enum gnix_dgram_type type,
237 			struct gnix_datagram **d_ptr)
238 {
239 	int ret = -FI_EAGAIN;
240 	struct gnix_datagram *d = NULL;
241 	struct dlist_entry *the_free_list;
242 	struct dlist_entry *the_active_list;
243 
244 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
245 
246 	fastlock_acquire(&hndl->lock);
247 
248 	if (type == GNIX_DGRAM_WC) {
249 		the_free_list = &hndl->wc_dgram_free_list;
250 		the_active_list = &hndl->wc_dgram_active_list;
251 	} else {
252 		the_free_list = &hndl->bnd_dgram_free_list;
253 		the_active_list = &hndl->bnd_dgram_active_list;
254 	}
255 
256 	if (!dlist_empty(the_free_list)) {
257 		d = dlist_first_entry(the_free_list, struct gnix_datagram,
258 				      list);
259 		if (d != NULL) {
260 			dlist_remove_init(&d->list);
261 			dlist_insert_head(&d->list, the_active_list);
262 			d->type = type;
263 			ret = FI_SUCCESS;
264 		}
265 
266 	}
267 
268 	fastlock_release(&hndl->lock);
269 
270 	if (d != NULL) {
271 		d->r_index_in_buf = 0;
272 		d->w_index_in_buf = 0;
273 		d->w_index_in_buf = 0;
274 		d->w_index_out_buf = 0;
275 	}
276 
277 	*d_ptr = d;
278 	return ret;
279 }
280 
_gnix_dgram_free(struct gnix_datagram * d)281 int _gnix_dgram_free(struct gnix_datagram *d)
282 {
283 	int ret = FI_SUCCESS;
284 	gni_return_t status;
285 
286 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
287 
288 	if (d->type == GNIX_DGRAM_BND) {
289 		status = GNI_EpUnbind(d->gni_ep);
290 		if (status != GNI_RC_SUCCESS) {
291 			/* TODO: have to handle this */
292 			GNIX_FATAL(FI_LOG_EP_CTRL,
293 				   "GNI_EpUnbind returned %s (ep=%p)\n",
294 				   gni_err_str[status], d->gni_ep);
295 		}
296 	}
297 
298 	fastlock_acquire(&d->d_hndl->lock);
299 	dlist_remove_init(&d->list);
300 	d->state = GNIX_DGRAM_STATE_FREE;
301 	dlist_insert_head(&d->list, d->free_list_head);
302 	fastlock_release(&d->d_hndl->lock);
303 	return ret;
304 }
305 
_gnix_dgram_wc_post(struct gnix_datagram * d)306 int _gnix_dgram_wc_post(struct gnix_datagram *d)
307 {
308 	int ret = FI_SUCCESS;
309 	gni_return_t status;
310 	struct gnix_nic *nic = d->cm_nic->nic;
311 
312 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
313 
314 	COND_ACQUIRE(nic->requires_lock, &nic->lock);
315 	status = GNI_EpPostDataWId(d->gni_ep,
316 				   d->dgram_in_buf,
317 				   GNI_DATAGRAM_MAXSIZE,
318 				   d->dgram_out_buf,
319 				   GNI_DATAGRAM_MAXSIZE,
320 				   (uint64_t)d);
321 	if (status != GNI_RC_SUCCESS) {
322 		ret = gnixu_to_fi_errno(status);
323 	} else {
324 		/*
325 		 * datagram is active now, listening
326 		 */
327 		d->state = GNIX_DGRAM_STATE_ACTIVE;
328 	}
329 	COND_RELEASE(nic->requires_lock, &nic->lock);
330 
331 	return ret;
332 }
333 
_gnix_dgram_bnd_post(struct gnix_datagram * d)334 int _gnix_dgram_bnd_post(struct gnix_datagram *d)
335 {
336 	gni_return_t status = GNI_RC_SUCCESS;
337 	int ret = FI_SUCCESS;
338 	struct gnix_nic *nic = d->cm_nic->nic;
339 	int post = 1;
340 
341 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
342 
343 	/*
344 	 * bind the datagram ep
345 	 */
346 
347 	status = GNI_EpBind(d->gni_ep,
348 			    d->target_addr.device_addr,
349 			    d->target_addr.cdm_id);
350 	if (status != GNI_RC_SUCCESS) {
351 		GNIX_WARN(FI_LOG_EP_CTRL,
352 			"GNI_EpBind returned %s\n", gni_err_str[status]);
353 		ret = gnixu_to_fi_errno(status);
354 		goto err;
355 	}
356 
357 	COND_ACQUIRE(nic->requires_lock, &nic->lock);
358 	if (d->pre_post_clbk_fn != NULL) {
359 		ret = d->pre_post_clbk_fn(d, &post);
360 		if (ret != FI_SUCCESS)
361 			GNIX_WARN(FI_LOG_EP_CTRL,
362 				"pre_post_callback_fn: %d\n",
363 				ret);
364 	}
365 
366 	if (post) {
367 		/*
368 		 * if we get GNI_RC_ERROR_RESOURCE status return from
369 		 * GNI_EpPostDataWId  that means that either a previously posted
370 		 * wildcard datagram has matched up with an incoming
371 		 * bound datagram or we have a previously posted bound
372 		 * datagram whose transfer to the target node has
373 		 * not yet completed.  Don't treat this case as an error.
374 		 */
375 		status = GNI_EpPostDataWId(d->gni_ep,
376 					   d->dgram_in_buf,
377 					   GNI_DATAGRAM_MAXSIZE,
378 					   d->dgram_out_buf,
379 					   GNI_DATAGRAM_MAXSIZE,
380 					   (uint64_t)d);
381 		if (d->post_post_clbk_fn != NULL) {
382 			ret = d->post_post_clbk_fn(d, status);
383 			if (ret != FI_SUCCESS)
384 				GNIX_WARN(FI_LOG_EP_CTRL,
385 				"post_post_callback_fn: %d\n",
386 				ret);
387 		}
388 	}
389 
390 	COND_RELEASE(nic->requires_lock, &nic->lock);
391 
392 	if (post) {
393 		if ((status != GNI_RC_SUCCESS) &&
394 			(status != GNI_RC_ERROR_RESOURCE)) {
395 				GNIX_WARN(FI_LOG_EP_CTRL,
396 				    "GNI_EpPostDataWId returned %s\n",
397 				     gni_err_str[status]);
398 				ret = gnixu_to_fi_errno(status);
399 				goto err;
400 		}
401 
402 		if (status == GNI_RC_SUCCESS) {
403 			/*
404 			 * datagram is active now, connecting
405 			 */
406 			d->state = GNIX_DGRAM_STATE_ACTIVE;
407 		} else {
408 			ret = -FI_EBUSY;
409 		}
410 	}
411 
412 err:
413 	return ret;
414 }
415 
_gnix_dgram_poll(struct gnix_dgram_hndl * hndl,enum gnix_dgram_poll_type type)416 int  _gnix_dgram_poll(struct gnix_dgram_hndl *hndl,
417 			enum gnix_dgram_poll_type type)
418 {
419 	int ret = FI_SUCCESS;
420 	gni_return_t status;
421 	gni_post_state_t post_state = GNI_POST_PENDING;
422 	uint32_t responding_remote_id;
423 	uint32_t timeout = -1;
424 	unsigned int responding_remote_addr;
425 	struct gnix_datagram *dg_ptr;
426 	uint64_t datagram_id = 0UL;
427 	struct gnix_cm_nic *cm_nic = NULL;
428 	struct gnix_nic *nic = NULL;
429 	struct gnix_address responding_addr;
430 
431 	cm_nic = hndl->cm_nic;
432 	assert(cm_nic != NULL);
433 	nic = cm_nic->nic;
434 	assert(nic != NULL);
435 
436 	if (type == GNIX_DGRAM_BLOCK) {
437 		if (hndl->timeout_needed &&
438 			(hndl->timeout_needed(hndl->timeout_data) == true))
439 				timeout = hndl->timeout;
440 
441 		status = GNI_PostdataProbeWaitById(nic->gni_nic_hndl,
442 						   timeout,
443 						   &datagram_id);
444 		if ((status != GNI_RC_SUCCESS) &&
445 			(status  != GNI_RC_TIMEOUT)) {
446 			GNIX_WARN(FI_LOG_EP_CTRL,
447 				"GNI_PostdataProbeWaitById returned %s\n",
448 					gni_err_str[status]);
449 			ret = gnixu_to_fi_errno(status);
450 			goto err;
451 		}
452 	} else {
453 		status = GNI_PostDataProbeById(nic->gni_nic_hndl,
454 						   &datagram_id);
455 		if ((status != GNI_RC_SUCCESS) &&
456 			(status  != GNI_RC_NO_MATCH)) {
457 			GNIX_WARN(FI_LOG_EP_CTRL,
458 				"GNI_PostdataProbeById returned %s\n",
459 					gni_err_str[status]);
460 			ret = gnixu_to_fi_errno(status);
461 			goto err;
462 		}
463 	}
464 
465 	switch (status) {
466 	case GNI_RC_SUCCESS:
467 		dg_ptr = (struct gnix_datagram *)datagram_id;
468 		assert(dg_ptr != NULL);
469 
470 		/*
471 		 * do need to take lock here
472 		 */
473 		COND_ACQUIRE(nic->requires_lock, &nic->lock);
474 
475 		status = GNI_EpPostDataTestById(dg_ptr->gni_ep,
476 						datagram_id,
477 						&post_state,
478 						&responding_remote_addr,
479 						&responding_remote_id);
480 		if ((status != GNI_RC_SUCCESS) &&
481 			(status !=GNI_RC_NO_MATCH)) {
482 			GNIX_WARN(FI_LOG_EP_CTRL,
483 				"GNI_EpPostDataTestById:  %s\n",
484 					gni_err_str[status]);
485 			ret = gnixu_to_fi_errno(status);
486 			COND_RELEASE(nic->requires_lock, &nic->lock);
487 			goto err;
488 		} else {
489 			if ((status == GNI_RC_SUCCESS) &&
490 			     (dg_ptr->state != GNIX_DGRAM_STATE_ACTIVE)) {
491 				GNIX_DEBUG(FI_LOG_EP_CTRL,
492 					"GNI_EpPostDataTestById ",
493 					"returned success but dgram not active\n");
494 			}
495 		}
496 
497 		COND_RELEASE(nic->requires_lock, &nic->lock);
498 
499 		/*
500 		 * no match is okay, it means another thread
501 		 * won the race to get this datagram
502 		 */
503 
504 		if (status == GNI_RC_NO_MATCH) {
505 			ret = FI_SUCCESS;
506 			goto err;
507 		}
508 
509 		/*
510 		 * pass COMPLETED and error post state cases to
511 		 * callback function if present.  If a callback funciton
512 		 * is not present, the error states set ret to -FI_EIO.
513 		 *
514 		 * TODO should we also pass pending,remote_data states to
515 		 * the callback?  maybe useful for debugging weird
516 		 * datagram problems?
517 		 */
518 		switch (post_state) {
519 		case GNI_POST_TIMEOUT:
520 		case GNI_POST_TERMINATED:
521 		case GNI_POST_ERROR:
522 			ret = -FI_EIO;
523 			break;
524 		case GNI_POST_COMPLETED:
525 			if (dg_ptr->callback_fn != NULL) {
526 				responding_addr.device_addr =
527 					responding_remote_addr;
528 				responding_addr.cdm_id =
529 					responding_remote_id;
530 				ret = dg_ptr->callback_fn((void *)datagram_id,
531 							responding_addr,
532 							post_state);
533 			}
534 			break;
535 		case GNI_POST_PENDING:
536 		case GNI_POST_REMOTE_DATA:
537 			break;
538 		default:
539 			GNIX_FATAL(FI_LOG_EP_CTRL, "Invalid post_state: %d\n",
540 				   post_state);
541 			break;
542 		}
543 		break;
544 	case GNI_RC_TIMEOUT:
545 		/* call progress function */
546 		if (hndl->timeout_progress)
547 			hndl->timeout_progress(hndl->timeout_data);
548 		break;
549 	case GNI_RC_NO_MATCH:
550 		break;
551 	default:
552 		/* an error */
553 		break;
554 	}
555 
556 err:
557 	return ret;
558 }
559 
_gnix_dgram_hndl_alloc(struct gnix_cm_nic * cm_nic,enum fi_progress progress,const struct gnix_dgram_hndl_attr * attr,struct gnix_dgram_hndl ** hndl_ptr)560 int _gnix_dgram_hndl_alloc(struct gnix_cm_nic *cm_nic,
561 			   enum fi_progress progress,
562 			   const struct gnix_dgram_hndl_attr *attr,
563 			   struct gnix_dgram_hndl **hndl_ptr)
564 {
565 	int i, ret = FI_SUCCESS;
566 	int n_dgrams_tot;
567 	struct gnix_datagram *dgram_base = NULL, *dg_ptr;
568 	struct gnix_dgram_hndl *the_hndl = NULL;
569 	struct gnix_fid_domain *dom = cm_nic->domain;
570 	struct gnix_fid_fabric *fabric = NULL;
571 	struct gnix_nic *nic;
572 	gni_return_t status;
573 	uint32_t num_corespec_cpus = 0;
574 
575 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
576 
577 	nic = cm_nic->nic;
578 
579 	if (dom == NULL)
580 		return -FI_EINVAL;
581 
582 	fabric = dom->fabric;
583 
584 	the_hndl = calloc(1, sizeof(struct gnix_dgram_hndl));
585 	if (the_hndl == NULL) {
586 		ret = -FI_ENOMEM;
587 		goto err;
588 	}
589 
590 	the_hndl->cm_nic = cm_nic;
591 
592 	dlist_init(&the_hndl->bnd_dgram_free_list);
593 	dlist_init(&the_hndl->bnd_dgram_active_list);
594 
595 	dlist_init(&the_hndl->wc_dgram_free_list);
596 	dlist_init(&the_hndl->wc_dgram_active_list);
597 
598 	the_hndl->timeout = -1;
599 
600 	/*
601 	 * inherit some stuff from the fabric object being
602 	 * used to open the domain which will use this cm nic.
603 	 */
604 
605 	the_hndl->n_dgrams = fabric->n_bnd_dgrams;
606 	the_hndl->n_wc_dgrams = fabric->n_wc_dgrams;
607 	fastlock_init(&the_hndl->lock);
608 
609 	n_dgrams_tot = the_hndl->n_dgrams + the_hndl->n_wc_dgrams;
610 
611 	/*
612 	 * set up the free lists for datagrams
613 	 */
614 
615 	dgram_base = calloc(n_dgrams_tot,
616 			    sizeof(struct gnix_datagram));
617 	if (dgram_base == NULL) {
618 		ret = -FI_ENOMEM;
619 		goto err;
620 	}
621 
622 	dg_ptr = dgram_base;
623 
624 	/*
625 	 * first build up the list for connection requests
626 	 */
627 
628 	for (i = 0; i < fabric->n_bnd_dgrams; i++, dg_ptr++) {
629 		dg_ptr->d_hndl = the_hndl;
630 		dg_ptr->cm_nic = cm_nic;
631 		status = GNI_EpCreate(nic->gni_nic_hndl,
632 					NULL,
633 					&dg_ptr->gni_ep);
634 		if (status != GNI_RC_SUCCESS) {
635 			ret = gnixu_to_fi_errno(status);
636 			goto err;
637 		}
638 		dlist_node_init(&dg_ptr->list);
639 		dlist_insert_head(&dg_ptr->list,
640 				  &the_hndl->bnd_dgram_free_list);
641 		dg_ptr->free_list_head = &the_hndl->bnd_dgram_free_list;
642 	}
643 
644 	/*
645 	 * now the wild card (WC) dgrams
646 	 */
647 
648 	for (i = 0; i < fabric->n_wc_dgrams; i++, dg_ptr++) {
649 		dg_ptr->d_hndl = the_hndl;
650 		dg_ptr->cm_nic = cm_nic;
651 		status = GNI_EpCreate(nic->gni_nic_hndl,
652 					NULL,
653 					&dg_ptr->gni_ep);
654 		if (status != GNI_RC_SUCCESS) {
655 			ret = gnixu_to_fi_errno(status);
656 			goto err;
657 		}
658 		dlist_node_init(&dg_ptr->list);
659 		dlist_insert_head(&dg_ptr->list, &the_hndl->wc_dgram_free_list);
660 		dg_ptr->free_list_head = &the_hndl->wc_dgram_free_list;
661 	}
662 
663 	/*
664 	 * check the progress model, if FI_PROGRESS_AUTO, fire off
665 	 * a progress thread
666 	 */
667 
668 	if (progress == FI_PROGRESS_AUTO) {
669 
670 		if (attr != NULL) {
671 			the_hndl->timeout_needed = attr->timeout_needed;
672 			the_hndl->timeout_progress = attr->timeout_progress;
673 			the_hndl->timeout_data = attr->timeout_data;
674 			the_hndl->timeout = attr->timeout;
675 		}
676 
677 		/*
678 		 * tell CLE job container that next thread should be
679 		 * runnable anywhere in the cpuset, don't treat as
680 		 * an error if one is returned, may have perf issues
681 		 * though...
682 		 */
683 
684 		ret = _gnix_get_num_corespec_cpus(&num_corespec_cpus);
685 		if (ret != FI_SUCCESS) {
686 			GNIX_WARN(FI_LOG_EP_CTRL,
687 				  "failed to get num corespec cpus\n");
688 		}
689 
690 		if (num_corespec_cpus > 0) {
691 			ret = _gnix_job_disable_affinity_apply();
692 		} else {
693 			ret = _gnix_job_enable_unassigned_cpus();
694 		}
695 		if (ret != 0)
696 			GNIX_WARN(FI_LOG_EP_CTRL,
697 			"disable_affinity/unassigned_cpus call returned %d\n",
698 			ret);
699 
700 		ret = pthread_create(&the_hndl->progress_thread,
701 				     NULL,
702 				     _gnix_dgram_prog_thread_fn,
703 				     (void *)the_hndl);
704 		if (ret) {
705 			GNIX_WARN(FI_LOG_EP_CTRL,
706 			"pthread_ceate  call returned %d\n", ret);
707 			goto err1;
708 		}
709 	}
710 
711 	the_hndl->dgram_base = dgram_base;
712 
713 	*hndl_ptr = the_hndl;
714 
715 	return ret;
716 
717 err1:
718 
719 err:
720 	dg_ptr = dgram_base;
721 	if (dg_ptr) {
722 
723 		for (i = 0; i < n_dgrams_tot; i++, dg_ptr++) {
724 			if (dg_ptr->gni_ep != NULL)
725 				GNI_EpDestroy(dg_ptr->gni_ep);
726 		}
727 		free(dgram_base);
728 	}
729 	if (the_hndl)
730 		free(the_hndl);
731 	return ret;
732 }
733 
_gnix_dgram_hndl_free(struct gnix_dgram_hndl * the_hndl)734 int _gnix_dgram_hndl_free(struct gnix_dgram_hndl *the_hndl)
735 {
736 	int i;
737 	int n_dgrams;
738 	int ret = FI_SUCCESS;
739 	struct gnix_datagram *p, *next, *dg_ptr;
740 	gni_return_t status;
741 
742 	GNIX_TRACE(FI_LOG_EP_CTRL, "\n");
743 
744 	if (the_hndl->dgram_base == NULL) {
745 		ret = -FI_EINVAL;
746 		goto err;
747 	}
748 
749 	/*
750 	 * cancel any active datagrams - GNI_RC_NO_MATCH is okay.
751 	 */
752 	dlist_for_each_safe(&the_hndl->bnd_dgram_active_list, p, next, list) {
753 		dg_ptr = p;
754 		if (dg_ptr->state != GNIX_DGRAM_STATE_FREE) {
755 			status = GNI_EpPostDataCancel(dg_ptr->gni_ep);
756 			if ((status != GNI_RC_SUCCESS) &&
757 					(status != GNI_RC_NO_MATCH)) {
758 				ret = gnixu_to_fi_errno(status);
759 				goto err;
760 			}
761 		}
762 		dlist_remove_init(&dg_ptr->list);
763 	}
764 
765 	dlist_for_each_safe(&the_hndl->wc_dgram_active_list, p, next, list) {
766 		dg_ptr = p;
767 		if (dg_ptr->state == GNIX_DGRAM_STATE_FREE) {
768 			status = GNI_EpPostDataCancel(dg_ptr->gni_ep);
769 			if ((status != GNI_RC_SUCCESS) &&
770 					(status != GNI_RC_NO_MATCH)) {
771 				ret = gnixu_to_fi_errno(status);
772 				goto err;
773 			}
774 		}
775 		dlist_remove_init(&dg_ptr->list);
776 	}
777 
778 	/*
779 	 * destroy all the endpoints
780 	 */
781 
782 	n_dgrams = the_hndl->n_dgrams + the_hndl->n_wc_dgrams;
783 	dg_ptr = the_hndl->dgram_base;
784 
785 	for (i = 0; i < n_dgrams; i++, dg_ptr++) {
786 		if (dg_ptr->gni_ep != NULL)
787 			GNI_EpDestroy(dg_ptr->gni_ep);
788 	}
789 
790 	/*
791 	 * cancel the progress thread, if any
792 	 */
793 
794 	if (the_hndl->progress_thread) {
795 
796 		ret = pthread_cancel(the_hndl->progress_thread);
797 		if ((ret != 0) && (ret != ESRCH)) {
798 			GNIX_WARN(FI_LOG_EP_CTRL,
799 			"pthread_cancel returned %d\n", ret);
800 			goto err;
801 		}
802 
803 		ret = pthread_join(the_hndl->progress_thread,
804 				   NULL);
805 		if ((ret != 0) && (ret != ESRCH)) {
806 			GNIX_WARN(FI_LOG_EP_CTRL,
807 			"pthread_join returned %d\n", ret);
808 			goto err;
809 		}
810 
811 		GNIX_INFO(FI_LOG_EP_CTRL, "pthread_join returned %d\n", ret);
812 	}
813 err:
814 	if (ret != FI_SUCCESS)
815 		GNIX_INFO(FI_LOG_EP_CTRL, "returning error %d\n", ret);
816 	free(the_hndl->dgram_base);
817 	free(the_hndl);
818 
819 	return ret;
820 }
821