1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2018      Los Alamos National Security, LLC. All rights
4  *                         reserved.
5  * Copyright (c) 2018      Research Organization for Information Science
6  *                         and Technology (RIST). All rights reserved.
7  * Copyright (c) 2018      Triad National Security, LLC. All rights
8  *                         reserved.
9  * Copyright (c) 2019      Google, LLC. All rights reserved.
10  * $COPYRIGHT$
11  *
12  * Additional copyrights may follow
13  *
14  * $HEADER$
15  */
16 
17 #include "btl_uct_device_context.h"
18 #include "btl_uct_am.h"
19 #include "opal/util/bit_ops.h"
20 #include "opal/util/argv.h"
21 
22 #if HAVE_DECL_UCT_CB_FLAG_SYNC
23 #define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC
24 #else
25 #define MCA_BTL_UCT_CB_FLAG_SYNC 0
26 #endif
27 
28 /**
29  * @brief Convert UCT capabilities to BTL flags
30  */
31 static uint64_t mca_btl_uct_cap_to_btl_flag[][2] = {
32     {UCT_IFACE_FLAG_AM_SHORT, MCA_BTL_FLAGS_SEND},
33     {UCT_IFACE_FLAG_PUT_ZCOPY, MCA_BTL_FLAGS_PUT},
34     {UCT_IFACE_FLAG_GET_ZCOPY, MCA_BTL_FLAGS_GET},
35     {0,0},
36 };
37 
38 /**
39  * @brief Convert UCT capability flags to BTL flags
40  *
41  * @param[in] cap_flags  UCT capability flags
42  *
43  * @returns equivalent BTL flags
44  */
mca_btl_uct_module_flags(uint64_t cap_flags)45 static int32_t mca_btl_uct_module_flags (uint64_t cap_flags)
46 {
47     uint32_t flags = 0;
48 
49     for (int i = 0 ; mca_btl_uct_cap_to_btl_flag[i][0] > 0 ; ++i) {
50         if (cap_flags & mca_btl_uct_cap_to_btl_flag[i][0]) {
51             flags |= (uint32_t) mca_btl_uct_cap_to_btl_flag[i][1];
52         }
53     }
54     return flags;
55 }
56 
57 #if OPAL_HAVE_UCT_EP_ATOMIC64_POST
58 /**
59  * @brief Convert UCT capabilities to BTL atomic flags
60  */
61 static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = {
62     {UCS_BIT(UCT_ATOMIC_OP_ADD), MCA_BTL_ATOMIC_SUPPORTS_ADD},
63     {UCS_BIT(UCT_ATOMIC_OP_AND), MCA_BTL_ATOMIC_SUPPORTS_AND},
64     {UCS_BIT(UCT_ATOMIC_OP_OR), MCA_BTL_ATOMIC_SUPPORTS_OR},
65     {UCS_BIT(UCT_ATOMIC_OP_XOR), MCA_BTL_ATOMIC_SUPPORTS_XOR},
66     {UCS_BIT(UCT_ATOMIC_OP_SWAP), MCA_BTL_ATOMIC_SUPPORTS_SWAP},
67     {UCS_BIT(UCT_ATOMIC_OP_CSWAP), MCA_BTL_ATOMIC_SUPPORTS_CSWAP},
68     {0, },
69 };
70 
mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)71 static void mca_btl_uct_module_set_atomic_flags (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
72 {
73     uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags;
74 
75     /* NTH: only use the fetching atomics for now */
76     uint64_t atomic_flags32 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic32.fop_flags;
77     uint64_t atomic_flags64 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic64.fop_flags;
78 
79     /* NTH: don't really have a way to seperate 32-bit and 64-bit right now */
80     uint64_t all_flags = atomic_flags32 & atomic_flags64;
81 
82     module->super.btl_atomic_flags = 0;
83 
84     if (cap_flags & UCT_IFACE_FLAG_ATOMIC_CPU) {
85         module->super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
86     }
87 
88     for (int i = 0 ; mca_btl_uct_cap_to_btl_atomic_flag[i][0] ; ++i) {
89         if (all_flags & mca_btl_uct_cap_to_btl_atomic_flag[i][0]) {
90             module->super.btl_atomic_flags |= mca_btl_uct_cap_to_btl_atomic_flag[i][1];
91         }
92     }
93 
94     if (0 != module->super.btl_atomic_flags) {
95         /* some atomics are supported */
96         module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS;
97     }
98 }
99 
100 #else
101 /**
102  * @brief Convert UCT capabilities to BTL atomic flags
103  */
104 static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = {
105     {UCT_IFACE_FLAG_ATOMIC_ADD64, MCA_BTL_ATOMIC_SUPPORTS_ADD},
106     {UCT_IFACE_FLAG_ATOMIC_ADD32, MCA_BTL_ATOMIC_SUPPORTS_32BIT},
107     {UCT_IFACE_FLAG_ATOMIC_CSWAP64, MCA_BTL_ATOMIC_SUPPORTS_CSWAP},
108     {UCT_IFACE_FLAG_ATOMIC_SWAP64, MCA_BTL_ATOMIC_SUPPORTS_SWAP},
109     {UCT_IFACE_FLAG_ATOMIC_CPU, MCA_BTL_ATOMIC_SUPPORTS_GLOB},
110     {0, },
111 };
112 
113 /**
114  * @brief Convert UCT capability flags to BTL atomic flags
115  *
116  * @param[in] cap_flags  UCT capability flags
117  *
118  * @returns equivalent BTL atomic flags
119  */
mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)120 static void mca_btl_uct_module_set_atomic_flags (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
121 {
122     uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags;
123 
124     module->super.btl_atomic_flags = 0;
125 
126     for (int i = 0 ; mca_btl_uct_cap_to_btl_atomic_flag[i][0] > 0 ; ++i) {
127         if (cap_flags & mca_btl_uct_cap_to_btl_atomic_flag[i][0]) {
128             module->super.btl_atomic_flags |= (uint32_t) mca_btl_uct_cap_to_btl_atomic_flag[i][1];
129         }
130     }
131 
132     if (0 != module->super.btl_atomic_flags) {
133         /* some atomics are supported */
134         module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS;
135     }
136 }
137 
138 #endif
139 
mca_btl_uct_tl_constructor(mca_btl_uct_tl_t * tl)140 static void mca_btl_uct_tl_constructor (mca_btl_uct_tl_t *tl)
141 {
142     memset ((void *)((uintptr_t) tl + sizeof (tl->super)), 0, sizeof (*tl) - sizeof (tl->super));
143     OBJ_CONSTRUCT(&tl->tl_lock, opal_mutex_t);
144 }
145 
mca_btl_uct_tl_destructor(mca_btl_uct_tl_t * tl)146 static void mca_btl_uct_tl_destructor (mca_btl_uct_tl_t *tl)
147 {
148     assert (((opal_object_t *) tl)->obj_reference_count == 0);
149 
150     for (int context_id = 0 ; context_id < MCA_BTL_UCT_MAX_WORKERS ; ++context_id) {
151         if (NULL != tl->uct_dev_contexts[context_id]) {
152             mca_btl_uct_context_destroy (tl->uct_dev_contexts[context_id]);
153         }
154     }
155 
156     if (tl->uct_md) {
157         OBJ_RELEASE(tl->uct_md);
158     }
159 
160     free (tl->uct_dev_contexts);
161     free (tl->uct_tl_name);
162     free (tl->uct_dev_name);
163 
164     if (NULL != tl->uct_tl_config) {
165         uct_config_release (tl->uct_tl_config);
166     }
167 
168     OBJ_DESTRUCT(&tl->tl_lock);
169 }
170 
171 OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructor, mca_btl_uct_tl_destructor);
172 
mca_btl_uct_conn_req_cb(void * arg,void * data,size_t length,unsigned flags)173 static ucs_status_t mca_btl_uct_conn_req_cb (void *arg, void *data, size_t length, unsigned flags)
174 {
175     mca_btl_uct_module_t *module = (mca_btl_uct_module_t *) arg;
176     mca_btl_uct_pending_connection_request_t *request = calloc (1, length + sizeof (request->super));
177 
178     /* it is not safe to process the connection request from the callback so just save it for
179      * later processing */
180     OBJ_CONSTRUCT(request, mca_btl_uct_pending_connection_request_t);
181     memcpy (&request->request_data, (void *) ((intptr_t) data + 8), length);
182     opal_fifo_push_atomic (&module->pending_connection_reqs, &request->super);
183 
184     return UCS_OK;
185 }
186 
187 OBJ_CLASS_INSTANCE(mca_btl_uct_pending_connection_request_t, opal_list_item_t, NULL, NULL);
188 
mca_btl_uct_process_connection_request(mca_btl_uct_module_t * module,mca_btl_uct_conn_req_t * req)189 int mca_btl_uct_process_connection_request (mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req)
190 {
191     struct opal_proc_t *remote_proc = opal_proc_for_name (req->proc_name);
192     mca_btl_base_endpoint_t *endpoint = mca_btl_uct_get_ep (&module->super, remote_proc);
193     mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[req->context_id] + req->tl_index;
194     int32_t ep_flags;
195     int rc;
196 
197     BTL_VERBOSE(("got connection request for endpoint %p. type = %d. context id = %d",
198                  (void *) endpoint, req->type, req->context_id));
199 
200     if (NULL == endpoint) {
201         BTL_ERROR(("could not create endpoint for connection request"));
202         return UCS_ERR_UNREACHABLE;
203     }
204 
205     assert (req->type < 2);
206 
207     ep_flags = opal_atomic_fetch_or_32 (&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC);
208 
209     if (!(ep_flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC)) {
210         /* create any necessary resources */
211         rc = mca_btl_uct_endpoint_connect (module, endpoint, req->context_id, req->ep_addr, req->tl_index);
212         if (OPAL_SUCCESS != rc && OPAL_ERR_OUT_OF_RESOURCE != rc) {
213             BTL_ERROR(("could not setup rdma endpoint. rc = %d", rc));
214             return rc;
215         }
216     }
217 
218     /* the connection is ready once we have received the connection data and also a connection ready
219      * message. this might be overkill but there is little documentation at the UCT level on when
220      * an endpoint can be used. */
221     if (req->type == 1) {
222         /* remote side is ready */
223         mca_btl_uct_base_frag_t *frag;
224 
225         /* to avoid a race with send adding pending frags grab the lock here */
226         OPAL_THREAD_SCOPED_LOCK(&endpoint->ep_lock,{
227                 BTL_VERBOSE(("connection ready. sending %" PRIsize_t " frags", opal_list_get_size (&module->pending_frags)));
228                 (void) opal_atomic_or_fetch_32 (&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY);
229                 opal_atomic_wmb ();
230 
231                 OPAL_LIST_FOREACH(frag, &module->pending_frags, mca_btl_uct_base_frag_t) {
232                     if (frag->context->context_id == req->context_id && endpoint == frag->endpoint) {
233                         frag->ready = true;
234                     }
235                 }
236             });
237     }
238 
239     return OPAL_SUCCESS;
240 }
241 
mca_btl_uct_setup_connection_tl(mca_btl_uct_module_t * module)242 static int mca_btl_uct_setup_connection_tl (mca_btl_uct_module_t *module)
243 {
244     ucs_status_t ucs_status;
245 
246     if (NULL == module->conn_tl) {
247         return OPAL_ERR_NOT_SUPPORTED;
248     }
249 
250     ucs_status = uct_iface_set_am_handler (module->conn_tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_CONNECT_RDMA,
251                                            mca_btl_uct_conn_req_cb, module, UCT_CB_FLAG_ASYNC);
252     if (UCS_OK != ucs_status) {
253         BTL_ERROR(("could not set active message handler for uct tl"));
254     }
255 
256     return UCS_OK == ucs_status ? OPAL_SUCCESS : OPAL_ERROR;
257 }
258 
mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t * context)259 static void mca_btl_uct_context_enable_progress (mca_btl_uct_device_context_t *context)
260 {
261     if (!context->progress_enabled) {
262 #if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE
263         uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND |
264                                    UCT_PROGRESS_RECV);
265 #else
266         uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV);
267 #endif
268         context->progress_enabled = true;
269     }
270 }
271 
mca_btl_uct_context_create(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl,int context_id,bool enable_progress)272 mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id, bool enable_progress)
273 {
274 #if UCT_API >= UCT_VERSION(1, 6)
275     uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE |
276                                                      UCT_IFACE_PARAM_FIELD_DEVICE,
277                                        .open_mode = UCT_IFACE_OPEN_MODE_DEVICE,
278                                        .mode = {.device = {.tl_name = tl->uct_tl_name,
279                                                            .dev_name = tl->uct_dev_name}}};
280 #else
281     uct_iface_params_t iface_params = {.rndv_cb = NULL, .eager_cb = NULL, .stats_root = NULL,
282                                        .rx_headroom = 0, .open_mode = UCT_IFACE_OPEN_MODE_DEVICE,
283                                        .mode = {.device = {.tl_name = tl->uct_tl_name,
284                                                            .dev_name = tl->uct_dev_name}}};
285 #endif
286     mca_btl_uct_device_context_t *context;
287     ucs_status_t ucs_status;
288     int rc;
289 
290     context = calloc (1, sizeof (*context));
291     if (OPAL_UNLIKELY(NULL == context)) {
292         return NULL;
293     }
294 
295     context->context_id = context_id;
296     context->uct_btl = module;
297     OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t);
298     OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t);
299     OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t);
300 
301     rc = opal_free_list_init (&context->rdma_completions, sizeof (mca_btl_uct_uct_completion_t),
302                               opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t),
303                               0, opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL,
304                               NULL);
305     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
306         mca_btl_uct_context_destroy (context);
307         return NULL;
308     }
309 
310     /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to
311      * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their
312      * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the
313      * various UCT calls. */
314     ucs_status = uct_worker_create (module->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker);
315     if (OPAL_UNLIKELY(UCS_OK != ucs_status)) {
316         BTL_VERBOSE(("could not create a UCT worker"));
317         mca_btl_uct_context_destroy (context);
318         return NULL;
319     }
320 
321     ucs_status = uct_iface_open (tl->uct_md->uct_md, context->uct_worker, &iface_params,
322                                  tl->uct_tl_config, &context->uct_iface);
323     if (OPAL_UNLIKELY(UCS_OK != ucs_status)) {
324         BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status));
325         mca_btl_uct_context_destroy (context);
326         return NULL;
327     }
328 
329     /* only need to query one of the interfaces to get the attributes */
330     ucs_status = uct_iface_query (context->uct_iface, &context->uct_iface_attr);
331     if (UCS_OK != ucs_status) {
332         BTL_VERBOSE(("Error querying UCT interface"));
333         mca_btl_uct_context_destroy (context);
334         return NULL;
335     }
336 
337     if (context_id > 0 && tl == module->am_tl) {
338       BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id));
339       uct_iface_set_am_handler (context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler,
340 				context, MCA_BTL_UCT_CB_FLAG_SYNC);
341     }
342 
343     if (enable_progress) {
344         BTL_VERBOSE(("enabling progress for tl %p context id %d", (void *) tl, context_id));
345         mca_btl_uct_context_enable_progress (context);
346     }
347 
348     return context;
349 }
350 
mca_btl_uct_context_destroy(mca_btl_uct_device_context_t * context)351 void mca_btl_uct_context_destroy (mca_btl_uct_device_context_t *context)
352 {
353     if (context->uct_iface) {
354         uct_iface_close (context->uct_iface);
355         context->uct_iface = NULL;
356     }
357 
358     if (context->uct_worker) {
359         uct_worker_destroy (context->uct_worker);
360         context->uct_worker = NULL;
361     }
362 
363     OBJ_DESTRUCT(&context->completion_fifo);
364     OBJ_DESTRUCT(&context->rdma_completions);
365     free (context);
366 }
367 
tl_compare(opal_list_item_t ** a,opal_list_item_t ** b)368 static int tl_compare (opal_list_item_t **a, opal_list_item_t **b)
369 {
370     mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a;
371     mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b;
372 
373     return tl_a->priority - tl_b->priority;
374 }
375 
mca_btl_uct_create_tl(mca_btl_uct_module_t * module,mca_btl_uct_md_t * md,uct_tl_resource_desc_t * tl_desc,int priority)376 static mca_btl_uct_tl_t *mca_btl_uct_create_tl (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_desc, int priority)
377 {
378     mca_btl_uct_tl_t *tl = OBJ_NEW(mca_btl_uct_tl_t);
379 
380     if (OPAL_UNLIKELY(NULL == tl)) {
381         return NULL;
382     }
383 
384     /* initialize btl tl structure */
385     tl->uct_md = md;
386     OBJ_RETAIN(md);
387 
388     tl->uct_tl_name = strdup (tl_desc->tl_name);
389     tl->uct_dev_name = strdup (tl_desc->dev_name);
390     tl->priority = priority;
391 
392     tl->uct_dev_contexts = calloc (MCA_BTL_UCT_MAX_WORKERS, sizeof (tl->uct_dev_contexts[0]));
393     if (NULL == tl->uct_dev_contexts) {
394         OBJ_RELEASE(tl);
395         return NULL;
396     }
397 
398     (void) uct_md_iface_config_read (md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config);
399 
400     /* always create a 0 context (needed to query) */
401     tl->uct_dev_contexts[0] = mca_btl_uct_context_create (module, tl, 0, false);
402     if (NULL == tl->uct_dev_contexts[0]) {
403         BTL_VERBOSE(("could not create a uct device context"));
404         OBJ_RELEASE(tl);
405         return NULL;
406     }
407 
408     BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md_name, tl_desc->tl_name,
409                  (unsigned long) MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags));
410 
411     return tl;
412 }
413 
mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)414 static void mca_btl_uct_set_tl_rdma (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
415 {
416     BTL_VERBOSE(("tl %s is suitable for RDMA", tl->uct_tl_name));
417 
418     mca_btl_uct_module_set_atomic_flags (module, tl);
419 
420     module->super.btl_get_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_zcopy;
421     if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_bcopy) {
422         module->super.btl_get_alignment = 0;
423         module->super.btl_get_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_bcopy;
424     } else {
425         /* this is overkill in terms of alignment but we have no way to enforce a minimum get size */
426         module->super.btl_get_alignment = opal_next_poweroftwo_inclusive (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.min_zcopy);
427     }
428 
429     module->super.btl_put_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.put.max_zcopy;
430     module->super.btl_put_alignment = 0;
431 
432     /* no registration needed when using short/bcopy put */
433     module->super.btl_put_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.put.max_bcopy;
434 
435     module->rdma_tl = tl;
436     OBJ_RETAIN(tl);
437 
438     tl->tl_index = (module->am_tl && tl != module->am_tl) ? 1 : 0;
439     module->comm_tls[tl->tl_index] = tl;
440     if (tl->max_device_contexts <= 1) {
441 	tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module;
442     }
443 }
444 
mca_btl_uct_set_tl_am(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)445 static void mca_btl_uct_set_tl_am (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
446 {
447     BTL_VERBOSE(("tl %s is suitable for active-messaging", tl->uct_tl_name));
448 
449     if (module->rdma_tl == tl) {
450 	module->shared_endpoints = true;
451     }
452     module->am_tl = tl;
453     OBJ_RETAIN(tl);
454 
455     uct_iface_set_am_handler (tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_FRAG,
456                               mca_btl_uct_am_handler, tl->uct_dev_contexts[0], UCT_CB_FLAG_ASYNC);
457 
458     tl->tl_index = (module->rdma_tl && tl != module->rdma_tl) ? 1 : 0;
459     module->comm_tls[tl->tl_index] = tl;
460     if (tl->max_device_contexts <= 1) {
461 	tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module;
462     }
463 
464     module->super.btl_max_send_size = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_zcopy - sizeof (mca_btl_uct_am_header_t);
465     module->super.btl_eager_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_bcopy - sizeof (mca_btl_uct_am_header_t);
466 }
467 
mca_btl_uct_set_tl_conn(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)468 static int mca_btl_uct_set_tl_conn (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
469 {
470     int rc;
471 
472     BTL_VERBOSE(("tl %s is suitable for making connections", tl->uct_tl_name));
473 
474     module->conn_tl = tl;
475     rc = mca_btl_uct_setup_connection_tl (module);
476     if (OPAL_SUCCESS != rc) {
477         return rc;
478     }
479 
480     OBJ_RETAIN(tl);
481 
482     if (!tl->max_device_contexts) {
483 	/* if a tl is only being used to create connections do not bother with multiple
484 	 * contexts */
485 	tl->max_device_contexts = 1;
486     }
487 
488     return OPAL_SUCCESS;
489 }
490 
mca_btl_uct_evaluate_tl(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)491 static int mca_btl_uct_evaluate_tl (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
492 {
493     int rc;
494 
495     BTL_VERBOSE(("evaluating tl %s", tl->uct_tl_name));
496     if (NULL == module->rdma_tl && mca_btl_uct_tl_supports_rdma (tl)) {
497 	mca_btl_uct_set_tl_rdma (module, tl);
498     }
499 
500     if (NULL == module->am_tl && mca_btl_uct_tl_support_am (tl)) {
501 	mca_btl_uct_set_tl_am (module, tl);
502     }
503 
504     if (NULL == module->conn_tl && mca_btl_uct_tl_supports_conn (tl)) {
505 	rc = mca_btl_uct_set_tl_conn (module, tl);
506         if (OPAL_SUCCESS != rc) {
507             return rc;
508         }
509     }
510 
511     if (tl == module->rdma_tl || tl == module->am_tl) {
512         BTL_VERBOSE(("tl has flags 0x%" PRIx64, MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags));
513         module->super.btl_flags |= mca_btl_uct_module_flags (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags);
514 
515 	/* the bandwidth and latency numbers relate to both rdma and active messages. need to
516 	 * come up with a better estimate. */
517 
518 	/* UCT bandwidth is in bytes/sec, BTL is in MB/sec */
519 #if UCT_API >= UCT_VERSION(1, 7)
520 	module->super.btl_bandwidth = (uint32_t) ((MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.dedicated +
521                                                    MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.shared /
522                                                    (opal_process_info.num_local_peers + 1)) / 1048576.0);
523 #else
524 	module->super.btl_bandwidth = (uint32_t) (MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth / 1048576.0);
525 #endif
526 	/* TODO -- figure out how to translate UCT latency to us */
527 	module->super.btl_latency = 1;
528     }
529 
530     if (tl == module->rdma_tl || tl == module->am_tl || tl == module->conn_tl) {
531         /* make sure progress is enabled on the default context now that we know this TL will be used */
532         mca_btl_uct_context_enable_progress (tl->uct_dev_contexts[0]);
533     }
534 
535     return OPAL_SUCCESS;
536 }
537 
mca_btl_uct_query_tls(mca_btl_uct_module_t * module,mca_btl_uct_md_t * md,uct_tl_resource_desc_t * tl_descs,unsigned tl_count)538 int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count)
539 {
540     bool include = true, any = false;
541     mca_btl_uct_tl_t *tl;
542     opal_list_t tl_list;
543     char **tl_filter;
544     int any_priority = 0;
545 
546     OBJ_CONSTRUCT(&tl_list, opal_list_t);
547 
548     tl_filter = opal_argv_split (mca_btl_uct_component.allowed_transports, ',');
549 
550     if ('^' == tl_filter[0][0]) {
551 	/* user has negated the include list */
552 	char *tmp = strdup (tl_filter[0] + 1);
553 
554 	free (tl_filter[0]);
555 	tl_filter[0] = tmp;
556 	include = false;
557     }
558 
559     /* check for the any keyword */
560     for (unsigned j = 0 ; tl_filter[j] ; ++j) {
561         if (0 == strcmp (tl_filter[j], "any")) {
562             any_priority = j;
563             any = true;
564             break;
565         }
566     }
567 
568     if (any && !include) {
569         opal_argv_free (tl_filter);
570         return OPAL_ERR_NOT_AVAILABLE;
571     }
572 
573     for (unsigned i = 0 ; i < tl_count ; ++i) {
574 	bool try_tl = any;
575 	int priority = any_priority;
576 
577 	for (unsigned j = 0 ; tl_filter[j] ; ++j) {
578             if (0 == strcmp (tl_filter[j], tl_descs[i].tl_name)) {
579                 try_tl = include;
580                 priority = j;
581                 break;
582             }
583 	}
584 
585         BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, try_tl, priority));
586 
587 	if (!try_tl) {
588 	    continue;
589 	}
590 
591         if (0 == strcmp (tl_descs[i].tl_name, "ud")) {
592             /* ud looks like any normal transport but we do not want to use it for anything other
593              * than connection management so ensure it gets evaluated last */
594             priority = INT_MAX;
595         }
596 
597 	tl = mca_btl_uct_create_tl (module, md, tl_descs + i, priority);
598 
599 	if (tl) {
600 	    opal_list_append (&tl_list, &tl->super);
601 	}
602     }
603 
604     opal_argv_free (tl_filter);
605 
606     if (0 == opal_list_get_size (&tl_list)) {
607 	BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports));
608 	OBJ_DESTRUCT(&tl_list);
609 	return OPAL_ERR_NOT_AVAILABLE;
610     }
611 
612     opal_list_sort (&tl_list, tl_compare);
613 
614     OPAL_LIST_FOREACH(tl, &tl_list, mca_btl_uct_tl_t) {
615 	mca_btl_uct_evaluate_tl (module, tl);
616 	if (NULL != module->am_tl && NULL != module->rdma_tl &&
617 	    (NULL != module->conn_tl || !(mca_btl_uct_tl_requires_connection_tl (module->am_tl) ||
618 					  mca_btl_uct_tl_requires_connection_tl (module->rdma_tl)))) {
619 	    /* all done */
620 	    break;
621 	}
622     }
623 
624     if (NULL == module->rdma_tl) {
625 	/* no rdma tls */
626 	BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support"));
627 
628         module->super.btl_flags &= ~MCA_BTL_FLAGS_RDMA;
629 	module->super.btl_put = NULL;
630 	module->super.btl_get = NULL;
631 	module->super.btl_atomic_fop = NULL;
632 	module->super.btl_atomic_op = NULL;
633     }
634 
635     if (NULL == module->am_tl) {
636 	/* no active message tls == no send/recv */
637 	BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support"));
638 
639 	module->super.btl_send = NULL;
640 	module->super.btl_sendi = NULL;
641 	module->super.btl_alloc = NULL;
642 	module->super.btl_free = NULL;
643     }
644 
645     OPAL_LIST_DESTRUCT(&tl_list);
646 
647     if (!(NULL != module->am_tl && mca_btl_uct_tl_requires_connection_tl (module->am_tl)) &&
648 	!(NULL != module->rdma_tl && mca_btl_uct_tl_requires_connection_tl (module->rdma_tl)) &&
649 	module->conn_tl) {
650 	/* no connection tl needed for selected transports */
651 	OBJ_RELEASE(module->conn_tl);
652 	module->conn_tl = NULL;
653     } else if (NULL == module->conn_tl) {
654         BTL_VERBOSE(("a connection tl is required but no tls match the filter %s",
655                      mca_btl_uct_component.allowed_transports));
656         return OPAL_ERROR;
657     }
658 
659     return OPAL_SUCCESS;
660 }
661