1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2018 Los Alamos National Security, LLC. All rights
4 * reserved.
5 * Copyright (c) 2018 Research Organization for Information Science
6 * and Technology (RIST). All rights reserved.
7 * Copyright (c) 2018 Triad National Security, LLC. All rights
8 * reserved.
9 * Copyright (c) 2019 Google, LLC. All rights reserved.
10 * $COPYRIGHT$
11 *
12 * Additional copyrights may follow
13 *
14 * $HEADER$
15 */
16
17 #include "btl_uct_device_context.h"
18 #include "btl_uct_am.h"
19 #include "opal/util/bit_ops.h"
20 #include "opal/util/argv.h"
21
22 #if HAVE_DECL_UCT_CB_FLAG_SYNC
23 #define MCA_BTL_UCT_CB_FLAG_SYNC UCT_CB_FLAG_SYNC
24 #else
25 #define MCA_BTL_UCT_CB_FLAG_SYNC 0
26 #endif
27
28 /**
29 * @brief Convert UCT capabilities to BTL flags
30 */
31 static uint64_t mca_btl_uct_cap_to_btl_flag[][2] = {
32 {UCT_IFACE_FLAG_AM_SHORT, MCA_BTL_FLAGS_SEND},
33 {UCT_IFACE_FLAG_PUT_ZCOPY, MCA_BTL_FLAGS_PUT},
34 {UCT_IFACE_FLAG_GET_ZCOPY, MCA_BTL_FLAGS_GET},
35 {0,0},
36 };
37
38 /**
39 * @brief Convert UCT capability flags to BTL flags
40 *
41 * @param[in] cap_flags UCT capability flags
42 *
43 * @returns equivalent BTL flags
44 */
mca_btl_uct_module_flags(uint64_t cap_flags)45 static int32_t mca_btl_uct_module_flags (uint64_t cap_flags)
46 {
47 uint32_t flags = 0;
48
49 for (int i = 0 ; mca_btl_uct_cap_to_btl_flag[i][0] > 0 ; ++i) {
50 if (cap_flags & mca_btl_uct_cap_to_btl_flag[i][0]) {
51 flags |= (uint32_t) mca_btl_uct_cap_to_btl_flag[i][1];
52 }
53 }
54 return flags;
55 }
56
57 #if OPAL_HAVE_UCT_EP_ATOMIC64_POST
58 /**
59 * @brief Convert UCT capabilities to BTL atomic flags
60 */
61 static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = {
62 {UCS_BIT(UCT_ATOMIC_OP_ADD), MCA_BTL_ATOMIC_SUPPORTS_ADD},
63 {UCS_BIT(UCT_ATOMIC_OP_AND), MCA_BTL_ATOMIC_SUPPORTS_AND},
64 {UCS_BIT(UCT_ATOMIC_OP_OR), MCA_BTL_ATOMIC_SUPPORTS_OR},
65 {UCS_BIT(UCT_ATOMIC_OP_XOR), MCA_BTL_ATOMIC_SUPPORTS_XOR},
66 {UCS_BIT(UCT_ATOMIC_OP_SWAP), MCA_BTL_ATOMIC_SUPPORTS_SWAP},
67 {UCS_BIT(UCT_ATOMIC_OP_CSWAP), MCA_BTL_ATOMIC_SUPPORTS_CSWAP},
68 {0, },
69 };
70
mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)71 static void mca_btl_uct_module_set_atomic_flags (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
72 {
73 uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags;
74
75 /* NTH: only use the fetching atomics for now */
76 uint64_t atomic_flags32 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic32.fop_flags;
77 uint64_t atomic_flags64 = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.atomic64.fop_flags;
78
79 /* NTH: don't really have a way to seperate 32-bit and 64-bit right now */
80 uint64_t all_flags = atomic_flags32 & atomic_flags64;
81
82 module->super.btl_atomic_flags = 0;
83
84 if (cap_flags & UCT_IFACE_FLAG_ATOMIC_CPU) {
85 module->super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
86 }
87
88 for (int i = 0 ; mca_btl_uct_cap_to_btl_atomic_flag[i][0] ; ++i) {
89 if (all_flags & mca_btl_uct_cap_to_btl_atomic_flag[i][0]) {
90 module->super.btl_atomic_flags |= mca_btl_uct_cap_to_btl_atomic_flag[i][1];
91 }
92 }
93
94 if (0 != module->super.btl_atomic_flags) {
95 /* some atomics are supported */
96 module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS;
97 }
98 }
99
100 #else
101 /**
102 * @brief Convert UCT capabilities to BTL atomic flags
103 */
104 static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = {
105 {UCT_IFACE_FLAG_ATOMIC_ADD64, MCA_BTL_ATOMIC_SUPPORTS_ADD},
106 {UCT_IFACE_FLAG_ATOMIC_ADD32, MCA_BTL_ATOMIC_SUPPORTS_32BIT},
107 {UCT_IFACE_FLAG_ATOMIC_CSWAP64, MCA_BTL_ATOMIC_SUPPORTS_CSWAP},
108 {UCT_IFACE_FLAG_ATOMIC_SWAP64, MCA_BTL_ATOMIC_SUPPORTS_SWAP},
109 {UCT_IFACE_FLAG_ATOMIC_CPU, MCA_BTL_ATOMIC_SUPPORTS_GLOB},
110 {0, },
111 };
112
113 /**
114 * @brief Convert UCT capability flags to BTL atomic flags
115 *
116 * @param[in] cap_flags UCT capability flags
117 *
118 * @returns equivalent BTL atomic flags
119 */
mca_btl_uct_module_set_atomic_flags(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)120 static void mca_btl_uct_module_set_atomic_flags (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
121 {
122 uint64_t cap_flags = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags;
123
124 module->super.btl_atomic_flags = 0;
125
126 for (int i = 0 ; mca_btl_uct_cap_to_btl_atomic_flag[i][0] > 0 ; ++i) {
127 if (cap_flags & mca_btl_uct_cap_to_btl_atomic_flag[i][0]) {
128 module->super.btl_atomic_flags |= (uint32_t) mca_btl_uct_cap_to_btl_atomic_flag[i][1];
129 }
130 }
131
132 if (0 != module->super.btl_atomic_flags) {
133 /* some atomics are supported */
134 module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS;
135 }
136 }
137
138 #endif
139
mca_btl_uct_tl_constructor(mca_btl_uct_tl_t * tl)140 static void mca_btl_uct_tl_constructor (mca_btl_uct_tl_t *tl)
141 {
142 memset ((void *)((uintptr_t) tl + sizeof (tl->super)), 0, sizeof (*tl) - sizeof (tl->super));
143 OBJ_CONSTRUCT(&tl->tl_lock, opal_mutex_t);
144 }
145
mca_btl_uct_tl_destructor(mca_btl_uct_tl_t * tl)146 static void mca_btl_uct_tl_destructor (mca_btl_uct_tl_t *tl)
147 {
148 assert (((opal_object_t *) tl)->obj_reference_count == 0);
149
150 for (int context_id = 0 ; context_id < MCA_BTL_UCT_MAX_WORKERS ; ++context_id) {
151 if (NULL != tl->uct_dev_contexts[context_id]) {
152 mca_btl_uct_context_destroy (tl->uct_dev_contexts[context_id]);
153 }
154 }
155
156 if (tl->uct_md) {
157 OBJ_RELEASE(tl->uct_md);
158 }
159
160 free (tl->uct_dev_contexts);
161 free (tl->uct_tl_name);
162 free (tl->uct_dev_name);
163
164 if (NULL != tl->uct_tl_config) {
165 uct_config_release (tl->uct_tl_config);
166 }
167
168 OBJ_DESTRUCT(&tl->tl_lock);
169 }
170
171 OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructor, mca_btl_uct_tl_destructor);
172
mca_btl_uct_conn_req_cb(void * arg,void * data,size_t length,unsigned flags)173 static ucs_status_t mca_btl_uct_conn_req_cb (void *arg, void *data, size_t length, unsigned flags)
174 {
175 mca_btl_uct_module_t *module = (mca_btl_uct_module_t *) arg;
176 mca_btl_uct_pending_connection_request_t *request = calloc (1, length + sizeof (request->super));
177
178 /* it is not safe to process the connection request from the callback so just save it for
179 * later processing */
180 OBJ_CONSTRUCT(request, mca_btl_uct_pending_connection_request_t);
181 memcpy (&request->request_data, (void *) ((intptr_t) data + 8), length);
182 opal_fifo_push_atomic (&module->pending_connection_reqs, &request->super);
183
184 return UCS_OK;
185 }
186
187 OBJ_CLASS_INSTANCE(mca_btl_uct_pending_connection_request_t, opal_list_item_t, NULL, NULL);
188
mca_btl_uct_process_connection_request(mca_btl_uct_module_t * module,mca_btl_uct_conn_req_t * req)189 int mca_btl_uct_process_connection_request (mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req)
190 {
191 struct opal_proc_t *remote_proc = opal_proc_for_name (req->proc_name);
192 mca_btl_base_endpoint_t *endpoint = mca_btl_uct_get_ep (&module->super, remote_proc);
193 mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[req->context_id] + req->tl_index;
194 int32_t ep_flags;
195 int rc;
196
197 BTL_VERBOSE(("got connection request for endpoint %p. type = %d. context id = %d",
198 (void *) endpoint, req->type, req->context_id));
199
200 if (NULL == endpoint) {
201 BTL_ERROR(("could not create endpoint for connection request"));
202 return UCS_ERR_UNREACHABLE;
203 }
204
205 assert (req->type < 2);
206
207 ep_flags = opal_atomic_fetch_or_32 (&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC);
208
209 if (!(ep_flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC)) {
210 /* create any necessary resources */
211 rc = mca_btl_uct_endpoint_connect (module, endpoint, req->context_id, req->ep_addr, req->tl_index);
212 if (OPAL_SUCCESS != rc && OPAL_ERR_OUT_OF_RESOURCE != rc) {
213 BTL_ERROR(("could not setup rdma endpoint. rc = %d", rc));
214 return rc;
215 }
216 }
217
218 /* the connection is ready once we have received the connection data and also a connection ready
219 * message. this might be overkill but there is little documentation at the UCT level on when
220 * an endpoint can be used. */
221 if (req->type == 1) {
222 /* remote side is ready */
223 mca_btl_uct_base_frag_t *frag;
224
225 /* to avoid a race with send adding pending frags grab the lock here */
226 OPAL_THREAD_SCOPED_LOCK(&endpoint->ep_lock,{
227 BTL_VERBOSE(("connection ready. sending %" PRIsize_t " frags", opal_list_get_size (&module->pending_frags)));
228 (void) opal_atomic_or_fetch_32 (&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY);
229 opal_atomic_wmb ();
230
231 OPAL_LIST_FOREACH(frag, &module->pending_frags, mca_btl_uct_base_frag_t) {
232 if (frag->context->context_id == req->context_id && endpoint == frag->endpoint) {
233 frag->ready = true;
234 }
235 }
236 });
237 }
238
239 return OPAL_SUCCESS;
240 }
241
mca_btl_uct_setup_connection_tl(mca_btl_uct_module_t * module)242 static int mca_btl_uct_setup_connection_tl (mca_btl_uct_module_t *module)
243 {
244 ucs_status_t ucs_status;
245
246 if (NULL == module->conn_tl) {
247 return OPAL_ERR_NOT_SUPPORTED;
248 }
249
250 ucs_status = uct_iface_set_am_handler (module->conn_tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_CONNECT_RDMA,
251 mca_btl_uct_conn_req_cb, module, UCT_CB_FLAG_ASYNC);
252 if (UCS_OK != ucs_status) {
253 BTL_ERROR(("could not set active message handler for uct tl"));
254 }
255
256 return UCS_OK == ucs_status ? OPAL_SUCCESS : OPAL_ERROR;
257 }
258
mca_btl_uct_context_enable_progress(mca_btl_uct_device_context_t * context)259 static void mca_btl_uct_context_enable_progress (mca_btl_uct_device_context_t *context)
260 {
261 if (!context->progress_enabled) {
262 #if HAVE_DECL_UCT_PROGRESS_THREAD_SAFE
263 uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND |
264 UCT_PROGRESS_RECV);
265 #else
266 uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV);
267 #endif
268 context->progress_enabled = true;
269 }
270 }
271
mca_btl_uct_context_create(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl,int context_id,bool enable_progress)272 mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id, bool enable_progress)
273 {
274 #if UCT_API >= UCT_VERSION(1, 6)
275 uct_iface_params_t iface_params = {.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE |
276 UCT_IFACE_PARAM_FIELD_DEVICE,
277 .open_mode = UCT_IFACE_OPEN_MODE_DEVICE,
278 .mode = {.device = {.tl_name = tl->uct_tl_name,
279 .dev_name = tl->uct_dev_name}}};
280 #else
281 uct_iface_params_t iface_params = {.rndv_cb = NULL, .eager_cb = NULL, .stats_root = NULL,
282 .rx_headroom = 0, .open_mode = UCT_IFACE_OPEN_MODE_DEVICE,
283 .mode = {.device = {.tl_name = tl->uct_tl_name,
284 .dev_name = tl->uct_dev_name}}};
285 #endif
286 mca_btl_uct_device_context_t *context;
287 ucs_status_t ucs_status;
288 int rc;
289
290 context = calloc (1, sizeof (*context));
291 if (OPAL_UNLIKELY(NULL == context)) {
292 return NULL;
293 }
294
295 context->context_id = context_id;
296 context->uct_btl = module;
297 OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t);
298 OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t);
299 OBJ_CONSTRUCT(&context->rdma_completions, opal_free_list_t);
300
301 rc = opal_free_list_init (&context->rdma_completions, sizeof (mca_btl_uct_uct_completion_t),
302 opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t),
303 0, opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL,
304 NULL);
305 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
306 mca_btl_uct_context_destroy (context);
307 return NULL;
308 }
309
310 /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to
311 * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their
312 * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the
313 * various UCT calls. */
314 ucs_status = uct_worker_create (module->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker);
315 if (OPAL_UNLIKELY(UCS_OK != ucs_status)) {
316 BTL_VERBOSE(("could not create a UCT worker"));
317 mca_btl_uct_context_destroy (context);
318 return NULL;
319 }
320
321 ucs_status = uct_iface_open (tl->uct_md->uct_md, context->uct_worker, &iface_params,
322 tl->uct_tl_config, &context->uct_iface);
323 if (OPAL_UNLIKELY(UCS_OK != ucs_status)) {
324 BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status));
325 mca_btl_uct_context_destroy (context);
326 return NULL;
327 }
328
329 /* only need to query one of the interfaces to get the attributes */
330 ucs_status = uct_iface_query (context->uct_iface, &context->uct_iface_attr);
331 if (UCS_OK != ucs_status) {
332 BTL_VERBOSE(("Error querying UCT interface"));
333 mca_btl_uct_context_destroy (context);
334 return NULL;
335 }
336
337 if (context_id > 0 && tl == module->am_tl) {
338 BTL_VERBOSE(("installing AM handler for tl %p context id %d", (void *) tl, context_id));
339 uct_iface_set_am_handler (context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler,
340 context, MCA_BTL_UCT_CB_FLAG_SYNC);
341 }
342
343 if (enable_progress) {
344 BTL_VERBOSE(("enabling progress for tl %p context id %d", (void *) tl, context_id));
345 mca_btl_uct_context_enable_progress (context);
346 }
347
348 return context;
349 }
350
mca_btl_uct_context_destroy(mca_btl_uct_device_context_t * context)351 void mca_btl_uct_context_destroy (mca_btl_uct_device_context_t *context)
352 {
353 if (context->uct_iface) {
354 uct_iface_close (context->uct_iface);
355 context->uct_iface = NULL;
356 }
357
358 if (context->uct_worker) {
359 uct_worker_destroy (context->uct_worker);
360 context->uct_worker = NULL;
361 }
362
363 OBJ_DESTRUCT(&context->completion_fifo);
364 OBJ_DESTRUCT(&context->rdma_completions);
365 free (context);
366 }
367
tl_compare(opal_list_item_t ** a,opal_list_item_t ** b)368 static int tl_compare (opal_list_item_t **a, opal_list_item_t **b)
369 {
370 mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a;
371 mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b;
372
373 return tl_a->priority - tl_b->priority;
374 }
375
mca_btl_uct_create_tl(mca_btl_uct_module_t * module,mca_btl_uct_md_t * md,uct_tl_resource_desc_t * tl_desc,int priority)376 static mca_btl_uct_tl_t *mca_btl_uct_create_tl (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_desc, int priority)
377 {
378 mca_btl_uct_tl_t *tl = OBJ_NEW(mca_btl_uct_tl_t);
379
380 if (OPAL_UNLIKELY(NULL == tl)) {
381 return NULL;
382 }
383
384 /* initialize btl tl structure */
385 tl->uct_md = md;
386 OBJ_RETAIN(md);
387
388 tl->uct_tl_name = strdup (tl_desc->tl_name);
389 tl->uct_dev_name = strdup (tl_desc->dev_name);
390 tl->priority = priority;
391
392 tl->uct_dev_contexts = calloc (MCA_BTL_UCT_MAX_WORKERS, sizeof (tl->uct_dev_contexts[0]));
393 if (NULL == tl->uct_dev_contexts) {
394 OBJ_RELEASE(tl);
395 return NULL;
396 }
397
398 (void) uct_md_iface_config_read (md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config);
399
400 /* always create a 0 context (needed to query) */
401 tl->uct_dev_contexts[0] = mca_btl_uct_context_create (module, tl, 0, false);
402 if (NULL == tl->uct_dev_contexts[0]) {
403 BTL_VERBOSE(("could not create a uct device context"));
404 OBJ_RELEASE(tl);
405 return NULL;
406 }
407
408 BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md_name, tl_desc->tl_name,
409 (unsigned long) MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags));
410
411 return tl;
412 }
413
mca_btl_uct_set_tl_rdma(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)414 static void mca_btl_uct_set_tl_rdma (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
415 {
416 BTL_VERBOSE(("tl %s is suitable for RDMA", tl->uct_tl_name));
417
418 mca_btl_uct_module_set_atomic_flags (module, tl);
419
420 module->super.btl_get_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_zcopy;
421 if (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_bcopy) {
422 module->super.btl_get_alignment = 0;
423 module->super.btl_get_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.max_bcopy;
424 } else {
425 /* this is overkill in terms of alignment but we have no way to enforce a minimum get size */
426 module->super.btl_get_alignment = opal_next_poweroftwo_inclusive (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.get.min_zcopy);
427 }
428
429 module->super.btl_put_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.put.max_zcopy;
430 module->super.btl_put_alignment = 0;
431
432 /* no registration needed when using short/bcopy put */
433 module->super.btl_put_local_registration_threshold = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.put.max_bcopy;
434
435 module->rdma_tl = tl;
436 OBJ_RETAIN(tl);
437
438 tl->tl_index = (module->am_tl && tl != module->am_tl) ? 1 : 0;
439 module->comm_tls[tl->tl_index] = tl;
440 if (tl->max_device_contexts <= 1) {
441 tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module;
442 }
443 }
444
mca_btl_uct_set_tl_am(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)445 static void mca_btl_uct_set_tl_am (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
446 {
447 BTL_VERBOSE(("tl %s is suitable for active-messaging", tl->uct_tl_name));
448
449 if (module->rdma_tl == tl) {
450 module->shared_endpoints = true;
451 }
452 module->am_tl = tl;
453 OBJ_RETAIN(tl);
454
455 uct_iface_set_am_handler (tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_FRAG,
456 mca_btl_uct_am_handler, tl->uct_dev_contexts[0], UCT_CB_FLAG_ASYNC);
457
458 tl->tl_index = (module->rdma_tl && tl != module->rdma_tl) ? 1 : 0;
459 module->comm_tls[tl->tl_index] = tl;
460 if (tl->max_device_contexts <= 1) {
461 tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module;
462 }
463
464 module->super.btl_max_send_size = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_zcopy - sizeof (mca_btl_uct_am_header_t);
465 module->super.btl_eager_limit = MCA_BTL_UCT_TL_ATTR(tl, 0).cap.am.max_bcopy - sizeof (mca_btl_uct_am_header_t);
466 }
467
mca_btl_uct_set_tl_conn(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)468 static int mca_btl_uct_set_tl_conn (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
469 {
470 int rc;
471
472 BTL_VERBOSE(("tl %s is suitable for making connections", tl->uct_tl_name));
473
474 module->conn_tl = tl;
475 rc = mca_btl_uct_setup_connection_tl (module);
476 if (OPAL_SUCCESS != rc) {
477 return rc;
478 }
479
480 OBJ_RETAIN(tl);
481
482 if (!tl->max_device_contexts) {
483 /* if a tl is only being used to create connections do not bother with multiple
484 * contexts */
485 tl->max_device_contexts = 1;
486 }
487
488 return OPAL_SUCCESS;
489 }
490
mca_btl_uct_evaluate_tl(mca_btl_uct_module_t * module,mca_btl_uct_tl_t * tl)491 static int mca_btl_uct_evaluate_tl (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl)
492 {
493 int rc;
494
495 BTL_VERBOSE(("evaluating tl %s", tl->uct_tl_name));
496 if (NULL == module->rdma_tl && mca_btl_uct_tl_supports_rdma (tl)) {
497 mca_btl_uct_set_tl_rdma (module, tl);
498 }
499
500 if (NULL == module->am_tl && mca_btl_uct_tl_support_am (tl)) {
501 mca_btl_uct_set_tl_am (module, tl);
502 }
503
504 if (NULL == module->conn_tl && mca_btl_uct_tl_supports_conn (tl)) {
505 rc = mca_btl_uct_set_tl_conn (module, tl);
506 if (OPAL_SUCCESS != rc) {
507 return rc;
508 }
509 }
510
511 if (tl == module->rdma_tl || tl == module->am_tl) {
512 BTL_VERBOSE(("tl has flags 0x%" PRIx64, MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags));
513 module->super.btl_flags |= mca_btl_uct_module_flags (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags);
514
515 /* the bandwidth and latency numbers relate to both rdma and active messages. need to
516 * come up with a better estimate. */
517
518 /* UCT bandwidth is in bytes/sec, BTL is in MB/sec */
519 #if UCT_API >= UCT_VERSION(1, 7)
520 module->super.btl_bandwidth = (uint32_t) ((MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.dedicated +
521 MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.shared /
522 (opal_process_info.num_local_peers + 1)) / 1048576.0);
523 #else
524 module->super.btl_bandwidth = (uint32_t) (MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth / 1048576.0);
525 #endif
526 /* TODO -- figure out how to translate UCT latency to us */
527 module->super.btl_latency = 1;
528 }
529
530 if (tl == module->rdma_tl || tl == module->am_tl || tl == module->conn_tl) {
531 /* make sure progress is enabled on the default context now that we know this TL will be used */
532 mca_btl_uct_context_enable_progress (tl->uct_dev_contexts[0]);
533 }
534
535 return OPAL_SUCCESS;
536 }
537
mca_btl_uct_query_tls(mca_btl_uct_module_t * module,mca_btl_uct_md_t * md,uct_tl_resource_desc_t * tl_descs,unsigned tl_count)538 int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count)
539 {
540 bool include = true, any = false;
541 mca_btl_uct_tl_t *tl;
542 opal_list_t tl_list;
543 char **tl_filter;
544 int any_priority = 0;
545
546 OBJ_CONSTRUCT(&tl_list, opal_list_t);
547
548 tl_filter = opal_argv_split (mca_btl_uct_component.allowed_transports, ',');
549
550 if ('^' == tl_filter[0][0]) {
551 /* user has negated the include list */
552 char *tmp = strdup (tl_filter[0] + 1);
553
554 free (tl_filter[0]);
555 tl_filter[0] = tmp;
556 include = false;
557 }
558
559 /* check for the any keyword */
560 for (unsigned j = 0 ; tl_filter[j] ; ++j) {
561 if (0 == strcmp (tl_filter[j], "any")) {
562 any_priority = j;
563 any = true;
564 break;
565 }
566 }
567
568 if (any && !include) {
569 opal_argv_free (tl_filter);
570 return OPAL_ERR_NOT_AVAILABLE;
571 }
572
573 for (unsigned i = 0 ; i < tl_count ; ++i) {
574 bool try_tl = any;
575 int priority = any_priority;
576
577 for (unsigned j = 0 ; tl_filter[j] ; ++j) {
578 if (0 == strcmp (tl_filter[j], tl_descs[i].tl_name)) {
579 try_tl = include;
580 priority = j;
581 break;
582 }
583 }
584
585 BTL_VERBOSE(("tl filter: tl_name = %s, use = %d, priority = %d", tl_descs[i].tl_name, try_tl, priority));
586
587 if (!try_tl) {
588 continue;
589 }
590
591 if (0 == strcmp (tl_descs[i].tl_name, "ud")) {
592 /* ud looks like any normal transport but we do not want to use it for anything other
593 * than connection management so ensure it gets evaluated last */
594 priority = INT_MAX;
595 }
596
597 tl = mca_btl_uct_create_tl (module, md, tl_descs + i, priority);
598
599 if (tl) {
600 opal_list_append (&tl_list, &tl->super);
601 }
602 }
603
604 opal_argv_free (tl_filter);
605
606 if (0 == opal_list_get_size (&tl_list)) {
607 BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports));
608 OBJ_DESTRUCT(&tl_list);
609 return OPAL_ERR_NOT_AVAILABLE;
610 }
611
612 opal_list_sort (&tl_list, tl_compare);
613
614 OPAL_LIST_FOREACH(tl, &tl_list, mca_btl_uct_tl_t) {
615 mca_btl_uct_evaluate_tl (module, tl);
616 if (NULL != module->am_tl && NULL != module->rdma_tl &&
617 (NULL != module->conn_tl || !(mca_btl_uct_tl_requires_connection_tl (module->am_tl) ||
618 mca_btl_uct_tl_requires_connection_tl (module->rdma_tl)))) {
619 /* all done */
620 break;
621 }
622 }
623
624 if (NULL == module->rdma_tl) {
625 /* no rdma tls */
626 BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support"));
627
628 module->super.btl_flags &= ~MCA_BTL_FLAGS_RDMA;
629 module->super.btl_put = NULL;
630 module->super.btl_get = NULL;
631 module->super.btl_atomic_fop = NULL;
632 module->super.btl_atomic_op = NULL;
633 }
634
635 if (NULL == module->am_tl) {
636 /* no active message tls == no send/recv */
637 BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support"));
638
639 module->super.btl_send = NULL;
640 module->super.btl_sendi = NULL;
641 module->super.btl_alloc = NULL;
642 module->super.btl_free = NULL;
643 }
644
645 OPAL_LIST_DESTRUCT(&tl_list);
646
647 if (!(NULL != module->am_tl && mca_btl_uct_tl_requires_connection_tl (module->am_tl)) &&
648 !(NULL != module->rdma_tl && mca_btl_uct_tl_requires_connection_tl (module->rdma_tl)) &&
649 module->conn_tl) {
650 /* no connection tl needed for selected transports */
651 OBJ_RELEASE(module->conn_tl);
652 module->conn_tl = NULL;
653 } else if (NULL == module->conn_tl) {
654 BTL_VERBOSE(("a connection tl is required but no tls match the filter %s",
655 mca_btl_uct_component.allowed_transports));
656 return OPAL_ERROR;
657 }
658
659 return OPAL_SUCCESS;
660 }
661