1 /*
2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2011 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
7 * reserved.
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2006 Sandia National Laboratories. All rights
13 * reserved.
14 * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
15 * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
16 * $COPYRIGHT$
17 *
18 * Additional copyrights may follow
19 *
20 * $HEADER$
21 */
22 #include <netinet/in.h>
23
24 #include "opal_config.h"
25
26 #include "opal_stdint.h"
27 #include "opal/util/arch.h"
28 #include "opal/util/show_help.h"
29 #include "opal/constants.h"
30 #include "opal/util/bipartite_graph.h"
31
32 #include "btl_usnic_compat.h"
33 #include "btl_usnic.h"
34 #include "btl_usnic_proc.h"
35 #include "btl_usnic_endpoint.h"
36 #include "btl_usnic_module.h"
37 #include "btl_usnic_util.h"
38
39 /* larger weight values are more desirable (i.e., worth, not cost) */
40 enum {
41 WEIGHT_UNREACHABLE = -1
42 };
43
44 /* Helper macros for "match_modex" and friends for translating between array
45 * indices and vertex IDs. Module vertices always come first in the graph,
46 * followed by proc (endpoint) vertices. */
47 #define PROC_VERTEX(modex_idx) (mca_btl_usnic_component.num_modules + modex_idx)
48 #define MODULE_VERTEX(module_idx) (module_idx)
49 #define PROC_INDEX(proc_vertex) ((proc_vertex) - mca_btl_usnic_component.num_modules)
50 #define MODULE_INDEX(module_vertex) (module_vertex)
51
proc_construct(opal_btl_usnic_proc_t * proc)52 static void proc_construct(opal_btl_usnic_proc_t* proc)
53 {
54 proc->proc_opal = 0;
55 proc->proc_modex = NULL;
56 proc->proc_modex_count = 0;
57 proc->proc_modex_claimed = NULL;
58 proc->proc_endpoints = NULL;
59 proc->proc_endpoint_count = 0;
60 proc->proc_ep_match_table = NULL;
61 proc->proc_match_exists = false;
62
63 /* add to list of all proc instance */
64 opal_list_append(&mca_btl_usnic_component.usnic_procs, &proc->super);
65 }
66
67
proc_destruct(opal_btl_usnic_proc_t * proc)68 static void proc_destruct(opal_btl_usnic_proc_t* proc)
69 {
70 /* remove from list of all proc instances */
71 opal_list_remove_item(&mca_btl_usnic_component.usnic_procs, &proc->super);
72
73 /* release resources */
74 if (NULL != proc->proc_modex) {
75 free(proc->proc_modex);
76 proc->proc_modex = NULL;
77 }
78
79 if (NULL != proc->proc_modex_claimed) {
80 free(proc->proc_modex_claimed);
81 proc->proc_modex_claimed = NULL;
82 }
83
84 if (NULL != proc->proc_ep_match_table) {
85 free(proc->proc_ep_match_table);
86 proc->proc_ep_match_table = NULL;
87 }
88
89 /* Release all endpoints associated with this proc */
90 if (NULL != proc->proc_endpoints) {
91 free(proc->proc_endpoints);
92 proc->proc_endpoints = NULL;
93 }
94 }
95
96
97 OBJ_CLASS_INSTANCE(opal_btl_usnic_proc_t,
98 opal_list_item_t,
99 proc_construct,
100 proc_destruct);
101
102 /*
103 * Look for an existing usnic process instance based on the
104 * associated opal_proc_t instance.
105 */
106 opal_btl_usnic_proc_t *
opal_btl_usnic_proc_lookup_ompi(opal_proc_t * opal_proc)107 opal_btl_usnic_proc_lookup_ompi(opal_proc_t* opal_proc)
108 {
109 opal_btl_usnic_proc_t* usnic_proc;
110
111 for (usnic_proc = (opal_btl_usnic_proc_t*)
112 opal_list_get_first(&mca_btl_usnic_component.usnic_procs);
113 usnic_proc != (opal_btl_usnic_proc_t*)
114 opal_list_get_end(&mca_btl_usnic_component.usnic_procs);
115 usnic_proc = (opal_btl_usnic_proc_t*)
116 opal_list_get_next(usnic_proc)) {
117 if (usnic_proc->proc_opal == opal_proc) {
118 return usnic_proc;
119 }
120 }
121
122 return NULL;
123 }
124
125
126 /*
127 * Look for an existing usnic proc based on a hashed RTE process
128 * name.
129 */
130 opal_btl_usnic_endpoint_t *
opal_btl_usnic_proc_lookup_endpoint(opal_btl_usnic_module_t * receiver,uint64_t sender_proc_name)131 opal_btl_usnic_proc_lookup_endpoint(opal_btl_usnic_module_t *receiver,
132 uint64_t sender_proc_name)
133 {
134 opal_btl_usnic_proc_t *proc;
135 opal_btl_usnic_endpoint_t *endpoint;
136 opal_list_item_t *item;
137
138 MSGDEBUG1_OUT("lookup_endpoint: recvmodule=%p sendhash=0x%" PRIx64,
139 (void *)receiver, sender_proc_name);
140
141 opal_mutex_lock(&receiver->all_endpoints_lock);
142 for (item = opal_list_get_first(&receiver->all_endpoints);
143 item != opal_list_get_end(&receiver->all_endpoints);
144 item = opal_list_get_next(item)) {
145 endpoint = container_of(item, opal_btl_usnic_endpoint_t,
146 endpoint_endpoint_li);
147 proc = endpoint->endpoint_proc;
148 /* Note that this works today because opal_proc_t->proc_name
149 is unique across the universe. George is potentially
150 working to give handles instead of proc names, and then
151 have a function pointer to perform comparisons. This would
152 be bad here in the critical path, though... */
153 if (usnic_compat_rte_hash_name(&(proc->proc_opal->proc_name)) ==
154 sender_proc_name) {
155 MSGDEBUG1_OUT("lookup_endpoint: matched endpoint=%p",
156 (void *)endpoint);
157 opal_mutex_unlock(&receiver->all_endpoints_lock);
158 return endpoint;
159 }
160 }
161 opal_mutex_unlock(&receiver->all_endpoints_lock);
162
163 /* Didn't find it */
164 return NULL;
165 }
166
167 /*
168 * Create an opal_btl_usnic_proc_t and initialize it with modex info
169 * and an empty array of endpoints.
170 *
171 * Returns OPAL_ERR_UNREACH if we can't reach the peer (i.e., we can't
172 * find their modex data).
173 */
create_proc(opal_proc_t * opal_proc,opal_btl_usnic_proc_t ** usnic_proc)174 static int create_proc(opal_proc_t *opal_proc,
175 opal_btl_usnic_proc_t **usnic_proc)
176 {
177 opal_btl_usnic_proc_t *proc = NULL;
178 size_t size;
179 int rc;
180
181 *usnic_proc = NULL;
182
183 /* Create the proc if it doesn't already exist */
184 proc = OBJ_NEW(opal_btl_usnic_proc_t);
185 if (NULL == proc) {
186 return OPAL_ERR_OUT_OF_RESOURCE;
187 }
188
189 /* Initialize number of peers */
190 proc->proc_endpoint_count = 0;
191 proc->proc_opal = opal_proc;
192
193 /* query for the peer address info */
194 usnic_compat_modex_recv(&rc, &mca_btl_usnic_component.super.btl_version,
195 opal_proc, &proc->proc_modex, &size);
196
197 /* If this proc simply doesn't have this key, then they're not
198 running the usnic BTL -- just ignore them. Otherwise, show an
199 error message. */
200 if (OPAL_ERR_NOT_FOUND == rc) {
201 OBJ_RELEASE(proc);
202 return OPAL_ERR_UNREACH;
203 } else if (OPAL_SUCCESS != rc) {
204 opal_show_help("help-mpi-btl-usnic.txt",
205 "internal error during init",
206 true,
207 opal_process_info.nodename,
208 "<none>", "<none>",
209 "opal_modex_recv() failed", __FILE__, __LINE__,
210 opal_strerror(rc));
211 OBJ_RELEASE(proc);
212 return OPAL_ERROR;
213 }
214
215 if ((size % sizeof(opal_btl_usnic_modex_t)) != 0) {
216 char msg[1024];
217
218 snprintf(msg, sizeof(msg),
219 "sizeof(modex for peer %s data) == %d, expected multiple of %d",
220 usnic_compat_proc_name_print(&opal_proc->proc_name),
221 (int) size, (int) sizeof(opal_btl_usnic_modex_t));
222 opal_show_help("help-mpi-btl-usnic.txt", "internal error during init",
223 true,
224 opal_process_info.nodename,
225 "<none>", 0,
226 "invalid modex data", __FILE__, __LINE__,
227 msg);
228
229 OBJ_RELEASE(proc);
230 return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
231 }
232
233 /* See if the peer has the same underlying wire protocol as me.
234 If not, then print an error and ignore this peer. */
235 // RFXXX - things are weird when i force this to fail
236 if (mca_btl_usnic_component.transport_protocol !=
237 proc->proc_modex->protocol) {
238 uint64_t proto;
239 char protostr[32];
240 proto = mca_btl_usnic_component.transport_protocol;
241 memset(protostr, 0, sizeof(protostr));
242 strncpy(protostr, fi_tostr(&proto, FI_TYPE_PROTOCOL),
243 sizeof(protostr) - 1);
244 proto = proc->proc_modex->protocol;
245 opal_show_help("help-mpi-btl-usnic.txt",
246 "transport mismatch",
247 true,
248 opal_process_info.nodename,
249 protostr,
250 "peer",
251 fi_tostr(&proto, FI_TYPE_PROTOCOL));
252
253 OBJ_RELEASE(proc);
254 return OPAL_ERR_UNREACH;
255 }
256
257 proc->proc_modex_count = size / sizeof(opal_btl_usnic_modex_t);
258 if (0 == proc->proc_modex_count) {
259 proc->proc_endpoints = NULL;
260 OBJ_RELEASE(proc);
261 return OPAL_ERR_UNREACH;
262 }
263
264 proc->proc_modex_claimed = (bool*)
265 calloc(proc->proc_modex_count, sizeof(bool));
266 if (NULL == proc->proc_modex_claimed) {
267 OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
268 OBJ_RELEASE(proc);
269 return OPAL_ERR_OUT_OF_RESOURCE;
270 }
271
272 proc->proc_endpoints = (mca_btl_base_endpoint_t**)
273 calloc(proc->proc_modex_count, sizeof(mca_btl_base_endpoint_t*));
274 if (NULL == proc->proc_endpoints) {
275 OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
276 OBJ_RELEASE(proc);
277 return OPAL_ERR_OUT_OF_RESOURCE;
278 }
279
280 *usnic_proc = proc;
281 return OPAL_SUCCESS;
282 }
283
284 /* Compare the addresses of the local interface corresponding to module and the
285 * remote interface corresponding to proc_modex_addr. Returns a weight value
286 * (higher values indicate more desirable connections). */
compute_weight(opal_btl_usnic_module_t * module,opal_btl_usnic_modex_t * proc_modex_addr)287 static uint64_t compute_weight(
288 opal_btl_usnic_module_t *module,
289 opal_btl_usnic_modex_t *proc_modex_addr)
290 {
291 char my_ip_string[INET_ADDRSTRLEN], peer_ip_string[INET_ADDRSTRLEN];
292 struct sockaddr_in sin;
293 struct sockaddr_in *sinp;
294 struct fi_usnic_info *uip;
295 uint32_t mynet, peernet;
296 int err;
297 int metric;
298 uint32_t min_link_speed_gbps;
299
300 uip = &module->usnic_info;
301 sinp = module->fabric_info->src_addr;
302 inet_ntop(AF_INET, &sinp->sin_addr,
303 my_ip_string, sizeof(my_ip_string));
304 inet_ntop(AF_INET, &proc_modex_addr->ipv4_addr,
305 peer_ip_string, sizeof(peer_ip_string));
306
307 /* Just compare the CIDR-masked IP address to see if they're on
308 the same network. If so, we're good. */
309 mynet = sinp->sin_addr.s_addr & uip->ui.v1.ui_netmask_be;
310 peernet = proc_modex_addr->ipv4_addr & proc_modex_addr->netmask;
311 opal_output_verbose(5, USNIC_OUT,
312 "btl:usnic:%s: checking my IP address/subnet (%s/%d) vs. peer (%s/%d): %s",
313 __func__, my_ip_string,
314 usnic_netmask_to_cidrlen(uip->ui.v1.ui_netmask_be),
315 peer_ip_string,
316 usnic_netmask_to_cidrlen(proc_modex_addr->netmask),
317 (mynet == peernet ? "match" : "DO NOT match"));
318
319 min_link_speed_gbps = MIN(module->super.btl_bandwidth,
320 proc_modex_addr->link_speed_mbps) / 1000;
321
322 /* Returned metric is:
323 * 0 - same VLAN
324 * 1..MAXINT - relative distance metric
325 * -1 - unreachable
326 */
327 metric = 0;
328 memset(&sin, 0, sizeof(sin));
329 sin.sin_family = AF_INET;
330 sin.sin_addr.s_addr = proc_modex_addr->ipv4_addr;
331 err = module->usnic_av_ops->get_distance(module->av, &sin, &metric);
332 if (0 != err || (0 == err && -1 == metric)) {
333 return 0; /* no connectivity */
334 }
335 else {
336 /* Format in binary MSB LSB
337 * most sig. 32-bits: 00000000 0000000A BBBBBBBB 00000001
338 * least sig. 32-bits: CCCCCCCC CCCCCCCC CCCCCCCC CCCCCCCC
339 *
340 * A = 1 iff same subnet
341 * B = min link speed (in Gbps) between iface pair
342 * C = metric from routing table
343 *
344 * That is, this prioritizes interfaces in the same subnet first,
345 * followed by having the same link speed. The extra literal "1" is in
346 * there to help prioritize over any zero-cost links that might
347 * otherwise make their way into the graph. It is not strictly
348 * necessary and could be eliminated if the extra byte is needed.
349 *
350 * TODO add an MCA parameter to optionally swap the offsets of A and
351 * B, thereby prioritizing link speed over same subnet reachability.
352 */
353 /* FIXME how can we check that the metric is the same before we have
354 * communication with this host? Mismatched metrics could cause the
355 * remote peer to make a different pairing decision... */
356 if (min_link_speed_gbps > 0xff) {
357 opal_output_verbose(20, USNIC_OUT, "clamping min_link_speed_gbps=%u to 255",
358 min_link_speed_gbps);
359 min_link_speed_gbps = 0xff;
360 }
361 return ((uint64_t)(mynet == peernet) << 48) |
362 ((uint64_t)(min_link_speed_gbps & 0xff) << 40) |
363 ((uint64_t)0x1 << 32) |
364 (/*metric=*/0);
365 }
366 }
367
368 /* Populate the given proc's match table from an array of (u,v) edge pairs.
369 *
370 * (DJG: this unfortunately knows a bit too much about the internals of
371 * "match_modex")
372 */
edge_pairs_to_match_table(opal_btl_usnic_proc_t * proc,bool proc_is_left,int nme,int * me)373 static void edge_pairs_to_match_table(
374 opal_btl_usnic_proc_t *proc,
375 bool proc_is_left,
376 int nme,
377 int *me)
378 {
379 int i;
380 int left, right;
381 int module_idx, proc_idx;
382 int num_modules;
383
384 num_modules = (int)mca_btl_usnic_component.num_modules;
385
386 assert(nme >= 0);
387 for (i = 0; i < nme; ++i) {
388 left = me[2*i+0];
389 right = me[2*i+1];
390
391 if (proc_is_left) {
392 proc_idx = PROC_INDEX(left);
393 module_idx = MODULE_INDEX(right);
394 } else {
395 module_idx = MODULE_INDEX(left);
396 proc_idx = PROC_INDEX(right);
397 }
398 assert(module_idx >= 0 && module_idx < num_modules);
399 assert(proc_idx >= 0 && proc_idx < (int)proc->proc_modex_count);
400 proc->proc_ep_match_table[module_idx] = proc_idx;
401 proc->proc_match_exists = true;
402 }
403
404 /* emit match summary for debugging purposes */
405 for (i = 0; i < num_modules; ++i) {
406 if (-1 != proc->proc_ep_match_table[i]) {
407 opal_output_verbose(5, USNIC_OUT,
408 "btl:usnic:%s: module[%d] (%p) should claim endpoint[%d] on proc %p",
409 __func__, i,
410 (void *)mca_btl_usnic_component.usnic_active_modules[i],
411 proc->proc_ep_match_table[i], (void *)proc);
412 } else {
413 opal_output_verbose(5, USNIC_OUT,
414 "btl:usnic:%s: module[%d] (%p) will NOT claim an endpoint on proc %p",
415 __func__, i,
416 (void *)mca_btl_usnic_component.usnic_active_modules[i],
417 (void *)proc);
418 }
419 }
420 }
421
422 /**
423 * Constructs an interface graph from all local modules and the given proc's
424 * remote interfaces. The resulting vertices will always have the module
425 * vertices appear before the proc vertices.
426 */
create_proc_module_graph(opal_btl_usnic_proc_t * proc,bool proc_is_left,opal_bp_graph_t ** g_out)427 static int create_proc_module_graph(
428 opal_btl_usnic_proc_t *proc,
429 bool proc_is_left,
430 opal_bp_graph_t **g_out)
431 {
432 int err;
433 int i, j;
434 int u, v;
435 int num_modules;
436 opal_bp_graph_t *g = NULL;
437
438 if (NULL == g_out) {
439 return OPAL_ERR_BAD_PARAM;
440 }
441 *g_out = NULL;
442
443 num_modules = (int)mca_btl_usnic_component.num_modules;
444
445 /* Construct a bipartite graph with remote interfaces on the one side and
446 * local interfaces (modules) on the other. */
447 err = opal_bp_graph_create(NULL, NULL, &g);
448 if (OPAL_SUCCESS != err) {
449 OPAL_ERROR_LOG(err);
450 goto out;
451 }
452
453 /* create vertices for each interface (local and remote) */
454 for (i = 0; i < num_modules; ++i) {
455 int idx = -1;
456 err = opal_bp_graph_add_vertex(g,
457 mca_btl_usnic_component.usnic_active_modules[i],
458 &idx);
459 if (OPAL_SUCCESS != err) {
460 OPAL_ERROR_LOG(err);
461 goto out_free_graph;
462 }
463 assert(idx == MODULE_VERTEX(i));
464 }
465 for (i = 0; i < (int)proc->proc_modex_count; ++i) {
466 int idx = -1;
467 err = opal_bp_graph_add_vertex(g, &proc->proc_modex[i], &idx);
468 if (OPAL_SUCCESS != err) {
469 OPAL_ERROR_LOG(err);
470 goto out_free_graph;
471 }
472 assert(idx == (int)PROC_VERTEX(i));
473 }
474
475 /* now add edges between interfaces that can communicate */
476 for (i = 0; i < num_modules; ++i) {
477 for (j = 0; j < (int)proc->proc_modex_count; ++j) {
478 int64_t weight, cost;
479
480 /* assumption: compute_weight returns the same weight on the
481 * remote process with these arguments (effectively) transposed */
482 weight = compute_weight(mca_btl_usnic_component.usnic_active_modules[i],
483 &proc->proc_modex[j]);
484
485 opal_output_verbose(20, USNIC_OUT,
486 "btl:usnic:%s: weight=0x%016" PRIx64 " for edge module[%d] (%p) <--> endpoint[%d] on proc %p",
487 __func__,
488 weight, i,
489 (void *)mca_btl_usnic_component.usnic_active_modules[i],
490 j, (void *)proc);
491
492 if (WEIGHT_UNREACHABLE == weight) {
493 continue;
494 } else {
495 /* the graph code optimizes for minimum *cost*, but we have
496 * been computing weights (negative costs) */
497 cost = -weight;
498 }
499 assert(INT64_MAX != cost);
500 assert(INT64_MIN != cost);
501
502 if (proc_is_left) {
503 u = PROC_VERTEX(j);
504 v = MODULE_VERTEX(i);
505 } else {
506 u = MODULE_VERTEX(i);
507 v = PROC_VERTEX(j);
508 }
509 opal_output_verbose(20, USNIC_OUT,
510 "btl:usnic:%s: adding edge (%d,%d) with cost=%" PRIi64 " for edge module[%d] <--> endpoint[%d]",
511 __func__, u, v, cost, i, j);
512 err = opal_bp_graph_add_edge(g, u, v, cost,
513 /*capacity=*/1,
514 /*e_data=*/NULL);
515 if (OPAL_SUCCESS != err) {
516 OPAL_ERROR_LOG(err);
517 goto out_free_graph;
518 }
519 }
520 }
521
522 *g_out = g;
523 return OPAL_SUCCESS;
524
525 out_free_graph:
526 opal_bp_graph_free(g);
527 out:
528 return err;
529 }
530
531 /*
532 * For a specific module, see if this proc has matching address/modex
533 * info. If so, create an endpoint and return it.
534 *
535 * Implementation note: This code relies on the order of modules on a local
536 * side matching the order of the modex entries that we send around, otherwise
537 * both sides may not agree on a bidirectional connection. It also assumes
538 * that add_procs will be invoked on the local modules in that same order, for
539 * the same reason. If those assumptions do not hold, we will need to
540 * canonicalize this match ordering somehow, probably by (jobid,vpid) pair or
541 * by the interface MAC or IP address.
542 */
match_modex(opal_btl_usnic_module_t * module,opal_btl_usnic_proc_t * proc,int * index_out)543 static int match_modex(opal_btl_usnic_module_t *module,
544 opal_btl_usnic_proc_t *proc,
545 int *index_out)
546 {
547 int err = OPAL_SUCCESS;
548 size_t i;
549 uint32_t num_modules;
550 opal_bp_graph_t *g = NULL;
551 bool proc_is_left;
552
553 if (NULL == index_out) {
554 return OPAL_ERR_BAD_PARAM;
555 }
556 *index_out = -1;
557
558 num_modules = mca_btl_usnic_component.num_modules;
559
560 opal_output_verbose(20, USNIC_OUT, "btl:usnic:%s: module=%p proc=%p with dimensions %d x %d",
561 __func__, (void *)module, (void *)proc,
562 num_modules, (int)proc->proc_modex_count);
563
564 /* We compute an interface match-up table once for each (module,proc) pair
565 * and cache it in the proc. Store per-proc instead of per-module, since
566 * MPI dynamic process routines can add procs but not new modules. */
567 if (NULL == proc->proc_ep_match_table) {
568 proc->proc_ep_match_table = malloc(num_modules *
569 sizeof(*proc->proc_ep_match_table));
570 if (NULL == proc->proc_ep_match_table) {
571 OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
572 return OPAL_ERR_OUT_OF_RESOURCE;
573 }
574
575 /* initialize to "no matches" */
576 for (i = 0; i < num_modules; ++i) {
577 proc->proc_ep_match_table[i] = -1;
578 }
579
580 /* For graphs where all edges are equal (and even for some other
581 * graphs), two peers making matching calculations with "mirror image"
582 * graphs might not end up with the same matching. Ensure that both
583 * sides are always setting up the exact same graph by always putting
584 * the process with the lower (jobid,vpid) on the "left".
585 */
586 #if 0
587 proc_is_left = (proc->proc_opal->proc_name <
588 opal_proc_local_get()->proc_name);
589 #else
590 proc_is_left =
591 usnic_compat_proc_name_compare(proc->proc_opal->proc_name,
592 opal_proc_local_get()->proc_name);
593 #endif
594
595 err = create_proc_module_graph(proc, proc_is_left, &g);
596 if (OPAL_SUCCESS != err) {
597 goto out_free_table;
598 }
599
600 int nme = 0;
601 int *me = NULL;
602 err = opal_bp_graph_solve_bipartite_assignment(g, &nme, &me);
603 if (OPAL_SUCCESS != err) {
604 OPAL_ERROR_LOG(err);
605 goto out_free_graph;
606 }
607
608 edge_pairs_to_match_table(proc, proc_is_left, nme, me);
609 free(me);
610
611 err = opal_bp_graph_free(g);
612 if (OPAL_SUCCESS != err) {
613 OPAL_ERROR_LOG(err);
614 return err;
615 }
616 }
617
618
619 if (!proc->proc_match_exists) {
620 opal_output_verbose(5, USNIC_OUT, "btl:usnic:%s: unable to find any valid interface pairs for proc %s",
621 __func__,
622 usnic_compat_proc_name_print(&proc->proc_opal->proc_name));
623 return OPAL_ERR_NOT_FOUND;
624 }
625
626 /* assuming no strange failure cases, this should always be present */
627 if (NULL != proc->proc_ep_match_table && proc->proc_match_exists) {
628 for (i = 0; i < num_modules; ++i) {
629 if (module == mca_btl_usnic_component.usnic_active_modules[i]) {
630 *index_out = proc->proc_ep_match_table[i];
631 break;
632 }
633 }
634 }
635
636 /* If MTU does not match, throw an error */
637 /* TODO with UDP, do we still want to enforce this restriction or just take
638 * the min of the two MTUs? Another choice is to disqualify this pairing
639 * before running the matching algorithm on it. */
640 if (*index_out >= 0 &&
641 proc->proc_modex[*index_out].max_msg_size !=
642 (uint16_t) module->fabric_info->ep_attr->max_msg_size) {
643 opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
644 true,
645 opal_process_info.nodename,
646 module->linux_device_name,
647 module->fabric_info->ep_attr->max_msg_size,
648 (NULL == proc->proc_opal->proc_hostname) ?
649 "unknown" : proc->proc_opal->proc_hostname,
650 proc->proc_modex[*index_out].max_msg_size);
651 *index_out = -1;
652 return OPAL_ERR_UNREACH;
653 }
654
655 return (*index_out == -1 ? OPAL_ERR_NOT_FOUND : OPAL_SUCCESS);
656
657 out_free_graph:
658 opal_bp_graph_free(g);
659 out_free_table:
660 free(proc->proc_ep_match_table);
661 proc->proc_ep_match_table = NULL;
662 proc->proc_match_exists = false;
663 return err;
664 }
665
666 /*
667 * Initiate the process to create a USD dest.
668 * It will be polled for completion later.
669 */
start_av_insert(opal_btl_usnic_module_t * module,opal_btl_usnic_endpoint_t * endpoint,int channel)670 static int start_av_insert(opal_btl_usnic_module_t *module,
671 opal_btl_usnic_endpoint_t *endpoint,
672 int channel)
673 {
674 int ret;
675 opal_btl_usnic_modex_t *modex = &endpoint->endpoint_remote_modex;
676 opal_btl_usnic_addr_context_t *context;
677 struct sockaddr_in sin;
678
679 context = calloc(1, sizeof(*context));
680 context->endpoint = endpoint;
681 context->channel_id = channel;
682
683 char str[IPV4STRADDRLEN];
684 opal_btl_usnic_snprintf_ipv4_addr(str, sizeof(str), modex->ipv4_addr,
685 modex->netmask);
686 opal_output_verbose(5, USNIC_OUT,
687 "btl:usnic:start_av_insert: to channel %d at %s:%d",
688 channel, str, modex->ports[channel]);
689
690 /* build remote address */
691 memset(&sin, 0, sizeof(sin));
692 sin.sin_family = AF_INET;
693 sin.sin_port = htons(modex->ports[channel]);
694 sin.sin_addr.s_addr = modex->ipv4_addr;
695
696 ret = fi_av_insert(module->av, &sin, 1,
697 &endpoint->endpoint_remote_addrs[channel], 0, context);
698 /* Did an error occur? */
699 if (0 != ret) {
700 opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed",
701 true,
702 opal_process_info.nodename,
703 module->linux_device_name,
704 "fi_av_insert()", __FILE__, __LINE__,
705 ret,
706 "Failed to initiate AV insert");
707 free(context);
708 return OPAL_ERROR;
709 }
710
711 return OPAL_SUCCESS;
712 }
713
714 /*
715 * Create an endpoint and claim the matched modex slot
716 */
717 int
opal_btl_usnic_create_endpoint(opal_btl_usnic_module_t * module,opal_btl_usnic_proc_t * proc,opal_btl_usnic_endpoint_t ** endpoint_o)718 opal_btl_usnic_create_endpoint(opal_btl_usnic_module_t *module,
719 opal_btl_usnic_proc_t *proc,
720 opal_btl_usnic_endpoint_t **endpoint_o)
721 {
722 int rc;
723 int modex_index;
724 opal_btl_usnic_endpoint_t *endpoint;
725
726 /* look for matching modex info */
727 rc = match_modex(module, proc, &modex_index);
728 if (OPAL_SUCCESS != rc) {
729 opal_output_verbose(5, USNIC_OUT,
730 "btl:usnic:create_endpoint: did not match usnic modex info for peer %s",
731 usnic_compat_proc_name_print(&proc->proc_opal->proc_name));
732 return rc;
733 }
734
735 endpoint = OBJ_NEW(opal_btl_usnic_endpoint_t);
736 if (NULL == endpoint) {
737 return OPAL_ERR_OUT_OF_RESOURCE;
738 }
739
740 /* Initalize the endpoint */
741 endpoint->endpoint_module = module;
742 assert(modex_index >= 0 && modex_index < (int)proc->proc_modex_count);
743 endpoint->endpoint_remote_modex = proc->proc_modex[modex_index];
744 endpoint->endpoint_send_credits = module->sd_num;
745
746 /* Start creating destinations; one for each channel. These
747 progress in the background.a */
748 for (int i = 0; i < USNIC_NUM_CHANNELS; ++i) {
749 rc = start_av_insert(module, endpoint, i);
750 if (OPAL_SUCCESS != rc) {
751 OBJ_RELEASE(endpoint);
752 return rc;
753 }
754 }
755
756 /* Initialize endpoint sequence number info */
757 endpoint->endpoint_next_seq_to_send = module->local_modex.isn;
758 endpoint->endpoint_ack_seq_rcvd = endpoint->endpoint_next_seq_to_send - 1;
759 endpoint->endpoint_next_contig_seq_to_recv =
760 endpoint->endpoint_remote_modex.isn;
761 endpoint->endpoint_highest_seq_rcvd =
762 endpoint->endpoint_next_contig_seq_to_recv - 1;
763 endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);
764
765 /* Now claim that modex slot */
766 proc->proc_modex_claimed[modex_index] = true;
767 MSGDEBUG1_OUT("create_endpoint: module=%p claimed endpoint=%p on proc=%p (hash=0x%" PRIx64 ")\n",
768 (void *)module, (void *)endpoint, (void *)proc,
769 proc->proc_opal->proc_name);
770
771 /* Save the endpoint on this proc's array of endpoints */
772 proc->proc_endpoints[proc->proc_endpoint_count] = endpoint;
773 endpoint->endpoint_proc_index = proc->proc_endpoint_count;
774 endpoint->endpoint_proc = proc;
775 ++proc->proc_endpoint_count;
776 OBJ_RETAIN(proc);
777
778 /* also add endpoint to module's list of endpoints (done here and
779 not in the endpoint constructor because we aren't able to pass
780 the module as a constructor argument -- doh!). */
781 opal_mutex_lock(&module->all_endpoints_lock);
782 opal_list_append(&(module->all_endpoints),
783 &(endpoint->endpoint_endpoint_li));
784 endpoint->endpoint_on_all_endpoints = true;
785 opal_mutex_unlock(&module->all_endpoints_lock);
786
787 *endpoint_o = endpoint;
788 return OPAL_SUCCESS;
789 }
790
791 /*
792 * If we haven't done so already, receive the modex info for the
793 * specified opal_proc. Search that proc's modex info; if we can find
794 * matching address info, then create an endpoint.
795 *
796 * If we don't find a match, it's not an error: just return "not
797 * found".
798 *
799 * This routine transfers ownership of an object reference to the caller, who
800 * is eventually responsible for transferring or releasing that reference.
801 *
802 * There is a one-to-one correspondence between a opal_proc_t and a
803 * opal_btl_usnic_proc_t instance. We cache additional data on the
804 * opal_btl_usnic_proc_t: specifically, the list of
805 * opal_btl_usnic_endpoint_t instances, and published addresses/modex
806 * info.
807 */
opal_btl_usnic_proc_match(opal_proc_t * opal_proc,opal_btl_usnic_module_t * module,opal_btl_usnic_proc_t ** proc)808 int opal_btl_usnic_proc_match(opal_proc_t *opal_proc,
809 opal_btl_usnic_module_t *module,
810 opal_btl_usnic_proc_t **proc)
811 {
812 /* Check if we have already created a proc structure for this peer
813 ompi process */
814 *proc = opal_btl_usnic_proc_lookup_ompi(opal_proc);
815 if (*proc != NULL) {
816 OBJ_RETAIN(*proc);
817 return OPAL_SUCCESS;
818 } else {
819 /* If not, go make one */
820 return create_proc(opal_proc, proc);
821 }
822 }
823