1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  *                         University Research and Technology
4  *                         Corporation.  All rights reserved.
5  * Copyright (c) 2004-2011 The University of Tennessee and The University
6  *                         of Tennessee Research Foundation.  All rights
7  *                         reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  *                         University of Stuttgart.  All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  *                         All rights reserved.
12  * Copyright (c) 2006      Sandia National Laboratories. All rights
13  *                         reserved.
14  * Copyright (c) 2013-2016 Cisco Systems, Inc.  All rights reserved.
15  * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
16  * $COPYRIGHT$
17  *
18  * Additional copyrights may follow
19  *
20  * $HEADER$
21  */
22 #include <netinet/in.h>
23 
24 #include "opal_config.h"
25 
26 #include "opal_stdint.h"
27 #include "opal/util/arch.h"
28 #include "opal/util/show_help.h"
29 #include "opal/constants.h"
30 #include "opal/util/bipartite_graph.h"
31 
32 #include "btl_usnic_compat.h"
33 #include "btl_usnic.h"
34 #include "btl_usnic_proc.h"
35 #include "btl_usnic_endpoint.h"
36 #include "btl_usnic_module.h"
37 #include "btl_usnic_util.h"
38 
39 /* larger weight values are more desirable (i.e., worth, not cost) */
40 enum {
41     WEIGHT_UNREACHABLE = -1
42 };
43 
44 /* Helper macros for "match_modex" and friends for translating between array
45  * indices and vertex IDs.  Module vertices always come first in the graph,
46  * followed by proc (endpoint) vertices. */
47 #define PROC_VERTEX(modex_idx) (mca_btl_usnic_component.num_modules + modex_idx)
48 #define MODULE_VERTEX(module_idx) (module_idx)
49 #define PROC_INDEX(proc_vertex) ((proc_vertex) - mca_btl_usnic_component.num_modules)
50 #define MODULE_INDEX(module_vertex) (module_vertex)
51 
proc_construct(opal_btl_usnic_proc_t * proc)52 static void proc_construct(opal_btl_usnic_proc_t* proc)
53 {
54     proc->proc_opal = 0;
55     proc->proc_modex = NULL;
56     proc->proc_modex_count = 0;
57     proc->proc_modex_claimed = NULL;
58     proc->proc_endpoints = NULL;
59     proc->proc_endpoint_count = 0;
60     proc->proc_ep_match_table = NULL;
61     proc->proc_match_exists = false;
62 
63     /* add to list of all proc instance */
64     opal_list_append(&mca_btl_usnic_component.usnic_procs, &proc->super);
65 }
66 
67 
proc_destruct(opal_btl_usnic_proc_t * proc)68 static void proc_destruct(opal_btl_usnic_proc_t* proc)
69 {
70     /* remove from list of all proc instances */
71     opal_list_remove_item(&mca_btl_usnic_component.usnic_procs, &proc->super);
72 
73     /* release resources */
74     if (NULL != proc->proc_modex) {
75         free(proc->proc_modex);
76         proc->proc_modex = NULL;
77     }
78 
79     if (NULL != proc->proc_modex_claimed) {
80         free(proc->proc_modex_claimed);
81         proc->proc_modex_claimed = NULL;
82     }
83 
84     if (NULL != proc->proc_ep_match_table) {
85         free(proc->proc_ep_match_table);
86         proc->proc_ep_match_table = NULL;
87     }
88 
89     /* Release all endpoints associated with this proc */
90     if (NULL != proc->proc_endpoints) {
91         free(proc->proc_endpoints);
92         proc->proc_endpoints = NULL;
93     }
94 }
95 
96 
97 OBJ_CLASS_INSTANCE(opal_btl_usnic_proc_t,
98                    opal_list_item_t,
99                    proc_construct,
100                    proc_destruct);
101 
102 /*
103  * Look for an existing usnic process instance based on the
104  * associated opal_proc_t instance.
105  */
106 opal_btl_usnic_proc_t *
opal_btl_usnic_proc_lookup_ompi(opal_proc_t * opal_proc)107 opal_btl_usnic_proc_lookup_ompi(opal_proc_t* opal_proc)
108 {
109     opal_btl_usnic_proc_t* usnic_proc;
110 
111     for (usnic_proc = (opal_btl_usnic_proc_t*)
112              opal_list_get_first(&mca_btl_usnic_component.usnic_procs);
113          usnic_proc != (opal_btl_usnic_proc_t*)
114              opal_list_get_end(&mca_btl_usnic_component.usnic_procs);
115          usnic_proc  = (opal_btl_usnic_proc_t*)
116              opal_list_get_next(usnic_proc)) {
117         if (usnic_proc->proc_opal == opal_proc) {
118             return usnic_proc;
119         }
120     }
121 
122     return NULL;
123 }
124 
125 
126 /*
127  * Look for an existing usnic proc based on a hashed RTE process
128  * name.
129  */
130 opal_btl_usnic_endpoint_t *
opal_btl_usnic_proc_lookup_endpoint(opal_btl_usnic_module_t * receiver,uint64_t sender_proc_name)131 opal_btl_usnic_proc_lookup_endpoint(opal_btl_usnic_module_t *receiver,
132                                     uint64_t sender_proc_name)
133 {
134     opal_btl_usnic_proc_t *proc;
135     opal_btl_usnic_endpoint_t *endpoint;
136     opal_list_item_t *item;
137 
138     MSGDEBUG1_OUT("lookup_endpoint: recvmodule=%p sendhash=0x%" PRIx64,
139                   (void *)receiver, sender_proc_name);
140 
141     opal_mutex_lock(&receiver->all_endpoints_lock);
142     for (item = opal_list_get_first(&receiver->all_endpoints);
143          item != opal_list_get_end(&receiver->all_endpoints);
144          item = opal_list_get_next(item)) {
145         endpoint = container_of(item, opal_btl_usnic_endpoint_t,
146                                 endpoint_endpoint_li);
147         proc = endpoint->endpoint_proc;
148         /* Note that this works today because opal_proc_t->proc_name
149            is unique across the universe.  George is potentially
150            working to give handles instead of proc names, and then
151            have a function pointer to perform comparisons.  This would
152            be bad here in the critical path, though... */
153         if (usnic_compat_rte_hash_name(&(proc->proc_opal->proc_name)) ==
154             sender_proc_name) {
155             MSGDEBUG1_OUT("lookup_endpoint: matched endpoint=%p",
156                           (void *)endpoint);
157             opal_mutex_unlock(&receiver->all_endpoints_lock);
158             return endpoint;
159         }
160     }
161     opal_mutex_unlock(&receiver->all_endpoints_lock);
162 
163     /* Didn't find it */
164     return NULL;
165 }
166 
167 /*
168  * Create an opal_btl_usnic_proc_t and initialize it with modex info
169  * and an empty array of endpoints.
170  *
171  * Returns OPAL_ERR_UNREACH if we can't reach the peer (i.e., we can't
172  * find their modex data).
173  */
create_proc(opal_proc_t * opal_proc,opal_btl_usnic_proc_t ** usnic_proc)174 static int create_proc(opal_proc_t *opal_proc,
175                        opal_btl_usnic_proc_t **usnic_proc)
176 {
177     opal_btl_usnic_proc_t *proc = NULL;
178     size_t size;
179     int rc;
180 
181     *usnic_proc = NULL;
182 
183     /* Create the proc if it doesn't already exist */
184     proc = OBJ_NEW(opal_btl_usnic_proc_t);
185     if (NULL == proc) {
186         return OPAL_ERR_OUT_OF_RESOURCE;
187     }
188 
189     /* Initialize number of peers */
190     proc->proc_endpoint_count = 0;
191     proc->proc_opal = opal_proc;
192 
193     /* query for the peer address info */
194     usnic_compat_modex_recv(&rc, &mca_btl_usnic_component.super.btl_version,
195                             opal_proc, &proc->proc_modex, &size);
196 
197     /* If this proc simply doesn't have this key, then they're not
198        running the usnic BTL -- just ignore them.  Otherwise, show an
199        error message. */
200     if (OPAL_ERR_NOT_FOUND == rc) {
201         OBJ_RELEASE(proc);
202         return OPAL_ERR_UNREACH;
203     } else if (OPAL_SUCCESS != rc) {
204         opal_show_help("help-mpi-btl-usnic.txt",
205                        "internal error during init",
206                        true,
207                        opal_process_info.nodename,
208                        "<none>", "<none>",
209                        "opal_modex_recv() failed", __FILE__, __LINE__,
210                        opal_strerror(rc));
211         OBJ_RELEASE(proc);
212         return OPAL_ERROR;
213     }
214 
215     if ((size % sizeof(opal_btl_usnic_modex_t)) != 0) {
216         char msg[1024];
217 
218         snprintf(msg, sizeof(msg),
219                  "sizeof(modex for peer %s data) == %d, expected multiple of %d",
220                  usnic_compat_proc_name_print(&opal_proc->proc_name),
221                  (int) size, (int) sizeof(opal_btl_usnic_modex_t));
222         opal_show_help("help-mpi-btl-usnic.txt", "internal error during init",
223                        true,
224                        opal_process_info.nodename,
225                        "<none>", 0,
226                        "invalid modex data", __FILE__, __LINE__,
227                        msg);
228 
229         OBJ_RELEASE(proc);
230         return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
231     }
232 
233     /* See if the peer has the same underlying wire protocol as me.
234        If not, then print an error and ignore this peer. */
235 // RFXXX - things are weird when i force this to fail
236     if (mca_btl_usnic_component.transport_protocol !=
237         proc->proc_modex->protocol) {
238         uint64_t proto;
239         char protostr[32];
240         proto = mca_btl_usnic_component.transport_protocol;
241         memset(protostr, 0, sizeof(protostr));
242         strncpy(protostr, fi_tostr(&proto, FI_TYPE_PROTOCOL),
243                 sizeof(protostr) - 1);
244         proto = proc->proc_modex->protocol;
245         opal_show_help("help-mpi-btl-usnic.txt",
246                        "transport mismatch",
247                        true,
248                        opal_process_info.nodename,
249                        protostr,
250                        "peer",
251                        fi_tostr(&proto, FI_TYPE_PROTOCOL));
252 
253         OBJ_RELEASE(proc);
254         return OPAL_ERR_UNREACH;
255     }
256 
257     proc->proc_modex_count = size / sizeof(opal_btl_usnic_modex_t);
258     if (0 == proc->proc_modex_count) {
259         proc->proc_endpoints = NULL;
260         OBJ_RELEASE(proc);
261         return OPAL_ERR_UNREACH;
262     }
263 
264     proc->proc_modex_claimed = (bool*)
265         calloc(proc->proc_modex_count, sizeof(bool));
266     if (NULL == proc->proc_modex_claimed) {
267         OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
268         OBJ_RELEASE(proc);
269         return OPAL_ERR_OUT_OF_RESOURCE;
270     }
271 
272     proc->proc_endpoints = (mca_btl_base_endpoint_t**)
273         calloc(proc->proc_modex_count, sizeof(mca_btl_base_endpoint_t*));
274     if (NULL == proc->proc_endpoints) {
275         OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
276         OBJ_RELEASE(proc);
277         return OPAL_ERR_OUT_OF_RESOURCE;
278     }
279 
280     *usnic_proc = proc;
281     return OPAL_SUCCESS;
282 }
283 
284 /* Compare the addresses of the local interface corresponding to module and the
285  * remote interface corresponding to proc_modex_addr.  Returns a weight value
286  * (higher values indicate more desirable connections). */
compute_weight(opal_btl_usnic_module_t * module,opal_btl_usnic_modex_t * proc_modex_addr)287 static uint64_t compute_weight(
288     opal_btl_usnic_module_t *module,
289     opal_btl_usnic_modex_t *proc_modex_addr)
290 {
291     char my_ip_string[INET_ADDRSTRLEN], peer_ip_string[INET_ADDRSTRLEN];
292     struct sockaddr_in sin;
293     struct sockaddr_in *sinp;
294     struct fi_usnic_info *uip;
295     uint32_t mynet, peernet;
296     int err;
297     int metric;
298     uint32_t min_link_speed_gbps;
299 
300     uip = &module->usnic_info;
301     sinp = module->fabric_info->src_addr;
302     inet_ntop(AF_INET, &sinp->sin_addr,
303               my_ip_string, sizeof(my_ip_string));
304     inet_ntop(AF_INET, &proc_modex_addr->ipv4_addr,
305               peer_ip_string, sizeof(peer_ip_string));
306 
307     /* Just compare the CIDR-masked IP address to see if they're on
308        the same network.  If so, we're good. */
309     mynet = sinp->sin_addr.s_addr & uip->ui.v1.ui_netmask_be;
310     peernet = proc_modex_addr->ipv4_addr & proc_modex_addr->netmask;
311     opal_output_verbose(5, USNIC_OUT,
312                         "btl:usnic:%s: checking my IP address/subnet (%s/%d) vs. peer (%s/%d): %s",
313                         __func__, my_ip_string,
314                         usnic_netmask_to_cidrlen(uip->ui.v1.ui_netmask_be),
315                         peer_ip_string,
316                         usnic_netmask_to_cidrlen(proc_modex_addr->netmask),
317                         (mynet == peernet ? "match" : "DO NOT match"));
318 
319     min_link_speed_gbps = MIN(module->super.btl_bandwidth,
320                               proc_modex_addr->link_speed_mbps) / 1000;
321 
322     /* Returned metric is:
323      *    0 - same VLAN
324      *    1..MAXINT - relative distance metric
325      *    -1 - unreachable
326      */
327     metric = 0;
328     memset(&sin, 0, sizeof(sin));
329     sin.sin_family = AF_INET;
330     sin.sin_addr.s_addr = proc_modex_addr->ipv4_addr;
331     err = module->usnic_av_ops->get_distance(module->av, &sin, &metric);
332     if (0 != err || (0 == err && -1 == metric)) {
333         return 0; /* no connectivity */
334     }
335     else {
336         /* Format in binary    MSB                             LSB
337          * most sig. 32-bits:  00000000 0000000A BBBBBBBB 00000001
338          * least sig. 32-bits: CCCCCCCC CCCCCCCC CCCCCCCC CCCCCCCC
339          *
340          * A = 1 iff same subnet
341          * B = min link speed (in Gbps) between iface pair
342          * C = metric from routing table
343          *
344          * That is, this prioritizes interfaces in the same subnet first,
345          * followed by having the same link speed.  The extra literal "1" is in
346          * there to help prioritize over any zero-cost links that might
347          * otherwise make their way into the graph.  It is not strictly
348          * necessary and could be eliminated if the extra byte is needed.
349          *
350          * TODO add an MCA parameter to optionally swap the offsets of A and
351          * B, thereby prioritizing link speed over same subnet reachability.
352          */
353         /* FIXME how can we check that the metric is the same before we have
354          * communication with this host?  Mismatched metrics could cause the
355          * remote peer to make a different pairing decision... */
356         if (min_link_speed_gbps > 0xff) {
357             opal_output_verbose(20, USNIC_OUT, "clamping min_link_speed_gbps=%u to 255",
358                                 min_link_speed_gbps);
359             min_link_speed_gbps = 0xff;
360         }
361         return ((uint64_t)(mynet == peernet) << 48) |
362                ((uint64_t)(min_link_speed_gbps & 0xff) << 40) |
363                ((uint64_t)0x1 << 32) |
364                (/*metric=*/0);
365     }
366 }
367 
368 /* Populate the given proc's match table from an array of (u,v) edge pairs.
369  *
370  * (DJG: this unfortunately knows a bit too much about the internals of
371  * "match_modex")
372  */
edge_pairs_to_match_table(opal_btl_usnic_proc_t * proc,bool proc_is_left,int nme,int * me)373 static void edge_pairs_to_match_table(
374     opal_btl_usnic_proc_t *proc,
375     bool proc_is_left,
376     int nme,
377     int *me)
378 {
379     int i;
380     int left, right;
381     int module_idx, proc_idx;
382     int num_modules;
383 
384     num_modules = (int)mca_btl_usnic_component.num_modules;
385 
386     assert(nme >= 0);
387     for (i = 0; i < nme; ++i) {
388         left  = me[2*i+0];
389         right = me[2*i+1];
390 
391         if (proc_is_left) {
392             proc_idx = PROC_INDEX(left);
393             module_idx = MODULE_INDEX(right);
394         } else {
395             module_idx = MODULE_INDEX(left);
396             proc_idx = PROC_INDEX(right);
397         }
398         assert(module_idx >= 0 && module_idx < num_modules);
399         assert(proc_idx >= 0 && proc_idx < (int)proc->proc_modex_count);
400         proc->proc_ep_match_table[module_idx] = proc_idx;
401         proc->proc_match_exists = true;
402     }
403 
404     /* emit match summary for debugging purposes */
405     for (i = 0; i < num_modules; ++i) {
406         if (-1 != proc->proc_ep_match_table[i]) {
407             opal_output_verbose(5, USNIC_OUT,
408                                 "btl:usnic:%s: module[%d] (%p) should claim endpoint[%d] on proc %p",
409                                 __func__, i,
410                                 (void *)mca_btl_usnic_component.usnic_active_modules[i],
411                                 proc->proc_ep_match_table[i], (void *)proc);
412         } else {
413             opal_output_verbose(5, USNIC_OUT,
414                                 "btl:usnic:%s: module[%d] (%p) will NOT claim an endpoint on proc %p",
415                                 __func__, i,
416                                 (void *)mca_btl_usnic_component.usnic_active_modules[i],
417                                 (void *)proc);
418         }
419     }
420 }
421 
422 /**
423  * Constructs an interface graph from all local modules and the given proc's
424  * remote interfaces.  The resulting vertices will always have the module
425  * vertices appear before the proc vertices.
426  */
create_proc_module_graph(opal_btl_usnic_proc_t * proc,bool proc_is_left,opal_bp_graph_t ** g_out)427 static int create_proc_module_graph(
428     opal_btl_usnic_proc_t *proc,
429     bool proc_is_left,
430     opal_bp_graph_t **g_out)
431 {
432     int err;
433     int i, j;
434     int u, v;
435     int num_modules;
436     opal_bp_graph_t *g = NULL;
437 
438     if (NULL == g_out) {
439         return OPAL_ERR_BAD_PARAM;
440     }
441     *g_out = NULL;
442 
443     num_modules = (int)mca_btl_usnic_component.num_modules;
444 
445     /* Construct a bipartite graph with remote interfaces on the one side and
446      * local interfaces (modules) on the other. */
447     err = opal_bp_graph_create(NULL, NULL, &g);
448     if (OPAL_SUCCESS != err) {
449         OPAL_ERROR_LOG(err);
450         goto out;
451     }
452 
453     /* create vertices for each interface (local and remote) */
454     for (i = 0; i < num_modules; ++i) {
455         int idx = -1;
456         err = opal_bp_graph_add_vertex(g,
457 				       mca_btl_usnic_component.usnic_active_modules[i],
458 				       &idx);
459         if (OPAL_SUCCESS != err) {
460             OPAL_ERROR_LOG(err);
461             goto out_free_graph;
462         }
463         assert(idx == MODULE_VERTEX(i));
464     }
465     for (i = 0; i < (int)proc->proc_modex_count; ++i) {
466         int idx = -1;
467         err = opal_bp_graph_add_vertex(g, &proc->proc_modex[i], &idx);
468         if (OPAL_SUCCESS != err) {
469             OPAL_ERROR_LOG(err);
470             goto out_free_graph;
471         }
472         assert(idx == (int)PROC_VERTEX(i));
473     }
474 
475     /* now add edges between interfaces that can communicate */
476     for (i = 0; i < num_modules; ++i) {
477         for (j = 0; j < (int)proc->proc_modex_count; ++j) {
478             int64_t weight, cost;
479 
480             /* assumption: compute_weight returns the same weight on the
481              * remote process with these arguments (effectively) transposed */
482             weight = compute_weight(mca_btl_usnic_component.usnic_active_modules[i],
483                                     &proc->proc_modex[j]);
484 
485             opal_output_verbose(20, USNIC_OUT,
486                                 "btl:usnic:%s: weight=0x%016" PRIx64 " for edge module[%d] (%p) <--> endpoint[%d] on proc %p",
487                                 __func__,
488                                 weight, i,
489                                 (void *)mca_btl_usnic_component.usnic_active_modules[i],
490                                 j, (void *)proc);
491 
492             if (WEIGHT_UNREACHABLE == weight) {
493                 continue;
494             } else {
495                 /* the graph code optimizes for minimum *cost*, but we have
496                  * been computing weights (negative costs) */
497                 cost = -weight;
498             }
499             assert(INT64_MAX != cost);
500             assert(INT64_MIN != cost);
501 
502             if (proc_is_left) {
503                 u = PROC_VERTEX(j);
504                 v = MODULE_VERTEX(i);
505             } else {
506                 u = MODULE_VERTEX(i);
507                 v = PROC_VERTEX(j);
508             }
509             opal_output_verbose(20, USNIC_OUT,
510                                 "btl:usnic:%s: adding edge (%d,%d) with cost=%" PRIi64 " for edge module[%d] <--> endpoint[%d]",
511                                 __func__, u, v, cost, i, j);
512             err = opal_bp_graph_add_edge(g, u, v, cost,
513 					 /*capacity=*/1,
514 					 /*e_data=*/NULL);
515             if (OPAL_SUCCESS != err) {
516                 OPAL_ERROR_LOG(err);
517                 goto out_free_graph;
518             }
519         }
520     }
521 
522     *g_out = g;
523     return OPAL_SUCCESS;
524 
525 out_free_graph:
526     opal_bp_graph_free(g);
527 out:
528     return err;
529 }
530 
531 /*
532  * For a specific module, see if this proc has matching address/modex
533  * info.  If so, create an endpoint and return it.
534  *
535  * Implementation note: This code relies on the order of modules on a local
536  * side matching the order of the modex entries that we send around, otherwise
537  * both sides may not agree on a bidirectional connection.  It also assumes
538  * that add_procs will be invoked on the local modules in that same order, for
539  * the same reason.  If those assumptions do not hold, we will need to
540  * canonicalize this match ordering somehow, probably by (jobid,vpid) pair or
541  * by the interface MAC or IP address.
542  */
match_modex(opal_btl_usnic_module_t * module,opal_btl_usnic_proc_t * proc,int * index_out)543 static int match_modex(opal_btl_usnic_module_t *module,
544                        opal_btl_usnic_proc_t *proc,
545                        int *index_out)
546 {
547     int err = OPAL_SUCCESS;
548     size_t i;
549     uint32_t num_modules;
550     opal_bp_graph_t *g = NULL;
551     bool proc_is_left;
552 
553     if (NULL == index_out) {
554         return OPAL_ERR_BAD_PARAM;
555     }
556     *index_out = -1;
557 
558     num_modules = mca_btl_usnic_component.num_modules;
559 
560     opal_output_verbose(20, USNIC_OUT, "btl:usnic:%s: module=%p proc=%p with dimensions %d x %d",
561                         __func__, (void *)module, (void *)proc,
562                         num_modules, (int)proc->proc_modex_count);
563 
564     /* We compute an interface match-up table once for each (module,proc) pair
565      * and cache it in the proc.  Store per-proc instead of per-module, since
566      * MPI dynamic process routines can add procs but not new modules. */
567     if (NULL == proc->proc_ep_match_table) {
568         proc->proc_ep_match_table = malloc(num_modules *
569                                        sizeof(*proc->proc_ep_match_table));
570         if (NULL == proc->proc_ep_match_table) {
571             OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
572             return OPAL_ERR_OUT_OF_RESOURCE;
573         }
574 
575         /* initialize to "no matches" */
576         for (i = 0; i < num_modules; ++i) {
577             proc->proc_ep_match_table[i] = -1;
578         }
579 
580         /* For graphs where all edges are equal (and even for some other
581          * graphs), two peers making matching calculations with "mirror image"
582          * graphs might not end up with the same matching.  Ensure that both
583          * sides are always setting up the exact same graph by always putting
584          * the process with the lower (jobid,vpid) on the "left".
585          */
586 #if 0
587         proc_is_left = (proc->proc_opal->proc_name <
588                         opal_proc_local_get()->proc_name);
589 #else
590         proc_is_left =
591             usnic_compat_proc_name_compare(proc->proc_opal->proc_name,
592                                            opal_proc_local_get()->proc_name);
593 #endif
594 
595         err = create_proc_module_graph(proc, proc_is_left, &g);
596         if (OPAL_SUCCESS != err) {
597             goto out_free_table;
598         }
599 
600         int nme = 0;
601         int *me = NULL;
602         err = opal_bp_graph_solve_bipartite_assignment(g, &nme, &me);
603         if (OPAL_SUCCESS != err) {
604             OPAL_ERROR_LOG(err);
605             goto out_free_graph;
606         }
607 
608         edge_pairs_to_match_table(proc, proc_is_left, nme, me);
609         free(me);
610 
611         err = opal_bp_graph_free(g);
612         if (OPAL_SUCCESS != err) {
613             OPAL_ERROR_LOG(err);
614             return err;
615         }
616     }
617 
618 
619     if (!proc->proc_match_exists) {
620         opal_output_verbose(5, USNIC_OUT, "btl:usnic:%s: unable to find any valid interface pairs for proc %s",
621                             __func__,
622                             usnic_compat_proc_name_print(&proc->proc_opal->proc_name));
623         return OPAL_ERR_NOT_FOUND;
624     }
625 
626     /* assuming no strange failure cases, this should always be present */
627     if (NULL != proc->proc_ep_match_table && proc->proc_match_exists) {
628         for (i = 0; i < num_modules; ++i) {
629             if (module == mca_btl_usnic_component.usnic_active_modules[i]) {
630                 *index_out = proc->proc_ep_match_table[i];
631                 break;
632             }
633         }
634     }
635 
636     /* If MTU does not match, throw an error */
637     /* TODO with UDP, do we still want to enforce this restriction or just take
638      * the min of the two MTUs?  Another choice is to disqualify this pairing
639      * before running the matching algorithm on it. */
640     if (*index_out >= 0 &&
641         proc->proc_modex[*index_out].max_msg_size !=
642         (uint16_t) module->fabric_info->ep_attr->max_msg_size) {
643         opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
644                        true,
645                        opal_process_info.nodename,
646                        module->linux_device_name,
647                        module->fabric_info->ep_attr->max_msg_size,
648                        (NULL == proc->proc_opal->proc_hostname) ?
649                        "unknown" : proc->proc_opal->proc_hostname,
650                        proc->proc_modex[*index_out].max_msg_size);
651         *index_out = -1;
652         return OPAL_ERR_UNREACH;
653     }
654 
655     return (*index_out == -1 ? OPAL_ERR_NOT_FOUND : OPAL_SUCCESS);
656 
657 out_free_graph:
658     opal_bp_graph_free(g);
659 out_free_table:
660     free(proc->proc_ep_match_table);
661     proc->proc_ep_match_table = NULL;
662     proc->proc_match_exists = false;
663     return err;
664 }
665 
666 /*
667  * Initiate the process to create a USD dest.
668  * It will be polled for completion later.
669  */
start_av_insert(opal_btl_usnic_module_t * module,opal_btl_usnic_endpoint_t * endpoint,int channel)670 static int start_av_insert(opal_btl_usnic_module_t *module,
671                                   opal_btl_usnic_endpoint_t *endpoint,
672                                   int channel)
673 {
674     int ret;
675     opal_btl_usnic_modex_t *modex = &endpoint->endpoint_remote_modex;
676     opal_btl_usnic_addr_context_t *context;
677     struct sockaddr_in sin;
678 
679     context = calloc(1, sizeof(*context));
680     context->endpoint = endpoint;
681     context->channel_id = channel;
682 
683     char str[IPV4STRADDRLEN];
684     opal_btl_usnic_snprintf_ipv4_addr(str, sizeof(str), modex->ipv4_addr,
685                                       modex->netmask);
686     opal_output_verbose(5, USNIC_OUT,
687                         "btl:usnic:start_av_insert: to channel %d at %s:%d",
688                         channel, str, modex->ports[channel]);
689 
690     /* build remote address */
691     memset(&sin, 0, sizeof(sin));
692     sin.sin_family = AF_INET;
693     sin.sin_port = htons(modex->ports[channel]);
694     sin.sin_addr.s_addr = modex->ipv4_addr;
695 
696     ret = fi_av_insert(module->av, &sin, 1,
697             &endpoint->endpoint_remote_addrs[channel], 0, context);
698     /* Did an error occur? */
699     if (0 != ret) {
700         opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed",
701                        true,
702                        opal_process_info.nodename,
703                        module->linux_device_name,
704                        "fi_av_insert()", __FILE__, __LINE__,
705                        ret,
706                        "Failed to initiate AV insert");
707         free(context);
708         return OPAL_ERROR;
709     }
710 
711     return OPAL_SUCCESS;
712 }
713 
714 /*
715  * Create an endpoint and claim the matched modex slot
716  */
717 int
opal_btl_usnic_create_endpoint(opal_btl_usnic_module_t * module,opal_btl_usnic_proc_t * proc,opal_btl_usnic_endpoint_t ** endpoint_o)718 opal_btl_usnic_create_endpoint(opal_btl_usnic_module_t *module,
719                 opal_btl_usnic_proc_t *proc,
720                 opal_btl_usnic_endpoint_t **endpoint_o)
721 {
722     int rc;
723     int modex_index;
724     opal_btl_usnic_endpoint_t *endpoint;
725 
726     /* look for matching modex info */
727     rc = match_modex(module, proc, &modex_index);
728     if (OPAL_SUCCESS != rc) {
729         opal_output_verbose(5, USNIC_OUT,
730                             "btl:usnic:create_endpoint: did not match usnic modex info for peer %s",
731                             usnic_compat_proc_name_print(&proc->proc_opal->proc_name));
732         return rc;
733     }
734 
735     endpoint = OBJ_NEW(opal_btl_usnic_endpoint_t);
736     if (NULL == endpoint) {
737         return OPAL_ERR_OUT_OF_RESOURCE;
738     }
739 
740     /* Initalize the endpoint */
741     endpoint->endpoint_module = module;
742     assert(modex_index >= 0 && modex_index < (int)proc->proc_modex_count);
743     endpoint->endpoint_remote_modex = proc->proc_modex[modex_index];
744     endpoint->endpoint_send_credits = module->sd_num;
745 
746     /* Start creating destinations; one for each channel.  These
747        progress in the background.a */
748     for (int i = 0; i < USNIC_NUM_CHANNELS; ++i)  {
749         rc = start_av_insert(module, endpoint, i);
750         if (OPAL_SUCCESS != rc) {
751             OBJ_RELEASE(endpoint);
752             return rc;
753         }
754     }
755 
756     /* Initialize endpoint sequence number info */
757     endpoint->endpoint_next_seq_to_send = module->local_modex.isn;
758     endpoint->endpoint_ack_seq_rcvd = endpoint->endpoint_next_seq_to_send - 1;
759     endpoint->endpoint_next_contig_seq_to_recv =
760         endpoint->endpoint_remote_modex.isn;
761     endpoint->endpoint_highest_seq_rcvd =
762         endpoint->endpoint_next_contig_seq_to_recv - 1;
763     endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);
764 
765     /* Now claim that modex slot */
766     proc->proc_modex_claimed[modex_index] = true;
767     MSGDEBUG1_OUT("create_endpoint: module=%p claimed endpoint=%p on proc=%p (hash=0x%" PRIx64 ")\n",
768                   (void *)module, (void *)endpoint, (void *)proc,
769                   proc->proc_opal->proc_name);
770 
771     /* Save the endpoint on this proc's array of endpoints */
772     proc->proc_endpoints[proc->proc_endpoint_count] = endpoint;
773     endpoint->endpoint_proc_index = proc->proc_endpoint_count;
774     endpoint->endpoint_proc = proc;
775     ++proc->proc_endpoint_count;
776     OBJ_RETAIN(proc);
777 
778     /* also add endpoint to module's list of endpoints (done here and
779        not in the endpoint constructor because we aren't able to pass
780        the module as a constructor argument -- doh!). */
781     opal_mutex_lock(&module->all_endpoints_lock);
782     opal_list_append(&(module->all_endpoints),
783             &(endpoint->endpoint_endpoint_li));
784     endpoint->endpoint_on_all_endpoints = true;
785     opal_mutex_unlock(&module->all_endpoints_lock);
786 
787     *endpoint_o = endpoint;
788     return OPAL_SUCCESS;
789 }
790 
791 /*
792  * If we haven't done so already, receive the modex info for the
793  * specified opal_proc.  Search that proc's modex info; if we can find
794  * matching address info, then create an endpoint.
795  *
796  * If we don't find a match, it's not an error: just return "not
797  * found".
798  *
799  * This routine transfers ownership of an object reference to the caller, who
800  * is eventually responsible for transferring or releasing that reference.
801  *
802  * There is a one-to-one correspondence between a opal_proc_t and a
803  * opal_btl_usnic_proc_t instance.  We cache additional data on the
804  * opal_btl_usnic_proc_t: specifically, the list of
805  * opal_btl_usnic_endpoint_t instances, and published addresses/modex
806  * info.
807  */
opal_btl_usnic_proc_match(opal_proc_t * opal_proc,opal_btl_usnic_module_t * module,opal_btl_usnic_proc_t ** proc)808 int opal_btl_usnic_proc_match(opal_proc_t *opal_proc,
809                               opal_btl_usnic_module_t *module,
810                               opal_btl_usnic_proc_t **proc)
811 {
812     /* Check if we have already created a proc structure for this peer
813        ompi process */
814     *proc = opal_btl_usnic_proc_lookup_ompi(opal_proc);
815     if (*proc != NULL) {
816         OBJ_RETAIN(*proc);
817         return OPAL_SUCCESS;
818     } else {
819         /* If not, go make one */
820         return create_proc(opal_proc, proc);
821     }
822 }
823