1 /*
2 * Copyright (c) 2009 Simula Research Laboratory. All rights reserved.
3 * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
4 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
5 * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved.
6 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
7 *
8 * This software is available to you under a choice of one of two
9 * licenses. You may choose to be licensed under the terms of the GNU
10 * General Public License (GPL) Version 2, available from the file
11 * COPYING in the main directory of this source tree, or the
12 * OpenIB.org BSD license below:
13 *
14 * Redistribution and use in source and binary forms, with or
15 * without modification, are permitted provided that the following
16 * conditions are met:
17 *
18 * - Redistributions of source code must retain the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer.
21 *
22 * - Redistributions in binary form must reproduce the above
23 * copyright notice, this list of conditions and the following
24 * disclaimer in the documentation and/or other materials
25 * provided with the distribution.
26 *
27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 * SOFTWARE.
35 *
36 */
37
38 /*
39 * Abstract:
40 * Implementation of OpenSM FatTree routing
41 */
42
43 #if HAVE_CONFIG_H
44 # include <config.h>
45 #endif
46
47 #include <stdlib.h>
48 #include <string.h>
49 #include <ctype.h>
50 #include <errno.h>
51 #include <iba/ib_types.h>
52 #include <complib/cl_qmap.h>
53 #include <complib/cl_debug.h>
54 #include <opensm/osm_file_ids.h>
55 #define FILE_ID OSM_FILE_UCAST_FTREE_C
56 #include <opensm/osm_opensm.h>
57 #include <opensm/osm_switch.h>
58
59 /*
60 * FatTree rank is bounded between 2 and 8:
61 * - Tree of rank 1 has only trivial routing paths,
62 * so no need to use FatTree routing.
63 * - Why maximum rank is 8:
64 * Each node (switch) is assigned a unique tuple.
65 * Switches are stored in two cl_qmaps - one is
66 * ordered by guid, and the other by a key that is
67 * generated from tuple. Since cl_qmap supports only
68 * a 64-bit key, the maximal tuple length is 8 bytes.
69 * which means that maximal tree rank is 8.
70 * Note that the above also implies that each switch
71 * can have at max 255 up/down ports.
72 */
73
74 #define FAT_TREE_MIN_RANK 2
75 #define FAT_TREE_MAX_RANK 8
76
77 typedef enum {
78 FTREE_DIRECTION_DOWN = -1,
79 FTREE_DIRECTION_SAME,
80 FTREE_DIRECTION_UP
81 } ftree_direction_t;
82
83 /***************************************************
84 **
85 ** Forward references
86 **
87 ***************************************************/
88 struct ftree_sw_t_;
89 struct ftree_hca_t_;
90 struct ftree_port_t_;
91 struct ftree_port_group_t_;
92 struct ftree_fabric_t_;
93
94 /***************************************************
95 **
96 ** ftree_tuple_t definition
97 **
98 ***************************************************/
99
100 #define FTREE_TUPLE_BUFF_LEN 1024
101 #define FTREE_TUPLE_LEN 8
102
103 typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN];
104 typedef uint64_t ftree_tuple_key_t;
105
106 /***************************************************
107 **
108 ** ftree_sw_table_element_t definition
109 **
110 ***************************************************/
111
112 typedef struct {
113 cl_map_item_t map_item;
114 struct ftree_sw_t_ *p_sw;
115 } ftree_sw_tbl_element_t;
116
117 /***************************************************
118 **
119 ** ftree_port_t definition
120 **
121 ***************************************************/
122
123 typedef struct ftree_port_t_ {
124 cl_map_item_t map_item;
125 uint8_t port_num; /* port number on the current node */
126 uint8_t remote_port_num; /* port number on the remote node */
127 uint32_t counter_up; /* number of allocated routes upwards */
128 uint32_t counter_down; /* number of allocated routes downwards */
129 } ftree_port_t;
130
131 /***************************************************
132 **
133 ** ftree_port_group_t definition
134 **
135 ***************************************************/
136
137 typedef union ftree_hca_or_sw_ {
138 struct ftree_hca_t_ *p_hca;
139 struct ftree_sw_t_ *p_sw;
140 } ftree_hca_or_sw;
141
142 typedef struct ftree_port_group_t_ {
143 cl_map_item_t map_item;
144 uint16_t lid; /* lid of the current node */
145 uint16_t remote_lid; /* lid of the remote node */
146 ib_net64_t port_guid; /* port guid of this port */
147 ib_net64_t node_guid; /* this node's guid */
148 uint8_t node_type; /* this node's type */
149 ib_net64_t remote_port_guid; /* port guid of the remote port */
150 ib_net64_t remote_node_guid; /* node guid of the remote node */
151 uint8_t remote_node_type; /* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */
152 ftree_hca_or_sw hca_or_sw; /* pointer to this hca/switch */
153 ftree_hca_or_sw remote_hca_or_sw; /* pointer to remote hca/switch */
154 cl_ptr_vector_t ports; /* vector of ports to the same lid */
155 boolean_t is_cn; /* whether this port is a compute node */
156 boolean_t is_io; /* whether this port is an I/O node */
157 uint32_t counter_down; /* number of allocated routes downwards */
158 uint32_t counter_up; /* number of allocated routes upwards */
159 } ftree_port_group_t;
160
161 /***************************************************
162 **
163 ** ftree_sw_t definition
164 **
165 ***************************************************/
166
167 typedef struct ftree_sw_t_ {
168 cl_map_item_t map_item;
169 osm_switch_t *p_osm_sw;
170 uint32_t rank;
171 ftree_tuple_t tuple;
172 uint16_t lid;
173 ftree_port_group_t **down_port_groups;
174 uint8_t down_port_groups_num;
175 ftree_port_group_t **sibling_port_groups;
176 uint8_t sibling_port_groups_num;
177 ftree_port_group_t **up_port_groups;
178 uint8_t up_port_groups_num;
179 boolean_t is_leaf;
180 unsigned down_port_groups_idx;
181 uint8_t *hops;
182 uint32_t min_counter_down;
183 boolean_t counter_up_changed;
184 } ftree_sw_t;
185
186 /***************************************************
187 **
188 ** ftree_hca_t definition
189 **
190 ***************************************************/
191
192 typedef struct ftree_hca_t_ {
193 cl_map_item_t map_item;
194 osm_node_t *p_osm_node;
195 ftree_port_group_t **up_port_groups;
196 uint8_t *disconnected_ports;
197 uint16_t up_port_groups_num;
198 unsigned cn_num;
199 } ftree_hca_t;
200
201 /***************************************************
202 **
203 ** ftree_fabric_t definition
204 **
205 ***************************************************/
206
207 typedef struct ftree_fabric_t_ {
208 osm_opensm_t *p_osm;
209 osm_subn_t *p_subn;
210 cl_qmap_t hca_tbl;
211 cl_qmap_t sw_tbl;
212 cl_qmap_t sw_by_tuple_tbl;
213 cl_qmap_t cn_guid_tbl;
214 cl_qmap_t io_guid_tbl;
215 unsigned cn_num;
216 unsigned ca_ports;
217 uint8_t leaf_switch_rank;
218 uint8_t max_switch_rank;
219 ftree_sw_t **leaf_switches;
220 uint32_t leaf_switches_num;
221 uint16_t max_cn_per_leaf;
222 uint16_t lft_max_lid;
223 boolean_t fabric_built;
224 } ftree_fabric_t;
225
ftree_get_subnet(IN ftree_fabric_t * p_ftree)226 static inline osm_subn_t *ftree_get_subnet(IN ftree_fabric_t * p_ftree)
227 {
228 return p_ftree->p_subn;
229 }
230
231 /***************************************************
232 **
233 ** comparators
234 **
235 ***************************************************/
236
compare_switches_by_index(IN const void * p1,IN const void * p2)237 static int compare_switches_by_index(IN const void *p1, IN const void *p2)
238 {
239 ftree_sw_t **pp_sw1 = (ftree_sw_t **) p1;
240 ftree_sw_t **pp_sw2 = (ftree_sw_t **) p2;
241
242 uint16_t i;
243 for (i = 0; i < FTREE_TUPLE_LEN; i++) {
244 if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i])
245 return 1;
246 if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i])
247 return -1;
248 }
249 return 0;
250 }
251
252 /***************************************************/
253
254 static int
compare_port_groups_by_remote_switch_index(IN const void * p1,IN const void * p2)255 compare_port_groups_by_remote_switch_index(IN const void *p1, IN const void *p2)
256 {
257 ftree_port_group_t **pp_g1 = (ftree_port_group_t **) p1;
258 ftree_port_group_t **pp_g2 = (ftree_port_group_t **) p2;
259
260 return
261 compare_switches_by_index(&((*pp_g1)->remote_hca_or_sw.p_sw),
262 &((*pp_g2)->remote_hca_or_sw.p_sw));
263 }
264
265 /***************************************************
266 **
267 ** ftree_tuple_t functions
268 **
269 ***************************************************/
270
tuple_init(IN ftree_tuple_t tuple)271 static void tuple_init(IN ftree_tuple_t tuple)
272 {
273 memset(tuple, 0xFF, FTREE_TUPLE_LEN);
274 }
275
276 /***************************************************/
277
tuple_assigned(IN ftree_tuple_t tuple)278 static inline boolean_t tuple_assigned(IN ftree_tuple_t tuple)
279 {
280 return (tuple[0] != 0xFF);
281 }
282
283 /***************************************************/
284
285 #define FTREE_TUPLE_BUFFERS_NUM 6
286
tuple_to_str(IN ftree_tuple_t tuple)287 static const char *tuple_to_str(IN ftree_tuple_t tuple)
288 {
289 static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN];
290 static uint8_t ind = 0;
291 char *ret_buffer;
292 uint32_t i;
293
294 if (!tuple_assigned(tuple))
295 return "INDEX.NOT.ASSIGNED";
296
297 buffer[ind][0] = '\0';
298
299 for (i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++) {
300 if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN)
301 return "INDEX.TOO.LONG";
302 if (i != 0)
303 strcat(buffer[ind], ".");
304 sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]);
305 }
306
307 ret_buffer = buffer[ind];
308 ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM;
309 return ret_buffer;
310 } /* tuple_to_str() */
311
312 /***************************************************/
313
tuple_to_key(IN ftree_tuple_t tuple)314 static inline ftree_tuple_key_t tuple_to_key(IN ftree_tuple_t tuple)
315 {
316 ftree_tuple_key_t key;
317 memcpy(&key, tuple, FTREE_TUPLE_LEN);
318 return key;
319 }
320
321 /***************************************************/
322
tuple_from_key(IN ftree_tuple_t tuple,IN ftree_tuple_key_t key)323 static inline void tuple_from_key(IN ftree_tuple_t tuple,
324 IN ftree_tuple_key_t key)
325 {
326 memcpy(tuple, &key, FTREE_TUPLE_LEN);
327 }
328
329 /***************************************************
330 **
331 ** ftree_sw_tbl_element_t functions
332 **
333 ***************************************************/
334
sw_tbl_element_create(IN ftree_sw_t * p_sw)335 static ftree_sw_tbl_element_t *sw_tbl_element_create(IN ftree_sw_t * p_sw)
336 {
337 ftree_sw_tbl_element_t *p_element =
338 (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t));
339 if (!p_element)
340 return NULL;
341 memset(p_element, 0, sizeof(ftree_sw_tbl_element_t));
342
343 p_element->p_sw = p_sw;
344 return p_element;
345 }
346
347 /***************************************************/
348
sw_tbl_element_destroy(IN ftree_sw_tbl_element_t * p_element)349 static void sw_tbl_element_destroy(IN ftree_sw_tbl_element_t * p_element)
350 {
351 free(p_element);
352 }
353
354 /***************************************************
355 **
356 ** ftree_port_t functions
357 **
358 ***************************************************/
359
port_create(IN uint8_t port_num,IN uint8_t remote_port_num)360 static ftree_port_t *port_create(IN uint8_t port_num,
361 IN uint8_t remote_port_num)
362 {
363 ftree_port_t *p_port = (ftree_port_t *) malloc(sizeof(ftree_port_t));
364 if (!p_port)
365 return NULL;
366 memset(p_port, 0, sizeof(ftree_port_t));
367
368 p_port->port_num = port_num;
369 p_port->remote_port_num = remote_port_num;
370
371 return p_port;
372 }
373
374 /***************************************************/
375
port_destroy(IN ftree_port_t * p_port)376 static void port_destroy(IN ftree_port_t * p_port)
377 {
378 free(p_port);
379 }
380
381 /***************************************************
382 **
383 ** ftree_port_group_t functions
384 **
385 ***************************************************/
386
port_group_create(IN uint16_t lid,IN uint16_t remote_lid,IN ib_net64_t port_guid,IN ib_net64_t node_guid,IN uint8_t node_type,IN void * p_hca_or_sw,IN ib_net64_t remote_port_guid,IN ib_net64_t remote_node_guid,IN uint8_t remote_node_type,IN void * p_remote_hca_or_sw,IN boolean_t is_cn,IN boolean_t is_io)387 static ftree_port_group_t *port_group_create(IN uint16_t lid,
388 IN uint16_t remote_lid,
389 IN ib_net64_t port_guid,
390 IN ib_net64_t node_guid,
391 IN uint8_t node_type,
392 IN void *p_hca_or_sw,
393 IN ib_net64_t remote_port_guid,
394 IN ib_net64_t remote_node_guid,
395 IN uint8_t remote_node_type,
396 IN void *p_remote_hca_or_sw,
397 IN boolean_t is_cn,
398 IN boolean_t is_io)
399 {
400 ftree_port_group_t *p_group =
401 (ftree_port_group_t *) malloc(sizeof(ftree_port_group_t));
402 if (p_group == NULL)
403 return NULL;
404 memset(p_group, 0, sizeof(ftree_port_group_t));
405
406 p_group->lid = lid;
407 p_group->remote_lid = remote_lid;
408 memcpy(&p_group->port_guid, &port_guid, sizeof(ib_net64_t));
409 memcpy(&p_group->node_guid, &node_guid, sizeof(ib_net64_t));
410 memcpy(&p_group->remote_port_guid, &remote_port_guid,
411 sizeof(ib_net64_t));
412 memcpy(&p_group->remote_node_guid, &remote_node_guid,
413 sizeof(ib_net64_t));
414
415 p_group->node_type = node_type;
416 switch (node_type) {
417 case IB_NODE_TYPE_CA:
418 p_group->hca_or_sw.p_hca = (ftree_hca_t *) p_hca_or_sw;
419 break;
420 case IB_NODE_TYPE_SWITCH:
421 p_group->hca_or_sw.p_sw = (ftree_sw_t *) p_hca_or_sw;
422 break;
423 default:
424 /* we shouldn't get here - port is created only in hca or switch */
425 CL_ASSERT(0);
426 }
427
428 p_group->remote_node_type = remote_node_type;
429 switch (remote_node_type) {
430 case IB_NODE_TYPE_CA:
431 p_group->remote_hca_or_sw.p_hca =
432 (ftree_hca_t *) p_remote_hca_or_sw;
433 break;
434 case IB_NODE_TYPE_SWITCH:
435 p_group->remote_hca_or_sw.p_sw =
436 (ftree_sw_t *) p_remote_hca_or_sw;
437 break;
438 default:
439 /* we shouldn't get here - port is created only in hca or switch */
440 CL_ASSERT(0);
441 }
442
443 cl_ptr_vector_init(&p_group->ports, 0, /* min size */
444 8); /* grow size */
445 p_group->is_cn = is_cn;
446 p_group->is_io = is_io;
447 return p_group;
448 } /* port_group_create() */
449
450 /***************************************************/
451
port_group_destroy(IN ftree_port_group_t * p_group)452 static void port_group_destroy(IN ftree_port_group_t * p_group)
453 {
454 uint32_t i;
455 uint32_t size;
456 ftree_port_t *p_port;
457
458 if (!p_group)
459 return;
460
461 /* remove all the elements of p_group->ports vector */
462 size = cl_ptr_vector_get_size(&p_group->ports);
463 for (i = 0; i < size; i++)
464 if (cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port) == CL_SUCCESS)
465 port_destroy(p_port);
466
467 cl_ptr_vector_destroy(&p_group->ports);
468 free(p_group);
469 } /* port_group_destroy() */
470
471 /***************************************************/
472
port_group_dump(IN ftree_fabric_t * p_ftree,IN ftree_port_group_t * p_group,IN ftree_direction_t direction)473 static void port_group_dump(IN ftree_fabric_t * p_ftree,
474 IN ftree_port_group_t * p_group,
475 IN ftree_direction_t direction)
476 {
477 ftree_port_t *p_port;
478 uint32_t size;
479 uint32_t i;
480 char *buff;
481
482 if (!p_group)
483 return;
484
485 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
486 return;
487
488 size = cl_ptr_vector_get_size(&p_group->ports);
489
490 buff = calloc(10, 1024);
491 if (!buff) {
492 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB33: "
493 "Failed to allocate buffer\n");
494 return;
495 }
496
497 for (i = 0; i < size; i++) {
498 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
499 CL_ASSERT(p_port);
500
501 if (i != 0)
502 strcat(buff, ", ");
503 sprintf(buff + strlen(buff), "%u", p_port->port_num);
504 }
505
506 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
507 " Port Group of size %u, port(s): %s, direction: %s\n"
508 " Local <--> Remote GUID (LID):"
509 "0x%016" PRIx64 " (0x%04x) <--> 0x%016" PRIx64 " (0x%04x)\n",
510 size, buff,
511 (direction == FTREE_DIRECTION_DOWN) ? "DOWN" : (direction ==
512 FTREE_DIRECTION_SAME)
513 ? "SIBLING" : "UP", cl_ntoh64(p_group->port_guid),
514 p_group->lid, cl_ntoh64(p_group->remote_port_guid),
515 p_group->remote_lid);
516
517 free(buff);
518
519 } /* port_group_dump() */
520
521 /***************************************************/
522
port_group_add_port(IN ftree_port_group_t * p_group,IN uint8_t port_num,IN uint8_t remote_port_num)523 static void port_group_add_port(IN ftree_port_group_t * p_group,
524 IN uint8_t port_num, IN uint8_t remote_port_num)
525 {
526 uint16_t i;
527 ftree_port_t *p_port;
528
529 for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++) {
530 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
531 if (p_port->port_num == port_num)
532 return;
533 }
534
535 p_port = port_create(port_num, remote_port_num);
536 CL_ASSERT(p_port);
537 cl_ptr_vector_insert(&p_group->ports, p_port, NULL);
538 }
539
540 /***************************************************
541 **
542 ** ftree_sw_t functions
543 **
544 ***************************************************/
545
sw_create(IN osm_switch_t * p_osm_sw)546 static ftree_sw_t *sw_create(IN osm_switch_t * p_osm_sw)
547 {
548 ftree_sw_t *p_sw;
549 uint8_t ports_num;
550
551 /* make sure that the switch has ports */
552 if (p_osm_sw->num_ports == 1)
553 return NULL;
554
555 p_sw = (ftree_sw_t *) malloc(sizeof(ftree_sw_t));
556 if (p_sw == NULL)
557 return NULL;
558 memset(p_sw, 0, sizeof(ftree_sw_t));
559
560 p_sw->p_osm_sw = p_osm_sw;
561 p_sw->rank = 0xFFFFFFFF;
562 tuple_init(p_sw->tuple);
563
564 p_sw->lid =
565 cl_ntoh16(osm_node_get_base_lid(p_sw->p_osm_sw->p_node, 0));
566
567 ports_num = osm_node_get_num_physp(p_sw->p_osm_sw->p_node);
568 p_sw->down_port_groups =
569 (ftree_port_group_t **) malloc(ports_num *
570 sizeof(ftree_port_group_t *));
571 if (p_sw->down_port_groups == NULL)
572 goto FREE_P_SW;
573 memset(p_sw->down_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
574
575 p_sw->up_port_groups =
576 (ftree_port_group_t **) malloc(ports_num *
577 sizeof(ftree_port_group_t *));
578 if (p_sw->up_port_groups == NULL)
579 goto FREE_DOWN;
580 memset(p_sw->up_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
581
582 p_sw->sibling_port_groups =
583 (ftree_port_group_t **) malloc(ports_num *
584 sizeof(ftree_port_group_t *));
585 if (p_sw->sibling_port_groups == NULL)
586 goto FREE_UP;
587 memset(p_sw->sibling_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
588
589 /* initialize lft buffer */
590 memset(p_osm_sw->new_lft, OSM_NO_PATH, p_osm_sw->lft_size);
591 p_sw->hops = malloc((p_osm_sw->max_lid_ho + 1) * sizeof(*(p_sw->hops)));
592 if (p_sw->hops == NULL)
593 goto FREE_SIBLING;
594
595 memset(p_sw->hops, OSM_NO_PATH, p_osm_sw->max_lid_ho + 1);
596
597 return p_sw;
598
599 FREE_SIBLING:
600 free(p_sw->sibling_port_groups);
601 FREE_UP:
602 free(p_sw->up_port_groups);
603 FREE_DOWN:
604 free(p_sw->down_port_groups);
605 FREE_P_SW:
606 free(p_sw);
607 return NULL;
608 } /* sw_create() */
609
610 /***************************************************/
611
sw_destroy(IN ftree_sw_t * p_sw)612 static void sw_destroy(IN ftree_sw_t * p_sw)
613 {
614 uint8_t i;
615
616 if (!p_sw)
617 return;
618 free(p_sw->hops);
619
620 for (i = 0; i < p_sw->down_port_groups_num; i++)
621 port_group_destroy(p_sw->down_port_groups[i]);
622 for (i = 0; i < p_sw->sibling_port_groups_num; i++)
623 port_group_destroy(p_sw->sibling_port_groups[i]);
624 for (i = 0; i < p_sw->up_port_groups_num; i++)
625 port_group_destroy(p_sw->up_port_groups[i]);
626 free(p_sw->down_port_groups);
627 free(p_sw->sibling_port_groups);
628 free(p_sw->up_port_groups);
629
630 free(p_sw);
631 } /* sw_destroy() */
632
633 /***************************************************/
634
sw_get_guid_no(IN ftree_sw_t * p_sw)635 static uint64_t sw_get_guid_no(IN ftree_sw_t * p_sw)
636 {
637 if (!p_sw)
638 return 0;
639 return osm_node_get_node_guid(p_sw->p_osm_sw->p_node);
640 }
641
642 /***************************************************/
643
sw_get_guid_ho(IN ftree_sw_t * p_sw)644 static uint64_t sw_get_guid_ho(IN ftree_sw_t * p_sw)
645 {
646 return cl_ntoh64(sw_get_guid_no(p_sw));
647 }
648
649 /***************************************************/
650
sw_dump(IN ftree_fabric_t * p_ftree,IN ftree_sw_t * p_sw)651 static void sw_dump(IN ftree_fabric_t * p_ftree, IN ftree_sw_t * p_sw)
652 {
653 uint32_t i;
654
655 if (!p_sw)
656 return;
657
658 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
659 return;
660
661 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
662 "Switch index: %s, GUID: 0x%016" PRIx64
663 ", Ports: %u DOWN, %u SIBLINGS, %u UP\n",
664 tuple_to_str(p_sw->tuple), sw_get_guid_ho(p_sw),
665 p_sw->down_port_groups_num, p_sw->sibling_port_groups_num,
666 p_sw->up_port_groups_num);
667
668 for (i = 0; i < p_sw->down_port_groups_num; i++)
669 port_group_dump(p_ftree, p_sw->down_port_groups[i],
670 FTREE_DIRECTION_DOWN);
671 for (i = 0; i < p_sw->sibling_port_groups_num; i++)
672 port_group_dump(p_ftree, p_sw->sibling_port_groups[i],
673 FTREE_DIRECTION_SAME);
674 for (i = 0; i < p_sw->up_port_groups_num; i++)
675 port_group_dump(p_ftree, p_sw->up_port_groups[i],
676 FTREE_DIRECTION_UP);
677
678 } /* sw_dump() */
679
680 /***************************************************/
681
sw_ranked(IN ftree_sw_t * p_sw)682 static boolean_t sw_ranked(IN ftree_sw_t * p_sw)
683 {
684 return (p_sw->rank != 0xFFFFFFFF);
685 }
686
687 /***************************************************/
688
sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw,IN uint16_t remote_lid,IN ftree_direction_t direction)689 static ftree_port_group_t *sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw,
690 IN uint16_t
691 remote_lid,
692 IN ftree_direction_t
693 direction)
694 {
695 uint32_t i;
696 uint32_t size;
697 ftree_port_group_t **port_groups;
698
699 if (direction == FTREE_DIRECTION_UP) {
700 port_groups = p_sw->up_port_groups;
701 size = p_sw->up_port_groups_num;
702 } else if (direction == FTREE_DIRECTION_SAME) {
703 port_groups = p_sw->sibling_port_groups;
704 size = p_sw->sibling_port_groups_num;
705 } else {
706 port_groups = p_sw->down_port_groups;
707 size = p_sw->down_port_groups_num;
708 }
709
710 for (i = 0; i < size; i++)
711 if (remote_lid == port_groups[i]->remote_lid)
712 return port_groups[i];
713
714 return NULL;
715 } /* sw_get_port_group_by_remote_lid() */
716
717 /***************************************************/
718
sw_add_port(IN ftree_sw_t * p_sw,IN uint8_t port_num,IN uint8_t remote_port_num,IN uint16_t lid,IN uint16_t remote_lid,IN ib_net64_t port_guid,IN ib_net64_t remote_port_guid,IN ib_net64_t remote_node_guid,IN uint8_t remote_node_type,IN void * p_remote_hca_or_sw,IN ftree_direction_t direction)719 static void sw_add_port(IN ftree_sw_t * p_sw, IN uint8_t port_num,
720 IN uint8_t remote_port_num, IN uint16_t lid,
721 IN uint16_t remote_lid, IN ib_net64_t port_guid,
722 IN ib_net64_t remote_port_guid,
723 IN ib_net64_t remote_node_guid,
724 IN uint8_t remote_node_type,
725 IN void *p_remote_hca_or_sw,
726 IN ftree_direction_t direction)
727 {
728 ftree_port_group_t *p_group =
729 sw_get_port_group_by_remote_lid(p_sw, remote_lid, direction);
730
731 if (!p_group) {
732 p_group = port_group_create(lid, remote_lid,
733 port_guid, sw_get_guid_no(p_sw),
734 IB_NODE_TYPE_SWITCH, p_sw,
735 remote_port_guid, remote_node_guid,
736 remote_node_type,
737 p_remote_hca_or_sw, FALSE, FALSE);
738 CL_ASSERT(p_group);
739
740 if (direction == FTREE_DIRECTION_UP) {
741 p_sw->up_port_groups[p_sw->up_port_groups_num++] =
742 p_group;
743 } else if (direction == FTREE_DIRECTION_SAME) {
744 p_sw->
745 sibling_port_groups[p_sw->sibling_port_groups_num++]
746 = p_group;
747 } else
748 p_sw->down_port_groups[p_sw->down_port_groups_num++] =
749 p_group;
750 }
751 port_group_add_port(p_group, port_num, remote_port_num);
752
753 } /* sw_add_port() */
754
755 /***************************************************/
756
sw_set_hops(IN ftree_sw_t * p_sw,IN uint16_t lid,IN uint8_t port_num,IN uint8_t hops,IN boolean_t is_target_sw)757 static inline cl_status_t sw_set_hops(IN ftree_sw_t * p_sw, IN uint16_t lid,
758 IN uint8_t port_num, IN uint8_t hops,
759 IN boolean_t is_target_sw)
760 {
761 /* set local min hop table(LID) */
762 p_sw->hops[lid] = hops;
763 if (is_target_sw)
764 return osm_switch_set_hops(p_sw->p_osm_sw, lid, port_num, hops);
765 return 0;
766 }
767
768 /***************************************************/
769
set_hops_on_remote_sw(IN ftree_port_group_t * p_group,IN uint16_t target_lid,IN uint8_t hops,IN boolean_t is_target_sw)770 static int set_hops_on_remote_sw(IN ftree_port_group_t * p_group,
771 IN uint16_t target_lid, IN uint8_t hops,
772 IN boolean_t is_target_sw)
773 {
774 ftree_port_t *p_port;
775 uint8_t i, ports_num;
776 ftree_sw_t *p_remote_sw = p_group->remote_hca_or_sw.p_sw;
777
778 /* if lid is a switch, we set the min hop table in the osm_switch struct */
779 CL_ASSERT(p_group->remote_node_type == IB_NODE_TYPE_SWITCH);
780 p_remote_sw->hops[target_lid] = hops;
781
782 /* If target lid is a switch we set the min hop table values
783 * for each port on the associated osm_sw struct */
784 if (!is_target_sw)
785 return 0;
786
787 ports_num = (uint8_t) cl_ptr_vector_get_size(&p_group->ports);
788 for (i = 0; i < ports_num; i++) {
789 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
790 if (sw_set_hops(p_remote_sw, target_lid,
791 p_port->remote_port_num, hops, is_target_sw))
792 return -1;
793 }
794 return 0;
795 }
796
797 /***************************************************/
798
799 static inline uint8_t
sw_get_least_hops(IN ftree_sw_t * p_sw,IN uint16_t target_lid)800 sw_get_least_hops(IN ftree_sw_t * p_sw, IN uint16_t target_lid)
801 {
802 CL_ASSERT(p_sw->hops != NULL);
803 return p_sw->hops[target_lid];
804 }
805
806 /***************************************************
807 **
808 ** ftree_hca_t functions
809 **
810 ***************************************************/
811
hca_create(IN osm_node_t * p_osm_node)812 static ftree_hca_t *hca_create(IN osm_node_t * p_osm_node)
813 {
814 ftree_hca_t *p_hca = (ftree_hca_t *) malloc(sizeof(ftree_hca_t));
815 if (p_hca == NULL)
816 return NULL;
817 memset(p_hca, 0, sizeof(ftree_hca_t));
818
819 p_hca->p_osm_node = p_osm_node;
820 p_hca->up_port_groups = (ftree_port_group_t **)
821 malloc(osm_node_get_num_physp(p_hca->p_osm_node) *
822 sizeof(ftree_port_group_t *));
823 if (!p_hca->up_port_groups) {
824 free(p_hca);
825 return NULL;
826 }
827 memset(p_hca->up_port_groups, 0, osm_node_get_num_physp(p_hca->p_osm_node) *
828 sizeof(ftree_port_group_t *));
829
830 p_hca->disconnected_ports = (uint8_t *)
831 calloc(osm_node_get_num_physp(p_hca->p_osm_node) + 1, sizeof(uint8_t));
832 if (!p_hca->disconnected_ports) {
833 free(p_hca->up_port_groups);
834 free(p_hca);
835 return NULL;
836 }
837 p_hca->up_port_groups_num = 0;
838 return p_hca;
839 }
840
841 /***************************************************/
842
hca_destroy(IN ftree_hca_t * p_hca)843 static void hca_destroy(IN ftree_hca_t * p_hca)
844 {
845 uint32_t i;
846
847 if (!p_hca)
848 return;
849
850 for (i = 0; i < p_hca->up_port_groups_num; i++)
851 port_group_destroy(p_hca->up_port_groups[i]);
852
853 free(p_hca->up_port_groups);
854 free(p_hca->disconnected_ports);
855
856 free(p_hca);
857 }
858
859 /***************************************************/
860
hca_get_guid_no(IN ftree_hca_t * p_hca)861 static uint64_t hca_get_guid_no(IN ftree_hca_t * p_hca)
862 {
863 if (!p_hca)
864 return 0;
865 return osm_node_get_node_guid(p_hca->p_osm_node);
866 }
867
868 /***************************************************/
869
hca_get_guid_ho(IN ftree_hca_t * p_hca)870 static uint64_t hca_get_guid_ho(IN ftree_hca_t * p_hca)
871 {
872 return cl_ntoh64(hca_get_guid_no(p_hca));
873 }
874
875 /***************************************************/
876
hca_dump(IN ftree_fabric_t * p_ftree,IN ftree_hca_t * p_hca)877 static void hca_dump(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca)
878 {
879 uint32_t i;
880
881 if (!p_hca)
882 return;
883
884 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
885 return;
886
887 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
888 "CA GUID: 0x%016" PRIx64 ", Ports: %u UP\n",
889 hca_get_guid_ho(p_hca), p_hca->up_port_groups_num);
890
891 for (i = 0; i < p_hca->up_port_groups_num; i++)
892 port_group_dump(p_ftree, p_hca->up_port_groups[i],
893 FTREE_DIRECTION_UP);
894 }
895
hca_get_port_group_by_lid(IN ftree_hca_t * p_hca,IN uint16_t lid)896 static ftree_port_group_t *hca_get_port_group_by_lid(IN ftree_hca_t *
897 p_hca,
898 IN uint16_t
899 lid)
900 {
901 uint32_t i;
902 for (i = 0; i < p_hca->up_port_groups_num; i++)
903 if (lid ==
904 p_hca->up_port_groups[i]->lid)
905 return p_hca->up_port_groups[i];
906
907 return NULL;
908 }
909 /***************************************************/
910
hca_add_port(IN ftree_fabric_t * p_ftree,IN ftree_hca_t * p_hca,IN uint8_t port_num,IN uint8_t remote_port_num,IN uint16_t lid,IN uint16_t remote_lid,IN ib_net64_t port_guid,IN ib_net64_t remote_port_guid,IN ib_net64_t remote_node_guid,IN uint8_t remote_node_type,IN void * p_remote_hca_or_sw,IN boolean_t is_cn,IN boolean_t is_io)911 static void hca_add_port(IN ftree_fabric_t * p_ftree,
912 IN ftree_hca_t * p_hca, IN uint8_t port_num,
913 IN uint8_t remote_port_num, IN uint16_t lid,
914 IN uint16_t remote_lid, IN ib_net64_t port_guid,
915 IN ib_net64_t remote_port_guid,
916 IN ib_net64_t remote_node_guid,
917 IN uint8_t remote_node_type,
918 IN void *p_remote_hca_or_sw, IN boolean_t is_cn,
919 IN boolean_t is_io)
920 {
921 ftree_port_group_t *p_group;
922
923 /* this function is supposed to be called only for adding ports
924 in hca's that lead to switches */
925 CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH);
926
927 p_group = hca_get_port_group_by_lid(p_hca, lid);
928
929 if (!p_group) {
930 p_group = port_group_create(lid, remote_lid,
931 port_guid, hca_get_guid_no(p_hca),
932 IB_NODE_TYPE_CA, p_hca,
933 remote_port_guid, remote_node_guid,
934 remote_node_type,
935 p_remote_hca_or_sw, is_cn, is_io);
936 CL_ASSERT(p_group);
937 p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group;
938 port_group_add_port(p_group, port_num, remote_port_num);
939 } else
940 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
941 "ERR AB32: Duplicated LID for CA GUID: 0x%016" PRIx64 "\n",
942 cl_ntoh64(port_guid));
943 } /* hca_add_port() */
944
945 /***************************************************
946 **
947 ** ftree_fabric_t functions
948 **
949 ***************************************************/
950
fabric_create()951 static ftree_fabric_t *fabric_create()
952 {
953 ftree_fabric_t *p_ftree =
954 (ftree_fabric_t *) malloc(sizeof(ftree_fabric_t));
955 if (p_ftree == NULL)
956 return NULL;
957
958 memset(p_ftree, 0, sizeof(ftree_fabric_t));
959
960 cl_qmap_init(&p_ftree->hca_tbl);
961 cl_qmap_init(&p_ftree->sw_tbl);
962 cl_qmap_init(&p_ftree->sw_by_tuple_tbl);
963 cl_qmap_init(&p_ftree->cn_guid_tbl);
964 cl_qmap_init(&p_ftree->io_guid_tbl);
965
966 return p_ftree;
967 }
968
969 /***************************************************/
970
fabric_clear(ftree_fabric_t * p_ftree)971 static void fabric_clear(ftree_fabric_t * p_ftree)
972 {
973 ftree_hca_t *p_hca;
974 ftree_hca_t *p_next_hca;
975 ftree_sw_t *p_sw;
976 ftree_sw_t *p_next_sw;
977 ftree_sw_tbl_element_t *p_element;
978 ftree_sw_tbl_element_t *p_next_element;
979 name_map_item_t *p_guid_element, *p_next_guid_element;
980
981 if (!p_ftree)
982 return;
983
984 /* remove all the elements of hca_tbl */
985
986 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
987 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
988 p_hca = p_next_hca;
989 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
990 hca_destroy(p_hca);
991 }
992 cl_qmap_remove_all(&p_ftree->hca_tbl);
993
994 /* remove all the elements of sw_tbl */
995
996 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
997 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
998 p_sw = p_next_sw;
999 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1000 sw_destroy(p_sw);
1001 }
1002 cl_qmap_remove_all(&p_ftree->sw_tbl);
1003
1004 /* remove all the elements of sw_by_tuple_tbl */
1005
1006 p_next_element =
1007 (ftree_sw_tbl_element_t *) cl_qmap_head(&p_ftree->sw_by_tuple_tbl);
1008 while (p_next_element != (ftree_sw_tbl_element_t *)
1009 cl_qmap_end(&p_ftree->sw_by_tuple_tbl)) {
1010 p_element = p_next_element;
1011 p_next_element = (ftree_sw_tbl_element_t *)
1012 cl_qmap_next(&p_element->map_item);
1013 sw_tbl_element_destroy(p_element);
1014 }
1015 cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl);
1016
1017 /* remove all the elements of cn_guid_tbl */
1018 p_next_guid_element =
1019 (name_map_item_t *) cl_qmap_head(&p_ftree->cn_guid_tbl);
1020 while (p_next_guid_element !=
1021 (name_map_item_t *) cl_qmap_end(&p_ftree->cn_guid_tbl)) {
1022 p_guid_element = p_next_guid_element;
1023 p_next_guid_element =
1024 (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
1025 free(p_guid_element);
1026 }
1027 cl_qmap_remove_all(&p_ftree->cn_guid_tbl);
1028
1029 /* remove all the elements of io_guid_tbl */
1030 p_next_guid_element =
1031 (name_map_item_t *) cl_qmap_head(&p_ftree->io_guid_tbl);
1032 while (p_next_guid_element !=
1033 (name_map_item_t *) cl_qmap_end(&p_ftree->io_guid_tbl)) {
1034 p_guid_element = p_next_guid_element;
1035 p_next_guid_element =
1036 (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
1037 free(p_guid_element);
1038 }
1039 cl_qmap_remove_all(&p_ftree->io_guid_tbl);
1040
1041 /* free the leaf switches array */
1042 if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches))
1043 free(p_ftree->leaf_switches);
1044
1045 p_ftree->leaf_switches_num = 0;
1046 p_ftree->cn_num = 0;
1047 p_ftree->ca_ports = 0;
1048 p_ftree->leaf_switch_rank = 0;
1049 p_ftree->max_switch_rank = 0;
1050 p_ftree->max_cn_per_leaf = 0;
1051 p_ftree->lft_max_lid = 0;
1052 p_ftree->leaf_switches = NULL;
1053 p_ftree->fabric_built = FALSE;
1054
1055 } /* fabric_destroy() */
1056
1057 /***************************************************/
1058
fabric_destroy(ftree_fabric_t * p_ftree)1059 static void fabric_destroy(ftree_fabric_t * p_ftree)
1060 {
1061 if (!p_ftree)
1062 return;
1063 fabric_clear(p_ftree);
1064 free(p_ftree);
1065 }
1066
1067 /***************************************************/
1068
fabric_get_rank(ftree_fabric_t * p_ftree)1069 static uint8_t fabric_get_rank(ftree_fabric_t * p_ftree)
1070 {
1071 return p_ftree->leaf_switch_rank + 1;
1072 }
1073
1074 /***************************************************/
1075
fabric_add_hca(ftree_fabric_t * p_ftree,osm_node_t * p_osm_node)1076 static void fabric_add_hca(ftree_fabric_t * p_ftree, osm_node_t * p_osm_node)
1077 {
1078 ftree_hca_t *p_hca;
1079
1080 CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA);
1081
1082 p_hca = hca_create(p_osm_node);
1083 if (!p_hca)
1084 return;
1085
1086 cl_qmap_insert(&p_ftree->hca_tbl, p_osm_node->node_info.node_guid,
1087 &p_hca->map_item);
1088 }
1089
1090 /***************************************************/
1091
fabric_add_sw(ftree_fabric_t * p_ftree,osm_switch_t * p_osm_sw)1092 static void fabric_add_sw(ftree_fabric_t * p_ftree, osm_switch_t * p_osm_sw)
1093 {
1094 ftree_sw_t *p_sw;
1095
1096 CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH);
1097
1098 p_sw = sw_create(p_osm_sw);
1099 if (!p_sw)
1100 return;
1101
1102 cl_qmap_insert(&p_ftree->sw_tbl, p_osm_sw->p_node->node_info.node_guid,
1103 &p_sw->map_item);
1104
1105 /* track the max lid (in host order) that exists in the fabric */
1106 if (p_sw->lid > p_ftree->lft_max_lid)
1107 p_ftree->lft_max_lid = p_sw->lid;
1108 }
1109
1110 /***************************************************/
1111
fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree,IN ftree_sw_t * p_sw)1112 static void fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1113 IN ftree_sw_t * p_sw)
1114 {
1115 CL_ASSERT(tuple_assigned(p_sw->tuple));
1116
1117 cl_qmap_insert(&p_ftree->sw_by_tuple_tbl, tuple_to_key(p_sw->tuple),
1118 &sw_tbl_element_create(p_sw)->map_item);
1119 }
1120
1121 /***************************************************/
1122
fabric_get_sw_by_tuple(IN ftree_fabric_t * p_ftree,IN ftree_tuple_t tuple)1123 static ftree_sw_t *fabric_get_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1124 IN ftree_tuple_t tuple)
1125 {
1126 ftree_sw_tbl_element_t *p_element;
1127
1128 CL_ASSERT(tuple_assigned(tuple));
1129
1130 tuple_to_key(tuple);
1131
1132 p_element =
1133 (ftree_sw_tbl_element_t *) cl_qmap_get(&p_ftree->sw_by_tuple_tbl,
1134 tuple_to_key(tuple));
1135 if (p_element ==
1136 (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->sw_by_tuple_tbl))
1137 return NULL;
1138
1139 return p_element->p_sw;
1140 }
1141
1142 /***************************************************/
1143
fabric_get_sw_by_guid(IN ftree_fabric_t * p_ftree,IN uint64_t guid)1144 static ftree_sw_t *fabric_get_sw_by_guid(IN ftree_fabric_t * p_ftree,
1145 IN uint64_t guid)
1146 {
1147 ftree_sw_t *p_sw;
1148 p_sw = (ftree_sw_t *) cl_qmap_get(&p_ftree->sw_tbl, guid);
1149 if (p_sw == (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl))
1150 return NULL;
1151 return p_sw;
1152 }
1153
1154 /***************************************************/
1155
fabric_get_hca_by_guid(IN ftree_fabric_t * p_ftree,IN uint64_t guid)1156 static ftree_hca_t *fabric_get_hca_by_guid(IN ftree_fabric_t * p_ftree,
1157 IN uint64_t guid)
1158 {
1159 ftree_hca_t *p_hca;
1160 p_hca = (ftree_hca_t *) cl_qmap_get(&p_ftree->hca_tbl, guid);
1161 if (p_hca == (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl))
1162 return NULL;
1163 return p_hca;
1164 }
1165
1166 /***************************************************/
1167
fabric_dump(ftree_fabric_t * p_ftree)1168 static void fabric_dump(ftree_fabric_t * p_ftree)
1169 {
1170 uint32_t i;
1171 ftree_hca_t *p_hca;
1172 ftree_sw_t *p_sw;
1173
1174 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
1175 return;
1176
1177 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1178 " |-------------------------------|\n"
1179 " |- Full fabric topology dump -|\n"
1180 " |-------------------------------|\n\n");
1181
1182 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "-- CAs:\n");
1183
1184 for (p_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1185 p_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl);
1186 p_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item)) {
1187 hca_dump(p_ftree, p_hca);
1188 }
1189
1190 for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1191 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1192 "-- Rank %u switches\n", i);
1193 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1194 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1195 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1196 if (p_sw->rank == i)
1197 sw_dump(p_ftree, p_sw);
1198 }
1199 }
1200
1201 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1202 " |---------------------------------------|\n"
1203 " |- Full fabric topology dump completed -|\n"
1204 " |---------------------------------------|\n\n");
1205 } /* fabric_dump() */
1206
1207 /***************************************************/
1208
fabric_dump_general_info(IN ftree_fabric_t * p_ftree)1209 static void fabric_dump_general_info(IN ftree_fabric_t * p_ftree)
1210 {
1211 uint32_t i, j;
1212 ftree_sw_t *p_sw;
1213
1214 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1215 "General fabric topology info\n");
1216 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1217 "============================\n");
1218
1219 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1220 " - FatTree rank (roots to leaf switches): %u\n",
1221 p_ftree->leaf_switch_rank + 1);
1222 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1223 " - FatTree max switch rank: %u\n", p_ftree->max_switch_rank);
1224 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1225 " - Fabric has %u CAs, %u CA ports (%u of them CNs), %u switches\n",
1226 cl_qmap_count(&p_ftree->hca_tbl), p_ftree->ca_ports,
1227 p_ftree->cn_num, cl_qmap_count(&p_ftree->sw_tbl));
1228
1229 CL_ASSERT(p_ftree->ca_ports >= p_ftree->cn_num);
1230
1231 for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1232 j = 0;
1233 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1234 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1235 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1236 if (p_sw->rank == i)
1237 j++;
1238 }
1239 if (i == 0)
1240 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1241 " - Fabric has %u switches at rank %u (roots)\n",
1242 j, i);
1243 else if (i == p_ftree->leaf_switch_rank)
1244 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1245 " - Fabric has %u switches at rank %u (%u of them leafs)\n",
1246 j, i, p_ftree->leaf_switches_num);
1247 else
1248 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1249 " - Fabric has %u switches at rank %u\n", j,
1250 i);
1251 }
1252
1253 if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_VERBOSE)) {
1254 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1255 " - Root switches:\n");
1256 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1257 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1258 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1259 if (p_sw->rank == 0)
1260 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1261 " GUID: 0x%016" PRIx64
1262 ", LID: %u, Index %s\n",
1263 sw_get_guid_ho(p_sw),
1264 p_sw->lid,
1265 tuple_to_str(p_sw->tuple));
1266 }
1267
1268 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1269 " - Leaf switches (sorted by index):\n");
1270 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1271 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1272 " GUID: 0x%016" PRIx64
1273 ", LID: %u, Index %s\n",
1274 sw_get_guid_ho(p_ftree->leaf_switches[i]),
1275 p_ftree->leaf_switches[i]->lid,
1276 tuple_to_str(p_ftree->leaf_switches[i]->tuple));
1277 }
1278 }
1279 } /* fabric_dump_general_info() */
1280
1281 /***************************************************/
1282
fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree)1283 static void fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree)
1284 {
1285 ftree_hca_t *p_hca;
1286 ftree_sw_t *p_sw;
1287 ftree_port_group_t *p_group_on_sw;
1288 ftree_port_group_t *p_group_on_hca;
1289 int rename_status = 0;
1290 uint32_t i;
1291 uint32_t j;
1292 unsigned printed_hcas_on_leaf;
1293
1294 char path[1024], path_tmp[1032];
1295 FILE *p_hca_ordering_file;
1296 const char *filename = "opensm-ftree-ca-order.dump";
1297
1298 snprintf(path, sizeof(path), "%s/%s",
1299 p_ftree->p_osm->subn.opt.dump_files_dir, filename);
1300
1301 snprintf(path_tmp, sizeof(path_tmp), "%s.tmp", path);
1302
1303 p_hca_ordering_file = fopen(path_tmp, "w");
1304 if (!p_hca_ordering_file) {
1305 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB01: "
1306 "cannot open file \'%s\': %s\n", path_tmp,
1307 strerror(errno));
1308 return;
1309 }
1310
1311 /* for each leaf switch (in indexing order) */
1312 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1313 p_sw = p_ftree->leaf_switches[i];
1314 printed_hcas_on_leaf = 0;
1315
1316 /* for each real CA (CNs and not) connected to this switch */
1317 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1318 p_group_on_sw = p_sw->down_port_groups[j];
1319
1320 if (p_group_on_sw->remote_node_type != IB_NODE_TYPE_CA)
1321 continue;
1322
1323 p_hca = p_group_on_sw->remote_hca_or_sw.p_hca;
1324 p_group_on_hca =
1325 hca_get_port_group_by_lid(p_hca,
1326 p_group_on_sw->
1327 remote_lid);
1328
1329 /* treat non-compute nodes as dummies */
1330 if (!p_group_on_hca->is_cn)
1331 continue;
1332
1333 fprintf(p_hca_ordering_file, "0x%04x\t%s\n",
1334 p_group_on_hca->lid,
1335 p_hca->p_osm_node->print_desc);
1336
1337 printed_hcas_on_leaf++;
1338 }
1339
1340 /* now print missing HCAs */
1341 for (j = 0;
1342 j < (p_ftree->max_cn_per_leaf - printed_hcas_on_leaf); j++)
1343 fprintf(p_hca_ordering_file, "0xFFFF\tDUMMY\n");
1344
1345 }
1346 /* done going through all the leaf switches */
1347
1348 fclose(p_hca_ordering_file);
1349
1350 rename_status = rename(path_tmp, path);
1351 if (rename_status) {
1352 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB03: "
1353 "cannot rename file \'%s\': %s\n", path_tmp,
1354 strerror(errno));
1355 }
1356 } /* fabric_dump_hca_ordering() */
1357
1358 /***************************************************/
1359
fabric_assign_tuple(IN ftree_fabric_t * p_ftree,IN ftree_sw_t * p_sw,IN ftree_tuple_t new_tuple)1360 static void fabric_assign_tuple(IN ftree_fabric_t * p_ftree,
1361 IN ftree_sw_t * p_sw,
1362 IN ftree_tuple_t new_tuple)
1363 {
1364 memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN);
1365 fabric_add_sw_by_tuple(p_ftree, p_sw);
1366 }
1367
1368 /***************************************************/
1369
fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree,IN ftree_sw_t * p_sw,IN unsigned int subtree)1370 static void fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree,
1371 IN ftree_sw_t * p_sw,
1372 IN unsigned int subtree)
1373 {
1374 uint8_t i;
1375 ftree_tuple_t new_tuple;
1376
1377 if (p_ftree->leaf_switch_rank >= FTREE_TUPLE_LEN)
1378 return;
1379
1380 tuple_init(new_tuple);
1381 new_tuple[0] = (uint8_t) p_sw->rank;
1382
1383 for (i = 1; i <= p_ftree->leaf_switch_rank; i++)
1384 new_tuple[i] = 0;
1385
1386 if (p_sw->rank == 0) {
1387 if (p_ftree->leaf_switch_rank > 1)
1388 new_tuple[p_ftree->leaf_switch_rank] = subtree;
1389
1390 for (i = 0; i < 0xFF; i++) {
1391 new_tuple[1] = i;
1392 if (fabric_get_sw_by_tuple(p_ftree, new_tuple) == NULL)
1393 break;
1394 }
1395 if (i == 0xFF) {
1396 /* new tuple not found - there are more than 255 ports in one direction */
1397 return;
1398 }
1399 }
1400 fabric_assign_tuple(p_ftree, p_sw, new_tuple);
1401 }
1402
1403 /***************************************************/
1404
fabric_get_new_tuple(IN ftree_fabric_t * p_ftree,OUT ftree_tuple_t new_tuple,IN ftree_tuple_t from_tuple,IN ftree_direction_t direction)1405 static void fabric_get_new_tuple(IN ftree_fabric_t * p_ftree,
1406 OUT ftree_tuple_t new_tuple,
1407 IN ftree_tuple_t from_tuple,
1408 IN ftree_direction_t direction)
1409 {
1410 ftree_sw_t *p_sw;
1411 ftree_tuple_t temp_tuple;
1412 uint8_t var_index;
1413 uint8_t i;
1414
1415 tuple_init(new_tuple);
1416 memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN);
1417
1418 if (direction == FTREE_DIRECTION_DOWN) {
1419 temp_tuple[0]++;
1420 var_index = from_tuple[0] + 1;
1421 } else {
1422 temp_tuple[0]--;
1423 var_index = from_tuple[0];
1424 }
1425
1426 for (i = 0; i < 0xFF; i++) {
1427 temp_tuple[var_index] = i;
1428 p_sw = fabric_get_sw_by_tuple(p_ftree, temp_tuple);
1429 if (p_sw == NULL) /* found free tuple */
1430 break;
1431 }
1432
1433 if (i == 0xFF) {
1434 /* new tuple not found - there are more than 255 ports in one direction */
1435 return;
1436 }
1437 memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN);
1438
1439 } /* fabric_get_new_tuple() */
1440
1441 /***************************************************/
1442
fabric_roots_provided(IN ftree_fabric_t * p_ftree)1443 static inline boolean_t fabric_roots_provided(IN ftree_fabric_t * p_ftree)
1444 {
1445 return (p_ftree->p_osm->subn.opt.root_guid_file != NULL);
1446 }
1447
1448 /***************************************************/
1449
fabric_cns_provided(IN ftree_fabric_t * p_ftree)1450 static inline boolean_t fabric_cns_provided(IN ftree_fabric_t * p_ftree)
1451 {
1452 return (p_ftree->p_osm->subn.opt.cn_guid_file != NULL);
1453 }
1454
1455 /***************************************************/
1456
fabric_ios_provided(IN ftree_fabric_t * p_ftree)1457 static inline boolean_t fabric_ios_provided(IN ftree_fabric_t * p_ftree)
1458 {
1459 return (p_ftree->p_osm->subn.opt.io_guid_file != NULL);
1460 }
1461
1462 /***************************************************/
1463
fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree)1464 static int fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree)
1465 {
1466 ftree_sw_t *p_sw;
1467 ftree_hca_t *p_hca;
1468 ftree_hca_t *p_next_hca;
1469 unsigned i;
1470 int res = 0;
1471
1472 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1473
1474 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1475 "Marking leaf switches in fabric\n");
1476
1477 /* Scan all the CAs, if they have CNs - find CN port and mark switch
1478 that is connected to this port as leaf switch.
1479 Also, ensure that this marked leaf has rank of p_ftree->leaf_switch_rank. */
1480 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1481 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
1482 p_hca = p_next_hca;
1483 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
1484 if (!p_hca->cn_num)
1485 continue;
1486
1487 for (i = 0; i < p_hca->up_port_groups_num; i++) {
1488 if (!p_hca->up_port_groups[i]->is_cn)
1489 continue;
1490
1491 /* In CAs, port group alway has one port, and since this
1492 port group is CN, we know that this port is compute node */
1493 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
1494 IB_NODE_TYPE_SWITCH);
1495 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
1496
1497 /* check if this switch was already processed */
1498 if (p_sw->is_leaf)
1499 continue;
1500 p_sw->is_leaf = TRUE;
1501
1502 /* ensure that this leaf switch is at the correct tree level */
1503 if (p_sw->rank != p_ftree->leaf_switch_rank) {
1504 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1505 "ERR AB26: CN port 0x%" PRIx64
1506 " is connected to switch 0x%" PRIx64
1507 " with rank %u, "
1508 "while FatTree leaf rank is %u\n",
1509 cl_ntoh64(p_hca->
1510 up_port_groups[i]->port_guid),
1511 sw_get_guid_ho(p_sw), p_sw->rank,
1512 p_ftree->leaf_switch_rank);
1513 res = -1;
1514 goto Exit;
1515
1516 }
1517 }
1518 }
1519
1520 Exit:
1521 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1522 return res;
1523 } /* fabric_mark_leaf_switches() */
1524
1525 /***************************************************/
bfs_fabric_indexing(IN ftree_fabric_t * p_ftree,IN ftree_sw_t * p_first_sw)1526 static void bfs_fabric_indexing(IN ftree_fabric_t * p_ftree,
1527 IN ftree_sw_t *p_first_sw)
1528 {
1529 ftree_sw_t *p_remote_sw;
1530 ftree_sw_t *p_sw = NULL;
1531 ftree_tuple_t new_tuple;
1532 uint32_t i;
1533 cl_list_t bfs_list;
1534
1535 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1536 cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
1537 /*
1538 * Now run BFS and assign indexes to all switches
1539 * Pseudo code of the algorithm is as follows:
1540 *
1541 * * Add first switch to BFS queue
1542 * * While (BFS queue not empty)
1543 * - Pop the switch from the head of the queue
1544 * - Scan all the downward and upward ports
1545 * - For each port
1546 * + Get the remote switch
1547 * + Assign index to the remote switch
1548 * + Add remote switch to the BFS queue
1549 */
1550
1551 cl_list_insert_tail(&bfs_list, p_first_sw);
1552
1553 while (!cl_is_list_empty(&bfs_list)) {
1554 p_sw = (ftree_sw_t *) cl_list_remove_head(&bfs_list);
1555
1556 /* Discover all the nodes from ports that are pointing down */
1557
1558 if (p_sw->rank >= p_ftree->leaf_switch_rank) {
1559 /* whether downward ports are pointing to CAs or switches,
1560 we don't assign indexes to switches that are located
1561 lower than leaf switches */
1562 } else {
1563 /* This is not the leaf switch */
1564 for (i = 0; i < p_sw->down_port_groups_num; i++) {
1565 /* Work with port groups that are pointing to switches only.
1566 No need to assign indexing to HCAs */
1567 if (p_sw->
1568 down_port_groups[i]->remote_node_type !=
1569 IB_NODE_TYPE_SWITCH)
1570 continue;
1571
1572 p_remote_sw =
1573 p_sw->down_port_groups[i]->
1574 remote_hca_or_sw.p_sw;
1575 if (tuple_assigned(p_remote_sw->tuple)) {
1576 /* this switch has been already indexed */
1577 continue;
1578 }
1579 /* allocate new tuple */
1580 fabric_get_new_tuple(p_ftree, new_tuple,
1581 p_sw->tuple,
1582 FTREE_DIRECTION_DOWN);
1583 /* Assign the new tuple to the remote switch.
1584 This fuction also adds the switch into the switch_by_tuple table. */
1585 fabric_assign_tuple(p_ftree, p_remote_sw,
1586 new_tuple);
1587
1588 /* add the newly discovered switch to the BFS queue */
1589 cl_list_insert_tail(&bfs_list, p_remote_sw);
1590 }
1591 /* Done assigning indexes to all the remote switches
1592 that are pointed by the downgoing ports.
1593 Now sort port groups according to remote index. */
1594 qsort(p_sw->down_port_groups, /* array */
1595 p_sw->down_port_groups_num, /* number of elements */
1596 sizeof(ftree_port_group_t *), /* size of each element */
1597 compare_port_groups_by_remote_switch_index); /* comparator */
1598 }
1599
1600 /* Done indexing switches from ports that go down.
1601 Now do the same with ports that are pointing up.
1602 if we started from root (rank == 0), the leaf is bsf termination point */
1603
1604 if (p_sw->rank != 0 && (p_first_sw->rank != 0 || !p_sw->is_leaf)) {
1605 /* This is not the root switch, which means that all the ports
1606 that are pointing up are taking us to another switches. */
1607 for (i = 0; i < p_sw->up_port_groups_num; i++) {
1608 p_remote_sw =
1609 p_sw->up_port_groups[i]->
1610 remote_hca_or_sw.p_sw;
1611 if (tuple_assigned(p_remote_sw->tuple))
1612 continue;
1613 /* allocate new tuple */
1614 fabric_get_new_tuple(p_ftree, new_tuple,
1615 p_sw->tuple,
1616 FTREE_DIRECTION_UP);
1617 /* Assign the new tuple to the remote switch.
1618 This fuction also adds the switch to the
1619 switch_by_tuple table. */
1620 fabric_assign_tuple(p_ftree,
1621 p_remote_sw, new_tuple);
1622 /* add the newly discovered switch to the BFS queue */
1623 cl_list_insert_tail(&bfs_list, p_remote_sw);
1624 }
1625 /* Done assigning indexes to all the remote switches
1626 that are pointed by the upgoing ports.
1627 Now sort port groups according to remote index. */
1628 qsort(p_sw->up_port_groups, /* array */
1629 p_sw->up_port_groups_num, /* number of elements */
1630 sizeof(ftree_port_group_t *), /* size of each element */
1631 compare_port_groups_by_remote_switch_index); /* comparator */
1632 }
1633 /* Done assigning indexes to all the switches that are directly connected
1634 to the current switch - go to the next switch in the BFS queue */
1635 }
1636 cl_list_destroy(&bfs_list);
1637
1638 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1639 }
1640
fabric_make_indexing(IN ftree_fabric_t * p_ftree)1641 static void fabric_make_indexing(IN ftree_fabric_t * p_ftree)
1642 {
1643 ftree_sw_t *p_sw = NULL;
1644 unsigned int subtree = 0;
1645 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1646
1647 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1648 "Starting FatTree indexing\n");
1649
1650 /* using the first switch as a starting point for indexing algorithm. */
1651 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1652 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1653 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1654 if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing) {
1655 /* find first root switch */
1656 if (p_sw->rank != 0)
1657 continue;
1658 } else {
1659 /* find first leaf switch */
1660 if (!p_sw->is_leaf)
1661 continue;
1662 }
1663 /* Assign the first tuple to the switch that is used as BFS starting point
1664 in the subtree.
1665 The tuple will be as follows: [rank].0...0.subtree
1666 This fuction also adds the switch it into the switch_by_tuple table. */
1667 if (!tuple_assigned(p_sw->tuple)) {
1668 fabric_assign_first_tuple(p_ftree, p_sw, subtree++);
1669 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1670 "Indexing starting point:\n"
1671 " - Switch rank : %u\n"
1672 " - Switch index : %s\n"
1673 " - Node LID : %u\n"
1674 " - Node GUID : 0x%016"
1675 PRIx64 "\n", p_sw->rank, tuple_to_str(p_sw->tuple),
1676 p_sw->lid, sw_get_guid_ho(p_sw));
1677 }
1678
1679 bfs_fabric_indexing(p_ftree, p_sw);
1680
1681 if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing == FALSE)
1682 goto Exit;
1683 }
1684 p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1685 while (p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1686 if (p_sw->is_leaf) {
1687 qsort(p_sw->up_port_groups, /* array */
1688 p_sw->up_port_groups_num, /* number of elements */
1689 sizeof(ftree_port_group_t *), /* size of each element */
1690 compare_port_groups_by_remote_switch_index); /* comparator */
1691 }
1692 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1693
1694 }
1695 Exit:
1696 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1697 } /* fabric_make_indexing() */
1698 /***************************************************/
1699
fabric_create_leaf_switch_array(IN ftree_fabric_t * p_ftree)1700 static int fabric_create_leaf_switch_array(IN ftree_fabric_t * p_ftree)
1701 {
1702 ftree_sw_t *p_sw;
1703 ftree_sw_t *p_next_sw;
1704 ftree_sw_t **all_switches_at_leaf_level;
1705 unsigned i;
1706 unsigned all_leaf_idx = 0;
1707 unsigned first_leaf_idx;
1708 unsigned last_leaf_idx;
1709 int res = 0;
1710
1711 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1712
1713 /* create array of ALL the switches that have leaf rank */
1714 all_switches_at_leaf_level = (ftree_sw_t **)
1715 malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1716 if (!all_switches_at_leaf_level) {
1717 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1718 "Fat-tree routing: Memory allocation failed\n");
1719 res = -1;
1720 goto Exit;
1721 }
1722 memset(all_switches_at_leaf_level, 0,
1723 cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1724
1725 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1726 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1727 p_sw = p_next_sw;
1728 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1729 if (p_sw->rank == p_ftree->leaf_switch_rank) {
1730 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1731 "Adding switch 0x%" PRIx64
1732 " to full leaf switch array\n",
1733 sw_get_guid_ho(p_sw));
1734 all_switches_at_leaf_level[all_leaf_idx++] = p_sw;
1735 }
1736 }
1737
1738 /* quick-sort array of leaf switches by index */
1739 qsort(all_switches_at_leaf_level, /* array */
1740 all_leaf_idx, /* number of elements */
1741 sizeof(ftree_sw_t *), /* size of each element */
1742 compare_switches_by_index); /* comparator */
1743
1744 /* check the first and the last REAL leaf (the one
1745 that has CNs) in the array of all the leafs */
1746
1747 first_leaf_idx = all_leaf_idx;
1748 last_leaf_idx = 0;
1749 for (i = 0; i < all_leaf_idx; i++) {
1750 if (all_switches_at_leaf_level[i]->is_leaf) {
1751 if (i < first_leaf_idx)
1752 first_leaf_idx = i;
1753 last_leaf_idx = i;
1754 }
1755 }
1756
1757 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1758 "Full leaf array info: first_leaf_idx = %u, last_leaf_idx = %u\n",
1759 first_leaf_idx, last_leaf_idx);
1760
1761 if (first_leaf_idx >= last_leaf_idx) {
1762 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
1763 "Failed to find leaf switches - topology is not "
1764 "fat-tree\n");
1765 res = -1;
1766 goto Exit;
1767 }
1768
1769 /* Create array of REAL leaf switches, sorted by index.
1770 This array may contain switches at the same rank w/o CNs,
1771 in case this is the order of indexing. */
1772 p_ftree->leaf_switches_num = last_leaf_idx - first_leaf_idx + 1;
1773 p_ftree->leaf_switches = (ftree_sw_t **)
1774 malloc(p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1775 if (!p_ftree->leaf_switches) {
1776 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1777 "Fat-tree routing: Memory allocation failed\n");
1778 res = -1;
1779 goto Exit;
1780 }
1781
1782 memcpy(p_ftree->leaf_switches,
1783 &(all_switches_at_leaf_level[first_leaf_idx]),
1784 p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1785
1786 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1787 "Created array of %u leaf switches\n",
1788 p_ftree->leaf_switches_num);
1789
1790 Exit:
1791 free(all_switches_at_leaf_level);
1792 OSM_LOG_EXIT(&p_ftree->p_osm->log);
1793 return res;
1794 } /* fabric_create_leaf_switch_array() */
1795
1796 /***************************************************/
1797
fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree)1798 static void fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree)
1799 {
1800 unsigned i;
1801 unsigned j;
1802 unsigned cns_on_this_leaf;
1803 ftree_sw_t *p_sw;
1804 ftree_port_group_t *p_group, *p_up_group;
1805 ftree_hca_t *p_hca;
1806
1807 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1808 p_sw = p_ftree->leaf_switches[i];
1809 cns_on_this_leaf = 0;
1810 for (j = 0; j < p_sw->down_port_groups_num; j++) {
1811 p_group = p_sw->down_port_groups[j];
1812 if (p_group->remote_node_type != IB_NODE_TYPE_CA)
1813 continue;
1814 p_hca = p_group->remote_hca_or_sw.p_hca;
1815 /*
1816 * Get the hca port group corresponding
1817 * to the LID of remote HCA port
1818 */
1819 p_up_group = hca_get_port_group_by_lid(p_hca,
1820 p_group->remote_lid);
1821
1822 CL_ASSERT(p_up_group);
1823
1824 if (p_up_group->is_cn)
1825 cns_on_this_leaf++;
1826 }
1827 if (cns_on_this_leaf > p_ftree->max_cn_per_leaf)
1828 p_ftree->max_cn_per_leaf = cns_on_this_leaf;
1829 }
1830 } /* fabric_set_max_cn_per_leaf() */
1831
1832 /***************************************************/
1833
fabric_validate_topology(IN ftree_fabric_t * p_ftree)1834 static boolean_t fabric_validate_topology(IN ftree_fabric_t * p_ftree)
1835 {
1836 ftree_port_group_t *p_group;
1837 ftree_port_group_t *p_ref_group;
1838 ftree_sw_t *p_sw;
1839 ftree_sw_t *p_next_sw;
1840 ftree_sw_t **reference_sw_arr;
1841 uint16_t tree_rank = fabric_get_rank(p_ftree);
1842 boolean_t res = TRUE;
1843 uint8_t i;
1844
1845 OSM_LOG_ENTER(&p_ftree->p_osm->log);
1846
1847 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1848 "Validating fabric topology\n");
1849
1850 reference_sw_arr =
1851 (ftree_sw_t **) malloc(tree_rank * sizeof(ftree_sw_t *));
1852 if (reference_sw_arr == NULL) {
1853 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1854 "Fat-tree routing: Memory allocation failed\n");
1855 return FALSE;
1856 }
1857 memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *));
1858
1859 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1860 while (res && p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1861 p_sw = p_next_sw;
1862 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1863
1864 if (!reference_sw_arr[p_sw->rank])
1865 /* This is the first switch in the current level that
1866 we're checking - use it as a reference */
1867 reference_sw_arr[p_sw->rank] = p_sw;
1868 else {
1869 /* compare this switch properties to the reference switch */
1870
1871 if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1872 p_sw->up_port_groups_num) {
1873 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1874 "ERR AB09: Different number of upward port groups on switches:\n"
1875 " GUID 0x%016" PRIx64
1876 ", LID %u, Index %s - %u groups\n"
1877 " GUID 0x%016" PRIx64
1878 ", LID %u, Index %s - %u groups\n",
1879 sw_get_guid_ho
1880 (reference_sw_arr[p_sw->rank]),
1881 reference_sw_arr[p_sw->rank]->lid,
1882 tuple_to_str
1883 (reference_sw_arr[p_sw->rank]->tuple),
1884 reference_sw_arr[p_sw->
1885 rank]->
1886 up_port_groups_num,
1887 sw_get_guid_ho(p_sw), p_sw->lid,
1888 tuple_to_str(p_sw->tuple),
1889 p_sw->up_port_groups_num);
1890 res = FALSE;
1891 break;
1892 }
1893
1894 if (p_sw->rank != (tree_rank - 1) &&
1895 reference_sw_arr[p_sw->
1896 rank]->down_port_groups_num !=
1897 p_sw->down_port_groups_num) {
1898 /* we're allowing some hca's to be missing */
1899 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1900 "ERR AB0A: Different number of downward port groups on switches:\n"
1901 " GUID 0x%016" PRIx64
1902 ", LID %u, Index %s - %u port groups\n"
1903 " GUID 0x%016" PRIx64
1904 ", LID %u, Index %s - %u port groups\n",
1905 sw_get_guid_ho
1906 (reference_sw_arr[p_sw->rank]),
1907 reference_sw_arr[p_sw->rank]->lid,
1908 tuple_to_str
1909 (reference_sw_arr[p_sw->rank]->tuple),
1910 reference_sw_arr[p_sw->
1911 rank]->
1912 down_port_groups_num,
1913 sw_get_guid_ho(p_sw), p_sw->lid,
1914 tuple_to_str(p_sw->tuple),
1915 p_sw->down_port_groups_num);
1916 res = FALSE;
1917 break;
1918 }
1919
1920 if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1921 0) {
1922 p_ref_group =
1923 reference_sw_arr[p_sw->
1924 rank]->up_port_groups[0];
1925 for (i = 0; i < p_sw->up_port_groups_num; i++) {
1926 p_group = p_sw->up_port_groups[i];
1927 if (cl_ptr_vector_get_size
1928 (&p_ref_group->ports) !=
1929 cl_ptr_vector_get_size
1930 (&p_group->ports)) {
1931 OSM_LOG(&p_ftree->p_osm->log,
1932 OSM_LOG_ERROR,
1933 "ERR AB0B: Different number of ports in an upward port group on switches:\n"
1934 " GUID 0x%016"
1935 PRIx64
1936 ", LID %u, Index %s - %u ports\n"
1937 " GUID 0x%016"
1938 PRIx64
1939 ", LID %u, Index %s - %u ports\n",
1940 sw_get_guid_ho
1941 (reference_sw_arr
1942 [p_sw->rank]),
1943 reference_sw_arr[p_sw->
1944 rank]->
1945 lid,
1946 tuple_to_str
1947 (reference_sw_arr
1948 [p_sw->rank]->tuple),
1949 cl_ptr_vector_get_size
1950 (&p_ref_group->ports),
1951 sw_get_guid_ho(p_sw),
1952 p_sw->lid,
1953 tuple_to_str(p_sw->
1954 tuple),
1955 cl_ptr_vector_get_size
1956 (&p_group->ports));
1957 res = FALSE;
1958 break;
1959 }
1960 }
1961 }
1962 if (reference_sw_arr[p_sw->rank]->down_port_groups_num
1963 != 0 && p_sw->rank != (tree_rank - 1)) {
1964 /* we're allowing some hca's to be missing */
1965 p_ref_group =
1966 reference_sw_arr[p_sw->
1967 rank]->down_port_groups[0];
1968 for (i = 0; i < p_sw->down_port_groups_num; i++) {
1969 p_group = p_sw->down_port_groups[0];
1970 if (cl_ptr_vector_get_size
1971 (&p_ref_group->ports) !=
1972 cl_ptr_vector_get_size
1973 (&p_group->ports)) {
1974 OSM_LOG(&p_ftree->p_osm->log,
1975 OSM_LOG_ERROR,
1976 "ERR AB0C: Different number of ports in an downward port group on switches:\n"
1977 " GUID 0x%016"
1978 PRIx64
1979 ", LID %u, Index %s - %u ports\n"
1980 " GUID 0x%016"
1981 PRIx64
1982 ", LID %u, Index %s - %u ports\n",
1983 sw_get_guid_ho
1984 (reference_sw_arr
1985 [p_sw->rank]),
1986 reference_sw_arr[p_sw->
1987 rank]->
1988 lid,
1989 tuple_to_str
1990 (reference_sw_arr
1991 [p_sw->rank]->tuple),
1992 cl_ptr_vector_get_size
1993 (&p_ref_group->ports),
1994 sw_get_guid_ho(p_sw),
1995 p_sw->lid,
1996 tuple_to_str(p_sw->
1997 tuple),
1998 cl_ptr_vector_get_size
1999 (&p_group->ports));
2000 res = FALSE;
2001 break;
2002 }
2003 }
2004 }
2005 } /* end of else */
2006 } /* end of while */
2007
2008 if (res == TRUE)
2009 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
2010 "Fabric topology has been identified as FatTree\n");
2011 else
2012 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2013 "ERR AB0D: Fabric topology hasn't been identified as FatTree\n");
2014
2015 free(reference_sw_arr);
2016 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2017 return res;
2018 } /* fabric_validate_topology() */
2019
2020 /***************************************************
2021 ***************************************************/
2022
set_sw_fwd_table(IN cl_map_item_t * const p_map_item,IN void * context)2023 static void set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
2024 IN void *context)
2025 {
2026 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
2027 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
2028
2029 p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid;
2030 }
2031
2032 /***************************************************
2033 ***************************************************/
2034
2035 /*
2036 * Function: Finds the least loaded port group and stores its counter
2037 * Given : A switch
2038 */
recalculate_min_counter_down(ftree_sw_t * p_sw)2039 static inline void recalculate_min_counter_down(ftree_sw_t * p_sw)
2040 {
2041 uint32_t min = (1 << 30);
2042 uint32_t i;
2043 for (i = 0; i < p_sw->down_port_groups_num; i++) {
2044 if (p_sw->down_port_groups[i]->counter_down < min) {
2045 min = p_sw->down_port_groups[i]->counter_down;
2046 }
2047 }
2048 p_sw->min_counter_down = min;
2049 return;
2050 }
2051
2052 /*
2053 * Function: Return the counter value of the least loaded down port group
2054 * Given : A switch
2055 */
find_lowest_loaded_group_on_sw(ftree_sw_t * p_sw)2056 static inline uint32_t find_lowest_loaded_group_on_sw(ftree_sw_t * p_sw)
2057 {
2058 return p_sw->min_counter_down;
2059 }
2060
2061 /*
2062 * Function: Compare the load of two port groups and return which is the least loaded
2063 * Given : Two port groups with remote switch
2064 * When both port groups are equally loaded, it picks the one whom
2065 * remote switch down ports are least loaded.
2066 * This way, it prefers the switch from where it will be easier to go down (creating upward routes).
2067 * If both are equal, it picks the lowest INDEX to be deterministic.
2068 */
port_group_compare_load_down(const ftree_port_group_t * p1,const ftree_port_group_t * p2)2069 static inline int port_group_compare_load_down(const ftree_port_group_t * p1,
2070 const ftree_port_group_t * p2)
2071 {
2072 int temp = p1->counter_down - p2->counter_down;
2073 if (temp > 0)
2074 return 1;
2075 if (temp < 0)
2076 return -1;
2077
2078 /* Find the less loaded remote sw and choose this one */
2079 do {
2080 uint32_t load1 =
2081 find_lowest_loaded_group_on_sw(p1->remote_hca_or_sw.p_sw);
2082 uint32_t load2 =
2083 find_lowest_loaded_group_on_sw(p2->remote_hca_or_sw.p_sw);
2084 temp = load1 - load2;
2085 if (temp > 0)
2086 return 1;
2087 } while (0);
2088 /* If they are both equal, choose the lowest index */
2089 return compare_port_groups_by_remote_switch_index(&p1, &p2);
2090 }
2091
port_group_compare_load_up(const ftree_port_group_t * p1,const ftree_port_group_t * p2)2092 static inline int port_group_compare_load_up(const ftree_port_group_t * p1,
2093 const ftree_port_group_t * p2)
2094 {
2095 int temp = p1->counter_up - p2->counter_up;
2096 if (temp > 0)
2097 return 1;
2098 if (temp < 0)
2099 return -1;
2100
2101 /* If they are both equal, choose the lowest index */
2102 return compare_port_groups_by_remote_switch_index (&p1,&p2);
2103 }
2104
2105 /*
2106 * Function: Sorts an array of port group by up load order
2107 * Given : A port group array and its length
2108 * As the list is mostly sorted, we used a bubble sort instead of qsort
2109 * as it is much faster.
2110 *
2111 * Important note:
2112 * This function and bubble_sort_down must NOT be factorized.
2113 * Although most of the code is the same and a function pointer could be used
2114 * for the compareason function, it would prevent the compareason function to be inlined
2115 * and cost a great deal to performances.
2116 */
2117 static inline void
bubble_sort_up(ftree_port_group_t ** p_group_array,uint32_t nmemb)2118 bubble_sort_up(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2119 {
2120 uint32_t i = 0;
2121 uint32_t j = 0;
2122 ftree_port_group_t *tmp = p_group_array[0];
2123
2124 /* As this function is a great number of times, we only go into the loop
2125 * if one of the port counters has changed, thus saving some tests */
2126 if (tmp->hca_or_sw.p_sw->counter_up_changed == FALSE) {
2127 return;
2128 }
2129 /* While we did modifications on the array order */
2130 /* i may grew above array length but next loop will fail and tmp will be null for the next time
2131 * this way we save a test i < nmemb for each pass through the loop */
2132 for (i = 0; tmp; i++) {
2133 /* Assume the array is orderd */
2134 tmp = NULL;
2135 /* Comparing elements j and j-1 */
2136 for (j = 1; j < (nmemb - i); j++) {
2137 /* If they are the wrong way around */
2138 if (port_group_compare_load_up(p_group_array[j],
2139 p_group_array[j - 1]) < 0) {
2140 /* We invert them */
2141 tmp = p_group_array[j - 1];
2142 p_group_array[j - 1] = p_group_array[j];
2143 p_group_array[j] = tmp;
2144 /* This sets tmp != NULL so the main loop will make another pass */
2145 }
2146 }
2147 }
2148
2149 /* We have reordered the array so as long noone changes the counter
2150 * it's not necessary to do it again */
2151 p_group_array[0]->hca_or_sw.p_sw->counter_up_changed = FALSE;
2152 }
2153
2154 static inline void
bubble_sort_siblings(ftree_port_group_t ** p_group_array,uint32_t nmemb)2155 bubble_sort_siblings(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2156 {
2157 uint32_t i = 0;
2158 uint32_t j = 0;
2159 ftree_port_group_t *tmp = p_group_array[0];
2160
2161 /* While we did modifications on the array order */
2162 /* i may grew above array length but next loop will fail and tmp will be null for the next time
2163 * this way we save a test i < nmemb for each pass through the loop */
2164 for (i = 0; tmp != NULL; i++) {
2165 /* Assume the array is orderd */
2166 tmp = NULL;
2167 /* Comparing elements j and j-1 */
2168 for (j = 1; j < (nmemb - i); j++) {
2169 /* If they are the wrong way around */
2170 if (port_group_compare_load_up(p_group_array[j],
2171 p_group_array[j - 1]) < 0) {
2172 /* We invert them */
2173 tmp = p_group_array[j - 1];
2174 p_group_array[j - 1] = p_group_array[j];
2175 p_group_array[j] = tmp;
2176 }
2177 }
2178 }
2179 }
2180
2181 /*
2182 * Function: Sorts an array of port group. Order is decide through
2183 * port_group_compare_load_down ( up counters, least load remote switch, biggest GUID)
2184 * Given : A port group array and its length. Each port group points to a remote switch (not a HCA)
2185 * As the list is mostly sorted, we used a bubble sort instead of qsort
2186 * as it is much faster.
2187 *
2188 * Important note:
2189 * This function and bubble_sort_up must NOT be factorized.
2190 * Although most of the code is the same and a function pointer could be used
2191 * for the compareason function, it would prevent the compareason function to be inlined
2192 * and cost a great deal to performances.
2193 */
2194 static inline void
bubble_sort_down(ftree_port_group_t ** p_group_array,uint32_t nmemb)2195 bubble_sort_down(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2196 {
2197 uint32_t i = 0;
2198 uint32_t j = 0;
2199 ftree_port_group_t *tmp = p_group_array[0];
2200
2201 /* While we did modifications on the array order */
2202 /* i may grew above array length but next loop will fail and tmp will be null for the next time
2203 * this way we save a test i < nmemb for each pass through the loop */
2204 for (i = 0; tmp; i++) {
2205 /* Assume the array is orderd */
2206 tmp = NULL;
2207 /* Comparing elements j and j-1 */
2208 for (j = 1; j < (nmemb - i); j++) {
2209 /* If they are the wrong way around */
2210 if (port_group_compare_load_down
2211 (p_group_array[j], p_group_array[j - 1]) < 0) {
2212 /* We invert them */
2213 tmp = p_group_array[j - 1];
2214 p_group_array[j - 1] = p_group_array[j];
2215 p_group_array[j] = tmp;
2216
2217 }
2218 }
2219 }
2220 }
2221
2222 /***************************************************
2223 ***************************************************/
2224
2225 /*
2226 * Function: assign-up-going-port-by-descending-down
2227 * Given : a switch and a LID
2228 * Pseudo code:
2229 * foreach down-going-port-group (in indexing order)
2230 * skip this group if the LFT(LID) port is part of this group
2231 * find the least loaded port of the group (scan in indexing order)
2232 * r-port is the remote port connected to it
2233 * assign the remote switch node LFT(LID) to r-port
2234 * increase r-port usage counter
2235 * assign-up-going-port-by-descending-down to r-port node (recursion)
2236 */
2237
2238 static boolean_t
fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,IN ftree_sw_t * p_sw,IN ftree_sw_t * p_prev_sw,IN uint16_t target_lid,IN boolean_t is_main_path,IN boolean_t is_target_a_sw,IN uint8_t current_hops)2239 fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
2240 IN ftree_sw_t * p_sw,
2241 IN ftree_sw_t * p_prev_sw,
2242 IN uint16_t target_lid,
2243 IN boolean_t is_main_path,
2244 IN boolean_t is_target_a_sw,
2245 IN uint8_t current_hops)
2246 {
2247 ftree_sw_t *p_remote_sw;
2248 uint16_t ports_num;
2249 ftree_port_group_t *p_group;
2250 ftree_port_t *p_port;
2251 ftree_port_t *p_min_port;
2252 uint16_t j;
2253 uint16_t k;
2254 boolean_t created_route = FALSE;
2255 boolean_t routed = 0;
2256 uint8_t least_hops;
2257
2258 /* if there is no down-going ports */
2259 if (p_sw->down_port_groups_num == 0)
2260 return FALSE;
2261
2262 /* foreach down-going port group (in load order) */
2263 bubble_sort_up(p_sw->down_port_groups, p_sw->down_port_groups_num);
2264
2265 if (p_sw->sibling_port_groups_num > 0)
2266 bubble_sort_siblings(p_sw->sibling_port_groups,
2267 p_sw->sibling_port_groups_num);
2268
2269 for (k = 0;
2270 k <
2271 (p_sw->down_port_groups_num +
2272 ((target_lid != 0) ? p_sw->sibling_port_groups_num : 0)); k++) {
2273
2274 if (k < p_sw->down_port_groups_num) {
2275 p_group = p_sw->down_port_groups[k];
2276 } else {
2277 p_group =
2278 p_sw->sibling_port_groups[k -
2279 p_sw->
2280 down_port_groups_num];
2281 }
2282
2283 /* If this port group doesn't point to a switch, mark
2284 that the route was created and skip to the next group */
2285 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH) {
2286 created_route = TRUE;
2287 continue;
2288 }
2289
2290 if (p_prev_sw
2291 && p_group->remote_lid == p_prev_sw->lid) {
2292 /* This port group has a port that was used when we entered this switch,
2293 which means that the current group points to the switch where we were
2294 at the previous step of the algorithm (before going up).
2295 Skipping this group. */
2296 continue;
2297 }
2298
2299 /* find the least loaded port of the group (in indexing order) */
2300 p_min_port = NULL;
2301 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2302 if(ports_num == 0)
2303 continue;
2304
2305 for (j = 0; j < ports_num; j++) {
2306 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2307 /* first port that we're checking - set as port with the lowest load */
2308 /* or this port is less loaded - use it as min */
2309 if (!p_min_port ||
2310 p_port->counter_up < p_min_port->counter_up)
2311 p_min_port = p_port;
2312 }
2313 /* At this point we have selected a port in this group with the
2314 lowest load of upgoing routes.
2315 Set on the remote switch how to get to the target_lid -
2316 set LFT(target_lid) on the remote switch to the remote port */
2317 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2318 least_hops = sw_get_least_hops(p_remote_sw, target_lid);
2319
2320 if (least_hops != OSM_NO_PATH) {
2321 /* Loop in the fabric - we already routed the remote switch
2322 on our way UP, and now we see it again on our way DOWN */
2323 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2324 "Loop of length %d in the fabric:\n "
2325 "Switch %s (LID %u) closes loop through switch %s (LID %u)\n",
2326 current_hops,
2327 tuple_to_str(p_remote_sw->tuple),
2328 p_group->lid,
2329 tuple_to_str(p_sw->tuple),
2330 p_group->remote_lid);
2331 /* We skip only if we have come through a longer path */
2332 if (current_hops + 1 >= least_hops)
2333 continue;
2334 }
2335
2336 /* Four possible cases:
2337 *
2338 * 1. is_main_path == TRUE:
2339 * - going DOWN(TRUE,TRUE) through ALL the groups
2340 * + promoting port counter
2341 * + setting path in remote switch fwd tbl
2342 * + setting hops in remote switch on all the ports of each group
2343 *
2344 * 2. is_main_path == FALSE:
2345 * - going DOWN(TRUE,FALSE) through ALL the groups but only if
2346 * the remote (lower) switch hasn't been already configured
2347 * for this target LID (or with a longer path)
2348 * + promoting port counter
2349 * + setting path in remote switch fwd tbl if it hasn't been set yet
2350 * + setting hops in remote switch on all the ports of each group
2351 * if it hasn't been set yet
2352 */
2353
2354 /* setting fwd tbl port only */
2355 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2356 p_min_port->remote_port_num;
2357 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2358 "Switch %s: set path to CA LID %u through port %u\n",
2359 tuple_to_str(p_remote_sw->tuple),
2360 target_lid, p_min_port->remote_port_num);
2361
2362 /* On the remote switch that is pointed by the p_group,
2363 set hops for ALL the ports in the remote group. */
2364
2365 set_hops_on_remote_sw(p_group, target_lid,
2366 current_hops + 1, is_target_a_sw);
2367
2368 /* Recursion step:
2369 Assign upgoing ports by stepping down, starting on REMOTE switch */
2370 routed = fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw, /* remote switch - used as a route-upgoing alg. start point */
2371 NULL, /* prev. position - NULL to mark that we went down and not up */
2372 target_lid, /* LID that we're routing to */
2373 is_main_path, /* whether this is path to HCA that should by tracked by counters */
2374 is_target_a_sw, /* Whether target lid is a switch or not */
2375 current_hops + 1); /* Number of hops done to this point */
2376 created_route |= routed;
2377 /* Counters are promoted only if a route toward a node is created */
2378 if (routed) {
2379 p_min_port->counter_up++;
2380 p_group->counter_up++;
2381 p_group->hca_or_sw.p_sw->counter_up_changed = TRUE;
2382 }
2383 }
2384 /* done scanning all the down-going port groups */
2385
2386 /* if the route was created, promote the index that
2387 indicates which group should we start with when
2388 going through all the downgoing groups */
2389 if (created_route)
2390 p_sw->down_port_groups_idx = (p_sw->down_port_groups_idx + 1)
2391 % p_sw->down_port_groups_num;
2392
2393 return created_route;
2394 } /* fabric_route_upgoing_by_going_down() */
2395
2396 /***************************************************/
2397
2398 /*
2399 * Function: assign-down-going-port-by-ascending-up
2400 * Given : a switch and a LID
2401 * Pseudo code:
2402 * find the least loaded port of all the upgoing groups (scan in indexing order)
2403 * assign the LFT(LID) of remote switch to that port
2404 * track that port usage
2405 * assign-up-going-port-by-descending-down on CURRENT switch
2406 * assign-down-going-port-by-ascending-up on REMOTE switch (recursion)
2407 */
2408
2409 static boolean_t
fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,IN ftree_sw_t * p_sw,IN ftree_sw_t * p_prev_sw,IN uint16_t target_lid,IN boolean_t is_main_path,IN boolean_t is_target_a_sw,IN uint16_t reverse_hop_credit,IN uint16_t reverse_hops,IN uint8_t current_hops)2410 fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
2411 IN ftree_sw_t * p_sw,
2412 IN ftree_sw_t * p_prev_sw,
2413 IN uint16_t target_lid,
2414 IN boolean_t is_main_path,
2415 IN boolean_t is_target_a_sw,
2416 IN uint16_t reverse_hop_credit,
2417 IN uint16_t reverse_hops,
2418 IN uint8_t current_hops)
2419 {
2420 ftree_sw_t *p_remote_sw;
2421 uint16_t ports_num;
2422 ftree_port_group_t *p_group;
2423 ftree_port_t *p_port;
2424 ftree_port_group_t *p_min_group;
2425 ftree_port_t *p_min_port;
2426 uint16_t i;
2427 uint16_t j;
2428 boolean_t created_route = FALSE;
2429 boolean_t routed = FALSE;
2430
2431
2432 /* Assign upgoing ports by stepping down, starting on THIS switch */
2433 created_route = fabric_route_upgoing_by_going_down(p_ftree, p_sw, /* local switch - used as a route-upgoing alg. start point */
2434 p_prev_sw, /* switch that we went up from (NULL means that we went down) */
2435 target_lid, /* LID that we're routing to */
2436 is_main_path, /* whether this path to HCA should by tracked by counters */
2437 is_target_a_sw, /* Whether target lid is a switch or not */
2438 current_hops); /* Number of hops done up to this point */
2439
2440 /* recursion stop condition - if it's a root switch, */
2441 if (p_sw->rank == 0) {
2442 if (reverse_hop_credit > 0) {
2443 /* We go up by going down as we have some reverse_hop_credit left */
2444 /* We use the index to scatter a bit the reverse up routes */
2445 p_sw->down_port_groups_idx =
2446 (p_sw->down_port_groups_idx +
2447 1) % p_sw->down_port_groups_num;
2448 i = p_sw->down_port_groups_idx;
2449 for (j = 0; j < p_sw->down_port_groups_num; j++) {
2450
2451 p_group = p_sw->down_port_groups[i];
2452 i = (i + 1) % p_sw->down_port_groups_num;
2453
2454 /* Skip this port group unless it points to a switch */
2455 if (p_group->remote_node_type !=
2456 IB_NODE_TYPE_SWITCH)
2457 continue;
2458 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2459
2460 created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2461 p_sw, /* this switch - prev. position switch for the function */
2462 target_lid, /* LID that we're routing to */
2463 is_main_path, /* whether this is path to HCA that should by tracked by counters */
2464 is_target_a_sw, /* Whether target lid is a switch or not */
2465 reverse_hop_credit - 1, /* Remaining reverse_hops allowed */
2466 reverse_hops + 1, /* Number of reverse_hops done up to this point */
2467 current_hops
2468 +
2469 1);
2470 }
2471
2472 }
2473 return created_route;
2474 }
2475
2476 /* We should generate a list of port sorted by load so we can find easily the least
2477 * going port and explore the other pots on secondary routes more easily (and quickly) */
2478 bubble_sort_down(p_sw->up_port_groups, p_sw->up_port_groups_num);
2479
2480 p_min_group = p_sw->up_port_groups[0];
2481 /* Find the least loaded upgoing port in the selected group */
2482 p_min_port = NULL;
2483 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports);
2484 for (j = 0; j < ports_num; j++) {
2485 cl_ptr_vector_at(&p_min_group->ports, j, (void *)&p_port);
2486 if (!p_min_port) {
2487 /* first port that we're checking - use
2488 it as a port with the lowest load */
2489 p_min_port = p_port;
2490 } else if (p_port->counter_down < p_min_port->counter_down) {
2491 /* this port is less loaded - use it as min */
2492 p_min_port = p_port;
2493 }
2494 }
2495
2496 /* At this point we have selected a group and port with the
2497 lowest load of downgoing routes.
2498 Set on the remote switch how to get to the target_lid -
2499 set LFT(target_lid) on the remote switch to the remote port */
2500 p_remote_sw = p_min_group->remote_hca_or_sw.p_sw;
2501
2502 /* Four possible cases:
2503 *
2504 * 1. is_main_path == TRUE:
2505 * - going UP(TRUE,TRUE) on selected min_group and min_port
2506 * + promoting port counter
2507 * + setting path in remote switch fwd tbl
2508 * + setting hops in remote switch on all the ports of selected group
2509 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0
2510 * + NOT promoting port counter
2511 * + setting path in remote switch fwd tbl if it hasn't been set yet
2512 * + setting hops in remote switch on all the ports of each group
2513 * if it hasn't been set yet
2514 *
2515 * 2. is_main_path == FALSE:
2516 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2517 * but only if the remote (upper) switch hasn't been already
2518 * configured for this target LID
2519 * + NOT promoting port counter
2520 * + setting path in remote switch fwd tbl if it hasn't been set yet
2521 * + setting hops in remote switch on all the ports of each group
2522 * if it hasn't been set yet
2523 */
2524
2525 /* covering first half of case 1, and case 3 */
2526 if (is_main_path) {
2527 if (p_sw->is_leaf) {
2528 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2529 " - Routing MAIN path for %s CA LID %u: %s --> %s\n",
2530 (target_lid != 0) ? "real" : "DUMMY",
2531 target_lid,
2532 tuple_to_str(p_sw->tuple),
2533 tuple_to_str(p_remote_sw->tuple));
2534 }
2535 /* The number of downgoing routes is tracked in the
2536 p_group->counter_down p_port->counter_down counters of the
2537 group and port that belong to the lower side of the link
2538 (on switch with higher rank) */
2539 p_min_group->counter_down++;
2540 p_min_port->counter_down++;
2541 if (p_min_group->counter_down ==
2542 (p_min_group->remote_hca_or_sw.p_sw->min_counter_down +
2543 1)) {
2544 recalculate_min_counter_down
2545 (p_min_group->remote_hca_or_sw.p_sw);
2546 }
2547
2548 /* This LID may already be in the LFT in the reverse_hop feature is used */
2549 /* We update the LFT only if this LID isn't already present. */
2550
2551 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2552 if ((p_remote_sw->p_osm_sw->new_lft[target_lid] == OSM_NO_PATH)
2553 ||
2554 (current_hops + 1 <
2555 sw_get_least_hops(p_remote_sw, target_lid))) {
2556
2557 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2558 p_min_port->remote_port_num;
2559 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2560 "Switch %s: set path to CA LID %u through port %u\n",
2561 tuple_to_str(p_remote_sw->tuple),
2562 target_lid,
2563 p_min_port->remote_port_num);
2564
2565 /* On the remote switch that is pointed by the min_group,
2566 set hops for ALL the ports in the remote group. */
2567
2568 set_hops_on_remote_sw(p_min_group, target_lid,
2569 current_hops + 1,
2570 is_target_a_sw);
2571 }
2572 /* Recursion step: Assign downgoing ports by stepping up, starting on REMOTE switch. */
2573 created_route |= fabric_route_downgoing_by_going_up(p_ftree,
2574 p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2575 p_sw, /* this switch - prev. position switch for the function */
2576 target_lid, /* LID that we're routing to */
2577 is_main_path, /* whether this is path to HCA that should by tracked by counters */
2578 is_target_a_sw, /* Whether target lid is a switch or not */
2579 reverse_hop_credit, /* Remaining reverse_hops allowed */
2580 reverse_hops, /* Number of reverse_hops done up to this point */
2581 current_hops + 1);
2582 }
2583
2584 /* What's left to do at this point:
2585 *
2586 * 1. is_main_path == TRUE:
2587 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0,
2588 * but only if the remote (upper) switch hasn't been already
2589 * configured for this target LID
2590 * + NOT promoting port counter
2591 * + setting path in remote switch fwd tbl if it hasn't been set yet
2592 * + setting hops in remote switch on all the ports of each group
2593 * if it hasn't been set yet
2594 *
2595 * 2. is_main_path == FALSE:
2596 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2597 * but only if the remote (upper) switch hasn't been already
2598 * configured for this target LID
2599 * + NOT promoting port counter
2600 * + setting path in remote switch fwd tbl if it hasn't been set yet
2601 * + setting hops in remote switch on all the ports of each group
2602 * if it hasn't been set yet
2603 *
2604 * These two rules can be rephrased this way:
2605 * - foreach UP port group
2606 * + if remote switch has been set with the target LID
2607 * - skip this port group
2608 * + else
2609 * - select port 0
2610 * - do NOT promote port counter
2611 * - set path in remote switch fwd tbl
2612 * - set hops in remote switch on all the ports of this group
2613 * - go UP(TRUE,FALSE) to the remote switch
2614 */
2615
2616 for (i = is_main_path ? 1 : 0; i < p_sw->up_port_groups_num; i++) {
2617 p_group = p_sw->up_port_groups[i];
2618 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2619
2620 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2621 if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH)
2622 if (current_hops + 1 >=
2623 sw_get_least_hops(p_remote_sw, target_lid))
2624 continue;
2625
2626 if (p_sw->is_leaf) {
2627 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2628 " - Routing SECONDARY path for LID %u: %s --> %s\n",
2629 target_lid,
2630 tuple_to_str(p_sw->tuple),
2631 tuple_to_str(p_remote_sw->tuple));
2632 }
2633
2634 /* Routing REAL lids on SECONDARY path means routing
2635 switch-to-switch or switch-to-CA paths.
2636 We can safely assume that switch will initiate very
2637 few traffic, so there's no point wasting runtime on
2638 trying to balance these routes - always pick port 0. */
2639 p_min_port = NULL;
2640 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2641 if(ports_num == 0)
2642 continue;
2643 for (j = 0; j < ports_num; j++) {
2644 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2645 if (!p_min_port) {
2646 /* first port that we're checking - use
2647 it as a port with the lowest load */
2648 p_min_port = p_port;
2649 } else if (p_port->counter_down <
2650 p_min_port->counter_down) {
2651 /* this port is less loaded - use it as min */
2652 p_min_port = p_port;
2653 }
2654 }
2655
2656 p_port = p_min_port;
2657 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2658 p_port->remote_port_num;
2659
2660 /* On the remote switch that is pointed by the p_group,
2661 set hops for ALL the ports in the remote group. */
2662
2663 set_hops_on_remote_sw(p_group, target_lid,
2664 current_hops + 1, is_target_a_sw);
2665
2666 /* Recursion step:
2667 Assign downgoing ports by stepping up, starting on REMOTE switch. */
2668 routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2669 p_sw, /* this switch - prev. position switch for the function */
2670 target_lid, /* LID that we're routing to */
2671 FALSE, /* whether this is path to HCA that should by tracked by counters */
2672 is_target_a_sw, /* Whether target lid is a switch or not */
2673 reverse_hop_credit, /* Remaining reverse_hops allowed */
2674 reverse_hops, /* Number of reverse_hops done up to this point */
2675 current_hops + 1);
2676 created_route |= routed;
2677 }
2678
2679 /* Now doing the same thing with horizontal links */
2680 if (p_sw->sibling_port_groups_num > 0)
2681 bubble_sort_down(p_sw->sibling_port_groups,
2682 p_sw->sibling_port_groups_num);
2683
2684 for (i = 0; i < p_sw->sibling_port_groups_num; i++) {
2685 p_group = p_sw->sibling_port_groups[i];
2686 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2687
2688 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2689 if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH)
2690 if (current_hops + 1 >=
2691 sw_get_least_hops(p_remote_sw, target_lid))
2692 continue;
2693
2694 if (p_sw->is_leaf) {
2695 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2696 " - Routing SECONDARY path for LID %u: %s --> %s\n",
2697 target_lid,
2698 tuple_to_str(p_sw->tuple),
2699 tuple_to_str(p_remote_sw->tuple));
2700 }
2701
2702 /* Routing REAL lids on SECONDARY path means routing
2703 switch-to-switch or switch-to-CA paths.
2704 We can safely assume that switch will initiate very
2705 few traffic, so there's no point wasting runtime on
2706 trying to balance these routes - always pick port 0. */
2707
2708 p_min_port = NULL;
2709 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2710 for (j = 0; j < ports_num; j++) {
2711 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2712 if (!p_min_port) {
2713 /* first port that we're checking - use
2714 it as a port with the lowest load */
2715 p_min_port = p_port;
2716 } else if (p_port->counter_down <
2717 p_min_port->counter_down) {
2718 /* this port is less loaded - use it as min */
2719 p_min_port = p_port;
2720 }
2721 }
2722
2723 p_port = p_min_port;
2724 p_remote_sw->p_osm_sw->new_lft[target_lid] =
2725 p_port->remote_port_num;
2726
2727 /* On the remote switch that is pointed by the p_group,
2728 set hops for ALL the ports in the remote group. */
2729
2730 set_hops_on_remote_sw(p_group, target_lid,
2731 current_hops + 1, is_target_a_sw);
2732
2733 /* Recursion step:
2734 Assign downgoing ports by stepping up, starting on REMOTE switch. */
2735 routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2736 p_sw, /* this switch - prev. position switch for the function */
2737 target_lid, /* LID that we're routing to */
2738 FALSE, /* whether this is path to HCA that should by tracked by counters */
2739 is_target_a_sw, /* Whether target lid is a switch or not */
2740 reverse_hop_credit, /* Remaining reverse_hops allowed */
2741 reverse_hops, /* Number of reverse_hops done up to this point */
2742 current_hops + 1);
2743 created_route |= routed;
2744 if (routed) {
2745 p_min_group->counter_down++;
2746 p_min_port->counter_down++;
2747 }
2748 }
2749
2750 /* If we don't have any reverse hop credits, we are done */
2751 if (reverse_hop_credit == 0)
2752 return created_route;
2753
2754 if (p_sw->is_leaf)
2755 return created_route;
2756
2757 /* We explore all the down group ports */
2758 /* We try to reverse jump for each of them */
2759 /* They already have a route to us from the upgoing_by_going_down started earlier */
2760 /* This is only so it'll continue exploring up, after this step backwards */
2761 for (i = 0; i < p_sw->down_port_groups_num; i++) {
2762 p_group = p_sw->down_port_groups[i];
2763 p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2764
2765 /* Skip this port group unless it points to a switch */
2766 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH)
2767 continue;
2768
2769 /* Recursion step:
2770 Assign downgoing ports by stepping up, fter doing one step down starting on REMOTE switch. */
2771 created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
2772 p_sw, /* this switch - prev. position switch for the function */
2773 target_lid, /* LID that we're routing to */
2774 TRUE, /* whether this is path to HCA that should by tracked by counters */
2775 is_target_a_sw, /* Whether target lid is a switch or not */
2776 reverse_hop_credit - 1, /* Remaining reverse_hops allowed */
2777 reverse_hops + 1, /* Number of reverse_hops done up to this point */
2778 current_hops
2779 + 1);
2780 }
2781 return created_route;
2782
2783 } /* ftree_fabric_route_downgoing_by_going_up() */
2784
2785 /***************************************************/
2786
2787 /*
2788 * Pseudo code:
2789 * foreach leaf switch (in indexing order)
2790 * for each compute node (in indexing order)
2791 * obtain the LID of the compute node
2792 * set local LFT(LID) of the port connecting to compute node
2793 * call assign-down-going-port-by-ascending-up(TRUE,TRUE) on CURRENT switch
2794 * for each MISSING compute node
2795 * call assign-down-going-port-by-ascending-up(FALSE,TRUE) on CURRENT switch
2796 */
2797
fabric_route_to_cns(IN ftree_fabric_t * p_ftree)2798 static void fabric_route_to_cns(IN ftree_fabric_t * p_ftree)
2799 {
2800 ftree_sw_t *p_sw;
2801 ftree_hca_t *p_hca;
2802 ftree_port_group_t *p_leaf_port_group;
2803 ftree_port_group_t *p_hca_port_group;
2804 ftree_port_t *p_port;
2805 unsigned int i, j;
2806 uint16_t hca_lid;
2807 unsigned routed_targets_on_leaf;
2808
2809 OSM_LOG_ENTER(&p_ftree->p_osm->log);
2810
2811 /* for each leaf switch (in indexing order) */
2812 for (i = 0; i < p_ftree->leaf_switches_num; i++) {
2813 p_sw = p_ftree->leaf_switches[i];
2814 routed_targets_on_leaf = 0;
2815
2816 /* for each HCA connected to this switch */
2817 for (j = 0; j < p_sw->down_port_groups_num; j++) {
2818 p_leaf_port_group = p_sw->down_port_groups[j];
2819
2820 /* work with this port group only if the remote node is CA */
2821 if (p_leaf_port_group->remote_node_type !=
2822 IB_NODE_TYPE_CA)
2823 continue;
2824
2825 p_hca = p_leaf_port_group->remote_hca_or_sw.p_hca;
2826
2827 /* work with this port group only if remote HCA has CNs */
2828 if (!p_hca->cn_num)
2829 continue;
2830
2831 p_hca_port_group =
2832 hca_get_port_group_by_lid(p_hca,
2833 p_leaf_port_group->
2834 remote_lid);
2835 CL_ASSERT(p_hca_port_group);
2836
2837 /* work with this port group only if remote port is CN */
2838 if (!p_hca_port_group->is_cn)
2839 continue;
2840
2841 /* obtain the LID of HCA port */
2842 hca_lid = p_leaf_port_group->remote_lid;
2843
2844 /* set local LFT(LID) to the port that is connected to HCA */
2845 cl_ptr_vector_at(&p_leaf_port_group->ports, 0,
2846 (void *)&p_port);
2847 p_sw->p_osm_sw->new_lft[hca_lid] = p_port->port_num;
2848
2849 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2850 "Switch %s: set path to CN LID %u through port %u\n",
2851 tuple_to_str(p_sw->tuple),
2852 hca_lid, p_port->port_num);
2853
2854 /* set local min hop table(LID) to route to the CA */
2855 sw_set_hops(p_sw, hca_lid, p_port->port_num, 1, FALSE);
2856
2857 /* Assign downgoing ports by stepping up.
2858 Since we're routing here only CNs, we're routing it as REAL
2859 LID and updating fat-tree balancing counters. */
2860 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2861 NULL, /* prev. position switch */
2862 hca_lid, /* LID that we're routing to */
2863 TRUE, /* whether this path to HCA should by tracked by counters */
2864 FALSE, /* whether target lid is a switch or not */
2865 0, /* Number of reverse hops allowed */
2866 0, /* Number of reverse hops done yet */
2867 1); /* Number of hops done yet */
2868
2869 /* count how many real targets have been routed from this leaf switch */
2870 routed_targets_on_leaf++;
2871 }
2872
2873 /* We're done with the real targets (all CNs) of this leaf switch.
2874 Now route the dummy HCAs that are missing or that are non-CNs.
2875 When routing to dummy HCAs we don't fill lid matrices. */
2876 if (p_ftree->max_cn_per_leaf > routed_targets_on_leaf) {
2877 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2878 "Routing %u dummy CAs\n",
2879 p_ftree->max_cn_per_leaf -
2880 p_sw->down_port_groups_num);
2881 for (j = 0; j <
2882 p_ftree->max_cn_per_leaf - routed_targets_on_leaf;
2883 j++) {
2884 ftree_sw_t *p_next_sw, *p_ftree_sw;
2885 sw_set_hops(p_sw, 0, 0xFF, 1, FALSE);
2886 /* assign downgoing ports by stepping up */
2887 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2888 NULL, /* prev. position switch */
2889 0, /* LID that we're routing to - ignored for dummy HCA */
2890 TRUE, /* whether this path to HCA should by tracked by counters */
2891 FALSE, /* Whether the target LID is a switch or not */
2892 0, /* Number of reverse hops allowed */
2893 0, /* Number of reverse hops done yet */
2894 1); /* Number of hops done yet */
2895
2896 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
2897 /* need to clean the LID 0 hops for dummy node */
2898 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
2899 p_ftree_sw = p_next_sw;
2900 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_ftree_sw->map_item);
2901 p_ftree_sw->hops[0] = OSM_NO_PATH;
2902 p_ftree_sw->p_osm_sw->new_lft[0] = OSM_NO_PATH;
2903 }
2904
2905 }
2906 }
2907 }
2908 /* done going through all the leaf switches */
2909 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2910 } /* fabric_route_to_cns() */
2911
2912 /***************************************************/
2913
2914 /*
2915 * Pseudo code:
2916 * foreach HCA non-CN port in fabric
2917 * obtain the LID of the HCA port
2918 * get switch that is connected to this HCA port
2919 * set switch LFT(LID) to the port connected to the HCA port
2920 * call assign-down-going-port-by-ascending-up(TRUE,TRUE) on the switch
2921 *
2922 * Routing to these HCAs is routing a REAL hca lid on MAIN path.
2923 * We want to allow load-leveling of the traffic to the non-CNs,
2924 * because such nodes may include IO nodes with heavy usage
2925 * - we should set fwd tables
2926 * - we should update port counters
2927 * Routing to non-CNs is done after routing to CNs, so updated port
2928 * counters will not affect CN-to-CN routing.
2929 */
2930
fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree)2931 static void fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree)
2932 {
2933 ftree_sw_t *p_sw;
2934 ftree_hca_t *p_hca;
2935 ftree_hca_t *p_next_hca;
2936 ftree_port_t *p_hca_port;
2937 ftree_port_group_t *p_hca_port_group;
2938 uint16_t hca_lid;
2939 unsigned port_num_on_switch;
2940 unsigned i;
2941
2942 OSM_LOG_ENTER(&p_ftree->p_osm->log);
2943
2944 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
2945 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
2946 p_hca = p_next_hca;
2947 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
2948
2949 for (i = 0; i < p_hca->up_port_groups_num; i++) {
2950 p_hca_port_group = p_hca->up_port_groups[i];
2951
2952 /* skip this port if it's CN, in which case it has been already routed */
2953 if (p_hca_port_group->is_cn)
2954 continue;
2955
2956 /* skip this port if it is not connected to switch */
2957 if (p_hca_port_group->remote_node_type !=
2958 IB_NODE_TYPE_SWITCH)
2959 continue;
2960
2961 p_sw = p_hca_port_group->remote_hca_or_sw.p_sw;
2962 hca_lid = p_hca_port_group->lid;
2963
2964 /* set switches LFT(LID) to the port that is connected to HCA */
2965 cl_ptr_vector_at(&p_hca_port_group->ports, 0,
2966 (void *)&p_hca_port);
2967 port_num_on_switch = p_hca_port->remote_port_num;
2968 p_sw->p_osm_sw->new_lft[hca_lid] = port_num_on_switch;
2969
2970 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2971 "Switch %s: set path to non-CN HCA LID %u through port %u\n",
2972 tuple_to_str(p_sw->tuple),
2973 hca_lid, port_num_on_switch);
2974
2975 /* set local min hop table(LID) to route to the CA */
2976 sw_set_hops(p_sw, hca_lid, port_num_on_switch, /* port num */
2977 1, FALSE); /* hops */
2978
2979 /* Assign downgoing ports by stepping up.
2980 We're routing REAL targets. They are not CNs and not included
2981 in the leafs array, but we treat them as MAIN path to allow load
2982 leveling, which means that the counters will be updated. */
2983 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
2984 NULL, /* prev. position switch */
2985 hca_lid, /* LID that we're routing to */
2986 TRUE, /* whether this path to HCA should by tracked by counters */
2987 FALSE, /* Whether the target LID is a switch or not */
2988 p_hca_port_group->is_io ? p_ftree->p_osm->subn.opt.max_reverse_hops : 0, /* Number or reverse hops allowed */
2989 0, /* Number or reverse hops done yet */
2990 1); /* Number of hops done yet */
2991 }
2992 /* done with all the port groups of this HCA - go to next HCA */
2993 }
2994
2995 OSM_LOG_EXIT(&p_ftree->p_osm->log);
2996 } /* fabric_route_to_non_cns() */
2997
2998 /***************************************************/
2999
3000 /*
3001 * Pseudo code:
3002 * foreach switch in fabric
3003 * obtain its LID
3004 * set local LFT(LID) to port 0
3005 * call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
3006 *
3007 * Routing to switch is similar to routing a REAL hca lid on SECONDARY path:
3008 * - we should set fwd tables
3009 * - we should NOT update port counters
3010 */
3011
fabric_route_to_switches(IN ftree_fabric_t * p_ftree)3012 static void fabric_route_to_switches(IN ftree_fabric_t * p_ftree)
3013 {
3014 ftree_sw_t *p_sw;
3015 ftree_sw_t *p_next_sw;
3016
3017 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3018
3019 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3020 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3021 p_sw = p_next_sw;
3022 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3023
3024 /* set local LFT(LID) to 0 (route to itself) */
3025 p_sw->p_osm_sw->new_lft[p_sw->lid] = 0;
3026
3027 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3028 "Switch %s (LID %u): routing switch-to-switch paths\n",
3029 tuple_to_str(p_sw->tuple), p_sw->lid);
3030
3031 /* set min hop table of the switch to itself */
3032 sw_set_hops(p_sw, p_sw->lid, 0, /* port_num */
3033 0, TRUE); /* hops */
3034
3035 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */
3036 NULL, /* prev. position switch */
3037 p_sw->lid, /* LID that we're routing to */
3038 FALSE, /* whether this path to HCA should by tracked by counters */
3039 TRUE, /* Whether the target LID is a switch or not */
3040 0, /* Number of reverse hops allowed */
3041 0, /* Number of reverse hops done yet */
3042 0); /* Number of hops done yet */
3043 }
3044
3045 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3046 } /* fabric_route_to_switches() */
3047
3048 /***************************************************
3049 ***************************************************/
3050
fabric_route_roots(IN ftree_fabric_t * p_ftree)3051 static void fabric_route_roots(IN ftree_fabric_t * p_ftree)
3052 {
3053 uint16_t lid;
3054 uint8_t port_num;
3055 osm_port_t *p_port;
3056 ftree_sw_t *p_sw;
3057 ftree_sw_t *p_leaf_sw;
3058
3059 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3060
3061 /*
3062 * We need a switch that will accomodate all the down/up turns in
3063 * the fabric. Having these turn in a single place in the fabric
3064 * will not create credit loops.
3065 * So we need to select this switch.
3066 * The idea here is to chose leaf with the highest index. I don't
3067 * have any theory to back me up on this. It's just a general thought
3068 * that this way the switch that might be a bottleneck for many mcast
3069 * groups will be far away from the OpenSM, so it will draw the
3070 * multicast traffic away from the SM.
3071 */
3072
3073 p_leaf_sw = p_ftree->leaf_switches[p_ftree->leaf_switches_num-1];
3074
3075 /*
3076 * Now go over all the switches in the fabric that
3077 * have lower rank, and route the missing LIDs to
3078 * the selected leaf switch.
3079 * In short, this leaf switch now poses a target
3080 * for all those missing LIDs.
3081 */
3082
3083 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3084 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
3085 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
3086
3087 if (p_sw->rank >= p_ftree->leaf_switch_rank)
3088 continue;
3089
3090 for (lid = 1; lid <= p_leaf_sw->p_osm_sw->max_lid_ho; lid ++) {
3091
3092 if (p_sw->p_osm_sw->new_lft[lid] != OSM_NO_PATH ||
3093 p_leaf_sw->hops[lid] == OSM_NO_PATH)
3094 continue;
3095
3096 p_port = osm_get_port_by_lid_ho(&p_ftree->p_osm->subn,
3097 lid);
3098
3099 /* we're interested only in switches */
3100 if (!p_port || !p_port->p_node->sw)
3101 continue;
3102
3103 /*
3104 * the missing LID will be routed through the same
3105 * port that routes to the selected leaf switch
3106 */
3107 port_num = p_sw->p_osm_sw->new_lft[p_leaf_sw->lid];
3108
3109 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3110 "Switch %s: setting path to LID %u "
3111 "through port %u\n",
3112 tuple_to_str(p_sw->tuple), lid, port_num);
3113
3114 /* set local lft */
3115 p_sw->p_osm_sw->new_lft[lid] = port_num;
3116
3117 /*
3118 * Set local min hop table.
3119 * The distance to the target LID is a distance
3120 * to the selected leaf switch plus the distance
3121 * from the leaf to the target LID.
3122 */
3123 sw_set_hops(p_sw, lid, port_num,
3124 p_sw->hops[p_leaf_sw->lid] +
3125 p_leaf_sw->hops[lid], TRUE);
3126 }
3127 }
3128
3129 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3130 } /* fabric_route_roots() */
3131
3132 /***************************************************/
3133
fabric_populate_nodes(IN ftree_fabric_t * p_ftree)3134 static int fabric_populate_nodes(IN ftree_fabric_t * p_ftree)
3135 {
3136 osm_node_t *p_osm_node;
3137 osm_node_t *p_next_osm_node;
3138
3139 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3140
3141 p_next_osm_node =
3142 (osm_node_t *) cl_qmap_head(&p_ftree->p_osm->subn.node_guid_tbl);
3143 while (p_next_osm_node !=
3144 (osm_node_t *) cl_qmap_end(&p_ftree->p_osm->
3145 subn.node_guid_tbl)) {
3146 p_osm_node = p_next_osm_node;
3147 p_next_osm_node =
3148 (osm_node_t *) cl_qmap_next(&p_osm_node->map_item);
3149 switch (osm_node_get_type(p_osm_node)) {
3150 case IB_NODE_TYPE_CA:
3151 fabric_add_hca(p_ftree, p_osm_node);
3152 break;
3153 case IB_NODE_TYPE_ROUTER:
3154 break;
3155 case IB_NODE_TYPE_SWITCH:
3156 fabric_add_sw(p_ftree, p_osm_node->sw);
3157 break;
3158 default:
3159 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3160 "ERR AB0E: " "Node GUID 0x%016" PRIx64
3161 " - Unknown node type: %s\n",
3162 cl_ntoh64(osm_node_get_node_guid(p_osm_node)),
3163 ib_get_node_type_str(osm_node_get_type
3164 (p_osm_node)));
3165 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3166 return -1;
3167 }
3168 }
3169
3170 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3171 return 0;
3172 } /* fabric_populate_nodes() */
3173
3174 /***************************************************
3175 ***************************************************/
3176
sw_update_rank(IN ftree_sw_t * p_sw,IN uint32_t new_rank)3177 static boolean_t sw_update_rank(IN ftree_sw_t * p_sw, IN uint32_t new_rank)
3178 {
3179 if (sw_ranked(p_sw) && p_sw->rank <= new_rank)
3180 return FALSE;
3181 p_sw->rank = new_rank;
3182 return TRUE;
3183
3184 }
3185
3186 /***************************************************/
3187
rank_switches_from_leafs(IN ftree_fabric_t * p_ftree,IN cl_list_t * p_ranking_bfs_list)3188 static void rank_switches_from_leafs(IN ftree_fabric_t * p_ftree,
3189 IN cl_list_t * p_ranking_bfs_list)
3190 {
3191 ftree_sw_t *p_sw;
3192 ftree_sw_t *p_remote_sw;
3193 osm_node_t *p_node;
3194 osm_node_t *p_remote_node;
3195 osm_physp_t *p_osm_port;
3196 uint8_t i;
3197 unsigned max_rank = 0;
3198
3199 while (!cl_is_list_empty(p_ranking_bfs_list)) {
3200 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
3201 p_node = p_sw->p_osm_sw->p_node;
3202
3203 /* note: skipping port 0 on switches */
3204 for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
3205 p_osm_port = osm_node_get_physp_ptr(p_node, i);
3206 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3207 continue;
3208
3209 p_remote_node =
3210 osm_node_get_remote_node(p_node, i, NULL);
3211 if (!p_remote_node)
3212 continue;
3213 if (osm_node_get_type(p_remote_node) !=
3214 IB_NODE_TYPE_SWITCH)
3215 continue;
3216
3217 p_remote_sw = fabric_get_sw_by_guid(p_ftree,
3218 osm_node_get_node_guid
3219 (p_remote_node));
3220 if (!p_remote_sw) {
3221 /* remote node is not a switch */
3222 continue;
3223 }
3224
3225 /* if needed, rank the remote switch and add it to the BFS list */
3226 if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) {
3227 max_rank = p_remote_sw->rank;
3228 cl_list_insert_tail(p_ranking_bfs_list,
3229 p_remote_sw);
3230 }
3231 }
3232 }
3233
3234 /* set FatTree maximal switch rank */
3235 p_ftree->max_switch_rank = max_rank;
3236
3237 } /* rank_switches_from_leafs() */
3238
3239 /***************************************************/
3240
rank_leaf_switches(IN ftree_fabric_t * p_ftree,IN ftree_hca_t * p_hca,IN cl_list_t * p_ranking_bfs_list)3241 static int rank_leaf_switches(IN ftree_fabric_t * p_ftree,
3242 IN ftree_hca_t * p_hca,
3243 IN cl_list_t * p_ranking_bfs_list)
3244 {
3245 ftree_sw_t *p_sw;
3246 osm_node_t *p_osm_node = p_hca->p_osm_node;
3247 osm_node_t *p_remote_osm_node;
3248 osm_physp_t *p_osm_port;
3249 static uint8_t i = 0;
3250 int res = 0;
3251
3252 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3253
3254 for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++) {
3255 p_osm_port = osm_node_get_physp_ptr(p_osm_node, i);
3256 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3257 continue;
3258
3259 p_remote_osm_node =
3260 osm_node_get_remote_node(p_osm_node, i, NULL);
3261 if (!p_remote_osm_node)
3262 continue;
3263
3264 switch (osm_node_get_type(p_remote_osm_node)) {
3265 case IB_NODE_TYPE_CA:
3266 /* HCA connected directly to another HCA - not FatTree */
3267 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3268 "ERR AB0F: "
3269 "CA conected directly to another CA: " "0x%016"
3270 PRIx64 " <---> 0x%016" PRIx64 "\n",
3271 hca_get_guid_ho(p_hca),
3272 cl_ntoh64(osm_node_get_node_guid
3273 (p_remote_osm_node)));
3274 res = -1;
3275 goto Exit;
3276
3277 case IB_NODE_TYPE_ROUTER:
3278 /* leaving this port - proceeding to the next one */
3279 continue;
3280
3281 case IB_NODE_TYPE_SWITCH:
3282 /* continue with this port */
3283 break;
3284
3285 default:
3286 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3287 "ERR AB10: Node GUID 0x%016" PRIx64
3288 " - Unknown node type: %s\n",
3289 cl_ntoh64(osm_node_get_node_guid
3290 (p_remote_osm_node)),
3291 ib_get_node_type_str(osm_node_get_type
3292 (p_remote_osm_node)));
3293 res = -1;
3294 goto Exit;
3295 }
3296
3297 /* remote node is switch */
3298
3299 p_sw = fabric_get_sw_by_guid(p_ftree,
3300 osm_node_get_node_guid
3301 (p_osm_port->p_remote_physp->
3302 p_node));
3303 CL_ASSERT(p_sw);
3304
3305 /* if needed, rank the remote switch and add it to the BFS list */
3306
3307 if (!sw_update_rank(p_sw, 0))
3308 continue;
3309 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3310 "Marking rank of switch that is directly connected to CA:\n"
3311 " - CA guid : 0x%016"
3312 PRIx64 "\n"
3313 " - Switch guid: 0x%016"
3314 PRIx64 "\n"
3315 " - Switch LID : %u\n",
3316 hca_get_guid_ho(p_hca),
3317 sw_get_guid_ho(p_sw), p_sw->lid);
3318 cl_list_insert_tail(p_ranking_bfs_list, p_sw);
3319 }
3320
3321 Exit:
3322 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3323 return res;
3324 } /* rank_leaf_switches() */
3325
3326 /***************************************************/
3327
sw_reverse_rank(IN cl_map_item_t * const p_map_item,IN void * context)3328 static void sw_reverse_rank(IN cl_map_item_t * const p_map_item,
3329 IN void *context)
3330 {
3331 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
3332 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
3333 if (p_sw->rank != 0xFFFFFFFF)
3334 p_sw->rank = p_ftree->max_switch_rank - p_sw->rank;
3335 }
3336
3337 /***************************************************
3338 ***************************************************/
3339
3340 static int
fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree,IN ftree_hca_t * p_hca)3341 fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca)
3342 {
3343 ftree_sw_t *p_remote_sw;
3344 osm_node_t *p_node = p_hca->p_osm_node;
3345 osm_node_t *p_remote_node;
3346 uint8_t remote_node_type;
3347 ib_net64_t remote_node_guid;
3348 osm_physp_t *p_remote_osm_port;
3349 uint8_t i;
3350 uint8_t remote_port_num;
3351 boolean_t is_cn;
3352 boolean_t is_in_cn_file;
3353 boolean_t is_io;
3354 boolean_t is_cns_file_provided = fabric_cns_provided(p_ftree);
3355 boolean_t is_ios_file_provided = fabric_ios_provided(p_ftree);
3356 int res = 0;
3357
3358 for (i = 0; i < osm_node_get_num_physp(p_node); i++) {
3359 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
3360 is_io = FALSE;
3361 is_cn = TRUE;
3362 is_in_cn_file = FALSE;
3363
3364 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3365 continue;
3366
3367 if (p_hca->disconnected_ports[i])
3368 continue;
3369
3370 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
3371 p_remote_node =
3372 osm_node_get_remote_node(p_node, i, &remote_port_num);
3373
3374 if (!p_remote_osm_port || !p_remote_node)
3375 continue;
3376
3377 remote_node_type = osm_node_get_type(p_remote_node);
3378 remote_node_guid = osm_node_get_node_guid(p_remote_node);
3379
3380 switch (remote_node_type) {
3381 case IB_NODE_TYPE_ROUTER:
3382 /* leaving this port - proceeding to the next one */
3383 continue;
3384
3385 case IB_NODE_TYPE_CA:
3386 /* HCA connected directly to another HCA - not FatTree */
3387 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3388 "ERR AB11: "
3389 "CA conected directly to another CA: " "0x%016"
3390 PRIx64 " <---> 0x%016" PRIx64 "\n",
3391 cl_ntoh64(osm_node_get_node_guid(p_node)),
3392 cl_ntoh64(remote_node_guid));
3393 res = -1;
3394 goto Exit;
3395
3396 case IB_NODE_TYPE_SWITCH:
3397 /* continue with this port */
3398 break;
3399
3400 default:
3401 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3402 "ERR AB12: Node GUID 0x%016" PRIx64
3403 " - Unknown node type: %s\n",
3404 cl_ntoh64(remote_node_guid),
3405 ib_get_node_type_str(remote_node_type));
3406 res = -1;
3407 goto Exit;
3408 }
3409
3410 /* remote node is switch */
3411
3412 p_remote_sw = fabric_get_sw_by_guid(p_ftree, remote_node_guid);
3413 CL_ASSERT(p_remote_sw);
3414
3415 /* If CN file is not supplied, then all the CAs considered as Compute Nodes.
3416 Otherwise all the CAs are not CNs, and only guids that are present in the
3417 CN file will be marked as compute nodes. */
3418 if (is_cns_file_provided == TRUE) {
3419 name_map_item_t *p_elem = (name_map_item_t *)
3420 cl_qmap_get(&p_ftree->cn_guid_tbl,
3421 cl_ntoh64(osm_physp_get_port_guid
3422 (p_osm_port)));
3423 if (p_elem == (name_map_item_t *)
3424 cl_qmap_end(&p_ftree->cn_guid_tbl))
3425 is_cn = FALSE;
3426 else
3427 is_in_cn_file = TRUE;
3428 }
3429 if (is_in_cn_file == FALSE && is_ios_file_provided == TRUE) {
3430 name_map_item_t *p_elem = (name_map_item_t *)
3431 cl_qmap_get(&p_ftree->io_guid_tbl,
3432 cl_ntoh64(osm_physp_get_port_guid
3433 (p_osm_port)));
3434 if (p_elem != (name_map_item_t *)
3435 cl_qmap_end(&p_ftree->io_guid_tbl)) {
3436 is_io = TRUE;
3437 is_cn = FALSE;
3438 }
3439 }
3440
3441 if (is_cn) {
3442 p_ftree->cn_num++;
3443 p_hca->cn_num++;
3444 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3445 "Marking CN port GUID 0x%016" PRIx64 "\n",
3446 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3447 } else if (is_io) {
3448 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3449 "Marking I/O port GUID 0x%016" PRIx64 "\n",
3450 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3451 } else {
3452 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3453 "Marking non-CN port GUID 0x%016" PRIx64 "\n",
3454 cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3455 }
3456 p_ftree->ca_ports++;
3457
3458 hca_add_port(p_ftree,
3459 p_hca, /* local ftree_hca object */
3460 i, /* local port number */
3461 remote_port_num, /* remote port number */
3462 cl_ntoh16(osm_node_get_base_lid(p_node, i)), /* local lid */
3463 cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0)), /* remote lid */
3464 osm_physp_get_port_guid(p_osm_port), /* local port guid */
3465 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */
3466 remote_node_guid, /* remote node guid */
3467 remote_node_type, /* remote node type */
3468 (void *)p_remote_sw, /* remote ftree_hca/sw object */
3469 is_cn, is_io); /* whether this port is compute node */
3470 }
3471
3472 Exit:
3473 return res;
3474 } /* fabric_construct_hca_ports() */
3475
3476 /***************************************************
3477 ***************************************************/
3478
fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree,IN ftree_sw_t * p_sw)3479 static int fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree,
3480 IN ftree_sw_t * p_sw)
3481 {
3482 ftree_hca_t *p_remote_hca;
3483 ftree_sw_t *p_remote_sw;
3484 osm_node_t *p_node = p_sw->p_osm_sw->p_node;
3485 osm_node_t *p_remote_node;
3486 uint16_t remote_lid;
3487 uint8_t remote_node_type;
3488 ib_net64_t remote_node_guid;
3489 osm_physp_t *p_remote_osm_port;
3490 ftree_direction_t direction;
3491 void *p_remote_hca_or_sw;
3492 uint8_t i;
3493 uint8_t remote_port_num;
3494 int res = 0;
3495
3496 CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
3497
3498 for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
3499 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
3500 if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3501 continue;
3502
3503 p_remote_osm_port = osm_physp_get_remote(p_osm_port);
3504 if (!p_remote_osm_port)
3505 continue;
3506
3507 p_remote_node =
3508 osm_node_get_remote_node(p_node, i, &remote_port_num);
3509 if (!p_remote_node)
3510 continue;
3511
3512 /* ignore any loopback connection on switch */
3513 if (p_node == p_remote_node) {
3514 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3515 "Ignoring loopback on switch GUID 0x%016" PRIx64
3516 ", LID %u, rank %u\n",
3517 sw_get_guid_ho(p_sw),
3518 p_sw->lid, p_sw->rank);
3519 continue;
3520 }
3521
3522 remote_node_type = osm_node_get_type(p_remote_node);
3523 remote_node_guid = osm_node_get_node_guid(p_remote_node);
3524
3525 switch (remote_node_type) {
3526 case IB_NODE_TYPE_ROUTER:
3527 /* leaving this port - proceeding to the next one */
3528 continue;
3529
3530 case IB_NODE_TYPE_CA:
3531 /* switch connected to hca */
3532
3533 p_remote_hca =
3534 fabric_get_hca_by_guid(p_ftree, remote_node_guid);
3535 CL_ASSERT(p_remote_hca);
3536
3537 p_remote_hca_or_sw = (void *)p_remote_hca;
3538 direction = FTREE_DIRECTION_DOWN;
3539
3540 remote_lid =
3541 cl_ntoh16(osm_physp_get_base_lid(p_remote_osm_port));
3542 break;
3543
3544 case IB_NODE_TYPE_SWITCH:
3545 /* switch connected to another switch */
3546
3547 p_remote_sw =
3548 fabric_get_sw_by_guid(p_ftree, remote_node_guid);
3549 CL_ASSERT(p_remote_sw);
3550
3551 p_remote_hca_or_sw = (void *)p_remote_sw;
3552
3553 if (p_sw->rank > p_remote_sw->rank) {
3554 direction = FTREE_DIRECTION_UP;
3555 } else if (p_sw->rank == p_remote_sw->rank) {
3556 direction = FTREE_DIRECTION_SAME;
3557 } else
3558 direction = FTREE_DIRECTION_DOWN;
3559
3560 /* switch LID is only in port 0 port_info structure */
3561 remote_lid =
3562 cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0));
3563
3564 break;
3565
3566 default:
3567 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3568 "ERR AB13: Node GUID 0x%016" PRIx64
3569 " - Unknown node type: %s\n",
3570 cl_ntoh64(remote_node_guid),
3571 ib_get_node_type_str(remote_node_type));
3572 res = -1;
3573 goto Exit;
3574 }
3575 sw_add_port(p_sw, /* local ftree_sw object */
3576 i, /* local port number */
3577 remote_port_num, /* remote port number */
3578 p_sw->lid, /* local lid */
3579 remote_lid, /* remote lid */
3580 osm_physp_get_port_guid(p_osm_port), /* local port guid */
3581 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */
3582 remote_node_guid, /* remote node guid */
3583 remote_node_type, /* remote node type */
3584 p_remote_hca_or_sw, /* remote ftree_hca/sw object */
3585 direction); /* port direction (up or down) */
3586
3587 /* Track the max lid (in host order) that exists in the fabric */
3588 if (remote_lid > p_ftree->lft_max_lid)
3589 p_ftree->lft_max_lid = remote_lid;
3590 }
3591
3592 Exit:
3593 return res;
3594 } /* fabric_construct_sw_ports() */
3595
3596 /***************************************************
3597 ***************************************************/
3598 struct rank_root_cxt {
3599 ftree_fabric_t *fabric;
3600 cl_list_t *list;
3601 };
3602 /***************************************************
3603 ***************************************************/
rank_root_sw_by_guid(void * cxt,uint64_t guid,char * p)3604 static int rank_root_sw_by_guid(void *cxt, uint64_t guid, char *p)
3605 {
3606 struct rank_root_cxt *c = cxt;
3607 ftree_sw_t *sw;
3608
3609 sw = fabric_get_sw_by_guid(c->fabric, cl_hton64(guid));
3610 if (!sw) {
3611 /* the specified root guid wasn't found in the fabric */
3612 OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_ERROR, "ERR AB24: "
3613 "Root switch GUID 0x%" PRIx64 " not found\n", guid);
3614 return 0;
3615 }
3616
3617 OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_DEBUG,
3618 "Ranking root switch with GUID 0x%" PRIx64 "\n", guid);
3619 sw->rank = 0;
3620 cl_list_insert_tail(c->list, sw);
3621
3622 return 0;
3623 }
3624 /***************************************************
3625 ***************************************************/
fabric_load_roots(IN ftree_fabric_t * p_ftree,IN cl_list_t * p_ranking_bfs_list)3626 static boolean_t fabric_load_roots(IN ftree_fabric_t * p_ftree,
3627 IN cl_list_t* p_ranking_bfs_list)
3628 {
3629 struct rank_root_cxt context;
3630 unsigned num_roots;
3631
3632 if (p_ranking_bfs_list) {
3633
3634 /* Rank all the roots and add them to list */
3635 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3636 "Fetching root nodes from file %s\n",
3637 p_ftree->p_osm->subn.opt.root_guid_file);
3638
3639 context.fabric = p_ftree;
3640 context.list = p_ranking_bfs_list;
3641 if (parse_node_map(p_ftree->p_osm->subn.opt.root_guid_file,
3642 rank_root_sw_by_guid, &context)) {
3643 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB2A: "
3644 "cannot parse root guids file \'%s\'\n",
3645 p_ftree->p_osm->subn.opt.root_guid_file);
3646 return FALSE;
3647 }
3648
3649 num_roots = cl_list_count(p_ranking_bfs_list);
3650 if (!num_roots) {
3651 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB25: "
3652 "No valid roots supplied\n");
3653 return FALSE;
3654 }
3655
3656 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3657 "Ranked %u valid root switches\n", num_roots);
3658 }
3659 return TRUE;
3660 }
3661 /***************************************************
3662 ***************************************************/
fabric_rank_from_roots(IN ftree_fabric_t * p_ftree,IN cl_list_t * p_ranking_bfs_list)3663 static int fabric_rank_from_roots(IN ftree_fabric_t * p_ftree,
3664 IN cl_list_t* p_ranking_bfs_list)
3665 {
3666 osm_node_t *p_osm_node;
3667 osm_node_t *p_remote_osm_node;
3668 osm_physp_t *p_osm_physp;
3669 ftree_sw_t *p_sw;
3670 ftree_sw_t *p_remote_sw;
3671 int res = 0;
3672 unsigned max_rank = 0;
3673 unsigned i;
3674
3675 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3676
3677 if (!p_ranking_bfs_list) {
3678 res = -1;
3679 goto Exit;
3680 }
3681 while (!cl_is_list_empty(p_ranking_bfs_list)) {
3682 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
3683 p_osm_node = p_sw->p_osm_sw->p_node;
3684
3685 /* note: skipping port 0 on switches */
3686 for (i = 1; i < osm_node_get_num_physp(p_osm_node); i++) {
3687 p_osm_physp = osm_node_get_physp_ptr(p_osm_node, i);
3688 if (!p_osm_physp || !osm_link_is_healthy(p_osm_physp))
3689 continue;
3690
3691 p_remote_osm_node =
3692 osm_node_get_remote_node(p_osm_node, i, NULL);
3693 if (!p_remote_osm_node)
3694 continue;
3695
3696 if (osm_node_get_type(p_remote_osm_node) !=
3697 IB_NODE_TYPE_SWITCH)
3698 continue;
3699
3700 p_remote_sw = fabric_get_sw_by_guid(p_ftree,
3701 osm_node_get_node_guid
3702 (p_remote_osm_node));
3703 CL_ASSERT(p_remote_sw);
3704
3705 /* if needed, rank the remote switch and add it to the BFS list */
3706 if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) {
3707 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3708 "Ranking switch 0x%" PRIx64
3709 " with rank %u\n",
3710 sw_get_guid_ho(p_remote_sw),
3711 p_remote_sw->rank);
3712 max_rank = p_remote_sw->rank;
3713 cl_list_insert_tail(p_ranking_bfs_list,
3714 p_remote_sw);
3715 }
3716 }
3717 /* done with ports of this switch - go to the next switch in the list */
3718 }
3719
3720 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3721 "Subnet ranking completed. Max Node Rank = %u\n", max_rank);
3722
3723 /* set FatTree maximal switch rank */
3724 p_ftree->max_switch_rank = max_rank;
3725
3726 Exit:
3727 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3728 return res;
3729 } /* fabric_rank_from_roots() */
3730
3731 /***************************************************
3732 ***************************************************/
3733
fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree)3734 static int fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree)
3735 {
3736 ftree_hca_t *p_hca;
3737 ftree_hca_t *p_next_hca;
3738 cl_list_t ranking_bfs_list;
3739 int res = 0;
3740
3741 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3742
3743 cl_list_init(&ranking_bfs_list, 10);
3744
3745 /* Mark REVERSED rank of all the switches in the subnet.
3746 Start from switches that are connected to hca's, and
3747 scan all the switches in the subnet. */
3748 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3749 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3750 p_hca = p_next_hca;
3751 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3752 if (rank_leaf_switches(p_ftree, p_hca, &ranking_bfs_list) != 0) {
3753 res = -1;
3754 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3755 "ERR AB14: "
3756 "Subnet ranking failed - subnet is not FatTree");
3757 goto Exit;
3758 }
3759 }
3760
3761 /* Now rank rest of the switches in the fabric, while the
3762 list already contains all the ranked leaf switches */
3763 rank_switches_from_leafs(p_ftree, &ranking_bfs_list);
3764
3765 /* fix ranking of the switches by reversing the ranking direction */
3766 cl_qmap_apply_func(&p_ftree->sw_tbl, sw_reverse_rank, (void *)p_ftree);
3767
3768 Exit:
3769 cl_list_destroy(&ranking_bfs_list);
3770 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3771 return res;
3772 } /* fabric_rank_from_hcas() */
3773
3774 /***************************************************
3775 * After ranking from HCA's we want to re-rank using
3776 * the roots
3777 ***************************************************/
fabric_rerank_using_root(IN ftree_fabric_t * p_ftree,IN cl_list_t * p_ranking_bfs_list)3778 static int fabric_rerank_using_root(IN ftree_fabric_t * p_ftree,
3779 IN cl_list_t* p_ranking_bfs_list)
3780 {
3781 ftree_sw_t *p_sw = NULL;
3782 ftree_sw_t *p_next_sw;
3783 int res;
3784
3785 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3786
3787 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3788 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3789 p_sw = p_next_sw;
3790 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3791 if (p_sw->rank == 0)
3792 cl_list_insert_tail(p_ranking_bfs_list, p_sw);
3793 else
3794 p_sw->rank = 0xFFFFFFFF;
3795 }
3796 res = fabric_rank_from_roots(p_ftree, p_ranking_bfs_list);
3797 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3798 return res;
3799 }
3800 /***************************************************
3801 ***************************************************/
fabric_rank(IN ftree_fabric_t * p_ftree)3802 static int fabric_rank(IN ftree_fabric_t * p_ftree)
3803 {
3804 int res = -1;
3805 cl_list_t ranking_bfs_list;
3806
3807 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3808 cl_list_init(&ranking_bfs_list, 10);
3809
3810 if (fabric_roots_provided(p_ftree) &&
3811 fabric_load_roots(p_ftree, &ranking_bfs_list))
3812 res = fabric_rank_from_roots(p_ftree, &ranking_bfs_list);
3813 else {
3814 res = fabric_rank_from_hcas(p_ftree);
3815 if (!res)
3816 res = fabric_rerank_using_root(p_ftree, &ranking_bfs_list);
3817 }
3818
3819 if (res)
3820 goto Exit;
3821
3822 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3823 "FatTree max switch rank is %u\n", p_ftree->max_switch_rank);
3824
3825 Exit:
3826 cl_list_destroy(&ranking_bfs_list);
3827 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3828 return res;
3829 } /* fabric_rank() */
3830
3831 /***************************************************
3832 ***************************************************/
3833
fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree)3834 static void fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree)
3835 {
3836 unsigned i;
3837 ftree_sw_t *p_sw;
3838 ftree_hca_t *p_hca = NULL;
3839 ftree_hca_t *p_next_hca;
3840
3841 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3842
3843 if (!fabric_roots_provided(p_ftree)) {
3844 /* If root file is not provided, the fabric has to be pure fat-tree
3845 in terms of ranking. Thus, leaf switches rank is the max rank. */
3846 p_ftree->leaf_switch_rank = p_ftree->max_switch_rank;
3847 } else {
3848 /* Find the first CN and set the leaf_switch_rank to the rank
3849 of the switch that is connected to this CN. Later we will
3850 ensure that all the leaf switches have the same rank. */
3851 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3852 while (p_next_hca !=
3853 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3854 p_hca = p_next_hca;
3855 if (p_hca->cn_num)
3856 break;
3857 p_next_hca =
3858 (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3859 }
3860 /* we know that there are CNs in the fabric, so just to be sure... */
3861 CL_ASSERT(p_next_hca !=
3862 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl));
3863
3864 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3865 "Selected CN port GUID 0x%" PRIx64 "\n",
3866 hca_get_guid_ho(p_hca));
3867
3868 for (i = 0; (i < p_hca->up_port_groups_num)
3869 && (!p_hca->up_port_groups[i]->is_cn); i++)
3870 ;
3871 CL_ASSERT(i < p_hca->up_port_groups_num);
3872 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
3873 IB_NODE_TYPE_SWITCH);
3874
3875 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
3876 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3877 "Selected leaf switch GUID 0x%" PRIx64 ", rank %u\n",
3878 sw_get_guid_ho(p_sw), p_sw->rank);
3879 p_ftree->leaf_switch_rank = p_sw->rank;
3880 }
3881
3882 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3883 "FatTree leaf switch rank is %u\n", p_ftree->leaf_switch_rank);
3884 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3885 } /* fabric_set_leaf_rank() */
3886
3887 /***************************************************
3888 ***************************************************/
3889
fabric_populate_ports(IN ftree_fabric_t * p_ftree)3890 static int fabric_populate_ports(IN ftree_fabric_t * p_ftree)
3891 {
3892 ftree_hca_t *p_hca;
3893 ftree_hca_t *p_next_hca;
3894 ftree_sw_t *p_sw;
3895 ftree_sw_t *p_next_sw;
3896 int res = 0;
3897
3898 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3899
3900 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3901 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3902 p_hca = p_next_hca;
3903 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3904 if (fabric_construct_hca_ports(p_ftree, p_hca) != 0) {
3905 res = -1;
3906 goto Exit;
3907 }
3908 }
3909
3910 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3911 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3912 p_sw = p_next_sw;
3913 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3914 if (fabric_construct_sw_ports(p_ftree, p_sw) != 0) {
3915 res = -1;
3916 goto Exit;
3917 }
3918 }
3919 Exit:
3920 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3921 return res;
3922 } /* fabric_populate_ports() */
3923
3924 /***************************************************
3925 ***************************************************/
add_guid_item_to_map(void * cxt,uint64_t guid,char * p)3926 static int add_guid_item_to_map(void *cxt, uint64_t guid, char *p)
3927 {
3928 cl_qmap_t *map = cxt;
3929 name_map_item_t *item;
3930 name_map_item_t *inserted_item;
3931
3932 item = malloc(sizeof(*item));
3933 if (!item)
3934 return -1;
3935
3936 item->guid = guid;
3937 inserted_item = (name_map_item_t *) cl_qmap_insert(map, guid, &item->item);
3938 if (inserted_item != item)
3939 free(item);
3940
3941 return 0;
3942 }
3943
fabric_read_guid_files(IN ftree_fabric_t * p_ftree)3944 static int fabric_read_guid_files(IN ftree_fabric_t * p_ftree)
3945 {
3946 int status = 0;
3947
3948 OSM_LOG_ENTER(&p_ftree->p_osm->log);
3949
3950 if (fabric_cns_provided(p_ftree)) {
3951 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3952 "Fetching compute nodes from file %s\n",
3953 p_ftree->p_osm->subn.opt.cn_guid_file);
3954
3955 if (parse_node_map(p_ftree->p_osm->subn.opt.cn_guid_file,
3956 add_guid_item_to_map,
3957 &p_ftree->cn_guid_tbl)) {
3958 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3959 "ERR AB23: " "Problem parsing CN guid file\n");
3960 status = -1;
3961 goto Exit;
3962 }
3963
3964 if (!cl_qmap_count(&p_ftree->cn_guid_tbl)) {
3965 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3966 "ERR AB27: "
3967 "Compute node guids file has no valid guids\n");
3968 status = -1;
3969 goto Exit;
3970 }
3971 }
3972
3973 if (fabric_ios_provided(p_ftree)) {
3974 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3975 "Fetching I/O nodes from file %s\n",
3976 p_ftree->p_osm->subn.opt.io_guid_file);
3977
3978 if (parse_node_map(p_ftree->p_osm->subn.opt.io_guid_file,
3979 add_guid_item_to_map,
3980 &p_ftree->io_guid_tbl)) {
3981 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3982 "ERR AB28: Problem parsing I/O guid file\n");
3983 status = -1;
3984 goto Exit;
3985 }
3986
3987 if (!cl_qmap_count(&p_ftree->io_guid_tbl)) {
3988 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3989 "ERR AB29: "
3990 "I/O node guids file has no valid guids\n");
3991 status = -1;
3992 goto Exit;
3993 }
3994 }
3995 Exit:
3996 OSM_LOG_EXIT(&p_ftree->p_osm->log);
3997 return status;
3998 } /*fabric_read_guid_files() */
3999
4000 /***************************************************
4001 ***************************************************/
4002 /* Get a Sw and remove all depended HCA's, meaning all
4003 * HCA's which this is the only switch they are connected
4004 * to */
remove_depended_hca(IN ftree_fabric_t * p_ftree,IN ftree_sw_t * p_sw)4005 static int remove_depended_hca(IN ftree_fabric_t *p_ftree, IN ftree_sw_t *p_sw)
4006 {
4007 ftree_hca_t *p_hca;
4008 int counter = 0;
4009 int port_num;
4010 uint8_t remote_port_num;
4011 osm_physp_t* physp;
4012 osm_node_t* sw_node;
4013 uint64_t remote_hca_guid;
4014
4015 sw_node = p_sw->p_osm_sw->p_node;
4016 for (port_num = 0; port_num < sw_node->physp_tbl_size; port_num++) {
4017 physp = osm_node_get_physp_ptr(sw_node, port_num);
4018 if (physp && physp->p_remote_physp) {
4019 if (osm_node_get_type(physp->p_remote_physp->p_node) == IB_NODE_TYPE_CA) {
4020 remote_hca_guid =
4021 osm_node_get_node_guid(physp->p_remote_physp->p_node);
4022 p_hca = fabric_get_hca_by_guid(p_ftree, remote_hca_guid);
4023 if (!p_hca)
4024 continue;
4025
4026 remote_port_num =
4027 osm_physp_get_port_num(physp->p_remote_physp);
4028 p_hca->disconnected_ports[remote_port_num] = 1;
4029 }
4030 }
4031 }
4032 return counter;
4033 }
4034 /***************************************************
4035 ***************************************************/
fabric_remove_unranked_sw(IN ftree_fabric_t * p_ftree)4036 static void fabric_remove_unranked_sw(IN ftree_fabric_t *p_ftree)
4037 {
4038 ftree_sw_t *p_sw = NULL;
4039 ftree_sw_t *p_next_sw;
4040 int removed_hca;
4041 int count = 0;
4042
4043 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
4044 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
4045 p_sw = p_next_sw;
4046 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
4047 if (!sw_ranked(p_sw)) {
4048 cl_qmap_remove_item(&p_ftree->sw_tbl,&p_sw->map_item);
4049 removed_hca = remove_depended_hca(p_ftree, p_sw);
4050 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4051 "Removing Unranked sw 0x%" PRIx64 " (with %d dependent hca's)\n",
4052 sw_get_guid_ho(p_sw),removed_hca);
4053 sw_destroy(p_sw);
4054 count++;
4055 }
4056 }
4057 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
4058 "Removed %d invalid switches\n", count);
4059 }
4060 /***************************************************
4061 ***************************************************/
construct_fabric(IN void * context)4062 static int construct_fabric(IN void *context)
4063 {
4064 ftree_fabric_t *p_ftree = context;
4065 int status = 0;
4066
4067 OSM_LOG_ENTER(&p_ftree->p_osm->log);
4068
4069 fabric_clear(p_ftree);
4070
4071 if (p_ftree->p_osm->subn.opt.lmc > 0) {
4072 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4073 "LMC > 0 is not supported by fat-tree routing.\n"
4074 "Falling back to default routing\n");
4075 status = -1;
4076 goto Exit;
4077 }
4078
4079 if (cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl) < 2) {
4080 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4081 "Fabric has %u switches - topology is not fat-tree.\n"
4082 "Falling back to default routing\n",
4083 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
4084 status = -1;
4085 goto Exit;
4086 }
4087
4088 if ((cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl) -
4089 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)) < 2) {
4090 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4091 "Fabric has %u nodes (%u switches) - topology is not fat-tree.\n"
4092 "Falling back to default routing\n",
4093 cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl),
4094 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
4095 status = -1;
4096 goto Exit;
4097 }
4098
4099 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
4100 " |----------------------------------------|\n"
4101 " |- Starting FatTree fabric construction -|\n"
4102 " |----------------------------------------|\n\n");
4103
4104 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4105 "Populating FatTree Switch and CA tables\n");
4106 if (fabric_populate_nodes(p_ftree) != 0) {
4107 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4108 "Fabric topology is not fat-tree - "
4109 "falling back to default routing\n");
4110 status = -1;
4111 goto Exit;
4112 }
4113
4114 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4115 "Reading guid files provided by user\n");
4116 if (fabric_read_guid_files(p_ftree) != 0) {
4117 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4118 "Failed reading guid files - "
4119 "falling back to default routing\n");
4120 status = -1;
4121 goto Exit;
4122 }
4123
4124 if (cl_qmap_count(&p_ftree->hca_tbl) < 2) {
4125 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4126 "Fabric has %u CAs - topology is not fat-tree.\n"
4127 "Falling back to default routing\n",
4128 cl_qmap_count(&p_ftree->hca_tbl));
4129 status = -1;
4130 goto Exit;
4131 }
4132
4133 /* Rank all the switches in the fabric.
4134 After that we will know only fabric max switch rank.
4135 We will be able to check leaf switches rank and the
4136 whole tree rank after filling ports and marking CNs. */
4137 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "Ranking FatTree\n");
4138 if (fabric_rank(p_ftree) != 0) {
4139 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4140 "Failed ranking the tree\n");
4141 status = -1;
4142 goto Exit;
4143 }
4144 fabric_remove_unranked_sw(p_ftree);
4145
4146 if (p_ftree->max_switch_rank == 0 &&
4147 cl_qmap_count(&p_ftree->sw_tbl) > 1) {
4148 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
4149 "ERR AB2B: Found more than one root on fabric with "
4150 "maximum rank 0\n");
4151 status = -1;
4152 goto Exit;
4153 }
4154
4155 /* For each hca and switch, construct array of ports.
4156 This is done after the whole FatTree data structure is ready,
4157 because we want the ports to have pointers to ftree_{sw,hca}_t
4158 objects, and we need the switches to be already ranked because
4159 that's how the port direction is determined. */
4160 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4161 "Populating CA & switch ports\n");
4162 if (fabric_populate_ports(p_ftree) != 0) {
4163 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4164 "Fabric topology is not a fat-tree\n");
4165 status = -1;
4166 goto Exit;
4167 } else if (p_ftree->cn_num == 0) {
4168 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4169 "Fabric has no valid compute nodes\n");
4170 status = -1;
4171 goto Exit;
4172 }
4173
4174 /* Now that the CA ports have been created and CNs were marked,
4175 we can complete the fabric ranking - set leaf switches rank. */
4176 fabric_set_leaf_rank(p_ftree);
4177
4178 if (fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK ||
4179 fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK) {
4180 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4181 "Fabric rank is %u (should be between %u and %u)\n",
4182 fabric_get_rank(p_ftree), FAT_TREE_MIN_RANK,
4183 FAT_TREE_MAX_RANK);
4184 status = -1;
4185 goto Exit;
4186 }
4187
4188 /* Mark all the switches in the fabric with rank equal to
4189 p_ftree->leaf_switch_rank and that are also connected to CNs.
4190 As a by-product, this function also runs basic topology
4191 validation - it checks that all the CNs are at the same rank. */
4192 if (fabric_mark_leaf_switches(p_ftree)) {
4193 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4194 "Fabric topology is not a fat-tree\n");
4195 status = -1;
4196 goto Exit;
4197 }
4198
4199 /* Assign index to all the switches in the fabric.
4200 This function also sorts leaf switch array by the switch index,
4201 sorts all the port arrays of the indexed switches by remote
4202 switch index, and creates switch-by-tuple table (sw_by_tuple_tbl) */
4203 fabric_make_indexing(p_ftree);
4204
4205 /* Create leaf switch array sorted by index.
4206 This array contains switches with rank equal to p_ftree->leaf_switch_rank
4207 and that are also connected to CNs (REAL leafs), and it may contain
4208 switches at the same leaf rank w/o CNs, if this is the order of indexing.
4209 In any case, the first and the last switches in the array are REAL leafs. */
4210 if (fabric_create_leaf_switch_array(p_ftree)) {
4211 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4212 "Fabric topology is not a fat-tree\n");
4213 status = -1;
4214 goto Exit;
4215 }
4216
4217 /* calculate and set ftree.max_cn_per_leaf field */
4218 fabric_set_max_cn_per_leaf(p_ftree);
4219
4220 /* print general info about fabric topology */
4221 fabric_dump_general_info(p_ftree);
4222
4223 /* dump full tree topology */
4224 if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
4225 fabric_dump(p_ftree);
4226
4227 /* the fabric is required to be PURE fat-tree only if the root
4228 guid file hasn't been provided by user */
4229 if (!fabric_roots_provided(p_ftree) &&
4230 !fabric_validate_topology(p_ftree)) {
4231 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4232 "Fabric topology is not a fat-tree\n");
4233 status = -1;
4234 goto Exit;
4235 }
4236
4237 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4238 "Max LID in switch LFTs: %u\n", p_ftree->lft_max_lid);
4239
4240 /* Build the full lid matrices needed for multicast routing */
4241 osm_ucast_mgr_build_lid_matrices(&p_ftree->p_osm->sm.ucast_mgr);
4242
4243 Exit:
4244 if (status != 0) {
4245 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4246 "Clearing FatTree Fabric data structures\n");
4247 fabric_clear(p_ftree);
4248 } else
4249 p_ftree->fabric_built = TRUE;
4250
4251 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
4252 " |--------------------------------------------------|\n"
4253 " |- Done constructing FatTree fabric (status = %d) -|\n"
4254 " |--------------------------------------------------|\n\n",
4255 status);
4256
4257 OSM_LOG_EXIT(&p_ftree->p_osm->log);
4258 return status;
4259 } /* construct_fabric() */
4260
4261 /***************************************************
4262 ***************************************************/
4263
do_routing(IN void * context)4264 static int do_routing(IN void *context)
4265 {
4266 ftree_fabric_t *p_ftree = context;
4267 int status = 0;
4268
4269 OSM_LOG_ENTER(&p_ftree->p_osm->log);
4270
4271 if (!p_ftree->fabric_built) {
4272 status = -1;
4273 goto Exit;
4274 }
4275
4276 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4277 "Starting FatTree routing\n");
4278
4279 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4280 "Filling switch forwarding tables for Compute Nodes\n");
4281 fabric_route_to_cns(p_ftree);
4282
4283 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4284 "Filling switch forwarding tables for non-CN targets\n");
4285 fabric_route_to_non_cns(p_ftree);
4286
4287 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4288 "Filling switch forwarding tables for switch-to-switch paths\n");
4289 fabric_route_to_switches(p_ftree);
4290
4291 if (p_ftree->p_osm->subn.opt.connect_roots) {
4292 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4293 "Connecting switches that are unreachable within "
4294 "Up/Down rules\n");
4295 fabric_route_roots(p_ftree);
4296 }
4297
4298 /* for each switch, set its fwd table */
4299 cl_qmap_apply_func(&p_ftree->sw_tbl, set_sw_fwd_table, (void *)p_ftree);
4300
4301 /* write out hca ordering file */
4302 fabric_dump_hca_ordering(p_ftree);
4303
4304 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4305 "FatTree routing is done\n");
4306
4307 Exit:
4308 OSM_LOG_EXIT(&p_ftree->p_osm->log);
4309 return status;
4310 }
4311
4312 /***************************************************
4313 ***************************************************/
4314
delete(IN void * context)4315 static void delete(IN void *context)
4316 {
4317 if (!context)
4318 return;
4319 fabric_destroy((ftree_fabric_t *) context);
4320 }
4321
4322 /***************************************************
4323 ***************************************************/
4324
osm_ucast_ftree_setup(struct osm_routing_engine * r,osm_opensm_t * p_osm)4325 int osm_ucast_ftree_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
4326 {
4327 ftree_fabric_t *p_ftree = fabric_create();
4328 if (!p_ftree)
4329 return -1;
4330
4331 p_ftree->p_osm = p_osm;
4332 p_ftree->p_subn = p_osm->sm.ucast_mgr.p_subn;
4333
4334 r->context = (void *)p_ftree;
4335 r->build_lid_matrices = construct_fabric;
4336 r->ucast_build_fwd_tables = do_routing;
4337 r->destroy = delete;
4338
4339 return 0;
4340 }
4341