1 /*
2  * Copyright (c) 2009 Simula Research Laboratory. All rights reserved.
3  * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
4  * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
5  * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved.
6  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
7  *
8  * This software is available to you under a choice of one of two
9  * licenses.  You may choose to be licensed under the terms of the GNU
10  * General Public License (GPL) Version 2, available from the file
11  * COPYING in the main directory of this source tree, or the
12  * OpenIB.org BSD license below:
13  *
14  *     Redistribution and use in source and binary forms, with or
15  *     without modification, are permitted provided that the following
16  *     conditions are met:
17  *
18  *      - Redistributions of source code must retain the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer.
21  *
22  *      - Redistributions in binary form must reproduce the above
23  *        copyright notice, this list of conditions and the following
24  *        disclaimer in the documentation and/or other materials
25  *        provided with the distribution.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34  * SOFTWARE.
35  *
36  */
37 
38 /*
39  * Abstract:
40  *    Implementation of OpenSM FatTree routing
41  */
42 
43 #if HAVE_CONFIG_H
44 #  include <config.h>
45 #endif
46 
47 #include <stdlib.h>
48 #include <string.h>
49 #include <ctype.h>
50 #include <errno.h>
51 #include <iba/ib_types.h>
52 #include <complib/cl_qmap.h>
53 #include <complib/cl_debug.h>
54 #include <opensm/osm_file_ids.h>
55 #define FILE_ID OSM_FILE_UCAST_FTREE_C
56 #include <opensm/osm_opensm.h>
57 #include <opensm/osm_switch.h>
58 
59 /*
60  * FatTree rank is bounded between 2 and 8:
61  *  - Tree of rank 1 has only trivial routing paths,
62  *    so no need to use FatTree routing.
63  *  - Why maximum rank is 8:
64  *    Each node (switch) is assigned a unique tuple.
65  *    Switches are stored in two cl_qmaps - one is
66  *    ordered by guid, and the other by a key that is
67  *    generated from tuple. Since cl_qmap supports only
68  *    a 64-bit key, the maximal tuple length is 8 bytes.
69  *    which means that maximal tree rank is 8.
70  * Note that the above also implies that each switch
71  * can have at max 255 up/down ports.
72  */
73 
74 #define FAT_TREE_MIN_RANK 2
75 #define FAT_TREE_MAX_RANK 8
76 
77 typedef enum {
78 	FTREE_DIRECTION_DOWN = -1,
79 	FTREE_DIRECTION_SAME,
80 	FTREE_DIRECTION_UP
81 } ftree_direction_t;
82 
83 /***************************************************
84  **
85  **  Forward references
86  **
87  ***************************************************/
88 struct ftree_sw_t_;
89 struct ftree_hca_t_;
90 struct ftree_port_t_;
91 struct ftree_port_group_t_;
92 struct ftree_fabric_t_;
93 
94 /***************************************************
95  **
96  **  ftree_tuple_t definition
97  **
98  ***************************************************/
99 
100 #define FTREE_TUPLE_BUFF_LEN 1024
101 #define FTREE_TUPLE_LEN 8
102 
103 typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN];
104 typedef uint64_t ftree_tuple_key_t;
105 
106 /***************************************************
107  **
108  **  ftree_sw_table_element_t definition
109  **
110  ***************************************************/
111 
112 typedef struct {
113 	cl_map_item_t map_item;
114 	struct ftree_sw_t_ *p_sw;
115 } ftree_sw_tbl_element_t;
116 
117 /***************************************************
118  **
119  **  ftree_port_t definition
120  **
121  ***************************************************/
122 
123 typedef struct ftree_port_t_ {
124 	cl_map_item_t map_item;
125 	uint8_t port_num;	/* port number on the current node */
126 	uint8_t remote_port_num;	/* port number on the remote node */
127 	uint32_t counter_up;	/* number of allocated routes upwards */
128 	uint32_t counter_down;	/* number of allocated routes downwards */
129 } ftree_port_t;
130 
131 /***************************************************
132  **
133  **  ftree_port_group_t definition
134  **
135  ***************************************************/
136 
137 typedef union ftree_hca_or_sw_ {
138 	struct ftree_hca_t_ *p_hca;
139 	struct ftree_sw_t_ *p_sw;
140 } ftree_hca_or_sw;
141 
142 typedef struct ftree_port_group_t_ {
143 	cl_map_item_t map_item;
144 	uint16_t lid;	/* lid of the current node */
145 	uint16_t remote_lid;	/* lid of the remote node */
146 	ib_net64_t port_guid;	/* port guid of this port */
147 	ib_net64_t node_guid;	/* this node's guid */
148 	uint8_t node_type;	/* this node's type */
149 	ib_net64_t remote_port_guid;	/* port guid of the remote port */
150 	ib_net64_t remote_node_guid;	/* node guid of the remote node */
151 	uint8_t remote_node_type;	/* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */
152 	ftree_hca_or_sw hca_or_sw;	/* pointer to this hca/switch */
153 	ftree_hca_or_sw remote_hca_or_sw;	/* pointer to remote hca/switch */
154 	cl_ptr_vector_t ports;	/* vector of ports to the same lid */
155 	boolean_t is_cn;	/* whether this port is a compute node */
156 	boolean_t is_io;	/* whether this port is an I/O node */
157 	uint32_t counter_down;	/* number of allocated routes downwards */
158 	uint32_t counter_up;	/* number of allocated routes upwards */
159 } ftree_port_group_t;
160 
161 /***************************************************
162  **
163  **  ftree_sw_t definition
164  **
165  ***************************************************/
166 
167 typedef struct ftree_sw_t_ {
168 	cl_map_item_t map_item;
169 	osm_switch_t *p_osm_sw;
170 	uint32_t rank;
171 	ftree_tuple_t tuple;
172 	uint16_t lid;
173 	ftree_port_group_t **down_port_groups;
174 	uint8_t down_port_groups_num;
175 	ftree_port_group_t **sibling_port_groups;
176 	uint8_t sibling_port_groups_num;
177 	ftree_port_group_t **up_port_groups;
178 	uint8_t up_port_groups_num;
179 	boolean_t is_leaf;
180 	unsigned down_port_groups_idx;
181 	uint8_t *hops;
182 	uint32_t min_counter_down;
183 	boolean_t counter_up_changed;
184 } ftree_sw_t;
185 
186 /***************************************************
187  **
188  **  ftree_hca_t definition
189  **
190  ***************************************************/
191 
192 typedef struct ftree_hca_t_ {
193 	cl_map_item_t map_item;
194 	osm_node_t *p_osm_node;
195 	ftree_port_group_t **up_port_groups;
196 	uint8_t *disconnected_ports;
197 	uint16_t up_port_groups_num;
198 	unsigned cn_num;
199 } ftree_hca_t;
200 
201 /***************************************************
202  **
203  **  ftree_fabric_t definition
204  **
205  ***************************************************/
206 
207 typedef struct ftree_fabric_t_ {
208 	osm_opensm_t *p_osm;
209 	osm_subn_t *p_subn;
210 	cl_qmap_t hca_tbl;
211 	cl_qmap_t sw_tbl;
212 	cl_qmap_t sw_by_tuple_tbl;
213 	cl_qmap_t cn_guid_tbl;
214 	cl_qmap_t io_guid_tbl;
215 	unsigned cn_num;
216 	unsigned ca_ports;
217 	uint8_t leaf_switch_rank;
218 	uint8_t max_switch_rank;
219 	ftree_sw_t **leaf_switches;
220 	uint32_t leaf_switches_num;
221 	uint16_t max_cn_per_leaf;
222 	uint16_t lft_max_lid;
223 	boolean_t fabric_built;
224 } ftree_fabric_t;
225 
226 static inline osm_subn_t *ftree_get_subnet(IN ftree_fabric_t * p_ftree)
227 {
228 	return p_ftree->p_subn;
229 }
230 
231 /***************************************************
232  **
233  ** comparators
234  **
235  ***************************************************/
236 
237 static int compare_switches_by_index(IN const void *p1, IN const void *p2)
238 {
239 	ftree_sw_t **pp_sw1 = (ftree_sw_t **) p1;
240 	ftree_sw_t **pp_sw2 = (ftree_sw_t **) p2;
241 
242 	uint16_t i;
243 	for (i = 0; i < FTREE_TUPLE_LEN; i++) {
244 		if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i])
245 			return 1;
246 		if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i])
247 			return -1;
248 	}
249 	return 0;
250 }
251 
252 /***************************************************/
253 
254 static int
255 compare_port_groups_by_remote_switch_index(IN const void *p1, IN const void *p2)
256 {
257 	ftree_port_group_t **pp_g1 = (ftree_port_group_t **) p1;
258 	ftree_port_group_t **pp_g2 = (ftree_port_group_t **) p2;
259 
260 	return
261 	    compare_switches_by_index(&((*pp_g1)->remote_hca_or_sw.p_sw),
262 				      &((*pp_g2)->remote_hca_or_sw.p_sw));
263 }
264 
265 /***************************************************
266  **
267  ** ftree_tuple_t functions
268  **
269  ***************************************************/
270 
271 static void tuple_init(IN ftree_tuple_t tuple)
272 {
273 	memset(tuple, 0xFF, FTREE_TUPLE_LEN);
274 }
275 
276 /***************************************************/
277 
278 static inline boolean_t tuple_assigned(IN ftree_tuple_t tuple)
279 {
280 	return (tuple[0] != 0xFF);
281 }
282 
283 /***************************************************/
284 
285 #define FTREE_TUPLE_BUFFERS_NUM 6
286 
287 static const char *tuple_to_str(IN ftree_tuple_t tuple)
288 {
289 	static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN];
290 	static uint8_t ind = 0;
291 	char *ret_buffer;
292 	uint32_t i;
293 
294 	if (!tuple_assigned(tuple))
295 		return "INDEX.NOT.ASSIGNED";
296 
297 	buffer[ind][0] = '\0';
298 
299 	for (i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++) {
300 		if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN)
301 			return "INDEX.TOO.LONG";
302 		if (i != 0)
303 			strcat(buffer[ind], ".");
304 		sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]);
305 	}
306 
307 	ret_buffer = buffer[ind];
308 	ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM;
309 	return ret_buffer;
310 }				/* tuple_to_str() */
311 
312 /***************************************************/
313 
314 static inline ftree_tuple_key_t tuple_to_key(IN ftree_tuple_t tuple)
315 {
316 	ftree_tuple_key_t key;
317 	memcpy(&key, tuple, FTREE_TUPLE_LEN);
318 	return key;
319 }
320 
321 /***************************************************/
322 
323 static inline void tuple_from_key(IN ftree_tuple_t tuple,
324 				  IN ftree_tuple_key_t key)
325 {
326 	memcpy(tuple, &key, FTREE_TUPLE_LEN);
327 }
328 
329 /***************************************************
330  **
331  ** ftree_sw_tbl_element_t functions
332  **
333  ***************************************************/
334 
335 static ftree_sw_tbl_element_t *sw_tbl_element_create(IN ftree_sw_t * p_sw)
336 {
337 	ftree_sw_tbl_element_t *p_element =
338 	    (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t));
339 	if (!p_element)
340 		return NULL;
341 	memset(p_element, 0, sizeof(ftree_sw_tbl_element_t));
342 
343 	p_element->p_sw = p_sw;
344 	return p_element;
345 }
346 
347 /***************************************************/
348 
349 static void sw_tbl_element_destroy(IN ftree_sw_tbl_element_t * p_element)
350 {
351 	free(p_element);
352 }
353 
354 /***************************************************
355  **
356  ** ftree_port_t functions
357  **
358  ***************************************************/
359 
360 static ftree_port_t *port_create(IN uint8_t port_num,
361 				 IN uint8_t remote_port_num)
362 {
363 	ftree_port_t *p_port = (ftree_port_t *) malloc(sizeof(ftree_port_t));
364 	if (!p_port)
365 		return NULL;
366 	memset(p_port, 0, sizeof(ftree_port_t));
367 
368 	p_port->port_num = port_num;
369 	p_port->remote_port_num = remote_port_num;
370 
371 	return p_port;
372 }
373 
374 /***************************************************/
375 
376 static void port_destroy(IN ftree_port_t * p_port)
377 {
378 	free(p_port);
379 }
380 
381 /***************************************************
382  **
383  ** ftree_port_group_t functions
384  **
385  ***************************************************/
386 
387 static ftree_port_group_t *port_group_create(IN uint16_t lid,
388 					     IN uint16_t remote_lid,
389 					     IN ib_net64_t port_guid,
390 					     IN ib_net64_t node_guid,
391 					     IN uint8_t node_type,
392 					     IN void *p_hca_or_sw,
393 					     IN ib_net64_t remote_port_guid,
394 					     IN ib_net64_t remote_node_guid,
395 					     IN uint8_t remote_node_type,
396 					     IN void *p_remote_hca_or_sw,
397 					     IN boolean_t is_cn,
398 					     IN boolean_t is_io)
399 {
400 	ftree_port_group_t *p_group =
401 	    (ftree_port_group_t *) malloc(sizeof(ftree_port_group_t));
402 	if (p_group == NULL)
403 		return NULL;
404 	memset(p_group, 0, sizeof(ftree_port_group_t));
405 
406 	p_group->lid = lid;
407 	p_group->remote_lid = remote_lid;
408 	memcpy(&p_group->port_guid, &port_guid, sizeof(ib_net64_t));
409 	memcpy(&p_group->node_guid, &node_guid, sizeof(ib_net64_t));
410 	memcpy(&p_group->remote_port_guid, &remote_port_guid,
411 	       sizeof(ib_net64_t));
412 	memcpy(&p_group->remote_node_guid, &remote_node_guid,
413 	       sizeof(ib_net64_t));
414 
415 	p_group->node_type = node_type;
416 	switch (node_type) {
417 	case IB_NODE_TYPE_CA:
418 		p_group->hca_or_sw.p_hca = (ftree_hca_t *) p_hca_or_sw;
419 		break;
420 	case IB_NODE_TYPE_SWITCH:
421 		p_group->hca_or_sw.p_sw = (ftree_sw_t *) p_hca_or_sw;
422 		break;
423 	default:
424 		/* we shouldn't get here - port is created only in hca or switch */
425 		CL_ASSERT(0);
426 	}
427 
428 	p_group->remote_node_type = remote_node_type;
429 	switch (remote_node_type) {
430 	case IB_NODE_TYPE_CA:
431 		p_group->remote_hca_or_sw.p_hca =
432 		    (ftree_hca_t *) p_remote_hca_or_sw;
433 		break;
434 	case IB_NODE_TYPE_SWITCH:
435 		p_group->remote_hca_or_sw.p_sw =
436 		    (ftree_sw_t *) p_remote_hca_or_sw;
437 		break;
438 	default:
439 		/* we shouldn't get here - port is created only in hca or switch */
440 		CL_ASSERT(0);
441 	}
442 
443 	cl_ptr_vector_init(&p_group->ports, 0,	/* min size */
444 			   8);	/* grow size */
445 	p_group->is_cn = is_cn;
446 	p_group->is_io = is_io;
447 	return p_group;
448 }				/* port_group_create() */
449 
450 /***************************************************/
451 
452 static void port_group_destroy(IN ftree_port_group_t * p_group)
453 {
454 	uint32_t i;
455 	uint32_t size;
456 	ftree_port_t *p_port;
457 
458 	if (!p_group)
459 		return;
460 
461 	/* remove all the elements of p_group->ports vector */
462 	size = cl_ptr_vector_get_size(&p_group->ports);
463 	for (i = 0; i < size; i++)
464 		if (cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port) == CL_SUCCESS)
465 			port_destroy(p_port);
466 
467 	cl_ptr_vector_destroy(&p_group->ports);
468 	free(p_group);
469 }				/* port_group_destroy() */
470 
471 /***************************************************/
472 
473 static void port_group_dump(IN ftree_fabric_t * p_ftree,
474 			    IN ftree_port_group_t * p_group,
475 			    IN ftree_direction_t direction)
476 {
477 	ftree_port_t *p_port;
478 	uint32_t size;
479 	uint32_t i;
480 	char *buff;
481 
482 	if (!p_group)
483 		return;
484 
485 	if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
486 		return;
487 
488 	size = cl_ptr_vector_get_size(&p_group->ports);
489 
490 	buff = calloc(10, 1024);
491 	if (!buff) {
492 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB33: "
493 			"Failed to allocate buffer\n");
494 		return;
495 	}
496 
497 	for (i = 0; i < size; i++) {
498 		cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
499 		CL_ASSERT(p_port);
500 
501 		if (i != 0)
502 			strcat(buff, ", ");
503 		sprintf(buff + strlen(buff), "%u", p_port->port_num);
504 	}
505 
506 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
507 		"    Port Group of size %u, port(s): %s, direction: %s\n"
508 		"                  Local <--> Remote GUID (LID):"
509 		"0x%016" PRIx64 " (0x%04x) <--> 0x%016" PRIx64 " (0x%04x)\n",
510 		size, buff,
511 		(direction == FTREE_DIRECTION_DOWN) ? "DOWN" : (direction ==
512 								FTREE_DIRECTION_SAME)
513 		? "SIBLING" : "UP", cl_ntoh64(p_group->port_guid),
514 		p_group->lid, cl_ntoh64(p_group->remote_port_guid),
515 		p_group->remote_lid);
516 
517 	free(buff);
518 
519 }				/* port_group_dump() */
520 
521 /***************************************************/
522 
523 static void port_group_add_port(IN ftree_port_group_t * p_group,
524 				IN uint8_t port_num, IN uint8_t remote_port_num)
525 {
526 	uint16_t i;
527 	ftree_port_t *p_port;
528 
529 	for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++) {
530 		cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
531 		if (p_port->port_num == port_num)
532 			return;
533 	}
534 
535 	p_port = port_create(port_num, remote_port_num);
536 	CL_ASSERT(p_port);
537 	cl_ptr_vector_insert(&p_group->ports, p_port, NULL);
538 }
539 
540 /***************************************************
541  **
542  ** ftree_sw_t functions
543  **
544  ***************************************************/
545 
546 static ftree_sw_t *sw_create(IN osm_switch_t * p_osm_sw)
547 {
548 	ftree_sw_t *p_sw;
549 	uint8_t ports_num;
550 
551 	/* make sure that the switch has ports */
552 	if (p_osm_sw->num_ports == 1)
553 		return NULL;
554 
555 	p_sw = (ftree_sw_t *) malloc(sizeof(ftree_sw_t));
556 	if (p_sw == NULL)
557 		return NULL;
558 	memset(p_sw, 0, sizeof(ftree_sw_t));
559 
560 	p_sw->p_osm_sw = p_osm_sw;
561 	p_sw->rank = 0xFFFFFFFF;
562 	tuple_init(p_sw->tuple);
563 
564 	p_sw->lid =
565 	    cl_ntoh16(osm_node_get_base_lid(p_sw->p_osm_sw->p_node, 0));
566 
567 	ports_num = osm_node_get_num_physp(p_sw->p_osm_sw->p_node);
568 	p_sw->down_port_groups =
569 	    (ftree_port_group_t **) malloc(ports_num *
570 					   sizeof(ftree_port_group_t *));
571 	if (p_sw->down_port_groups == NULL)
572 		goto FREE_P_SW;
573 	memset(p_sw->down_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
574 
575 	p_sw->up_port_groups =
576 	    (ftree_port_group_t **) malloc(ports_num *
577 					   sizeof(ftree_port_group_t *));
578 	if (p_sw->up_port_groups == NULL)
579 		goto FREE_DOWN;
580 	memset(p_sw->up_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
581 
582 	p_sw->sibling_port_groups =
583 	    (ftree_port_group_t **) malloc(ports_num *
584 					   sizeof(ftree_port_group_t *));
585 	if (p_sw->sibling_port_groups == NULL)
586 		goto FREE_UP;
587 	memset(p_sw->sibling_port_groups, 0, ports_num * sizeof(ftree_port_group_t *));
588 
589 	/* initialize lft buffer */
590 	memset(p_osm_sw->new_lft, OSM_NO_PATH, p_osm_sw->lft_size);
591 	p_sw->hops = malloc((p_osm_sw->max_lid_ho + 1) * sizeof(*(p_sw->hops)));
592 	if (p_sw->hops == NULL)
593 		goto FREE_SIBLING;
594 
595 	memset(p_sw->hops, OSM_NO_PATH, p_osm_sw->max_lid_ho + 1);
596 
597 	return p_sw;
598 
599 FREE_SIBLING:
600 	free(p_sw->sibling_port_groups);
601 FREE_UP:
602 	free(p_sw->up_port_groups);
603 FREE_DOWN:
604 	free(p_sw->down_port_groups);
605 FREE_P_SW:
606 	free(p_sw);
607 	return NULL;
608 }				/* sw_create() */
609 
610 /***************************************************/
611 
612 static void sw_destroy(IN ftree_sw_t * p_sw)
613 {
614 	uint8_t i;
615 
616 	if (!p_sw)
617 		return;
618 	free(p_sw->hops);
619 
620 	for (i = 0; i < p_sw->down_port_groups_num; i++)
621 		port_group_destroy(p_sw->down_port_groups[i]);
622 	for (i = 0; i < p_sw->sibling_port_groups_num; i++)
623 		port_group_destroy(p_sw->sibling_port_groups[i]);
624 	for (i = 0; i < p_sw->up_port_groups_num; i++)
625 		port_group_destroy(p_sw->up_port_groups[i]);
626 	free(p_sw->down_port_groups);
627 	free(p_sw->sibling_port_groups);
628 	free(p_sw->up_port_groups);
629 
630 	free(p_sw);
631 }				/* sw_destroy() */
632 
633 /***************************************************/
634 
635 static uint64_t sw_get_guid_no(IN ftree_sw_t * p_sw)
636 {
637 	if (!p_sw)
638 		return 0;
639 	return osm_node_get_node_guid(p_sw->p_osm_sw->p_node);
640 }
641 
642 /***************************************************/
643 
644 static uint64_t sw_get_guid_ho(IN ftree_sw_t * p_sw)
645 {
646 	return cl_ntoh64(sw_get_guid_no(p_sw));
647 }
648 
649 /***************************************************/
650 
651 static void sw_dump(IN ftree_fabric_t * p_ftree, IN ftree_sw_t * p_sw)
652 {
653 	uint32_t i;
654 
655 	if (!p_sw)
656 		return;
657 
658 	if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
659 		return;
660 
661 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
662 		"Switch index: %s, GUID: 0x%016" PRIx64
663 		", Ports: %u DOWN, %u SIBLINGS, %u UP\n",
664 		tuple_to_str(p_sw->tuple), sw_get_guid_ho(p_sw),
665 		p_sw->down_port_groups_num, p_sw->sibling_port_groups_num,
666 		p_sw->up_port_groups_num);
667 
668 	for (i = 0; i < p_sw->down_port_groups_num; i++)
669 		port_group_dump(p_ftree, p_sw->down_port_groups[i],
670 				FTREE_DIRECTION_DOWN);
671 	for (i = 0; i < p_sw->sibling_port_groups_num; i++)
672 		port_group_dump(p_ftree, p_sw->sibling_port_groups[i],
673 				FTREE_DIRECTION_SAME);
674 	for (i = 0; i < p_sw->up_port_groups_num; i++)
675 		port_group_dump(p_ftree, p_sw->up_port_groups[i],
676 				FTREE_DIRECTION_UP);
677 
678 }				/* sw_dump() */
679 
680 /***************************************************/
681 
682 static boolean_t sw_ranked(IN ftree_sw_t * p_sw)
683 {
684 	return (p_sw->rank != 0xFFFFFFFF);
685 }
686 
687 /***************************************************/
688 
689 static ftree_port_group_t *sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw,
690 							   IN uint16_t
691 							   remote_lid,
692 							   IN ftree_direction_t
693 							   direction)
694 {
695 	uint32_t i;
696 	uint32_t size;
697 	ftree_port_group_t **port_groups;
698 
699 	if (direction == FTREE_DIRECTION_UP) {
700 		port_groups = p_sw->up_port_groups;
701 		size = p_sw->up_port_groups_num;
702 	} else if (direction == FTREE_DIRECTION_SAME) {
703 		port_groups = p_sw->sibling_port_groups;
704 		size = p_sw->sibling_port_groups_num;
705 	} else {
706 		port_groups = p_sw->down_port_groups;
707 		size = p_sw->down_port_groups_num;
708 	}
709 
710 	for (i = 0; i < size; i++)
711 		if (remote_lid == port_groups[i]->remote_lid)
712 			return port_groups[i];
713 
714 	return NULL;
715 }				/* sw_get_port_group_by_remote_lid() */
716 
717 /***************************************************/
718 
719 static void sw_add_port(IN ftree_sw_t * p_sw, IN uint8_t port_num,
720 			IN uint8_t remote_port_num, IN uint16_t lid,
721 			IN uint16_t remote_lid, IN ib_net64_t port_guid,
722 			IN ib_net64_t remote_port_guid,
723 			IN ib_net64_t remote_node_guid,
724 			IN uint8_t remote_node_type,
725 			IN void *p_remote_hca_or_sw,
726 			IN ftree_direction_t direction)
727 {
728 	ftree_port_group_t *p_group =
729 	    sw_get_port_group_by_remote_lid(p_sw, remote_lid, direction);
730 
731 	if (!p_group) {
732 		p_group = port_group_create(lid, remote_lid,
733 					    port_guid, sw_get_guid_no(p_sw),
734 					    IB_NODE_TYPE_SWITCH, p_sw,
735 					    remote_port_guid, remote_node_guid,
736 					    remote_node_type,
737 					    p_remote_hca_or_sw, FALSE, FALSE);
738 		CL_ASSERT(p_group);
739 
740 		if (direction == FTREE_DIRECTION_UP) {
741 			p_sw->up_port_groups[p_sw->up_port_groups_num++] =
742 			    p_group;
743 		} else if (direction == FTREE_DIRECTION_SAME) {
744 			p_sw->
745 			    sibling_port_groups[p_sw->sibling_port_groups_num++]
746 			    = p_group;
747 		} else
748 			p_sw->down_port_groups[p_sw->down_port_groups_num++] =
749 			    p_group;
750 	}
751 	port_group_add_port(p_group, port_num, remote_port_num);
752 
753 }				/* sw_add_port() */
754 
755 /***************************************************/
756 
757 static inline cl_status_t sw_set_hops(IN ftree_sw_t * p_sw, IN uint16_t lid,
758 				      IN uint8_t port_num, IN uint8_t hops,
759 				      IN boolean_t is_target_sw)
760 {
761 	/* set local min hop table(LID) */
762 	p_sw->hops[lid] = hops;
763 	if (is_target_sw)
764 		return osm_switch_set_hops(p_sw->p_osm_sw, lid, port_num, hops);
765 	return 0;
766 }
767 
768 /***************************************************/
769 
770 static int set_hops_on_remote_sw(IN ftree_port_group_t * p_group,
771 				 IN uint16_t target_lid, IN uint8_t hops,
772 				 IN boolean_t is_target_sw)
773 {
774 	ftree_port_t *p_port;
775 	uint8_t i, ports_num;
776 	ftree_sw_t *p_remote_sw = p_group->remote_hca_or_sw.p_sw;
777 
778 	/* if lid is a switch, we set the min hop table in the osm_switch struct */
779 	CL_ASSERT(p_group->remote_node_type == IB_NODE_TYPE_SWITCH);
780 	p_remote_sw->hops[target_lid] = hops;
781 
782 	/* If target lid is a switch we set the min hop table values
783 	 * for each port on the associated osm_sw struct */
784 	if (!is_target_sw)
785 		return 0;
786 
787 	ports_num = (uint8_t) cl_ptr_vector_get_size(&p_group->ports);
788 	for (i = 0; i < ports_num; i++) {
789 		cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port);
790 		if (sw_set_hops(p_remote_sw, target_lid,
791 				p_port->remote_port_num, hops, is_target_sw))
792 			return -1;
793 	}
794 	return 0;
795 }
796 
797 /***************************************************/
798 
799 static inline uint8_t
800 sw_get_least_hops(IN ftree_sw_t * p_sw, IN uint16_t target_lid)
801 {
802 	CL_ASSERT(p_sw->hops != NULL);
803 	return p_sw->hops[target_lid];
804 }
805 
806 /***************************************************
807  **
808  ** ftree_hca_t functions
809  **
810  ***************************************************/
811 
812 static ftree_hca_t *hca_create(IN osm_node_t * p_osm_node)
813 {
814 	ftree_hca_t *p_hca = (ftree_hca_t *) malloc(sizeof(ftree_hca_t));
815 	if (p_hca == NULL)
816 		return NULL;
817 	memset(p_hca, 0, sizeof(ftree_hca_t));
818 
819 	p_hca->p_osm_node = p_osm_node;
820 	p_hca->up_port_groups = (ftree_port_group_t **)
821 	    malloc(osm_node_get_num_physp(p_hca->p_osm_node) *
822 		   sizeof(ftree_port_group_t *));
823 	if (!p_hca->up_port_groups) {
824 		free(p_hca);
825 		return NULL;
826 	}
827 	memset(p_hca->up_port_groups, 0, osm_node_get_num_physp(p_hca->p_osm_node) *
828 	       sizeof(ftree_port_group_t *));
829 
830 	p_hca->disconnected_ports = (uint8_t *)
831 	    calloc(osm_node_get_num_physp(p_hca->p_osm_node) + 1, sizeof(uint8_t));
832 	if (!p_hca->disconnected_ports) {
833 		free(p_hca->up_port_groups);
834 		free(p_hca);
835 		return NULL;
836 	}
837 	p_hca->up_port_groups_num = 0;
838 	return p_hca;
839 }
840 
841 /***************************************************/
842 
843 static void hca_destroy(IN ftree_hca_t * p_hca)
844 {
845 	uint32_t i;
846 
847 	if (!p_hca)
848 		return;
849 
850 	for (i = 0; i < p_hca->up_port_groups_num; i++)
851 		port_group_destroy(p_hca->up_port_groups[i]);
852 
853 	free(p_hca->up_port_groups);
854 	free(p_hca->disconnected_ports);
855 
856 	free(p_hca);
857 }
858 
859 /***************************************************/
860 
861 static uint64_t hca_get_guid_no(IN ftree_hca_t * p_hca)
862 {
863 	if (!p_hca)
864 		return 0;
865 	return osm_node_get_node_guid(p_hca->p_osm_node);
866 }
867 
868 /***************************************************/
869 
870 static uint64_t hca_get_guid_ho(IN ftree_hca_t * p_hca)
871 {
872 	return cl_ntoh64(hca_get_guid_no(p_hca));
873 }
874 
875 /***************************************************/
876 
877 static void hca_dump(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca)
878 {
879 	uint32_t i;
880 
881 	if (!p_hca)
882 		return;
883 
884 	if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
885 		return;
886 
887 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
888 		"CA GUID: 0x%016" PRIx64 ", Ports: %u UP\n",
889 		hca_get_guid_ho(p_hca), p_hca->up_port_groups_num);
890 
891 	for (i = 0; i < p_hca->up_port_groups_num; i++)
892 		port_group_dump(p_ftree, p_hca->up_port_groups[i],
893 				FTREE_DIRECTION_UP);
894 }
895 
896 static ftree_port_group_t *hca_get_port_group_by_lid(IN ftree_hca_t *
897 						     p_hca,
898 						     IN uint16_t
899 						     lid)
900 {
901 	uint32_t i;
902 	for (i = 0; i < p_hca->up_port_groups_num; i++)
903 		if (lid ==
904 		    p_hca->up_port_groups[i]->lid)
905 			return p_hca->up_port_groups[i];
906 
907 	return NULL;
908 }
909 /***************************************************/
910 
911 static void hca_add_port(IN ftree_fabric_t * p_ftree,
912 			 IN ftree_hca_t * p_hca, IN uint8_t port_num,
913 			 IN uint8_t remote_port_num, IN uint16_t lid,
914 			 IN uint16_t remote_lid, IN ib_net64_t port_guid,
915 			 IN ib_net64_t remote_port_guid,
916 			 IN ib_net64_t remote_node_guid,
917 			 IN uint8_t remote_node_type,
918 			 IN void *p_remote_hca_or_sw, IN boolean_t is_cn,
919 			 IN boolean_t is_io)
920 {
921 	ftree_port_group_t *p_group;
922 
923 	/* this function is supposed to be called only for adding ports
924 	   in hca's that lead to switches */
925 	CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH);
926 
927 	p_group = hca_get_port_group_by_lid(p_hca, lid);
928 
929 	if (!p_group) {
930 		p_group = port_group_create(lid, remote_lid,
931 					    port_guid, hca_get_guid_no(p_hca),
932 					    IB_NODE_TYPE_CA, p_hca,
933 					    remote_port_guid, remote_node_guid,
934 					    remote_node_type,
935 					    p_remote_hca_or_sw, is_cn, is_io);
936 		CL_ASSERT(p_group);
937 		p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group;
938 		port_group_add_port(p_group, port_num, remote_port_num);
939 	} else
940 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
941 			"ERR AB32: Duplicated LID for CA GUID: 0x%016" PRIx64 "\n",
942 			cl_ntoh64(port_guid));
943 }				/* hca_add_port() */
944 
945 /***************************************************
946  **
947  ** ftree_fabric_t functions
948  **
949  ***************************************************/
950 
951 static ftree_fabric_t *fabric_create()
952 {
953 	ftree_fabric_t *p_ftree =
954 	    (ftree_fabric_t *) malloc(sizeof(ftree_fabric_t));
955 	if (p_ftree == NULL)
956 		return NULL;
957 
958 	memset(p_ftree, 0, sizeof(ftree_fabric_t));
959 
960 	cl_qmap_init(&p_ftree->hca_tbl);
961 	cl_qmap_init(&p_ftree->sw_tbl);
962 	cl_qmap_init(&p_ftree->sw_by_tuple_tbl);
963 	cl_qmap_init(&p_ftree->cn_guid_tbl);
964 	cl_qmap_init(&p_ftree->io_guid_tbl);
965 
966 	return p_ftree;
967 }
968 
969 /***************************************************/
970 
971 static void fabric_clear(ftree_fabric_t * p_ftree)
972 {
973 	ftree_hca_t *p_hca;
974 	ftree_hca_t *p_next_hca;
975 	ftree_sw_t *p_sw;
976 	ftree_sw_t *p_next_sw;
977 	ftree_sw_tbl_element_t *p_element;
978 	ftree_sw_tbl_element_t *p_next_element;
979 	name_map_item_t *p_guid_element, *p_next_guid_element;
980 
981 	if (!p_ftree)
982 		return;
983 
984 	/* remove all the elements of hca_tbl */
985 
986 	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
987 	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
988 		p_hca = p_next_hca;
989 		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
990 		hca_destroy(p_hca);
991 	}
992 	cl_qmap_remove_all(&p_ftree->hca_tbl);
993 
994 	/* remove all the elements of sw_tbl */
995 
996 	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
997 	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
998 		p_sw = p_next_sw;
999 		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1000 		sw_destroy(p_sw);
1001 	}
1002 	cl_qmap_remove_all(&p_ftree->sw_tbl);
1003 
1004 	/* remove all the elements of sw_by_tuple_tbl */
1005 
1006 	p_next_element =
1007 	    (ftree_sw_tbl_element_t *) cl_qmap_head(&p_ftree->sw_by_tuple_tbl);
1008 	while (p_next_element != (ftree_sw_tbl_element_t *)
1009 	       cl_qmap_end(&p_ftree->sw_by_tuple_tbl)) {
1010 		p_element = p_next_element;
1011 		p_next_element = (ftree_sw_tbl_element_t *)
1012 		    cl_qmap_next(&p_element->map_item);
1013 		sw_tbl_element_destroy(p_element);
1014 	}
1015 	cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl);
1016 
1017 	/* remove all the elements of cn_guid_tbl */
1018 	p_next_guid_element =
1019 	    (name_map_item_t *) cl_qmap_head(&p_ftree->cn_guid_tbl);
1020 	while (p_next_guid_element !=
1021 	       (name_map_item_t *) cl_qmap_end(&p_ftree->cn_guid_tbl)) {
1022 		p_guid_element = p_next_guid_element;
1023 		p_next_guid_element =
1024 		    (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
1025 		free(p_guid_element);
1026 	}
1027 	cl_qmap_remove_all(&p_ftree->cn_guid_tbl);
1028 
1029 	/* remove all the elements of io_guid_tbl */
1030 	p_next_guid_element =
1031 	    (name_map_item_t *) cl_qmap_head(&p_ftree->io_guid_tbl);
1032 	while (p_next_guid_element !=
1033 	       (name_map_item_t *) cl_qmap_end(&p_ftree->io_guid_tbl)) {
1034 		p_guid_element = p_next_guid_element;
1035 		p_next_guid_element =
1036 		    (name_map_item_t *) cl_qmap_next(&p_guid_element->item);
1037 		free(p_guid_element);
1038 	}
1039 	cl_qmap_remove_all(&p_ftree->io_guid_tbl);
1040 
1041 	/* free the leaf switches array */
1042 	if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches))
1043 		free(p_ftree->leaf_switches);
1044 
1045 	p_ftree->leaf_switches_num = 0;
1046 	p_ftree->cn_num = 0;
1047 	p_ftree->ca_ports = 0;
1048 	p_ftree->leaf_switch_rank = 0;
1049 	p_ftree->max_switch_rank = 0;
1050 	p_ftree->max_cn_per_leaf = 0;
1051 	p_ftree->lft_max_lid = 0;
1052 	p_ftree->leaf_switches = NULL;
1053 	p_ftree->fabric_built = FALSE;
1054 
1055 }				/* fabric_destroy() */
1056 
1057 /***************************************************/
1058 
1059 static void fabric_destroy(ftree_fabric_t * p_ftree)
1060 {
1061 	if (!p_ftree)
1062 		return;
1063 	fabric_clear(p_ftree);
1064 	free(p_ftree);
1065 }
1066 
1067 /***************************************************/
1068 
1069 static uint8_t fabric_get_rank(ftree_fabric_t * p_ftree)
1070 {
1071 	return p_ftree->leaf_switch_rank + 1;
1072 }
1073 
1074 /***************************************************/
1075 
1076 static void fabric_add_hca(ftree_fabric_t * p_ftree, osm_node_t * p_osm_node)
1077 {
1078 	ftree_hca_t *p_hca;
1079 
1080 	CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA);
1081 
1082 	p_hca = hca_create(p_osm_node);
1083 	if (!p_hca)
1084 		return;
1085 
1086 	cl_qmap_insert(&p_ftree->hca_tbl, p_osm_node->node_info.node_guid,
1087 		       &p_hca->map_item);
1088 }
1089 
1090 /***************************************************/
1091 
1092 static void fabric_add_sw(ftree_fabric_t * p_ftree, osm_switch_t * p_osm_sw)
1093 {
1094 	ftree_sw_t *p_sw;
1095 
1096 	CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH);
1097 
1098 	p_sw = sw_create(p_osm_sw);
1099 	if (!p_sw)
1100 		return;
1101 
1102 	cl_qmap_insert(&p_ftree->sw_tbl, p_osm_sw->p_node->node_info.node_guid,
1103 		       &p_sw->map_item);
1104 
1105 	/* track the max lid (in host order) that exists in the fabric */
1106 	if (p_sw->lid > p_ftree->lft_max_lid)
1107 		p_ftree->lft_max_lid = p_sw->lid;
1108 }
1109 
1110 /***************************************************/
1111 
1112 static void fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1113 				   IN ftree_sw_t * p_sw)
1114 {
1115 	CL_ASSERT(tuple_assigned(p_sw->tuple));
1116 
1117 	cl_qmap_insert(&p_ftree->sw_by_tuple_tbl, tuple_to_key(p_sw->tuple),
1118 		       &sw_tbl_element_create(p_sw)->map_item);
1119 }
1120 
1121 /***************************************************/
1122 
1123 static ftree_sw_t *fabric_get_sw_by_tuple(IN ftree_fabric_t * p_ftree,
1124 					  IN ftree_tuple_t tuple)
1125 {
1126 	ftree_sw_tbl_element_t *p_element;
1127 
1128 	CL_ASSERT(tuple_assigned(tuple));
1129 
1130 	tuple_to_key(tuple);
1131 
1132 	p_element =
1133 	    (ftree_sw_tbl_element_t *) cl_qmap_get(&p_ftree->sw_by_tuple_tbl,
1134 						   tuple_to_key(tuple));
1135 	if (p_element ==
1136 	    (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->sw_by_tuple_tbl))
1137 		return NULL;
1138 
1139 	return p_element->p_sw;
1140 }
1141 
1142 /***************************************************/
1143 
1144 static ftree_sw_t *fabric_get_sw_by_guid(IN ftree_fabric_t * p_ftree,
1145 					 IN uint64_t guid)
1146 {
1147 	ftree_sw_t *p_sw;
1148 	p_sw = (ftree_sw_t *) cl_qmap_get(&p_ftree->sw_tbl, guid);
1149 	if (p_sw == (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl))
1150 		return NULL;
1151 	return p_sw;
1152 }
1153 
1154 /***************************************************/
1155 
1156 static ftree_hca_t *fabric_get_hca_by_guid(IN ftree_fabric_t * p_ftree,
1157 					   IN uint64_t guid)
1158 {
1159 	ftree_hca_t *p_hca;
1160 	p_hca = (ftree_hca_t *) cl_qmap_get(&p_ftree->hca_tbl, guid);
1161 	if (p_hca == (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl))
1162 		return NULL;
1163 	return p_hca;
1164 }
1165 
1166 /***************************************************/
1167 
1168 static void fabric_dump(ftree_fabric_t * p_ftree)
1169 {
1170 	uint32_t i;
1171 	ftree_hca_t *p_hca;
1172 	ftree_sw_t *p_sw;
1173 
1174 	if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
1175 		return;
1176 
1177 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1178 		"                       |-------------------------------|\n"
1179 		"                       |-  Full fabric topology dump  -|\n"
1180 		"                       |-------------------------------|\n\n");
1181 
1182 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "-- CAs:\n");
1183 
1184 	for (p_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1185 	     p_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl);
1186 	     p_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item)) {
1187 		hca_dump(p_ftree, p_hca);
1188 	}
1189 
1190 	for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1191 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1192 			"-- Rank %u switches\n", i);
1193 		for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1194 		     p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1195 		     p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1196 			if (p_sw->rank == i)
1197 				sw_dump(p_ftree, p_sw);
1198 		}
1199 	}
1200 
1201 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n"
1202 		"                       |---------------------------------------|\n"
1203 		"                       |- Full fabric topology dump completed -|\n"
1204 		"                       |---------------------------------------|\n\n");
1205 }				/* fabric_dump() */
1206 
1207 /***************************************************/
1208 
1209 static void fabric_dump_general_info(IN ftree_fabric_t * p_ftree)
1210 {
1211 	uint32_t i, j;
1212 	ftree_sw_t *p_sw;
1213 
1214 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1215 		"General fabric topology info\n");
1216 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1217 		"============================\n");
1218 
1219 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1220 		"  - FatTree rank (roots to leaf switches): %u\n",
1221 		p_ftree->leaf_switch_rank + 1);
1222 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1223 		"  - FatTree max switch rank: %u\n", p_ftree->max_switch_rank);
1224 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1225 		"  - Fabric has %u CAs, %u CA ports (%u of them CNs), %u switches\n",
1226 		cl_qmap_count(&p_ftree->hca_tbl), p_ftree->ca_ports,
1227 		p_ftree->cn_num, cl_qmap_count(&p_ftree->sw_tbl));
1228 
1229 	CL_ASSERT(p_ftree->ca_ports >= p_ftree->cn_num);
1230 
1231 	for (i = 0; i <= p_ftree->max_switch_rank; i++) {
1232 		j = 0;
1233 		for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1234 		     p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1235 		     p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1236 			if (p_sw->rank == i)
1237 				j++;
1238 		}
1239 		if (i == 0)
1240 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1241 				"  - Fabric has %u switches at rank %u (roots)\n",
1242 				j, i);
1243 		else if (i == p_ftree->leaf_switch_rank)
1244 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1245 				"  - Fabric has %u switches at rank %u (%u of them leafs)\n",
1246 				j, i, p_ftree->leaf_switches_num);
1247 		else
1248 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO,
1249 				"  - Fabric has %u switches at rank %u\n", j,
1250 				i);
1251 	}
1252 
1253 	if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_VERBOSE)) {
1254 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1255 			"  - Root switches:\n");
1256 		for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1257 		     p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1258 		     p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1259 			if (p_sw->rank == 0)
1260 				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1261 					"      GUID: 0x%016" PRIx64
1262 					", LID: %u, Index %s\n",
1263 					sw_get_guid_ho(p_sw),
1264 					p_sw->lid,
1265 					tuple_to_str(p_sw->tuple));
1266 		}
1267 
1268 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1269 			"  - Leaf switches (sorted by index):\n");
1270 		for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1271 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1272 				"      GUID: 0x%016" PRIx64
1273 				", LID: %u, Index %s\n",
1274 				sw_get_guid_ho(p_ftree->leaf_switches[i]),
1275 				p_ftree->leaf_switches[i]->lid,
1276 				tuple_to_str(p_ftree->leaf_switches[i]->tuple));
1277 		}
1278 	}
1279 }				/* fabric_dump_general_info() */
1280 
1281 /***************************************************/
1282 
1283 static void fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree)
1284 {
1285 	ftree_hca_t *p_hca;
1286 	ftree_sw_t *p_sw;
1287 	ftree_port_group_t *p_group_on_sw;
1288 	ftree_port_group_t *p_group_on_hca;
1289 	int rename_status = 0;
1290 	uint32_t i;
1291 	uint32_t j;
1292 	unsigned printed_hcas_on_leaf;
1293 
1294 	char path[1024], path_tmp[1032];
1295 	FILE *p_hca_ordering_file;
1296 	const char *filename = "opensm-ftree-ca-order.dump";
1297 
1298 	snprintf(path, sizeof(path), "%s/%s",
1299 		 p_ftree->p_osm->subn.opt.dump_files_dir, filename);
1300 
1301 	snprintf(path_tmp, sizeof(path_tmp), "%s.tmp", path);
1302 
1303 	p_hca_ordering_file = fopen(path_tmp, "w");
1304 	if (!p_hca_ordering_file) {
1305 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB01: "
1306 			"cannot open file \'%s\': %s\n", path_tmp,
1307 			strerror(errno));
1308 		return;
1309 	}
1310 
1311 	/* for each leaf switch (in indexing order) */
1312 	for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1313 		p_sw = p_ftree->leaf_switches[i];
1314 		printed_hcas_on_leaf = 0;
1315 
1316 		/* for each real CA (CNs and not) connected to this switch */
1317 		for (j = 0; j < p_sw->down_port_groups_num; j++) {
1318 			p_group_on_sw = p_sw->down_port_groups[j];
1319 
1320 			if (p_group_on_sw->remote_node_type != IB_NODE_TYPE_CA)
1321 				continue;
1322 
1323 			p_hca = p_group_on_sw->remote_hca_or_sw.p_hca;
1324 			p_group_on_hca =
1325 			    hca_get_port_group_by_lid(p_hca,
1326 						      p_group_on_sw->
1327 						      remote_lid);
1328 
1329 			/* treat non-compute nodes as dummies */
1330 			if (!p_group_on_hca->is_cn)
1331 				continue;
1332 
1333 			fprintf(p_hca_ordering_file, "0x%04x\t%s\n",
1334 				p_group_on_hca->lid,
1335 				p_hca->p_osm_node->print_desc);
1336 
1337 			printed_hcas_on_leaf++;
1338 		}
1339 
1340 		/* now print missing HCAs */
1341 		for (j = 0;
1342 		     j < (p_ftree->max_cn_per_leaf - printed_hcas_on_leaf); j++)
1343 			fprintf(p_hca_ordering_file, "0xFFFF\tDUMMY\n");
1344 
1345 	}
1346 	/* done going through all the leaf switches */
1347 
1348 	fclose(p_hca_ordering_file);
1349 
1350 	rename_status = rename(path_tmp, path);
1351 	if (rename_status) {
1352 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB03: "
1353 			"cannot rename file \'%s\': %s\n", path_tmp,
1354 			strerror(errno));
1355 	}
1356 }				/* fabric_dump_hca_ordering() */
1357 
1358 /***************************************************/
1359 
1360 static void fabric_assign_tuple(IN ftree_fabric_t * p_ftree,
1361 				IN ftree_sw_t * p_sw,
1362 				IN ftree_tuple_t new_tuple)
1363 {
1364 	memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN);
1365 	fabric_add_sw_by_tuple(p_ftree, p_sw);
1366 }
1367 
1368 /***************************************************/
1369 
1370 static void fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree,
1371 				      IN ftree_sw_t * p_sw,
1372 				      IN unsigned int subtree)
1373 {
1374 	uint8_t i;
1375 	ftree_tuple_t new_tuple;
1376 
1377 	if (p_ftree->leaf_switch_rank >= FTREE_TUPLE_LEN)
1378 		return;
1379 
1380 	tuple_init(new_tuple);
1381 	new_tuple[0] = (uint8_t) p_sw->rank;
1382 
1383 	for (i = 1; i <= p_ftree->leaf_switch_rank; i++)
1384 		new_tuple[i] = 0;
1385 
1386 	if (p_sw->rank == 0) {
1387 		if (p_ftree->leaf_switch_rank > 1)
1388 			new_tuple[p_ftree->leaf_switch_rank] = subtree;
1389 
1390 		for (i = 0; i < 0xFF; i++) {
1391 			new_tuple[1] = i;
1392 			if (fabric_get_sw_by_tuple(p_ftree, new_tuple) == NULL)
1393 				break;
1394 		}
1395 		if (i == 0xFF) {
1396 			/* new tuple not found - there are more than 255 ports in one direction */
1397 			return;
1398 		}
1399 	}
1400 	fabric_assign_tuple(p_ftree, p_sw, new_tuple);
1401 }
1402 
1403 /***************************************************/
1404 
1405 static void fabric_get_new_tuple(IN ftree_fabric_t * p_ftree,
1406 				 OUT ftree_tuple_t new_tuple,
1407 				 IN ftree_tuple_t from_tuple,
1408 				 IN ftree_direction_t direction)
1409 {
1410 	ftree_sw_t *p_sw;
1411 	ftree_tuple_t temp_tuple;
1412 	uint8_t var_index;
1413 	uint8_t i;
1414 
1415 	tuple_init(new_tuple);
1416 	memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN);
1417 
1418 	if (direction == FTREE_DIRECTION_DOWN) {
1419 		temp_tuple[0]++;
1420 		var_index = from_tuple[0] + 1;
1421 	} else {
1422 		temp_tuple[0]--;
1423 		var_index = from_tuple[0];
1424 	}
1425 
1426 	for (i = 0; i < 0xFF; i++) {
1427 		temp_tuple[var_index] = i;
1428 		p_sw = fabric_get_sw_by_tuple(p_ftree, temp_tuple);
1429 		if (p_sw == NULL)	/* found free tuple */
1430 			break;
1431 	}
1432 
1433 	if (i == 0xFF) {
1434 		/* new tuple not found - there are more than 255 ports in one direction */
1435 		return;
1436 	}
1437 	memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN);
1438 
1439 }				/* fabric_get_new_tuple() */
1440 
1441 /***************************************************/
1442 
1443 static inline boolean_t fabric_roots_provided(IN ftree_fabric_t * p_ftree)
1444 {
1445 	return (p_ftree->p_osm->subn.opt.root_guid_file != NULL);
1446 }
1447 
1448 /***************************************************/
1449 
1450 static inline boolean_t fabric_cns_provided(IN ftree_fabric_t * p_ftree)
1451 {
1452 	return (p_ftree->p_osm->subn.opt.cn_guid_file != NULL);
1453 }
1454 
1455 /***************************************************/
1456 
1457 static inline boolean_t fabric_ios_provided(IN ftree_fabric_t * p_ftree)
1458 {
1459 	return (p_ftree->p_osm->subn.opt.io_guid_file != NULL);
1460 }
1461 
1462 /***************************************************/
1463 
1464 static int fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree)
1465 {
1466 	ftree_sw_t *p_sw;
1467 	ftree_hca_t *p_hca;
1468 	ftree_hca_t *p_next_hca;
1469 	unsigned i;
1470 	int res = 0;
1471 
1472 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
1473 
1474 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1475 		"Marking leaf switches in fabric\n");
1476 
1477 	/* Scan all the CAs, if they have CNs - find CN port and mark switch
1478 	   that is connected to this port as leaf switch.
1479 	   Also, ensure that this marked leaf has rank of p_ftree->leaf_switch_rank. */
1480 	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
1481 	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
1482 		p_hca = p_next_hca;
1483 		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
1484 		if (!p_hca->cn_num)
1485 			continue;
1486 
1487 		for (i = 0; i < p_hca->up_port_groups_num; i++) {
1488 			if (!p_hca->up_port_groups[i]->is_cn)
1489 				continue;
1490 
1491 			/* In CAs, port group alway has one port, and since this
1492 			   port group is CN, we know that this port is compute node */
1493 			CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
1494 				  IB_NODE_TYPE_SWITCH);
1495 			p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
1496 
1497 			/* check if this switch was already processed */
1498 			if (p_sw->is_leaf)
1499 				continue;
1500 			p_sw->is_leaf = TRUE;
1501 
1502 			/* ensure that this leaf switch is at the correct tree level */
1503 			if (p_sw->rank != p_ftree->leaf_switch_rank) {
1504 				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1505 					"ERR AB26: CN port 0x%" PRIx64
1506 					" is connected to switch 0x%" PRIx64
1507 					" with rank %u, "
1508 					"while FatTree leaf rank is %u\n",
1509 					cl_ntoh64(p_hca->
1510 						  up_port_groups[i]->port_guid),
1511 					sw_get_guid_ho(p_sw), p_sw->rank,
1512 					p_ftree->leaf_switch_rank);
1513 				res = -1;
1514 				goto Exit;
1515 
1516 			}
1517 		}
1518 	}
1519 
1520 Exit:
1521 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
1522 	return res;
1523 }				/* fabric_mark_leaf_switches() */
1524 
1525 /***************************************************/
1526 static void bfs_fabric_indexing(IN ftree_fabric_t * p_ftree,
1527 				IN ftree_sw_t *p_first_sw)
1528 {
1529 	ftree_sw_t *p_remote_sw;
1530 	ftree_sw_t *p_sw = NULL;
1531 	ftree_tuple_t new_tuple;
1532 	uint32_t i;
1533 	cl_list_t bfs_list;
1534 
1535 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
1536 	cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
1537 	/*
1538 	 * Now run BFS and assign indexes to all switches
1539 	 * Pseudo code of the algorithm is as follows:
1540 	 *
1541 	 *  * Add first switch to BFS queue
1542 	 *  * While (BFS queue not empty)
1543 	 *      - Pop the switch from the head of the queue
1544 	 *      - Scan all the downward and upward ports
1545 	 *      - For each port
1546 	 *          + Get the remote switch
1547 	 *          + Assign index to the remote switch
1548 	 *          + Add remote switch to the BFS queue
1549 	 */
1550 
1551 	cl_list_insert_tail(&bfs_list, p_first_sw);
1552 
1553 	while (!cl_is_list_empty(&bfs_list)) {
1554 		p_sw = (ftree_sw_t *) cl_list_remove_head(&bfs_list);
1555 
1556 		/* Discover all the nodes from ports that are pointing down */
1557 
1558 		if (p_sw->rank >= p_ftree->leaf_switch_rank) {
1559 			/* whether downward ports are pointing to CAs or switches,
1560 			   we don't assign indexes to switches that are located
1561 			   lower than leaf switches */
1562 		} else {
1563 			/* This is not the leaf switch */
1564 			for (i = 0; i < p_sw->down_port_groups_num; i++) {
1565 				/* Work with port groups that are pointing to switches only.
1566 				   No need to assign indexing to HCAs */
1567 				if (p_sw->
1568 				    down_port_groups[i]->remote_node_type !=
1569 				    IB_NODE_TYPE_SWITCH)
1570 					continue;
1571 
1572 				p_remote_sw =
1573 				    p_sw->down_port_groups[i]->
1574 				    remote_hca_or_sw.p_sw;
1575 				if (tuple_assigned(p_remote_sw->tuple)) {
1576 					/* this switch has been already indexed */
1577 					continue;
1578 				}
1579 				/* allocate new tuple */
1580 				fabric_get_new_tuple(p_ftree, new_tuple,
1581 						     p_sw->tuple,
1582 						     FTREE_DIRECTION_DOWN);
1583 				/* Assign the new tuple to the remote switch.
1584 				   This fuction also adds the switch into the switch_by_tuple table. */
1585 				fabric_assign_tuple(p_ftree, p_remote_sw,
1586 						    new_tuple);
1587 
1588 				/* add the newly discovered switch to the BFS queue */
1589 				cl_list_insert_tail(&bfs_list, p_remote_sw);
1590 			}
1591 			/* Done assigning indexes to all the remote switches
1592 			   that are pointed by the downgoing ports.
1593 			   Now sort port groups according to remote index. */
1594 			qsort(p_sw->down_port_groups,	/* array */
1595 			      p_sw->down_port_groups_num,	/* number of elements */
1596 			      sizeof(ftree_port_group_t *),	/* size of each element */
1597 			      compare_port_groups_by_remote_switch_index);	/* comparator */
1598 		}
1599 
1600 		/* Done indexing switches from ports that go down.
1601 		   Now do the same with ports that are pointing up.
1602 		   if we started from root (rank == 0), the leaf is bsf termination point */
1603 
1604 		if (p_sw->rank != 0 && (p_first_sw->rank != 0 || !p_sw->is_leaf)) {
1605 			/* This is not the root switch, which means that all the ports
1606 			   that are pointing up are taking us to another switches. */
1607 			for (i = 0; i < p_sw->up_port_groups_num; i++) {
1608 				p_remote_sw =
1609 				    p_sw->up_port_groups[i]->
1610 				    remote_hca_or_sw.p_sw;
1611 				if (tuple_assigned(p_remote_sw->tuple))
1612 					continue;
1613 				/* allocate new tuple */
1614 				fabric_get_new_tuple(p_ftree, new_tuple,
1615 						     p_sw->tuple,
1616 						     FTREE_DIRECTION_UP);
1617 				/* Assign the new tuple to the remote switch.
1618 				   This fuction also adds the switch to the
1619 				   switch_by_tuple table. */
1620 				fabric_assign_tuple(p_ftree,
1621 						    p_remote_sw, new_tuple);
1622 				/* add the newly discovered switch to the BFS queue */
1623 				cl_list_insert_tail(&bfs_list, p_remote_sw);
1624 			}
1625 			/* Done assigning indexes to all the remote switches
1626 			   that are pointed by the upgoing ports.
1627 			   Now sort port groups according to remote index. */
1628 			qsort(p_sw->up_port_groups,	/* array */
1629 			      p_sw->up_port_groups_num,	/* number of elements */
1630 			      sizeof(ftree_port_group_t *),	/* size of each element */
1631 			      compare_port_groups_by_remote_switch_index);	/* comparator */
1632 		}
1633 		/* Done assigning indexes to all the switches that are directly connected
1634 		   to the current switch - go to the next switch in the BFS queue */
1635 	}
1636 	cl_list_destroy(&bfs_list);
1637 
1638 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
1639 }
1640 
1641 static void fabric_make_indexing(IN ftree_fabric_t * p_ftree)
1642 {
1643 	ftree_sw_t *p_sw = NULL;
1644 	unsigned int subtree = 0;
1645 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
1646 
1647 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1648 		"Starting FatTree indexing\n");
1649 
1650 	/* using the first switch as a starting point for indexing algorithm. */
1651 	for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1652 	     p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
1653 	     p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
1654 		if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing) {
1655 			/* find first root switch */
1656 			if (p_sw->rank != 0)
1657 				continue;
1658 		} else {
1659 			/* find first leaf switch */
1660 			if (!p_sw->is_leaf)
1661 				continue;
1662 		}
1663 		/* Assign the first tuple to the switch that is used as BFS starting point
1664 		   in the subtree.
1665 		   The tuple will be as follows: [rank].0...0.subtree
1666 		   This fuction also adds the switch it into the switch_by_tuple table. */
1667 		if (!tuple_assigned(p_sw->tuple)) {
1668 			fabric_assign_first_tuple(p_ftree, p_sw, subtree++);
1669 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1670 			"Indexing starting point:\n"
1671 			"                                            - Switch rank  : %u\n"
1672 			"                                            - Switch index : %s\n"
1673 			"                                            - Node LID     : %u\n"
1674 			"                                            - Node GUID    : 0x%016"
1675 			PRIx64 "\n", p_sw->rank, tuple_to_str(p_sw->tuple),
1676 			p_sw->lid, sw_get_guid_ho(p_sw));
1677 		}
1678 
1679 		bfs_fabric_indexing(p_ftree, p_sw);
1680 
1681 		if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing == FALSE)
1682 			goto Exit;
1683 	}
1684 	p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1685 	while (p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1686 		if (p_sw->is_leaf) {
1687 			qsort(p_sw->up_port_groups,	/* array */
1688 			      p_sw->up_port_groups_num,	/* number of elements */
1689 			      sizeof(ftree_port_group_t *),	/* size of each element */
1690 			      compare_port_groups_by_remote_switch_index);	/* comparator */
1691 		}
1692 		p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1693 
1694 	}
1695 Exit:
1696 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
1697 }				/* fabric_make_indexing() */
1698 /***************************************************/
1699 
1700 static int fabric_create_leaf_switch_array(IN ftree_fabric_t * p_ftree)
1701 {
1702 	ftree_sw_t *p_sw;
1703 	ftree_sw_t *p_next_sw;
1704 	ftree_sw_t **all_switches_at_leaf_level;
1705 	unsigned i;
1706 	unsigned all_leaf_idx = 0;
1707 	unsigned first_leaf_idx;
1708 	unsigned last_leaf_idx;
1709 	int res = 0;
1710 
1711 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
1712 
1713 	/* create array of ALL the switches that have leaf rank */
1714 	all_switches_at_leaf_level = (ftree_sw_t **)
1715 	    malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1716 	if (!all_switches_at_leaf_level) {
1717 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1718 			   "Fat-tree routing: Memory allocation failed\n");
1719 		res = -1;
1720 		goto Exit;
1721 	}
1722 	memset(all_switches_at_leaf_level, 0,
1723 	       cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
1724 
1725 	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1726 	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1727 		p_sw = p_next_sw;
1728 		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1729 		if (p_sw->rank == p_ftree->leaf_switch_rank) {
1730 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1731 				"Adding switch 0x%" PRIx64
1732 				" to full leaf switch array\n",
1733 				sw_get_guid_ho(p_sw));
1734 			all_switches_at_leaf_level[all_leaf_idx++] = p_sw;
1735 		}
1736 	}
1737 
1738 	/* quick-sort array of leaf switches by index */
1739 	qsort(all_switches_at_leaf_level,	/* array */
1740 	      all_leaf_idx,	/* number of elements */
1741 	      sizeof(ftree_sw_t *),	/* size of each element */
1742 	      compare_switches_by_index);	/* comparator */
1743 
1744 	/* check the first and the last REAL leaf (the one
1745 	   that has CNs) in the array of all the leafs */
1746 
1747 	first_leaf_idx = all_leaf_idx;
1748 	last_leaf_idx = 0;
1749 	for (i = 0; i < all_leaf_idx; i++) {
1750 		if (all_switches_at_leaf_level[i]->is_leaf) {
1751 			if (i < first_leaf_idx)
1752 				first_leaf_idx = i;
1753 			last_leaf_idx = i;
1754 		}
1755 	}
1756 
1757 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1758 		"Full leaf array info: first_leaf_idx = %u, last_leaf_idx = %u\n",
1759 		first_leaf_idx, last_leaf_idx);
1760 
1761 	if (first_leaf_idx >= last_leaf_idx) {
1762 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
1763 			   "Failed to find leaf switches - topology is not "
1764 			   "fat-tree\n");
1765 		res = -1;
1766 		goto Exit;
1767 	}
1768 
1769 	/* Create array of REAL leaf switches, sorted by index.
1770 	   This array may contain switches at the same rank w/o CNs,
1771 	   in case this is the order of indexing. */
1772 	p_ftree->leaf_switches_num = last_leaf_idx - first_leaf_idx + 1;
1773 	p_ftree->leaf_switches = (ftree_sw_t **)
1774 	    malloc(p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1775 	if (!p_ftree->leaf_switches) {
1776 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1777 			   "Fat-tree routing: Memory allocation failed\n");
1778 		res = -1;
1779 		goto Exit;
1780 	}
1781 
1782 	memcpy(p_ftree->leaf_switches,
1783 	       &(all_switches_at_leaf_level[first_leaf_idx]),
1784 	       p_ftree->leaf_switches_num * sizeof(ftree_sw_t *));
1785 
1786 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
1787 		"Created array of %u leaf switches\n",
1788 		p_ftree->leaf_switches_num);
1789 
1790 Exit:
1791 	free(all_switches_at_leaf_level);
1792 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
1793 	return res;
1794 }				/* fabric_create_leaf_switch_array() */
1795 
1796 /***************************************************/
1797 
1798 static void fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree)
1799 {
1800 	unsigned i;
1801 	unsigned j;
1802 	unsigned cns_on_this_leaf;
1803 	ftree_sw_t *p_sw;
1804 	ftree_port_group_t *p_group, *p_up_group;
1805 	ftree_hca_t *p_hca;
1806 
1807 	for (i = 0; i < p_ftree->leaf_switches_num; i++) {
1808 		p_sw = p_ftree->leaf_switches[i];
1809 		cns_on_this_leaf = 0;
1810 		for (j = 0; j < p_sw->down_port_groups_num; j++) {
1811 			p_group = p_sw->down_port_groups[j];
1812 			if (p_group->remote_node_type != IB_NODE_TYPE_CA)
1813 				continue;
1814 			p_hca = p_group->remote_hca_or_sw.p_hca;
1815 			/*
1816 			 * Get the hca port group corresponding
1817 			 * to the LID of remote HCA port
1818 			 */
1819 			p_up_group = hca_get_port_group_by_lid(p_hca,
1820 				     p_group->remote_lid);
1821 
1822 			CL_ASSERT(p_up_group);
1823 
1824 			if (p_up_group->is_cn)
1825 				cns_on_this_leaf++;
1826 		}
1827 		if (cns_on_this_leaf > p_ftree->max_cn_per_leaf)
1828 			p_ftree->max_cn_per_leaf = cns_on_this_leaf;
1829 	}
1830 }				/* fabric_set_max_cn_per_leaf() */
1831 
1832 /***************************************************/
1833 
1834 static boolean_t fabric_validate_topology(IN ftree_fabric_t * p_ftree)
1835 {
1836 	ftree_port_group_t *p_group;
1837 	ftree_port_group_t *p_ref_group;
1838 	ftree_sw_t *p_sw;
1839 	ftree_sw_t *p_next_sw;
1840 	ftree_sw_t **reference_sw_arr;
1841 	uint16_t tree_rank = fabric_get_rank(p_ftree);
1842 	boolean_t res = TRUE;
1843 	uint8_t i;
1844 
1845 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
1846 
1847 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
1848 		"Validating fabric topology\n");
1849 
1850 	reference_sw_arr =
1851 	    (ftree_sw_t **) malloc(tree_rank * sizeof(ftree_sw_t *));
1852 	if (reference_sw_arr == NULL) {
1853 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID,
1854 			   "Fat-tree routing: Memory allocation failed\n");
1855 		return FALSE;
1856 	}
1857 	memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *));
1858 
1859 	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
1860 	while (res && p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
1861 		p_sw = p_next_sw;
1862 		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
1863 
1864 		if (!reference_sw_arr[p_sw->rank])
1865 			/* This is the first switch in the current level that
1866 			   we're checking - use it as a reference */
1867 			reference_sw_arr[p_sw->rank] = p_sw;
1868 		else {
1869 			/* compare this switch properties to the reference switch */
1870 
1871 			if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1872 			    p_sw->up_port_groups_num) {
1873 				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1874 					"ERR AB09: Different number of upward port groups on switches:\n"
1875 					"       GUID 0x%016" PRIx64
1876 					", LID %u, Index %s - %u groups\n"
1877 					"       GUID 0x%016" PRIx64
1878 					", LID %u, Index %s - %u groups\n",
1879 					sw_get_guid_ho
1880 					(reference_sw_arr[p_sw->rank]),
1881 					reference_sw_arr[p_sw->rank]->lid,
1882 					tuple_to_str
1883 					(reference_sw_arr[p_sw->rank]->tuple),
1884 					reference_sw_arr[p_sw->
1885 							 rank]->
1886 					up_port_groups_num,
1887 					sw_get_guid_ho(p_sw), p_sw->lid,
1888 					tuple_to_str(p_sw->tuple),
1889 					p_sw->up_port_groups_num);
1890 				res = FALSE;
1891 				break;
1892 			}
1893 
1894 			if (p_sw->rank != (tree_rank - 1) &&
1895 			    reference_sw_arr[p_sw->
1896 					     rank]->down_port_groups_num !=
1897 			    p_sw->down_port_groups_num) {
1898 				/* we're allowing some hca's to be missing */
1899 				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
1900 					"ERR AB0A: Different number of downward port groups on switches:\n"
1901 					"       GUID 0x%016" PRIx64
1902 					", LID %u, Index %s - %u port groups\n"
1903 					"       GUID 0x%016" PRIx64
1904 					", LID %u, Index %s - %u port groups\n",
1905 					sw_get_guid_ho
1906 					(reference_sw_arr[p_sw->rank]),
1907 					reference_sw_arr[p_sw->rank]->lid,
1908 					tuple_to_str
1909 					(reference_sw_arr[p_sw->rank]->tuple),
1910 					reference_sw_arr[p_sw->
1911 							 rank]->
1912 					down_port_groups_num,
1913 					sw_get_guid_ho(p_sw), p_sw->lid,
1914 					tuple_to_str(p_sw->tuple),
1915 					p_sw->down_port_groups_num);
1916 				res = FALSE;
1917 				break;
1918 			}
1919 
1920 			if (reference_sw_arr[p_sw->rank]->up_port_groups_num !=
1921 			    0) {
1922 				p_ref_group =
1923 				    reference_sw_arr[p_sw->
1924 						     rank]->up_port_groups[0];
1925 				for (i = 0; i < p_sw->up_port_groups_num; i++) {
1926 					p_group = p_sw->up_port_groups[i];
1927 					if (cl_ptr_vector_get_size
1928 					    (&p_ref_group->ports) !=
1929 					    cl_ptr_vector_get_size
1930 					    (&p_group->ports)) {
1931 						OSM_LOG(&p_ftree->p_osm->log,
1932 							OSM_LOG_ERROR,
1933 							"ERR AB0B: Different number of ports in an upward port group on switches:\n"
1934 							"       GUID 0x%016"
1935 							PRIx64
1936 							", LID %u, Index %s - %u ports\n"
1937 							"       GUID 0x%016"
1938 							PRIx64
1939 							", LID %u, Index %s - %u ports\n",
1940 							sw_get_guid_ho
1941 							(reference_sw_arr
1942 							 [p_sw->rank]),
1943 							reference_sw_arr[p_sw->
1944 									 rank]->
1945 							lid,
1946 							tuple_to_str
1947 							(reference_sw_arr
1948 							 [p_sw->rank]->tuple),
1949 							cl_ptr_vector_get_size
1950 							(&p_ref_group->ports),
1951 							sw_get_guid_ho(p_sw),
1952 							p_sw->lid,
1953 							tuple_to_str(p_sw->
1954 								     tuple),
1955 							cl_ptr_vector_get_size
1956 							(&p_group->ports));
1957 						res = FALSE;
1958 						break;
1959 					}
1960 				}
1961 			}
1962 			if (reference_sw_arr[p_sw->rank]->down_port_groups_num
1963 			    != 0 && p_sw->rank != (tree_rank - 1)) {
1964 				/* we're allowing some hca's to be missing */
1965 				p_ref_group =
1966 				    reference_sw_arr[p_sw->
1967 						     rank]->down_port_groups[0];
1968 				for (i = 0; i < p_sw->down_port_groups_num; i++) {
1969 					p_group = p_sw->down_port_groups[0];
1970 					if (cl_ptr_vector_get_size
1971 					    (&p_ref_group->ports) !=
1972 					    cl_ptr_vector_get_size
1973 					    (&p_group->ports)) {
1974 						OSM_LOG(&p_ftree->p_osm->log,
1975 							OSM_LOG_ERROR,
1976 							"ERR AB0C: Different number of ports in an downward port group on switches:\n"
1977 							"       GUID 0x%016"
1978 							PRIx64
1979 							", LID %u, Index %s - %u ports\n"
1980 							"       GUID 0x%016"
1981 							PRIx64
1982 							", LID %u, Index %s - %u ports\n",
1983 							sw_get_guid_ho
1984 							(reference_sw_arr
1985 							 [p_sw->rank]),
1986 							reference_sw_arr[p_sw->
1987 									 rank]->
1988 							lid,
1989 							tuple_to_str
1990 							(reference_sw_arr
1991 							 [p_sw->rank]->tuple),
1992 							cl_ptr_vector_get_size
1993 							(&p_ref_group->ports),
1994 							sw_get_guid_ho(p_sw),
1995 							p_sw->lid,
1996 							tuple_to_str(p_sw->
1997 								     tuple),
1998 							cl_ptr_vector_get_size
1999 							(&p_group->ports));
2000 						res = FALSE;
2001 						break;
2002 					}
2003 				}
2004 			}
2005 		}		/* end of else */
2006 	}			/* end of while */
2007 
2008 	if (res == TRUE)
2009 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
2010 			"Fabric topology has been identified as FatTree\n");
2011 	else
2012 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
2013 			"ERR AB0D: Fabric topology hasn't been identified as FatTree\n");
2014 
2015 	free(reference_sw_arr);
2016 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
2017 	return res;
2018 }				/* fabric_validate_topology() */
2019 
2020 /***************************************************
2021  ***************************************************/
2022 
2023 static void set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
2024 			     IN void *context)
2025 {
2026 	ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
2027 	ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
2028 
2029 	p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid;
2030 }
2031 
2032 /***************************************************
2033  ***************************************************/
2034 
2035 /*
2036  * Function: Finds the least loaded port group and stores its counter
2037  * Given   : A switch
2038  */
2039 static inline void recalculate_min_counter_down(ftree_sw_t * p_sw)
2040 {
2041 	uint32_t min = (1 << 30);
2042 	uint32_t i;
2043 	for (i = 0; i < p_sw->down_port_groups_num; i++) {
2044 		if (p_sw->down_port_groups[i]->counter_down < min) {
2045 			min = p_sw->down_port_groups[i]->counter_down;
2046 		}
2047 	}
2048 	p_sw->min_counter_down = min;
2049 	return;
2050 }
2051 
2052 /*
2053  * Function: Return the counter value of the least loaded down port group
2054  * Given   : A switch
2055  */
2056 static inline uint32_t find_lowest_loaded_group_on_sw(ftree_sw_t * p_sw)
2057 {
2058 	return p_sw->min_counter_down;
2059 }
2060 
2061 /*
2062  * Function: Compare the load of two port groups and return which is the least loaded
2063  * Given   : Two port groups with remote switch
2064  * When both port groups are equally loaded, it picks the one whom
2065  * remote switch down ports are least loaded.
2066  * This way, it prefers the switch from where it will be easier to go down (creating upward routes).
2067  * If both are equal, it picks the lowest INDEX to be deterministic.
2068  */
2069 static inline int port_group_compare_load_down(const ftree_port_group_t * p1,
2070 					       const ftree_port_group_t * p2)
2071 {
2072 	int temp = p1->counter_down - p2->counter_down;
2073 	if (temp > 0)
2074 		return 1;
2075 	if (temp < 0)
2076 		return -1;
2077 
2078 	/* Find the less loaded remote sw and choose this one */
2079 	do {
2080 		uint32_t load1 =
2081 		    find_lowest_loaded_group_on_sw(p1->remote_hca_or_sw.p_sw);
2082 		uint32_t load2 =
2083 		    find_lowest_loaded_group_on_sw(p2->remote_hca_or_sw.p_sw);
2084 		temp = load1 - load2;
2085 		if (temp > 0)
2086 			return 1;
2087 	} while (0);
2088 	/* If they are both equal, choose the lowest index */
2089 	return compare_port_groups_by_remote_switch_index(&p1, &p2);
2090 }
2091 
2092 static inline int port_group_compare_load_up(const ftree_port_group_t * p1,
2093                                              const ftree_port_group_t * p2)
2094 {
2095         int temp = p1->counter_up - p2->counter_up;
2096         if (temp > 0)
2097                 return 1;
2098         if (temp < 0)
2099                 return -1;
2100 
2101         /* If they are both equal, choose the lowest index */
2102         return compare_port_groups_by_remote_switch_index (&p1,&p2);
2103 }
2104 
2105 /*
2106  * Function: Sorts an array of port group by up load order
2107  * Given   : A port group array and its length
2108  * As the list is mostly sorted, we used a bubble sort instead of qsort
2109  * as it is much faster.
2110  *
2111  * Important note:
2112  * This function and bubble_sort_down must NOT be factorized.
2113  * Although most of the code is the same and a function pointer could be used
2114  * for the compareason function, it would prevent the compareason function to be inlined
2115  * and cost a great deal to performances.
2116  */
2117 static inline void
2118 bubble_sort_up(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2119 {
2120 	uint32_t i = 0;
2121 	uint32_t j = 0;
2122 	ftree_port_group_t *tmp = p_group_array[0];
2123 
2124 	/* As this function is a great number of times, we only go into the loop
2125 	 * if one of the port counters has changed, thus saving some tests */
2126 	if (tmp->hca_or_sw.p_sw->counter_up_changed == FALSE) {
2127 		return;
2128 	}
2129 	/* While we did modifications on the array order */
2130 	/* i may grew above array length but next loop will fail and tmp will be null for the next time
2131 	 * this way we save a test i < nmemb for each pass through the loop */
2132 	for (i = 0; tmp; i++) {
2133 		/* Assume the array is orderd */
2134 		tmp = NULL;
2135 		/* Comparing elements j and j-1 */
2136 		for (j = 1; j < (nmemb - i); j++) {
2137 			/* If they are the wrong way around */
2138 			if (port_group_compare_load_up(p_group_array[j],
2139 						       p_group_array[j - 1]) < 0) {
2140 				/* We invert them */
2141 				tmp = p_group_array[j - 1];
2142 				p_group_array[j - 1] = p_group_array[j];
2143 				p_group_array[j] = tmp;
2144 				/* This sets tmp != NULL so the main loop will make another pass */
2145 			}
2146 		}
2147 	}
2148 
2149 	/* We have reordered the array so as long noone changes the counter
2150 	 * it's not necessary to do it again */
2151 	p_group_array[0]->hca_or_sw.p_sw->counter_up_changed = FALSE;
2152 }
2153 
2154 static inline void
2155 bubble_sort_siblings(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2156 {
2157 	uint32_t i = 0;
2158 	uint32_t j = 0;
2159 	ftree_port_group_t *tmp = p_group_array[0];
2160 
2161 	/* While we did modifications on the array order */
2162 	/* i may grew above array length but next loop will fail and tmp will be null for the next time
2163 	 * this way we save a test i < nmemb for each pass through the loop */
2164 	for (i = 0; tmp != NULL; i++) {
2165 		/* Assume the array is orderd */
2166 		tmp = NULL;
2167 		/* Comparing elements j and j-1 */
2168 		for (j = 1; j < (nmemb - i); j++) {
2169 			/* If they are the wrong way around */
2170 			if (port_group_compare_load_up(p_group_array[j],
2171 						       p_group_array[j - 1]) < 0) {
2172 				/* We invert them */
2173 				tmp = p_group_array[j - 1];
2174 				p_group_array[j - 1] = p_group_array[j];
2175 				p_group_array[j] = tmp;
2176 			}
2177 		}
2178 	}
2179 }
2180 
2181 /*
2182  * Function: Sorts an array of port group. Order is decide through
2183  * port_group_compare_load_down ( up counters, least load remote switch, biggest GUID)
2184  * Given   : A port group array and its length. Each port group points to a remote switch (not a HCA)
2185  * As the list is mostly sorted, we used a bubble sort instead of qsort
2186  * as it is much faster.
2187  *
2188  * Important note:
2189  * This function and bubble_sort_up must NOT be factorized.
2190  * Although most of the code is the same and a function pointer could be used
2191  * for the compareason function, it would prevent the compareason function to be inlined
2192  * and cost a great deal to performances.
2193  */
2194 static inline void
2195 bubble_sort_down(ftree_port_group_t ** p_group_array, uint32_t nmemb)
2196 {
2197 	uint32_t i = 0;
2198 	uint32_t j = 0;
2199 	ftree_port_group_t *tmp = p_group_array[0];
2200 
2201 	/* While we did modifications on the array order */
2202 	/* i may grew above array length but next loop will fail and tmp will be null for the next time
2203 	 * this way we save a test i < nmemb for each pass through the loop */
2204 	for (i = 0; tmp; i++) {
2205 		/* Assume the array is orderd */
2206 		tmp = NULL;
2207 		/* Comparing elements j and j-1 */
2208 		for (j = 1; j < (nmemb - i); j++) {
2209 			/* If they are the wrong way around */
2210 			if (port_group_compare_load_down
2211 			    (p_group_array[j], p_group_array[j - 1]) < 0) {
2212 				/* We invert them */
2213 				tmp = p_group_array[j - 1];
2214 				p_group_array[j - 1] = p_group_array[j];
2215 				p_group_array[j] = tmp;
2216 
2217 			}
2218 		}
2219 	}
2220 }
2221 
2222 /***************************************************
2223  ***************************************************/
2224 
2225 /*
2226  * Function: assign-up-going-port-by-descending-down
2227  * Given   : a switch and a LID
2228  * Pseudo code:
2229  *    foreach down-going-port-group (in indexing order)
2230  *        skip this group if the LFT(LID) port is part of this group
2231  *        find the least loaded port of the group (scan in indexing order)
2232  *        r-port is the remote port connected to it
2233  *        assign the remote switch node LFT(LID) to r-port
2234  *        increase r-port usage counter
2235  *        assign-up-going-port-by-descending-down to r-port node (recursion)
2236  */
2237 
2238 static boolean_t
2239 fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
2240 				   IN ftree_sw_t * p_sw,
2241 				   IN ftree_sw_t * p_prev_sw,
2242 				   IN uint16_t target_lid,
2243 				   IN boolean_t is_main_path,
2244 				   IN boolean_t is_target_a_sw,
2245 				   IN uint8_t current_hops)
2246 {
2247 	ftree_sw_t *p_remote_sw;
2248 	uint16_t ports_num;
2249 	ftree_port_group_t *p_group;
2250 	ftree_port_t *p_port;
2251 	ftree_port_t *p_min_port;
2252 	uint16_t j;
2253 	uint16_t k;
2254 	boolean_t created_route = FALSE;
2255 	boolean_t routed = 0;
2256 	uint8_t least_hops;
2257 
2258 	/* if there is no down-going ports */
2259 	if (p_sw->down_port_groups_num == 0)
2260 		return FALSE;
2261 
2262 	/* foreach down-going port group (in load order) */
2263 	bubble_sort_up(p_sw->down_port_groups, p_sw->down_port_groups_num);
2264 
2265 	if (p_sw->sibling_port_groups_num > 0)
2266 		bubble_sort_siblings(p_sw->sibling_port_groups,
2267 				     p_sw->sibling_port_groups_num);
2268 
2269 	for (k = 0;
2270 	     k <
2271 	     (p_sw->down_port_groups_num +
2272 	      ((target_lid != 0) ? p_sw->sibling_port_groups_num : 0)); k++) {
2273 
2274 		if (k < p_sw->down_port_groups_num) {
2275 			p_group = p_sw->down_port_groups[k];
2276 		} else {
2277 			p_group =
2278 			    p_sw->sibling_port_groups[k -
2279 						      p_sw->
2280 						      down_port_groups_num];
2281 		}
2282 
2283 		/* If this port group doesn't point to a switch, mark
2284 		   that the route was created and skip to the next group */
2285 		if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH) {
2286 			created_route = TRUE;
2287 			continue;
2288 		}
2289 
2290 		if (p_prev_sw
2291 		    && p_group->remote_lid == p_prev_sw->lid) {
2292 			/* This port group has a port that was used when we entered this switch,
2293 			   which means that the current group points to the switch where we were
2294 			   at the previous step of the algorithm (before going up).
2295 			   Skipping this group. */
2296 			continue;
2297 		}
2298 
2299 		/* find the least loaded port of the group (in indexing order) */
2300 		p_min_port = NULL;
2301 		ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2302 		if(ports_num == 0)
2303 			continue;
2304 
2305 		for (j = 0; j < ports_num; j++) {
2306 			cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2307 			/* first port that we're checking - set as port with the lowest load */
2308 			/* or this port is less loaded - use it as min */
2309 			if (!p_min_port ||
2310 			    p_port->counter_up < p_min_port->counter_up)
2311 				p_min_port = p_port;
2312 		}
2313 		/* At this point we have selected a port in this group with the
2314 		   lowest load of upgoing routes.
2315 		   Set on the remote switch how to get to the target_lid -
2316 		   set LFT(target_lid) on the remote switch to the remote port */
2317 		p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2318 		least_hops = sw_get_least_hops(p_remote_sw, target_lid);
2319 
2320 		if (least_hops != OSM_NO_PATH) {
2321 			/* Loop in the fabric - we already routed the remote switch
2322 			   on our way UP, and now we see it again on our way DOWN */
2323 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2324 				"Loop of length %d in the fabric:\n                             "
2325 				"Switch %s (LID %u) closes loop through switch %s (LID %u)\n",
2326 				current_hops,
2327 				tuple_to_str(p_remote_sw->tuple),
2328 				p_group->lid,
2329 				tuple_to_str(p_sw->tuple),
2330 				p_group->remote_lid);
2331 			/* We skip only if we have come through a longer path */
2332 			if (current_hops + 1 >= least_hops)
2333 				continue;
2334 		}
2335 
2336 		/* Four possible cases:
2337 		 *
2338 		 *  1. is_main_path == TRUE:
2339 		 *      - going DOWN(TRUE,TRUE) through ALL the groups
2340 		 *         + promoting port counter
2341 		 *         + setting path in remote switch fwd tbl
2342 		 *         + setting hops in remote switch on all the ports of each group
2343 		 *
2344 		 *  2. is_main_path == FALSE:
2345 		 *      - going DOWN(TRUE,FALSE) through ALL the groups but only if
2346 		 *        the remote (lower) switch hasn't been already configured
2347 		 *        for this target LID (or with a longer path)
2348 		 *         + promoting port counter
2349 		 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2350 		 *         + setting hops in remote switch on all the ports of each group
2351 		 *           if it hasn't been set yet
2352 		 */
2353 
2354 		/* setting fwd tbl port only */
2355 		p_remote_sw->p_osm_sw->new_lft[target_lid] =
2356 			    p_min_port->remote_port_num;
2357 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2358 				"Switch %s: set path to CA LID %u through port %u\n",
2359 				tuple_to_str(p_remote_sw->tuple),
2360 				target_lid, p_min_port->remote_port_num);
2361 
2362 		/* On the remote switch that is pointed by the p_group,
2363 			set hops for ALL the ports in the remote group. */
2364 
2365 		set_hops_on_remote_sw(p_group, target_lid,
2366 				      current_hops + 1, is_target_a_sw);
2367 
2368 		/* Recursion step:
2369 		   Assign upgoing ports by stepping down, starting on REMOTE switch */
2370 		routed = fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw,	/* remote switch - used as a route-upgoing alg. start point */
2371 							    NULL,	/* prev. position - NULL to mark that we went down and not up */
2372 							    target_lid,	/* LID that we're routing to */
2373 							    is_main_path,	/* whether this is path to HCA that should by tracked by counters */
2374 							    is_target_a_sw,	/* Whether target lid is a switch or not */
2375 							    current_hops + 1);	/* Number of hops done to this point */
2376 		created_route |= routed;
2377 		/* Counters are promoted only if a route toward a node is created */
2378 		if (routed) {
2379 			p_min_port->counter_up++;
2380 			p_group->counter_up++;
2381 			p_group->hca_or_sw.p_sw->counter_up_changed = TRUE;
2382 		}
2383 	}
2384 	/* done scanning all the down-going port groups */
2385 
2386 	/* if the route was created, promote the index that
2387 	   indicates which group should we start with when
2388 	   going through all the downgoing groups */
2389 	if (created_route)
2390 		p_sw->down_port_groups_idx = (p_sw->down_port_groups_idx + 1)
2391 		    % p_sw->down_port_groups_num;
2392 
2393 	return created_route;
2394 }				/* fabric_route_upgoing_by_going_down() */
2395 
2396 /***************************************************/
2397 
2398 /*
2399  * Function: assign-down-going-port-by-ascending-up
2400  * Given   : a switch and a LID
2401  * Pseudo code:
2402  *    find the least loaded port of all the upgoing groups (scan in indexing order)
2403  *    assign the LFT(LID) of remote switch to that port
2404  *    track that port usage
2405  *    assign-up-going-port-by-descending-down on CURRENT switch
2406  *    assign-down-going-port-by-ascending-up on REMOTE switch (recursion)
2407  */
2408 
2409 static boolean_t
2410 fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
2411 				   IN ftree_sw_t * p_sw,
2412 				   IN ftree_sw_t * p_prev_sw,
2413 				   IN uint16_t target_lid,
2414 				   IN boolean_t is_main_path,
2415 				   IN boolean_t is_target_a_sw,
2416 				   IN uint16_t reverse_hop_credit,
2417 				   IN uint16_t reverse_hops,
2418 				   IN uint8_t current_hops)
2419 {
2420 	ftree_sw_t *p_remote_sw;
2421 	uint16_t ports_num;
2422 	ftree_port_group_t *p_group;
2423 	ftree_port_t *p_port;
2424 	ftree_port_group_t *p_min_group;
2425 	ftree_port_t *p_min_port;
2426 	uint16_t i;
2427 	uint16_t j;
2428 	boolean_t created_route = FALSE;
2429 	boolean_t routed = FALSE;
2430 
2431 
2432 	/* Assign upgoing ports by stepping down, starting on THIS switch */
2433 	created_route = fabric_route_upgoing_by_going_down(p_ftree, p_sw,	/* local switch - used as a route-upgoing alg. start point */
2434 							   p_prev_sw,	/* switch that we went up from (NULL means that we went down) */
2435 							   target_lid,	/* LID that we're routing to */
2436 							   is_main_path,	/* whether this path to HCA should by tracked by counters */
2437 							   is_target_a_sw,	/* Whether target lid is a switch or not */
2438 							   current_hops);	/* Number of hops done up to this point */
2439 
2440 	/* recursion stop condition - if it's a root switch, */
2441 	if (p_sw->rank == 0) {
2442 		if (reverse_hop_credit > 0) {
2443 			/* We go up by going down as we have some reverse_hop_credit left */
2444 			/* We use the index to scatter a bit the reverse up routes */
2445 			p_sw->down_port_groups_idx =
2446 			    (p_sw->down_port_groups_idx +
2447 			     1) % p_sw->down_port_groups_num;
2448 			i = p_sw->down_port_groups_idx;
2449 			for (j = 0; j < p_sw->down_port_groups_num; j++) {
2450 
2451 				p_group = p_sw->down_port_groups[i];
2452 				i = (i + 1) % p_sw->down_port_groups_num;
2453 
2454 				/* Skip this port group unless it points to a switch */
2455 				if (p_group->remote_node_type !=
2456 				    IB_NODE_TYPE_SWITCH)
2457 					continue;
2458 				p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2459 
2460 				created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
2461 										    p_sw,	/* this switch - prev. position switch for the function */
2462 										    target_lid,	/* LID that we're routing to */
2463 										    is_main_path,	/* whether this is path to HCA that should by tracked by counters */
2464 										    is_target_a_sw,	/* Whether target lid is a switch or not */
2465 										    reverse_hop_credit - 1,	/* Remaining reverse_hops allowed */
2466 										    reverse_hops + 1,	/* Number of reverse_hops done up to this point */
2467 										    current_hops
2468 										    +
2469 										    1);
2470 			}
2471 
2472 		}
2473 		return created_route;
2474 	}
2475 
2476 	/* We should generate a list of port sorted by load so we can find easily the least
2477 	 * going port and explore the other pots on secondary routes more easily (and quickly) */
2478 	bubble_sort_down(p_sw->up_port_groups, p_sw->up_port_groups_num);
2479 
2480 	p_min_group = p_sw->up_port_groups[0];
2481 	/* Find the least loaded upgoing port in the selected group */
2482 	p_min_port = NULL;
2483 	ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports);
2484 	for (j = 0; j < ports_num; j++) {
2485 		cl_ptr_vector_at(&p_min_group->ports, j, (void *)&p_port);
2486 		if (!p_min_port) {
2487 			/* first port that we're checking - use
2488 			   it as a port with the lowest load */
2489 			p_min_port = p_port;
2490 		} else if (p_port->counter_down < p_min_port->counter_down) {
2491 			/* this port is less loaded - use it as min */
2492 			p_min_port = p_port;
2493 		}
2494 	}
2495 
2496 	/* At this point we have selected a group and port with the
2497 	   lowest load of downgoing routes.
2498 	   Set on the remote switch how to get to the target_lid -
2499 	   set LFT(target_lid) on the remote switch to the remote port */
2500 	p_remote_sw = p_min_group->remote_hca_or_sw.p_sw;
2501 
2502 	/* Four possible cases:
2503 	 *
2504 	 *  1. is_main_path == TRUE:
2505 	 *      - going UP(TRUE,TRUE) on selected min_group and min_port
2506 	 *         + promoting port counter
2507 	 *         + setting path in remote switch fwd tbl
2508 	 *         + setting hops in remote switch on all the ports of selected group
2509 	 *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0
2510 	 *         + NOT promoting port counter
2511 	 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2512 	 *         + setting hops in remote switch on all the ports of each group
2513 	 *           if it hasn't been set yet
2514 	 *
2515 	 *  2. is_main_path == FALSE:
2516 	 *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2517 	 *        but only if the remote (upper) switch hasn't been already
2518 	 *        configured for this target LID
2519 	 *         + NOT promoting port counter
2520 	 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2521 	 *         + setting hops in remote switch on all the ports of each group
2522 	 *           if it hasn't been set yet
2523 	 */
2524 
2525 	/* covering first half of case 1, and case 3 */
2526 	if (is_main_path) {
2527 		if (p_sw->is_leaf) {
2528 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2529 				" - Routing MAIN path for %s CA LID %u: %s --> %s\n",
2530 				(target_lid != 0) ? "real" : "DUMMY",
2531 				target_lid,
2532 				tuple_to_str(p_sw->tuple),
2533 				tuple_to_str(p_remote_sw->tuple));
2534 		}
2535 		/* The number of downgoing routes is tracked in the
2536 		   p_group->counter_down p_port->counter_down counters of the
2537 		   group and port that belong to the lower side of the link
2538 		   (on switch with higher rank) */
2539 		p_min_group->counter_down++;
2540 		p_min_port->counter_down++;
2541 		if (p_min_group->counter_down ==
2542 		    (p_min_group->remote_hca_or_sw.p_sw->min_counter_down +
2543 		     1)) {
2544 			recalculate_min_counter_down
2545 			    (p_min_group->remote_hca_or_sw.p_sw);
2546 		}
2547 
2548 		/* This LID may already be in the LFT in the reverse_hop feature is used */
2549 		/* We update the LFT only if this LID isn't already present. */
2550 
2551 		/* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2552 		if ((p_remote_sw->p_osm_sw->new_lft[target_lid] == OSM_NO_PATH)
2553 		    ||
2554 		    (current_hops + 1 <
2555 		     sw_get_least_hops(p_remote_sw, target_lid))) {
2556 
2557 			p_remote_sw->p_osm_sw->new_lft[target_lid] =
2558 				p_min_port->remote_port_num;
2559 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2560 					"Switch %s: set path to CA LID %u through port %u\n",
2561 					tuple_to_str(p_remote_sw->tuple),
2562 					target_lid,
2563 					p_min_port->remote_port_num);
2564 
2565 			/* On the remote switch that is pointed by the min_group,
2566 			set hops for ALL the ports in the remote group. */
2567 
2568 			set_hops_on_remote_sw(p_min_group, target_lid,
2569 						      current_hops + 1,
2570 						      is_target_a_sw);
2571 		}
2572 	/* Recursion step: Assign downgoing ports by stepping up, starting on REMOTE switch. */
2573 	created_route |= fabric_route_downgoing_by_going_up(p_ftree,
2574 							    p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
2575 							    p_sw,		/* this switch - prev. position switch for the function */
2576 							    target_lid,		/* LID that we're routing to */
2577 							    is_main_path,	/* whether this is path to HCA that should by tracked by counters */
2578 							    is_target_a_sw,	/* Whether target lid is a switch or not */
2579 							    reverse_hop_credit,	/* Remaining reverse_hops allowed */
2580 							    reverse_hops,	/* Number of reverse_hops done up to this point */
2581 							    current_hops + 1);
2582 	}
2583 
2584 	/* What's left to do at this point:
2585 	 *
2586 	 *  1. is_main_path == TRUE:
2587 	 *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0,
2588 	 *        but only if the remote (upper) switch hasn't been already
2589 	 *        configured for this target LID
2590 	 *         + NOT promoting port counter
2591 	 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2592 	 *         + setting hops in remote switch on all the ports of each group
2593 	 *           if it hasn't been set yet
2594 	 *
2595 	 *  2. is_main_path == FALSE:
2596 	 *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
2597 	 *        but only if the remote (upper) switch hasn't been already
2598 	 *        configured for this target LID
2599 	 *         + NOT promoting port counter
2600 	 *         + setting path in remote switch fwd tbl if it hasn't been set yet
2601 	 *         + setting hops in remote switch on all the ports of each group
2602 	 *           if it hasn't been set yet
2603 	 *
2604 	 *  These two rules can be rephrased this way:
2605 	 *   - foreach UP port group
2606 	 *      + if remote switch has been set with the target LID
2607 	 *         - skip this port group
2608 	 *      + else
2609 	 *         - select port 0
2610 	 *         - do NOT promote port counter
2611 	 *         - set path in remote switch fwd tbl
2612 	 *         - set hops in remote switch on all the ports of this group
2613 	 *         - go UP(TRUE,FALSE) to the remote switch
2614 	 */
2615 
2616 	for (i = is_main_path ? 1 : 0; i < p_sw->up_port_groups_num; i++) {
2617 		p_group = p_sw->up_port_groups[i];
2618 		p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2619 
2620 		/* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2621 		if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH)
2622 			if (current_hops + 1 >=
2623 			    sw_get_least_hops(p_remote_sw, target_lid))
2624 				continue;
2625 
2626 		if (p_sw->is_leaf) {
2627 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2628 				" - Routing SECONDARY path for LID %u: %s --> %s\n",
2629 				target_lid,
2630 				tuple_to_str(p_sw->tuple),
2631 				tuple_to_str(p_remote_sw->tuple));
2632 		}
2633 
2634 		/* Routing REAL lids on SECONDARY path means routing
2635 		   switch-to-switch or switch-to-CA paths.
2636 		   We can safely assume that switch will initiate very
2637 		   few traffic, so there's no point wasting runtime on
2638 		   trying to balance these routes - always pick port 0. */
2639 		p_min_port = NULL;
2640 		ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2641 		if(ports_num == 0)
2642 			continue;
2643 		for (j = 0; j < ports_num; j++) {
2644 			cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2645 			if (!p_min_port) {
2646 				/* first port that we're checking - use
2647 				   it as a port with the lowest load */
2648 				p_min_port = p_port;
2649 			} else if (p_port->counter_down <
2650 				   p_min_port->counter_down) {
2651 				/* this port is less loaded - use it as min */
2652 				p_min_port = p_port;
2653 			}
2654 		}
2655 
2656 		p_port = p_min_port;
2657 		p_remote_sw->p_osm_sw->new_lft[target_lid] =
2658 		    p_port->remote_port_num;
2659 
2660 		/* On the remote switch that is pointed by the p_group,
2661 		   set hops for ALL the ports in the remote group. */
2662 
2663 		set_hops_on_remote_sw(p_group, target_lid,
2664 				      current_hops + 1, is_target_a_sw);
2665 
2666 		/* Recursion step:
2667 		   Assign downgoing ports by stepping up, starting on REMOTE switch. */
2668 		routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
2669 							    p_sw,	/* this switch - prev. position switch for the function */
2670 							    target_lid,	/* LID that we're routing to */
2671 							    FALSE,	/* whether this is path to HCA that should by tracked by counters */
2672 							    is_target_a_sw,	/* Whether target lid is a switch or not */
2673 							    reverse_hop_credit,	/* Remaining reverse_hops allowed */
2674 							    reverse_hops,	/* Number of reverse_hops done up to this point */
2675 							    current_hops + 1);
2676 		created_route |= routed;
2677 	}
2678 
2679 	/* Now doing the same thing with horizontal links */
2680 	if (p_sw->sibling_port_groups_num > 0)
2681 		bubble_sort_down(p_sw->sibling_port_groups,
2682 				 p_sw->sibling_port_groups_num);
2683 
2684 	for (i = 0; i < p_sw->sibling_port_groups_num; i++) {
2685 		p_group = p_sw->sibling_port_groups[i];
2686 		p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2687 
2688 		/* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */
2689 		if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH)
2690 			if (current_hops + 1 >=
2691 			    sw_get_least_hops(p_remote_sw, target_lid))
2692 				continue;
2693 
2694 		if (p_sw->is_leaf) {
2695 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2696 				" - Routing SECONDARY path for LID %u: %s --> %s\n",
2697 				target_lid,
2698 				tuple_to_str(p_sw->tuple),
2699 				tuple_to_str(p_remote_sw->tuple));
2700 		}
2701 
2702 		/* Routing REAL lids on SECONDARY path means routing
2703 		   switch-to-switch or switch-to-CA paths.
2704 		   We can safely assume that switch will initiate very
2705 		   few traffic, so there's no point wasting runtime on
2706 		   trying to balance these routes - always pick port 0. */
2707 
2708 		p_min_port = NULL;
2709 		ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
2710 		for (j = 0; j < ports_num; j++) {
2711 			cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
2712 			if (!p_min_port) {
2713 				/* first port that we're checking - use
2714 				   it as a port with the lowest load */
2715 				p_min_port = p_port;
2716 			} else if (p_port->counter_down <
2717 				   p_min_port->counter_down) {
2718 				/* this port is less loaded - use it as min */
2719 				p_min_port = p_port;
2720 			}
2721 		}
2722 
2723 		p_port = p_min_port;
2724 		p_remote_sw->p_osm_sw->new_lft[target_lid] =
2725 		    p_port->remote_port_num;
2726 
2727 		/* On the remote switch that is pointed by the p_group,
2728 		   set hops for ALL the ports in the remote group. */
2729 
2730 		set_hops_on_remote_sw(p_group, target_lid,
2731 				      current_hops + 1, is_target_a_sw);
2732 
2733 		/* Recursion step:
2734 		   Assign downgoing ports by stepping up, starting on REMOTE switch. */
2735 		routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
2736 							    p_sw,	/* this switch - prev. position switch for the function */
2737 							    target_lid,	/* LID that we're routing to */
2738 							    FALSE,	/* whether this is path to HCA that should by tracked by counters */
2739 							    is_target_a_sw,	/* Whether target lid is a switch or not */
2740 							    reverse_hop_credit,	/* Remaining reverse_hops allowed */
2741 							    reverse_hops,	/* Number of reverse_hops done up to this point */
2742 							    current_hops + 1);
2743 		created_route |= routed;
2744 		if (routed) {
2745 			p_min_group->counter_down++;
2746 			p_min_port->counter_down++;
2747 		}
2748 	}
2749 
2750 	/* If we don't have any reverse hop credits, we are done */
2751 	if (reverse_hop_credit == 0)
2752 		return created_route;
2753 
2754 	if (p_sw->is_leaf)
2755 		return created_route;
2756 
2757 	/* We explore all the down group ports */
2758 	/* We try to reverse jump for each of them */
2759 	/* They already have a route to us from the upgoing_by_going_down started earlier */
2760 	/* This is only so it'll continue exploring up, after this step backwards */
2761 	for (i = 0; i < p_sw->down_port_groups_num; i++) {
2762 		p_group = p_sw->down_port_groups[i];
2763 		p_remote_sw = p_group->remote_hca_or_sw.p_sw;
2764 
2765 		/* Skip this port group unless it points to a switch */
2766 		if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH)
2767 			continue;
2768 
2769 		/* Recursion step:
2770 		   Assign downgoing ports by stepping up, fter doing one step down starting on REMOTE switch. */
2771 		created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
2772 								    p_sw,	/* this switch - prev. position switch for the function */
2773 								    target_lid,	/* LID that we're routing to */
2774 								    TRUE,	/* whether this is path to HCA that should by tracked by counters */
2775 								    is_target_a_sw,	/* Whether target lid is a switch or not */
2776 								    reverse_hop_credit - 1,	/* Remaining reverse_hops allowed */
2777 								    reverse_hops + 1,	/* Number of reverse_hops done up to this point */
2778 								    current_hops
2779 								    + 1);
2780 	}
2781 	return created_route;
2782 
2783 }				/* ftree_fabric_route_downgoing_by_going_up() */
2784 
2785 /***************************************************/
2786 
2787 /*
2788  * Pseudo code:
2789  *    foreach leaf switch (in indexing order)
2790  *       for each compute node (in indexing order)
2791  *          obtain the LID of the compute node
2792  *          set local LFT(LID) of the port connecting to compute node
2793  *          call assign-down-going-port-by-ascending-up(TRUE,TRUE) on CURRENT switch
2794  *       for each MISSING compute node
2795  *          call assign-down-going-port-by-ascending-up(FALSE,TRUE) on CURRENT switch
2796  */
2797 
2798 static void fabric_route_to_cns(IN ftree_fabric_t * p_ftree)
2799 {
2800 	ftree_sw_t *p_sw;
2801 	ftree_hca_t *p_hca;
2802 	ftree_port_group_t *p_leaf_port_group;
2803 	ftree_port_group_t *p_hca_port_group;
2804 	ftree_port_t *p_port;
2805 	unsigned int i, j;
2806 	uint16_t hca_lid;
2807 	unsigned routed_targets_on_leaf;
2808 
2809 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
2810 
2811 	/* for each leaf switch (in indexing order) */
2812 	for (i = 0; i < p_ftree->leaf_switches_num; i++) {
2813 		p_sw = p_ftree->leaf_switches[i];
2814 		routed_targets_on_leaf = 0;
2815 
2816 		/* for each HCA connected to this switch */
2817 		for (j = 0; j < p_sw->down_port_groups_num; j++) {
2818 			p_leaf_port_group = p_sw->down_port_groups[j];
2819 
2820 			/* work with this port group only if the remote node is CA */
2821 			if (p_leaf_port_group->remote_node_type !=
2822 			    IB_NODE_TYPE_CA)
2823 				continue;
2824 
2825 			p_hca = p_leaf_port_group->remote_hca_or_sw.p_hca;
2826 
2827 			/* work with this port group only if remote HCA has CNs */
2828 			if (!p_hca->cn_num)
2829 				continue;
2830 
2831 			p_hca_port_group =
2832 			    hca_get_port_group_by_lid(p_hca,
2833 						      p_leaf_port_group->
2834 						      remote_lid);
2835 			CL_ASSERT(p_hca_port_group);
2836 
2837 			/* work with this port group only if remote port is CN */
2838 			if (!p_hca_port_group->is_cn)
2839 				continue;
2840 
2841 			/* obtain the LID of HCA port */
2842 			hca_lid = p_leaf_port_group->remote_lid;
2843 
2844 			/* set local LFT(LID) to the port that is connected to HCA */
2845 			cl_ptr_vector_at(&p_leaf_port_group->ports, 0,
2846 					 (void *)&p_port);
2847 			p_sw->p_osm_sw->new_lft[hca_lid] = p_port->port_num;
2848 
2849 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2850 				"Switch %s: set path to CN LID %u through port %u\n",
2851 				tuple_to_str(p_sw->tuple),
2852 				hca_lid, p_port->port_num);
2853 
2854 			/* set local min hop table(LID) to route to the CA */
2855 			sw_set_hops(p_sw, hca_lid, p_port->port_num, 1, FALSE);
2856 
2857 			/* Assign downgoing ports by stepping up.
2858 			   Since we're routing here only CNs, we're routing it as REAL
2859 			   LID and updating fat-tree balancing counters. */
2860 			fabric_route_downgoing_by_going_up(p_ftree, p_sw,	/* local switch - used as a route-downgoing alg. start point */
2861 							   NULL,	/* prev. position switch */
2862 							   hca_lid,	/* LID that we're routing to */
2863 							   TRUE,	/* whether this path to HCA should by tracked by counters */
2864 							   FALSE,	/* whether target lid is a switch or not */
2865 							   0,	/* Number of reverse hops allowed */
2866 							   0,	/* Number of reverse hops done yet */
2867 							   1);	/* Number of hops done yet */
2868 
2869 			/* count how many real targets have been routed from this leaf switch */
2870 			routed_targets_on_leaf++;
2871 		}
2872 
2873 		/* We're done with the real targets (all CNs) of this leaf switch.
2874 		   Now route the dummy HCAs that are missing or that are non-CNs.
2875 		   When routing to dummy HCAs we don't fill lid matrices. */
2876 		if (p_ftree->max_cn_per_leaf > routed_targets_on_leaf) {
2877 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2878 				"Routing %u dummy CAs\n",
2879 				p_ftree->max_cn_per_leaf -
2880 				p_sw->down_port_groups_num);
2881 			for (j = 0; j <
2882 			     p_ftree->max_cn_per_leaf - routed_targets_on_leaf;
2883 			     j++) {
2884 				ftree_sw_t *p_next_sw, *p_ftree_sw;
2885 				sw_set_hops(p_sw, 0, 0xFF, 1, FALSE);
2886 				/* assign downgoing ports by stepping up */
2887 				fabric_route_downgoing_by_going_up(p_ftree, p_sw,	/* local switch - used as a route-downgoing alg. start point */
2888 								   NULL,	/* prev. position switch */
2889 								   0,	/* LID that we're routing to - ignored for dummy HCA */
2890 								   TRUE,	/* whether this path to HCA should by tracked by counters */
2891 								   FALSE,	/* Whether the target LID is a switch or not */
2892 								   0,	/* Number of reverse hops allowed */
2893 								   0,	/* Number of reverse hops done yet */
2894 								   1);	/* Number of hops done yet */
2895 
2896 				p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
2897 				/* need to clean the LID 0 hops for dummy node */
2898 				while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
2899 					p_ftree_sw = p_next_sw;
2900 					p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_ftree_sw->map_item);
2901 					p_ftree_sw->hops[0] = OSM_NO_PATH;
2902 					p_ftree_sw->p_osm_sw->new_lft[0] = OSM_NO_PATH;
2903 				}
2904 
2905 			}
2906 		}
2907 	}
2908 	/* done going through all the leaf switches */
2909 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
2910 }				/* fabric_route_to_cns() */
2911 
2912 /***************************************************/
2913 
2914 /*
2915  * Pseudo code:
2916  *    foreach HCA non-CN port in fabric
2917  *       obtain the LID of the HCA port
2918  *       get switch that is connected to this HCA port
2919  *       set switch LFT(LID) to the port connected to the HCA port
2920  *       call assign-down-going-port-by-ascending-up(TRUE,TRUE) on the switch
2921  *
2922  * Routing to these HCAs is routing a REAL hca lid on MAIN path.
2923  * We want to allow load-leveling of the traffic to the non-CNs,
2924  * because such nodes may include IO nodes with heavy usage
2925  *   - we should set fwd tables
2926  *   - we should update port counters
2927  * Routing to non-CNs is done after routing to CNs, so updated port
2928  * counters will not affect CN-to-CN routing.
2929  */
2930 
2931 static void fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree)
2932 {
2933 	ftree_sw_t *p_sw;
2934 	ftree_hca_t *p_hca;
2935 	ftree_hca_t *p_next_hca;
2936 	ftree_port_t *p_hca_port;
2937 	ftree_port_group_t *p_hca_port_group;
2938 	uint16_t hca_lid;
2939 	unsigned port_num_on_switch;
2940 	unsigned i;
2941 
2942 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
2943 
2944 	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
2945 	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
2946 		p_hca = p_next_hca;
2947 		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
2948 
2949 		for (i = 0; i < p_hca->up_port_groups_num; i++) {
2950 			p_hca_port_group = p_hca->up_port_groups[i];
2951 
2952 			/* skip this port if it's CN, in which case it has been already routed */
2953 			if (p_hca_port_group->is_cn)
2954 				continue;
2955 
2956 			/* skip this port if it is not connected to switch */
2957 			if (p_hca_port_group->remote_node_type !=
2958 			    IB_NODE_TYPE_SWITCH)
2959 				continue;
2960 
2961 			p_sw = p_hca_port_group->remote_hca_or_sw.p_sw;
2962 			hca_lid = p_hca_port_group->lid;
2963 
2964 			/* set switches  LFT(LID) to the port that is connected to HCA */
2965 			cl_ptr_vector_at(&p_hca_port_group->ports, 0,
2966 					 (void *)&p_hca_port);
2967 			port_num_on_switch = p_hca_port->remote_port_num;
2968 			p_sw->p_osm_sw->new_lft[hca_lid] = port_num_on_switch;
2969 
2970 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
2971 				"Switch %s: set path to non-CN HCA LID %u through port %u\n",
2972 				tuple_to_str(p_sw->tuple),
2973 				hca_lid, port_num_on_switch);
2974 
2975 			/* set local min hop table(LID) to route to the CA */
2976 			sw_set_hops(p_sw, hca_lid, port_num_on_switch,	/* port num */
2977 				    1, FALSE);	/* hops */
2978 
2979 			/* Assign downgoing ports by stepping up.
2980 			   We're routing REAL targets. They are not CNs and not included
2981 			   in the leafs array, but we treat them as MAIN path to allow load
2982 			   leveling, which means that the counters will be updated. */
2983 			fabric_route_downgoing_by_going_up(p_ftree, p_sw,	/* local switch - used as a route-downgoing alg. start point */
2984 							   NULL,	/* prev. position switch */
2985 							   hca_lid,	/* LID that we're routing to */
2986 							   TRUE,	/* whether this path to HCA should by tracked by counters */
2987 							   FALSE,	/* Whether the target LID is a switch or not */
2988 							   p_hca_port_group->is_io ? p_ftree->p_osm->subn.opt.max_reverse_hops : 0,	/* Number or reverse hops allowed */
2989 							   0,	/* Number or reverse hops done yet */
2990 							   1);	/* Number of hops done yet */
2991 		}
2992 		/* done with all the port groups of this HCA - go to next HCA */
2993 	}
2994 
2995 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
2996 }				/* fabric_route_to_non_cns() */
2997 
2998 /***************************************************/
2999 
3000 /*
3001  * Pseudo code:
3002  *    foreach switch in fabric
3003  *       obtain its LID
3004  *       set local LFT(LID) to port 0
3005  *       call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch
3006  *
3007  * Routing to switch is similar to routing a REAL hca lid on SECONDARY path:
3008  *   - we should set fwd tables
3009  *   - we should NOT update port counters
3010  */
3011 
3012 static void fabric_route_to_switches(IN ftree_fabric_t * p_ftree)
3013 {
3014 	ftree_sw_t *p_sw;
3015 	ftree_sw_t *p_next_sw;
3016 
3017 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3018 
3019 	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3020 	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3021 		p_sw = p_next_sw;
3022 		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3023 
3024 		/* set local LFT(LID) to 0 (route to itself) */
3025 		p_sw->p_osm_sw->new_lft[p_sw->lid] = 0;
3026 
3027 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3028 			"Switch %s (LID %u): routing switch-to-switch paths\n",
3029 			tuple_to_str(p_sw->tuple), p_sw->lid);
3030 
3031 		/* set min hop table of the switch to itself */
3032 		sw_set_hops(p_sw, p_sw->lid, 0,	/* port_num */
3033 			    0, TRUE);	/* hops     */
3034 
3035 		fabric_route_downgoing_by_going_up(p_ftree, p_sw,	/* local switch - used as a route-downgoing alg. start point */
3036 						   NULL,	/* prev. position switch */
3037 						   p_sw->lid,	/* LID that we're routing to */
3038 						   FALSE,	/* whether this path to HCA should by tracked by counters */
3039 						   TRUE,	/* Whether the target LID is a switch or not */
3040 						   0,	/* Number of reverse hops allowed */
3041 						   0,	/* Number of reverse hops done yet */
3042 						   0);	/* Number of hops done yet */
3043 	}
3044 
3045 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3046 }				/* fabric_route_to_switches() */
3047 
3048 /***************************************************
3049  ***************************************************/
3050 
3051 static void fabric_route_roots(IN ftree_fabric_t * p_ftree)
3052 {
3053 	uint16_t lid;
3054 	uint8_t port_num;
3055 	osm_port_t *p_port;
3056 	ftree_sw_t *p_sw;
3057 	ftree_sw_t *p_leaf_sw;
3058 
3059 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3060 
3061 	/*
3062 	 * We need a switch that will accomodate all the down/up turns in
3063 	 * the fabric. Having these turn in a single place in the fabric
3064 	 * will not create credit loops.
3065 	 * So we need to select this switch.
3066 	 * The idea here is to chose leaf with the highest index. I don't
3067 	 * have any theory to back me up on this. It's just a general thought
3068 	 * that this way the switch that might be a bottleneck for many mcast
3069 	 * groups will be far away from the OpenSM, so it will draw the
3070 	 * multicast traffic away from the SM.
3071 	 */
3072 
3073 	p_leaf_sw = p_ftree->leaf_switches[p_ftree->leaf_switches_num-1];
3074 
3075 	/*
3076 	 * Now go over all the switches in the fabric that
3077 	 * have lower rank, and route the missing LIDs to
3078 	 * the selected leaf switch.
3079 	 * In short, this leaf switch now poses a target
3080 	 * for all those missing LIDs.
3081 	 */
3082 
3083 	for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3084 	     p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl);
3085 	     p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) {
3086 
3087 		if (p_sw->rank >= p_ftree->leaf_switch_rank)
3088 			continue;
3089 
3090 		for (lid = 1; lid <= p_leaf_sw->p_osm_sw->max_lid_ho; lid ++) {
3091 
3092 			if (p_sw->p_osm_sw->new_lft[lid] != OSM_NO_PATH ||
3093 			    p_leaf_sw->hops[lid] == OSM_NO_PATH)
3094 				continue;
3095 
3096 			p_port = osm_get_port_by_lid_ho(&p_ftree->p_osm->subn,
3097 							lid);
3098 
3099 			/* we're interested only in switches */
3100 			if (!p_port || !p_port->p_node->sw)
3101 				continue;
3102 
3103 			/*
3104 			 * the missing LID will be routed through the same
3105 			 * port that routes to the selected leaf switch
3106 			 */
3107 			port_num = p_sw->p_osm_sw->new_lft[p_leaf_sw->lid];
3108 
3109 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3110 				"Switch %s: setting path to LID %u "
3111 				"through port %u\n",
3112 				tuple_to_str(p_sw->tuple), lid, port_num);
3113 
3114 			/* set local lft */
3115 			p_sw->p_osm_sw->new_lft[lid] = port_num;
3116 
3117 			/*
3118 			 * Set local min hop table.
3119 			 * The distance to the target LID is a distance
3120 			 * to the selected leaf switch plus the distance
3121 			 * from the leaf to the target LID.
3122 			 */
3123 			sw_set_hops(p_sw, lid, port_num,
3124 				p_sw->hops[p_leaf_sw->lid] +
3125 				p_leaf_sw->hops[lid], TRUE);
3126 		}
3127 	}
3128 
3129 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3130 }				/* fabric_route_roots() */
3131 
3132 /***************************************************/
3133 
3134 static int fabric_populate_nodes(IN ftree_fabric_t * p_ftree)
3135 {
3136 	osm_node_t *p_osm_node;
3137 	osm_node_t *p_next_osm_node;
3138 
3139 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3140 
3141 	p_next_osm_node =
3142 	    (osm_node_t *) cl_qmap_head(&p_ftree->p_osm->subn.node_guid_tbl);
3143 	while (p_next_osm_node !=
3144 	       (osm_node_t *) cl_qmap_end(&p_ftree->p_osm->
3145 					  subn.node_guid_tbl)) {
3146 		p_osm_node = p_next_osm_node;
3147 		p_next_osm_node =
3148 		    (osm_node_t *) cl_qmap_next(&p_osm_node->map_item);
3149 		switch (osm_node_get_type(p_osm_node)) {
3150 		case IB_NODE_TYPE_CA:
3151 			fabric_add_hca(p_ftree, p_osm_node);
3152 			break;
3153 		case IB_NODE_TYPE_ROUTER:
3154 			break;
3155 		case IB_NODE_TYPE_SWITCH:
3156 			fabric_add_sw(p_ftree, p_osm_node->sw);
3157 			break;
3158 		default:
3159 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3160 				"ERR AB0E: " "Node GUID 0x%016" PRIx64
3161 				" - Unknown node type: %s\n",
3162 				cl_ntoh64(osm_node_get_node_guid(p_osm_node)),
3163 				ib_get_node_type_str(osm_node_get_type
3164 						     (p_osm_node)));
3165 			OSM_LOG_EXIT(&p_ftree->p_osm->log);
3166 			return -1;
3167 		}
3168 	}
3169 
3170 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3171 	return 0;
3172 }				/* fabric_populate_nodes() */
3173 
3174 /***************************************************
3175  ***************************************************/
3176 
3177 static boolean_t sw_update_rank(IN ftree_sw_t * p_sw, IN uint32_t new_rank)
3178 {
3179 	if (sw_ranked(p_sw) && p_sw->rank <= new_rank)
3180 		return FALSE;
3181 	p_sw->rank = new_rank;
3182 	return TRUE;
3183 
3184 }
3185 
3186 /***************************************************/
3187 
3188 static void rank_switches_from_leafs(IN ftree_fabric_t * p_ftree,
3189 				     IN cl_list_t * p_ranking_bfs_list)
3190 {
3191 	ftree_sw_t *p_sw;
3192 	ftree_sw_t *p_remote_sw;
3193 	osm_node_t *p_node;
3194 	osm_node_t *p_remote_node;
3195 	osm_physp_t *p_osm_port;
3196 	uint8_t i;
3197 	unsigned max_rank = 0;
3198 
3199 	while (!cl_is_list_empty(p_ranking_bfs_list)) {
3200 		p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
3201 		p_node = p_sw->p_osm_sw->p_node;
3202 
3203 		/* note: skipping port 0 on switches */
3204 		for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
3205 			p_osm_port = osm_node_get_physp_ptr(p_node, i);
3206 			if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3207 				continue;
3208 
3209 			p_remote_node =
3210 			    osm_node_get_remote_node(p_node, i, NULL);
3211 			if (!p_remote_node)
3212 				continue;
3213 			if (osm_node_get_type(p_remote_node) !=
3214 			    IB_NODE_TYPE_SWITCH)
3215 				continue;
3216 
3217 			p_remote_sw = fabric_get_sw_by_guid(p_ftree,
3218 							    osm_node_get_node_guid
3219 							    (p_remote_node));
3220 			if (!p_remote_sw) {
3221 				/* remote node is not a switch */
3222 				continue;
3223 			}
3224 
3225 			/* if needed, rank the remote switch and add it to the BFS list */
3226 			if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) {
3227 				max_rank = p_remote_sw->rank;
3228 				cl_list_insert_tail(p_ranking_bfs_list,
3229 						    p_remote_sw);
3230 			}
3231 		}
3232 	}
3233 
3234 	/* set FatTree maximal switch rank */
3235 	p_ftree->max_switch_rank = max_rank;
3236 
3237 }				/* rank_switches_from_leafs() */
3238 
3239 /***************************************************/
3240 
3241 static int rank_leaf_switches(IN ftree_fabric_t * p_ftree,
3242 			      IN ftree_hca_t * p_hca,
3243 			      IN cl_list_t * p_ranking_bfs_list)
3244 {
3245 	ftree_sw_t *p_sw;
3246 	osm_node_t *p_osm_node = p_hca->p_osm_node;
3247 	osm_node_t *p_remote_osm_node;
3248 	osm_physp_t *p_osm_port;
3249 	static uint8_t i = 0;
3250 	int res = 0;
3251 
3252 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3253 
3254 	for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++) {
3255 		p_osm_port = osm_node_get_physp_ptr(p_osm_node, i);
3256 		if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3257 			continue;
3258 
3259 		p_remote_osm_node =
3260 		    osm_node_get_remote_node(p_osm_node, i, NULL);
3261 		if (!p_remote_osm_node)
3262 			continue;
3263 
3264 		switch (osm_node_get_type(p_remote_osm_node)) {
3265 		case IB_NODE_TYPE_CA:
3266 			/* HCA connected directly to another HCA - not FatTree */
3267 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3268 				"ERR AB0F: "
3269 				"CA conected directly to another CA: " "0x%016"
3270 				PRIx64 " <---> 0x%016" PRIx64 "\n",
3271 				hca_get_guid_ho(p_hca),
3272 				cl_ntoh64(osm_node_get_node_guid
3273 					  (p_remote_osm_node)));
3274 			res = -1;
3275 			goto Exit;
3276 
3277 		case IB_NODE_TYPE_ROUTER:
3278 			/* leaving this port - proceeding to the next one */
3279 			continue;
3280 
3281 		case IB_NODE_TYPE_SWITCH:
3282 			/* continue with this port */
3283 			break;
3284 
3285 		default:
3286 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3287 				"ERR AB10: Node GUID 0x%016" PRIx64
3288 				" - Unknown node type: %s\n",
3289 				cl_ntoh64(osm_node_get_node_guid
3290 					  (p_remote_osm_node)),
3291 				ib_get_node_type_str(osm_node_get_type
3292 						     (p_remote_osm_node)));
3293 			res = -1;
3294 			goto Exit;
3295 		}
3296 
3297 		/* remote node is switch */
3298 
3299 		p_sw = fabric_get_sw_by_guid(p_ftree,
3300 					     osm_node_get_node_guid
3301 					     (p_osm_port->p_remote_physp->
3302 					      p_node));
3303 		CL_ASSERT(p_sw);
3304 
3305 		/* if needed, rank the remote switch and add it to the BFS list */
3306 
3307 		if (!sw_update_rank(p_sw, 0))
3308 			continue;
3309 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3310 			"Marking rank of switch that is directly connected to CA:\n"
3311 			"                                            - CA guid    : 0x%016"
3312 			PRIx64 "\n"
3313 			"                                            - Switch guid: 0x%016"
3314 			PRIx64 "\n"
3315 			"                                            - Switch LID : %u\n",
3316 			hca_get_guid_ho(p_hca),
3317 			sw_get_guid_ho(p_sw), p_sw->lid);
3318 		cl_list_insert_tail(p_ranking_bfs_list, p_sw);
3319 	}
3320 
3321 Exit:
3322 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3323 	return res;
3324 }				/* rank_leaf_switches() */
3325 
3326 /***************************************************/
3327 
3328 static void sw_reverse_rank(IN cl_map_item_t * const p_map_item,
3329 			    IN void *context)
3330 {
3331 	ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
3332 	ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item;
3333 	if (p_sw->rank != 0xFFFFFFFF)
3334 		p_sw->rank = p_ftree->max_switch_rank - p_sw->rank;
3335 }
3336 
3337 /***************************************************
3338  ***************************************************/
3339 
3340 static int
3341 fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca)
3342 {
3343 	ftree_sw_t *p_remote_sw;
3344 	osm_node_t *p_node = p_hca->p_osm_node;
3345 	osm_node_t *p_remote_node;
3346 	uint8_t remote_node_type;
3347 	ib_net64_t remote_node_guid;
3348 	osm_physp_t *p_remote_osm_port;
3349 	uint8_t i;
3350 	uint8_t remote_port_num;
3351 	boolean_t is_cn;
3352 	boolean_t is_in_cn_file;
3353 	boolean_t is_io;
3354 	boolean_t is_cns_file_provided = fabric_cns_provided(p_ftree);
3355 	boolean_t is_ios_file_provided = fabric_ios_provided(p_ftree);
3356 	int res = 0;
3357 
3358 	for (i = 0; i < osm_node_get_num_physp(p_node); i++) {
3359 		osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
3360 		is_io = FALSE;
3361 		is_cn = TRUE;
3362 		is_in_cn_file = FALSE;
3363 
3364 		if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3365 			continue;
3366 
3367 		if (p_hca->disconnected_ports[i])
3368 			continue;
3369 
3370 		p_remote_osm_port = osm_physp_get_remote(p_osm_port);
3371 		p_remote_node =
3372 		    osm_node_get_remote_node(p_node, i, &remote_port_num);
3373 
3374 		if (!p_remote_osm_port || !p_remote_node)
3375 			continue;
3376 
3377 		remote_node_type = osm_node_get_type(p_remote_node);
3378 		remote_node_guid = osm_node_get_node_guid(p_remote_node);
3379 
3380 		switch (remote_node_type) {
3381 		case IB_NODE_TYPE_ROUTER:
3382 			/* leaving this port - proceeding to the next one */
3383 			continue;
3384 
3385 		case IB_NODE_TYPE_CA:
3386 			/* HCA connected directly to another HCA - not FatTree */
3387 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3388 				"ERR AB11: "
3389 				"CA conected directly to another CA: " "0x%016"
3390 				PRIx64 " <---> 0x%016" PRIx64 "\n",
3391 				cl_ntoh64(osm_node_get_node_guid(p_node)),
3392 				cl_ntoh64(remote_node_guid));
3393 			res = -1;
3394 			goto Exit;
3395 
3396 		case IB_NODE_TYPE_SWITCH:
3397 			/* continue with this port */
3398 			break;
3399 
3400 		default:
3401 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3402 				"ERR AB12: Node GUID 0x%016" PRIx64
3403 				" - Unknown node type: %s\n",
3404 				cl_ntoh64(remote_node_guid),
3405 				ib_get_node_type_str(remote_node_type));
3406 			res = -1;
3407 			goto Exit;
3408 		}
3409 
3410 		/* remote node is switch */
3411 
3412 		p_remote_sw = fabric_get_sw_by_guid(p_ftree, remote_node_guid);
3413 		CL_ASSERT(p_remote_sw);
3414 
3415 		/* If CN file is not supplied, then all the CAs considered as Compute Nodes.
3416 		   Otherwise all the CAs are not CNs, and only guids that are present in the
3417 		   CN file will be marked as compute nodes. */
3418 		if (is_cns_file_provided == TRUE) {
3419 			name_map_item_t *p_elem = (name_map_item_t *)
3420 			cl_qmap_get(&p_ftree->cn_guid_tbl,
3421 				    cl_ntoh64(osm_physp_get_port_guid
3422 					     (p_osm_port)));
3423 			if (p_elem == (name_map_item_t *)
3424 				cl_qmap_end(&p_ftree->cn_guid_tbl))
3425 				is_cn = FALSE;
3426 			else
3427 				is_in_cn_file = TRUE;
3428 		}
3429 		if (is_in_cn_file == FALSE && is_ios_file_provided == TRUE) {
3430 			name_map_item_t *p_elem = (name_map_item_t *)
3431 			cl_qmap_get(&p_ftree->io_guid_tbl,
3432 				    cl_ntoh64(osm_physp_get_port_guid
3433 					     (p_osm_port)));
3434 			if (p_elem != (name_map_item_t *)
3435 				cl_qmap_end(&p_ftree->io_guid_tbl)) {
3436 				is_io = TRUE;
3437 				is_cn = FALSE;
3438 			}
3439 		}
3440 
3441 		if (is_cn) {
3442 			p_ftree->cn_num++;
3443 			p_hca->cn_num++;
3444 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3445 				"Marking CN port GUID 0x%016" PRIx64 "\n",
3446 				cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3447 		} else if (is_io) {
3448 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3449 				"Marking I/O port GUID 0x%016" PRIx64 "\n",
3450 				cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3451 		} else {
3452 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3453 				"Marking non-CN port GUID 0x%016" PRIx64 "\n",
3454 				cl_ntoh64(osm_physp_get_port_guid(p_osm_port)));
3455 		}
3456 		p_ftree->ca_ports++;
3457 
3458 		hca_add_port(p_ftree,
3459 			     p_hca,	/* local ftree_hca object */
3460 			     i,	/* local port number */
3461 			     remote_port_num,	/* remote port number */
3462 			     cl_ntoh16(osm_node_get_base_lid(p_node, i)),	/* local lid */
3463 			     cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0)),	/* remote lid */
3464 			     osm_physp_get_port_guid(p_osm_port),	/* local port guid */
3465 			     osm_physp_get_port_guid(p_remote_osm_port),	/* remote port guid */
3466 			     remote_node_guid,	/* remote node guid */
3467 			     remote_node_type,	/* remote node type */
3468 			     (void *)p_remote_sw,	/* remote ftree_hca/sw object */
3469 			     is_cn, is_io);	/* whether this port is compute node */
3470 	}
3471 
3472 Exit:
3473 	return res;
3474 }				/* fabric_construct_hca_ports() */
3475 
3476 /***************************************************
3477  ***************************************************/
3478 
3479 static int fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree,
3480 				     IN ftree_sw_t * p_sw)
3481 {
3482 	ftree_hca_t *p_remote_hca;
3483 	ftree_sw_t *p_remote_sw;
3484 	osm_node_t *p_node = p_sw->p_osm_sw->p_node;
3485 	osm_node_t *p_remote_node;
3486 	uint16_t remote_lid;
3487 	uint8_t remote_node_type;
3488 	ib_net64_t remote_node_guid;
3489 	osm_physp_t *p_remote_osm_port;
3490 	ftree_direction_t direction;
3491 	void *p_remote_hca_or_sw;
3492 	uint8_t i;
3493 	uint8_t remote_port_num;
3494 	int res = 0;
3495 
3496 	CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
3497 
3498 	for (i = 1; i < osm_node_get_num_physp(p_node); i++) {
3499 		osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i);
3500 		if (!p_osm_port || !osm_link_is_healthy(p_osm_port))
3501 			continue;
3502 
3503 		p_remote_osm_port = osm_physp_get_remote(p_osm_port);
3504 		if (!p_remote_osm_port)
3505 			continue;
3506 
3507 		p_remote_node =
3508 		    osm_node_get_remote_node(p_node, i, &remote_port_num);
3509 		if (!p_remote_node)
3510 			continue;
3511 
3512 		/* ignore any loopback connection on switch */
3513 		if (p_node == p_remote_node) {
3514 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3515 				"Ignoring loopback on switch GUID 0x%016" PRIx64
3516 				", LID %u, rank %u\n",
3517 				sw_get_guid_ho(p_sw),
3518 				p_sw->lid, p_sw->rank);
3519 			continue;
3520 		}
3521 
3522 		remote_node_type = osm_node_get_type(p_remote_node);
3523 		remote_node_guid = osm_node_get_node_guid(p_remote_node);
3524 
3525 		switch (remote_node_type) {
3526 		case IB_NODE_TYPE_ROUTER:
3527 			/* leaving this port - proceeding to the next one */
3528 			continue;
3529 
3530 		case IB_NODE_TYPE_CA:
3531 			/* switch connected to hca */
3532 
3533 			p_remote_hca =
3534 			    fabric_get_hca_by_guid(p_ftree, remote_node_guid);
3535 			CL_ASSERT(p_remote_hca);
3536 
3537 			p_remote_hca_or_sw = (void *)p_remote_hca;
3538 			direction = FTREE_DIRECTION_DOWN;
3539 
3540 			remote_lid =
3541 			    cl_ntoh16(osm_physp_get_base_lid(p_remote_osm_port));
3542 			break;
3543 
3544 		case IB_NODE_TYPE_SWITCH:
3545 			/* switch connected to another switch */
3546 
3547 			p_remote_sw =
3548 			    fabric_get_sw_by_guid(p_ftree, remote_node_guid);
3549 			CL_ASSERT(p_remote_sw);
3550 
3551 			p_remote_hca_or_sw = (void *)p_remote_sw;
3552 
3553 			if (p_sw->rank > p_remote_sw->rank) {
3554 				direction = FTREE_DIRECTION_UP;
3555 			} else if (p_sw->rank == p_remote_sw->rank) {
3556 				direction = FTREE_DIRECTION_SAME;
3557 			} else
3558 				direction = FTREE_DIRECTION_DOWN;
3559 
3560 			/* switch LID is only in port 0 port_info structure */
3561 			remote_lid =
3562 			    cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0));
3563 
3564 			break;
3565 
3566 		default:
3567 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3568 				"ERR AB13: Node GUID 0x%016" PRIx64
3569 				" - Unknown node type: %s\n",
3570 				cl_ntoh64(remote_node_guid),
3571 				ib_get_node_type_str(remote_node_type));
3572 			res = -1;
3573 			goto Exit;
3574 		}
3575 		sw_add_port(p_sw,	/* local ftree_sw object */
3576 			    i,	/* local port number */
3577 			    remote_port_num,	/* remote port number */
3578 			    p_sw->lid,	/* local lid */
3579 			    remote_lid,	/* remote lid */
3580 			    osm_physp_get_port_guid(p_osm_port),	/* local port guid */
3581 			    osm_physp_get_port_guid(p_remote_osm_port),	/* remote port guid */
3582 			    remote_node_guid,	/* remote node guid */
3583 			    remote_node_type,	/* remote node type */
3584 			    p_remote_hca_or_sw,	/* remote ftree_hca/sw object */
3585 			    direction);	/* port direction (up or down) */
3586 
3587 		/* Track the max lid (in host order) that exists in the fabric */
3588 		if (remote_lid > p_ftree->lft_max_lid)
3589 			p_ftree->lft_max_lid = remote_lid;
3590 	}
3591 
3592 Exit:
3593 	return res;
3594 }				/* fabric_construct_sw_ports() */
3595 
3596 /***************************************************
3597  ***************************************************/
3598 struct rank_root_cxt {
3599 	ftree_fabric_t *fabric;
3600 	cl_list_t *list;
3601 };
3602 /***************************************************
3603  ***************************************************/
3604 static int rank_root_sw_by_guid(void *cxt, uint64_t guid, char *p)
3605 {
3606 	struct rank_root_cxt *c = cxt;
3607 	ftree_sw_t *sw;
3608 
3609 	sw = fabric_get_sw_by_guid(c->fabric, cl_hton64(guid));
3610 	if (!sw) {
3611 		/* the specified root guid wasn't found in the fabric */
3612 		OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_ERROR, "ERR AB24: "
3613 			"Root switch GUID 0x%" PRIx64 " not found\n", guid);
3614 		return 0;
3615 	}
3616 
3617 	OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_DEBUG,
3618 		"Ranking root switch with GUID 0x%" PRIx64 "\n", guid);
3619 	sw->rank = 0;
3620 	cl_list_insert_tail(c->list, sw);
3621 
3622 	return 0;
3623 }
3624 /***************************************************
3625  ***************************************************/
3626 static boolean_t fabric_load_roots(IN ftree_fabric_t * p_ftree,
3627 				   IN cl_list_t* p_ranking_bfs_list)
3628 {
3629 	struct rank_root_cxt context;
3630 	unsigned num_roots;
3631 
3632 	if (p_ranking_bfs_list) {
3633 
3634 		/* Rank all the roots and add them to list */
3635 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3636 			"Fetching root nodes from file %s\n",
3637 			p_ftree->p_osm->subn.opt.root_guid_file);
3638 
3639 		context.fabric = p_ftree;
3640 		context.list = p_ranking_bfs_list;
3641 		if (parse_node_map(p_ftree->p_osm->subn.opt.root_guid_file,
3642 				   rank_root_sw_by_guid, &context)) {
3643 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB2A: "
3644 				"cannot parse root guids file \'%s\'\n",
3645 				p_ftree->p_osm->subn.opt.root_guid_file);
3646 			return FALSE;
3647 		}
3648 
3649 		num_roots = cl_list_count(p_ranking_bfs_list);
3650 		if (!num_roots) {
3651 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB25: "
3652 				"No valid roots supplied\n");
3653 			return FALSE;
3654 		}
3655 
3656 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3657 			"Ranked %u valid root switches\n", num_roots);
3658 	}
3659 	return TRUE;
3660 }
3661 /***************************************************
3662  ***************************************************/
3663 static int fabric_rank_from_roots(IN ftree_fabric_t * p_ftree,
3664 				  IN cl_list_t* p_ranking_bfs_list)
3665 {
3666 	osm_node_t *p_osm_node;
3667 	osm_node_t *p_remote_osm_node;
3668 	osm_physp_t *p_osm_physp;
3669 	ftree_sw_t *p_sw;
3670 	ftree_sw_t *p_remote_sw;
3671 	int res = 0;
3672 	unsigned max_rank = 0;
3673 	unsigned i;
3674 
3675 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3676 
3677 	if (!p_ranking_bfs_list) {
3678 		res = -1;
3679 		goto Exit;
3680 	}
3681 	while (!cl_is_list_empty(p_ranking_bfs_list)) {
3682 		p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list);
3683 		p_osm_node = p_sw->p_osm_sw->p_node;
3684 
3685 		/* note: skipping port 0 on switches */
3686 		for (i = 1; i < osm_node_get_num_physp(p_osm_node); i++) {
3687 			p_osm_physp = osm_node_get_physp_ptr(p_osm_node, i);
3688 			if (!p_osm_physp || !osm_link_is_healthy(p_osm_physp))
3689 				continue;
3690 
3691 			p_remote_osm_node =
3692 			    osm_node_get_remote_node(p_osm_node, i, NULL);
3693 			if (!p_remote_osm_node)
3694 				continue;
3695 
3696 			if (osm_node_get_type(p_remote_osm_node) !=
3697 			    IB_NODE_TYPE_SWITCH)
3698 				continue;
3699 
3700 			p_remote_sw = fabric_get_sw_by_guid(p_ftree,
3701 							    osm_node_get_node_guid
3702 							    (p_remote_osm_node));
3703 			CL_ASSERT(p_remote_sw);
3704 
3705 			/* if needed, rank the remote switch and add it to the BFS list */
3706 			if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) {
3707 				OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3708 					"Ranking switch 0x%" PRIx64
3709 					" with rank %u\n",
3710 					sw_get_guid_ho(p_remote_sw),
3711 					p_remote_sw->rank);
3712 				max_rank = p_remote_sw->rank;
3713 				cl_list_insert_tail(p_ranking_bfs_list,
3714 						    p_remote_sw);
3715 			}
3716 		}
3717 		/* done with ports of this switch - go to the next switch in the list */
3718 	}
3719 
3720 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3721 		"Subnet ranking completed. Max Node Rank = %u\n", max_rank);
3722 
3723 	/* set FatTree maximal switch rank */
3724 	p_ftree->max_switch_rank = max_rank;
3725 
3726 Exit:
3727 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3728 	return res;
3729 }				/* fabric_rank_from_roots() */
3730 
3731 /***************************************************
3732  ***************************************************/
3733 
3734 static int fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree)
3735 {
3736 	ftree_hca_t *p_hca;
3737 	ftree_hca_t *p_next_hca;
3738 	cl_list_t ranking_bfs_list;
3739 	int res = 0;
3740 
3741 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3742 
3743 	cl_list_init(&ranking_bfs_list, 10);
3744 
3745 	/* Mark REVERSED rank of all the switches in the subnet.
3746 	   Start from switches that are connected to hca's, and
3747 	   scan all the switches in the subnet. */
3748 	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3749 	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3750 		p_hca = p_next_hca;
3751 		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3752 		if (rank_leaf_switches(p_ftree, p_hca, &ranking_bfs_list) != 0) {
3753 			res = -1;
3754 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3755 				"ERR AB14: "
3756 				"Subnet ranking failed - subnet is not FatTree");
3757 			goto Exit;
3758 		}
3759 	}
3760 
3761 	/* Now rank rest of the switches in the fabric, while the
3762 	   list already contains all the ranked leaf switches */
3763 	rank_switches_from_leafs(p_ftree, &ranking_bfs_list);
3764 
3765 	/* fix ranking of the switches by reversing the ranking direction */
3766 	cl_qmap_apply_func(&p_ftree->sw_tbl, sw_reverse_rank, (void *)p_ftree);
3767 
3768 Exit:
3769 	cl_list_destroy(&ranking_bfs_list);
3770 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3771 	return res;
3772 }				/* fabric_rank_from_hcas() */
3773 
3774 /***************************************************
3775  * After ranking from HCA's we want to re-rank using
3776  * the roots
3777  ***************************************************/
3778 static int fabric_rerank_using_root(IN ftree_fabric_t * p_ftree,
3779 				    IN cl_list_t* p_ranking_bfs_list)
3780 {
3781 	ftree_sw_t *p_sw = NULL;
3782 	ftree_sw_t *p_next_sw;
3783 	int res;
3784 
3785 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3786 
3787 	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3788 	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3789 		p_sw = p_next_sw;
3790 		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3791 		if (p_sw->rank == 0)
3792 			cl_list_insert_tail(p_ranking_bfs_list, p_sw);
3793 		else
3794 			p_sw->rank = 0xFFFFFFFF;
3795 	}
3796 	res = fabric_rank_from_roots(p_ftree, p_ranking_bfs_list);
3797 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3798 	return res;
3799 }
3800 /***************************************************
3801  ***************************************************/
3802 static int fabric_rank(IN ftree_fabric_t * p_ftree)
3803 {
3804 	int res = -1;
3805 	cl_list_t ranking_bfs_list;
3806 
3807 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3808 	cl_list_init(&ranking_bfs_list, 10);
3809 
3810 	if (fabric_roots_provided(p_ftree) &&
3811 	    fabric_load_roots(p_ftree, &ranking_bfs_list))
3812 		res = fabric_rank_from_roots(p_ftree, &ranking_bfs_list);
3813 	else {
3814 		res = fabric_rank_from_hcas(p_ftree);
3815 		if (!res)
3816 			res = fabric_rerank_using_root(p_ftree, &ranking_bfs_list);
3817 	}
3818 
3819 	if (res)
3820 		goto Exit;
3821 
3822 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3823 		"FatTree max switch rank is %u\n", p_ftree->max_switch_rank);
3824 
3825 Exit:
3826 	cl_list_destroy(&ranking_bfs_list);
3827 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3828 	return res;
3829 }				/* fabric_rank() */
3830 
3831 /***************************************************
3832  ***************************************************/
3833 
3834 static void fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree)
3835 {
3836 	unsigned i;
3837 	ftree_sw_t *p_sw;
3838 	ftree_hca_t *p_hca = NULL;
3839 	ftree_hca_t *p_next_hca;
3840 
3841 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3842 
3843 	if (!fabric_roots_provided(p_ftree)) {
3844 		/* If root file is not provided, the fabric has to be pure fat-tree
3845 		   in terms of ranking. Thus, leaf switches rank is the max rank. */
3846 		p_ftree->leaf_switch_rank = p_ftree->max_switch_rank;
3847 	} else {
3848 		/* Find the first CN and set the leaf_switch_rank to the rank
3849 		   of the switch that is connected to this CN. Later we will
3850 		   ensure that all the leaf switches have the same rank. */
3851 		p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3852 		while (p_next_hca !=
3853 		       (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3854 			p_hca = p_next_hca;
3855 			if (p_hca->cn_num)
3856 				break;
3857 			p_next_hca =
3858 			    (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3859 		}
3860 		/* we know that there are CNs in the fabric, so just to be sure... */
3861 		CL_ASSERT(p_next_hca !=
3862 			  (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl));
3863 
3864 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3865 			"Selected CN port GUID 0x%" PRIx64 "\n",
3866 			hca_get_guid_ho(p_hca));
3867 
3868 		for (i = 0; (i < p_hca->up_port_groups_num)
3869 		     && (!p_hca->up_port_groups[i]->is_cn); i++)
3870 			;
3871 		CL_ASSERT(i < p_hca->up_port_groups_num);
3872 		CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type ==
3873 			  IB_NODE_TYPE_SWITCH);
3874 
3875 		p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw;
3876 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3877 			"Selected leaf switch GUID 0x%" PRIx64 ", rank %u\n",
3878 			sw_get_guid_ho(p_sw), p_sw->rank);
3879 		p_ftree->leaf_switch_rank = p_sw->rank;
3880 	}
3881 
3882 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
3883 		"FatTree leaf switch rank is %u\n", p_ftree->leaf_switch_rank);
3884 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3885 }				/* fabric_set_leaf_rank() */
3886 
3887 /***************************************************
3888  ***************************************************/
3889 
3890 static int fabric_populate_ports(IN ftree_fabric_t * p_ftree)
3891 {
3892 	ftree_hca_t *p_hca;
3893 	ftree_hca_t *p_next_hca;
3894 	ftree_sw_t *p_sw;
3895 	ftree_sw_t *p_next_sw;
3896 	int res = 0;
3897 
3898 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3899 
3900 	p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl);
3901 	while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) {
3902 		p_hca = p_next_hca;
3903 		p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item);
3904 		if (fabric_construct_hca_ports(p_ftree, p_hca) != 0) {
3905 			res = -1;
3906 			goto Exit;
3907 		}
3908 	}
3909 
3910 	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
3911 	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
3912 		p_sw = p_next_sw;
3913 		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
3914 		if (fabric_construct_sw_ports(p_ftree, p_sw) != 0) {
3915 			res = -1;
3916 			goto Exit;
3917 		}
3918 	}
3919 Exit:
3920 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3921 	return res;
3922 }				/* fabric_populate_ports() */
3923 
3924 /***************************************************
3925  ***************************************************/
3926 static int add_guid_item_to_map(void *cxt, uint64_t guid, char *p)
3927 {
3928 	cl_qmap_t *map = cxt;
3929 	name_map_item_t *item;
3930 	name_map_item_t *inserted_item;
3931 
3932 	item = malloc(sizeof(*item));
3933 	if (!item)
3934 		return -1;
3935 
3936 	item->guid = guid;
3937 	inserted_item = (name_map_item_t *) cl_qmap_insert(map, guid, &item->item);
3938 	if (inserted_item != item)
3939                 free(item);
3940 
3941 	return 0;
3942 }
3943 
3944 static int fabric_read_guid_files(IN ftree_fabric_t * p_ftree)
3945 {
3946 	int status = 0;
3947 
3948 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
3949 
3950 	if (fabric_cns_provided(p_ftree)) {
3951 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3952 			"Fetching compute nodes from file %s\n",
3953 			p_ftree->p_osm->subn.opt.cn_guid_file);
3954 
3955 		if (parse_node_map(p_ftree->p_osm->subn.opt.cn_guid_file,
3956 				   add_guid_item_to_map,
3957 				   &p_ftree->cn_guid_tbl)) {
3958 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3959 				"ERR AB23: " "Problem parsing CN guid file\n");
3960 			status = -1;
3961 			goto Exit;
3962 		}
3963 
3964 		if (!cl_qmap_count(&p_ftree->cn_guid_tbl)) {
3965 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3966 				"ERR AB27: "
3967 				"Compute node guids file has no valid guids\n");
3968 			status = -1;
3969 			goto Exit;
3970 		}
3971 	}
3972 
3973 	if (fabric_ios_provided(p_ftree)) {
3974 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
3975 			"Fetching I/O nodes from file %s\n",
3976 			p_ftree->p_osm->subn.opt.io_guid_file);
3977 
3978 		if (parse_node_map(p_ftree->p_osm->subn.opt.io_guid_file,
3979 				   add_guid_item_to_map,
3980 				   &p_ftree->io_guid_tbl)) {
3981 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3982 				"ERR AB28: Problem parsing I/O guid file\n");
3983 			status = -1;
3984 			goto Exit;
3985 		}
3986 
3987 		if (!cl_qmap_count(&p_ftree->io_guid_tbl)) {
3988 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
3989 				"ERR AB29: "
3990 				"I/O node guids file has no valid guids\n");
3991 			status = -1;
3992 			goto Exit;
3993 		}
3994 	}
3995 Exit:
3996 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
3997 	return status;
3998 }				/*fabric_read_guid_files() */
3999 
4000 /***************************************************
4001  ***************************************************/
4002 /* Get a Sw and remove all depended HCA's, meaning all
4003  * HCA's which this is the only switch they are connected
4004  * to	*/
4005 static int remove_depended_hca(IN ftree_fabric_t *p_ftree, IN ftree_sw_t *p_sw)
4006 {
4007 	ftree_hca_t *p_hca;
4008 	int counter = 0;
4009 	int port_num;
4010 	uint8_t remote_port_num;
4011 	osm_physp_t* physp;
4012 	osm_node_t* sw_node;
4013 	uint64_t remote_hca_guid;
4014 
4015 	sw_node = p_sw->p_osm_sw->p_node;
4016 	for (port_num = 0; port_num < sw_node->physp_tbl_size; port_num++) {
4017 		physp = osm_node_get_physp_ptr(sw_node, port_num);
4018 		if (physp && physp->p_remote_physp) {
4019 			if (osm_node_get_type(physp->p_remote_physp->p_node) == IB_NODE_TYPE_CA) {
4020 				remote_hca_guid =
4021 				    osm_node_get_node_guid(physp->p_remote_physp->p_node);
4022 				p_hca = fabric_get_hca_by_guid(p_ftree, remote_hca_guid);
4023 				if (!p_hca)
4024 					continue;
4025 
4026 				remote_port_num =
4027 				    osm_physp_get_port_num(physp->p_remote_physp);
4028 				p_hca->disconnected_ports[remote_port_num] = 1;
4029 			}
4030 		}
4031 	}
4032 	return counter;
4033 }
4034 /***************************************************
4035  ***************************************************/
4036 static void fabric_remove_unranked_sw(IN ftree_fabric_t *p_ftree)
4037 {
4038 	ftree_sw_t *p_sw = NULL;
4039 	ftree_sw_t *p_next_sw;
4040 	int removed_hca;
4041 	int count = 0;
4042 
4043 	p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl);
4044 	while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) {
4045 		p_sw = p_next_sw;
4046 		p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item);
4047 		if (!sw_ranked(p_sw)) {
4048 			cl_qmap_remove_item(&p_ftree->sw_tbl,&p_sw->map_item);
4049 			removed_hca = remove_depended_hca(p_ftree, p_sw);
4050 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4051 				"Removing Unranked sw 0x%" PRIx64 " (with %d dependent hca's)\n",
4052 				sw_get_guid_ho(p_sw),removed_hca);
4053 			sw_destroy(p_sw);
4054 			count++;
4055 		}
4056 	}
4057 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
4058 		"Removed %d invalid switches\n", count);
4059 }
4060 /***************************************************
4061  ***************************************************/
4062 static int construct_fabric(IN void *context)
4063 {
4064 	ftree_fabric_t *p_ftree = context;
4065 	int status = 0;
4066 
4067 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
4068 
4069 	fabric_clear(p_ftree);
4070 
4071 	if (p_ftree->p_osm->subn.opt.lmc > 0) {
4072 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4073 			   "LMC > 0 is not supported by fat-tree routing.\n"
4074 			   "Falling back to default routing\n");
4075 		status = -1;
4076 		goto Exit;
4077 	}
4078 
4079 	if (cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl) < 2) {
4080 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4081 			   "Fabric has %u switches - topology is not fat-tree.\n"
4082 			   "Falling back to default routing\n",
4083 			   cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
4084 		status = -1;
4085 		goto Exit;
4086 	}
4087 
4088 	if ((cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl) -
4089 	     cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)) < 2) {
4090 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4091 			   "Fabric has %u nodes (%u switches) - topology is not fat-tree.\n"
4092 			   "Falling back to default routing\n",
4093 			   cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl),
4094 			   cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl));
4095 		status = -1;
4096 		goto Exit;
4097 	}
4098 
4099 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
4100 		"                       |----------------------------------------|\n"
4101 		"                       |- Starting FatTree fabric construction -|\n"
4102 		"                       |----------------------------------------|\n\n");
4103 
4104 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4105 		"Populating FatTree Switch and CA tables\n");
4106 	if (fabric_populate_nodes(p_ftree) != 0) {
4107 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4108 			   "Fabric topology is not fat-tree - "
4109 			   "falling back to default routing\n");
4110 		status = -1;
4111 		goto Exit;
4112 	}
4113 
4114 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4115 		"Reading guid files provided by user\n");
4116 	if (fabric_read_guid_files(p_ftree) != 0) {
4117 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4118 			   "Failed reading guid files - "
4119 			   "falling back to default routing\n");
4120 		status = -1;
4121 		goto Exit;
4122 	}
4123 
4124 	if (cl_qmap_count(&p_ftree->hca_tbl) < 2) {
4125 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4126 			   "Fabric has %u CAs - topology is not fat-tree.\n"
4127 			   "Falling back to default routing\n",
4128 			   cl_qmap_count(&p_ftree->hca_tbl));
4129 		status = -1;
4130 		goto Exit;
4131 	}
4132 
4133 	/* Rank all the switches in the fabric.
4134 	   After that we will know only fabric max switch rank.
4135 	   We will be able to check leaf switches rank and the
4136 	   whole tree rank after filling ports and marking CNs. */
4137 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "Ranking FatTree\n");
4138 	if (fabric_rank(p_ftree) != 0) {
4139 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4140 			   "Failed ranking the tree\n");
4141 		status = -1;
4142 		goto Exit;
4143 	}
4144 	fabric_remove_unranked_sw(p_ftree);
4145 
4146 	if (p_ftree->max_switch_rank == 0 &&
4147 	    cl_qmap_count(&p_ftree->sw_tbl) > 1) {
4148 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR,
4149 			"ERR AB2B: Found more than one root on fabric with "
4150 			"maximum rank 0\n");
4151 		status = -1;
4152 		goto Exit;
4153 	}
4154 
4155 	/* For each hca and switch, construct array of ports.
4156 	   This is done after the whole FatTree data structure is ready,
4157 	   because we want the ports to have pointers to ftree_{sw,hca}_t
4158 	   objects, and we need the switches to be already ranked because
4159 	   that's how the port direction is determined. */
4160 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4161 		"Populating CA & switch ports\n");
4162 	if (fabric_populate_ports(p_ftree) != 0) {
4163 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4164 			   "Fabric topology is not a fat-tree\n");
4165 		status = -1;
4166 		goto Exit;
4167 	} else if (p_ftree->cn_num == 0) {
4168 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4169 			   "Fabric has no valid compute nodes\n");
4170 		status = -1;
4171 		goto Exit;
4172 	}
4173 
4174 	/* Now that the CA ports have been created and CNs were marked,
4175 	   we can complete the fabric ranking - set leaf switches rank. */
4176 	fabric_set_leaf_rank(p_ftree);
4177 
4178 	if (fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK ||
4179 	    fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK) {
4180 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4181 			   "Fabric rank is %u (should be between %u and %u)\n",
4182 			   fabric_get_rank(p_ftree), FAT_TREE_MIN_RANK,
4183 			   FAT_TREE_MAX_RANK);
4184 		status = -1;
4185 		goto Exit;
4186 	}
4187 
4188 	/* Mark all the switches in the fabric with rank equal to
4189 	   p_ftree->leaf_switch_rank and that are also connected to CNs.
4190 	   As a by-product, this function also runs basic topology
4191 	   validation - it checks that all the CNs are at the same rank. */
4192 	if (fabric_mark_leaf_switches(p_ftree)) {
4193 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4194 			   "Fabric topology is not a fat-tree\n");
4195 		status = -1;
4196 		goto Exit;
4197 	}
4198 
4199 	/* Assign index to all the switches in the fabric.
4200 	   This function also sorts leaf switch array by the switch index,
4201 	   sorts all the port arrays of the indexed switches by remote
4202 	   switch index, and creates switch-by-tuple table (sw_by_tuple_tbl) */
4203 	fabric_make_indexing(p_ftree);
4204 
4205 	/* Create leaf switch array sorted by index.
4206 	   This array contains switches with rank equal to p_ftree->leaf_switch_rank
4207 	   and that are also connected to CNs (REAL leafs), and it may contain
4208 	   switches at the same leaf rank w/o CNs, if this is the order of indexing.
4209 	   In any case, the first and the last switches in the array are REAL leafs. */
4210 	if (fabric_create_leaf_switch_array(p_ftree)) {
4211 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4212 			   "Fabric topology is not a fat-tree\n");
4213 		status = -1;
4214 		goto Exit;
4215 	}
4216 
4217 	/* calculate and set ftree.max_cn_per_leaf field */
4218 	fabric_set_max_cn_per_leaf(p_ftree);
4219 
4220 	/* print general info about fabric topology */
4221 	fabric_dump_general_info(p_ftree);
4222 
4223 	/* dump full tree topology */
4224 	if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG))
4225 		fabric_dump(p_ftree);
4226 
4227 	/* the fabric is required to be PURE fat-tree only if the root
4228 	   guid file hasn't been provided by user */
4229 	if (!fabric_roots_provided(p_ftree) &&
4230 	    !fabric_validate_topology(p_ftree)) {
4231 		osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID,
4232 			   "Fabric topology is not a fat-tree\n");
4233 		status = -1;
4234 		goto Exit;
4235 	}
4236 
4237 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4238 		"Max LID in switch LFTs: %u\n", p_ftree->lft_max_lid);
4239 
4240 	/* Build the full lid matrices needed for multicast routing */
4241 	osm_ucast_mgr_build_lid_matrices(&p_ftree->p_osm->sm.ucast_mgr);
4242 
4243 Exit:
4244 	if (status != 0) {
4245 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4246 			"Clearing FatTree Fabric data structures\n");
4247 		fabric_clear(p_ftree);
4248 	} else
4249 		p_ftree->fabric_built = TRUE;
4250 
4251 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n"
4252 		"                       |--------------------------------------------------|\n"
4253 		"                       |- Done constructing FatTree fabric (status = %d) -|\n"
4254 		"                       |--------------------------------------------------|\n\n",
4255 		status);
4256 
4257 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
4258 	return status;
4259 }				/* construct_fabric() */
4260 
4261 /***************************************************
4262  ***************************************************/
4263 
4264 static int do_routing(IN void *context)
4265 {
4266 	ftree_fabric_t *p_ftree = context;
4267 	int status = 0;
4268 
4269 	OSM_LOG_ENTER(&p_ftree->p_osm->log);
4270 
4271 	if (!p_ftree->fabric_built) {
4272 		status = -1;
4273 		goto Exit;
4274 	}
4275 
4276 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4277 		"Starting FatTree routing\n");
4278 
4279 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4280 		"Filling switch forwarding tables for Compute Nodes\n");
4281 	fabric_route_to_cns(p_ftree);
4282 
4283 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4284 		"Filling switch forwarding tables for non-CN targets\n");
4285 	fabric_route_to_non_cns(p_ftree);
4286 
4287 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4288 		"Filling switch forwarding tables for switch-to-switch paths\n");
4289 	fabric_route_to_switches(p_ftree);
4290 
4291 	if (p_ftree->p_osm->subn.opt.connect_roots) {
4292 		OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4293 			"Connecting switches that are unreachable within "
4294 			"Up/Down rules\n");
4295 		fabric_route_roots(p_ftree);
4296 	}
4297 
4298 	/* for each switch, set its fwd table */
4299 	cl_qmap_apply_func(&p_ftree->sw_tbl, set_sw_fwd_table, (void *)p_ftree);
4300 
4301 	/* write out hca ordering file */
4302 	fabric_dump_hca_ordering(p_ftree);
4303 
4304 	OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE,
4305 		"FatTree routing is done\n");
4306 
4307 Exit:
4308 	OSM_LOG_EXIT(&p_ftree->p_osm->log);
4309 	return status;
4310 }
4311 
4312 /***************************************************
4313  ***************************************************/
4314 
4315 static void delete(IN void *context)
4316 {
4317 	if (!context)
4318 		return;
4319 	fabric_destroy((ftree_fabric_t *) context);
4320 }
4321 
4322 /***************************************************
4323  ***************************************************/
4324 
4325 int osm_ucast_ftree_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
4326 {
4327 	ftree_fabric_t *p_ftree = fabric_create();
4328 	if (!p_ftree)
4329 		return -1;
4330 
4331 	p_ftree->p_osm = p_osm;
4332 	p_ftree->p_subn = p_osm->sm.ucast_mgr.p_subn;
4333 
4334 	r->context = (void *)p_ftree;
4335 	r->build_lid_matrices = construct_fabric;
4336 	r->ucast_build_fwd_tables = do_routing;
4337 	r->destroy = delete;
4338 
4339 	return 0;
4340 }
4341