1 /*
2  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  * Copyright (c) 2009-2015 ZIH, TU Dresden, Federal Republic of Germany. All rights reserved.
6  * Copyright (C) 2012-2017 Tokyo Institute of Technology. All rights reserved.
7  *
8  * This software is available to you under a choice of one of two
9  * licenses.  You may choose to be licensed under the terms of the GNU
10  * General Public License (GPL) Version 2, available from the file
11  * COPYING in the main directory of this source tree, or the
12  * OpenIB.org BSD license below:
13  *
14  *     Redistribution and use in source and binary forms, with or
15  *     without modification, are permitted provided that the following
16  *     conditions are met:
17  *
18  *      - Redistributions of source code must retain the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer.
21  *
22  *      - Redistributions in binary form must reproduce the above
23  *        copyright notice, this list of conditions and the following
24  *        disclaimer in the documentation and/or other materials
25  *        provided with the distribution.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34  * SOFTWARE.
35  *
36  */
37 
38 /*
39  * Abstract:
40  *    Implementation of OpenSM (deadlock-free) single-source-shortest-path routing
41  *    (with dijkstra algorithm)
42  */
43 
44 #if HAVE_CONFIG_H
45 #  include <config.h>
46 #endif				/* HAVE_CONFIG_H */
47 
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <opensm/osm_file_ids.h>
52 #define FILE_ID OSM_FILE_UCAST_DFSSSP_C
53 #include <opensm/osm_ucast_mgr.h>
54 #include <opensm/osm_opensm.h>
55 #include <opensm/osm_node.h>
56 #include <opensm/osm_multicast.h>
57 #include <opensm/osm_mcast_mgr.h>
58 
59 /* "infinity" for dijkstra */
60 #define INF      0x7FFFFFFF
61 
62 enum {
63 	UNDISCOVERED = 0,
64 	DISCOVERED
65 };
66 
67 enum {
68 	UNKNOWN = 0,
69 	GRAY,
70 	BLACK,
71 };
72 
73 typedef struct link {
74 	uint64_t guid;		/* guid of the neighbor behind the link */
75 	uint32_t from;		/* base_index in the adjazenz list (start of the link) */
76 	uint8_t from_port;	/* port on the base_side (needed for weight update to identify the correct link for multigraphs) */
77 	uint32_t to;		/* index of the neighbor in the adjazenz list (end of the link) */
78 	uint8_t to_port;	/* port on the side of the neighbor (needed for the LFT) */
79 	uint64_t weight;	/* link weight */
80 	struct link *next;
81 } link_t;
82 
83 typedef struct vertex {
84 	/* informations of the fabric */
85 	uint64_t guid;
86 	uint16_t lid;		/* for lft filling */
87 	uint32_t num_hca;	/* numbers of Hca/LIDs on the switch, for weight calculation */
88 	link_t *links;
89 	uint8_t hops;
90 	/* for dijkstra routing */
91 	link_t *used_link;	/* link between the vertex discovered before and this vertex */
92 	uint64_t distance;	/* distance from source to this vertex */
93 	uint8_t state;
94 	/* for the binary heap */
95 	uint32_t heap_id;
96 	/* for LFT writing and debug */
97 	osm_switch_t *sw;	/* selfpointer */
98 	boolean_t dropped;	/* indicate dropped switches (w/ ucast cache) */
99 } vertex_t;
100 
101 typedef struct binary_heap {
102 	uint32_t size;		/* size of the heap */
103 	vertex_t **nodes;	/* array with pointers to elements of the adj_list */
104 } binary_heap_t;
105 
106 typedef struct vltable {
107 	uint64_t num_lids;	/* size of the lids array */
108 	uint16_t *lids;		/* sorted array of all lids in the subnet */
109 	uint8_t *vls;		/* matrix form assignment lid X lid -> virtual lane */
110 } vltable_t;
111 
112 typedef struct cdg_link {
113 	struct cdg_node *node;
114 	uint32_t num_pairs;	/* number of src->dest pairs incremented in path adding step */
115 	uint32_t max_len;	/* length of the srcdest array */
116 	uint32_t removed;	/* number of pairs removed in path deletion step */
117 	uint32_t *srcdest_pairs;
118 	struct cdg_link *next;
119 } cdg_link_t;
120 
121 /* struct for a node of a binary tree with additional parent pointer */
122 typedef struct cdg_node {
123 	uint64_t channelID;	/* unique key consist of src lid + port + dest lid + port */
124 	cdg_link_t *linklist;	/* edges to adjazent nodes */
125 	uint8_t status;		/* node status in cycle search to avoid recursive function */
126 	uint8_t visited;	/* needed to traverse the binary tree */
127 	struct cdg_node *pre;	/* to save the path in cycle detection algorithm */
128 	struct cdg_node *left, *right, *parent;
129 } cdg_node_t;
130 
131 typedef struct dfsssp_context {
132 	osm_routing_engine_type_t routing_type;
133 	osm_ucast_mgr_t *p_mgr;
134 	vertex_t *adj_list;
135 	uint32_t adj_list_size;
136 	vltable_t *srcdest2vl_table;
137 	uint8_t *vl_split_count;
138 } dfsssp_context_t;
139 
140 /**************** set initial values for structs **********************
141  **********************************************************************/
142 static inline void set_default_link(link_t * link)
143 {
144 	link->guid = 0;
145 	link->from = 0;
146 	link->from_port = 0;
147 	link->to = 0;
148 	link->to_port = 0;
149 	link->weight = 0;
150 	link->next = NULL;
151 }
152 
153 static inline void set_default_vertex(vertex_t * vertex)
154 {
155 	vertex->guid = 0;
156 	vertex->lid = 0;
157 	vertex->num_hca = 0;
158 	vertex->links = NULL;
159 	vertex->hops = 0;
160 	vertex->used_link = NULL;
161 	vertex->distance = 0;
162 	vertex->state = UNDISCOVERED;
163 	vertex->heap_id = 0;
164 	vertex->sw = NULL;
165 	vertex->dropped = FALSE;
166 }
167 
168 static inline void set_default_cdg_node(cdg_node_t * node)
169 {
170 	node->channelID = 0;
171 	node->linklist = NULL;
172 	node->status = UNKNOWN;
173 	node->visited = 0;
174 	node->pre = NULL;
175 	node->left = NULL;
176 	node->right = NULL;
177 	node->parent = NULL;
178 }
179 
180 /**********************************************************************
181  **********************************************************************/
182 
183 /************ helper functions for heap in dijkstra *******************
184  **********************************************************************/
185 /* returns true if element 1 is smaller than element 2 */
186 static inline uint32_t heap_smaller(binary_heap_t * heap, uint32_t i,
187 				    uint32_t j)
188 {
189 	return (heap->nodes[i]->distance < heap->nodes[j]->distance) ? 1 : 0;
190 }
191 
192 /* swap two elements */
193 static void heap_exchange(binary_heap_t * heap, uint32_t i, uint32_t j)
194 {
195 	uint32_t tmp_heap_id = 0;
196 	vertex_t *tmp_node = NULL;
197 
198 	/* 1. swap the heap_id */
199 	tmp_heap_id = heap->nodes[i]->heap_id;
200 	heap->nodes[i]->heap_id = heap->nodes[j]->heap_id;
201 	heap->nodes[j]->heap_id = tmp_heap_id;
202 	/* 2. swap pointers */
203 	tmp_node = heap->nodes[i];
204 	heap->nodes[i] = heap->nodes[j];
205 	heap->nodes[j] = tmp_node;
206 }
207 
208 /* changes position of element with parent until children are bigger */
209 static uint32_t heap_up(binary_heap_t * heap, uint32_t i)
210 {
211 	uint32_t curr = i, father = 0;
212 
213 	if (curr > 0) {
214 		father = (curr - 1) >> 1;
215 		while (heap_smaller(heap, curr, father)) {
216 			heap_exchange(heap, curr, father);
217 			/* try to go up when we arent already root */
218 			curr = father;
219 			if (curr > 0)
220 				father = (curr - 1) >> 1;
221 		}
222 	}
223 
224 	return curr;
225 }
226 
227 /* changes position of element with children until parent is smaller */
228 static uint32_t heap_down(binary_heap_t * heap, uint32_t i)
229 {
230 	uint32_t curr = i;
231 	uint32_t son1 = 0, son2 = 0, smaller_son = 0;
232 	uint32_t exchanged = 0;
233 
234 	do {
235 		son1 = ((curr + 1) << 1) - 1;
236 		son2 = (curr + 1) << 1;
237 		exchanged = 0;
238 
239 		/* exchange with smaller son */
240 		if (son1 < heap->size && son2 < heap->size) {
241 			if (heap_smaller(heap, son1, son2))
242 				smaller_son = son1;
243 			else
244 				smaller_son = son2;
245 		} else if (son1 < heap->size) {
246 			/* only one son */
247 			smaller_son = son1;
248 		} else {
249 			/* finished */
250 			break;
251 		}
252 
253 		/* only exchange when smaller */
254 		if (heap_smaller(heap, smaller_son, curr)) {
255 			heap_exchange(heap, curr, smaller_son);
256 			exchanged = 1;
257 			curr = smaller_son;
258 		}
259 	} while (exchanged);
260 
261 	return curr;
262 }
263 
264 /* reheapify element */
265 static inline void heap_heapify(binary_heap_t * heap, uint32_t i)
266 {
267 	heap_down(heap, heap_up(heap, i));
268 }
269 
270 /* creates heap for graph */
271 static int heap_create(vertex_t * adj_list, uint32_t adj_list_size,
272 		       binary_heap_t ** binheap)
273 {
274 	binary_heap_t *heap = NULL;
275 	uint32_t i = 0;
276 
277 	/* allocate the memory for the heap object */
278 	heap = (binary_heap_t *) malloc(sizeof(binary_heap_t));
279 	if (!heap)
280 		return 1;
281 
282 	/* the heap size is equivalent to the size of the adj_list */
283 	heap->size = adj_list_size;
284 
285 	/* allocate the pointer array, fill with the pointers to the elements of the adj_list and set the initial heap_id */
286 	heap->nodes = (vertex_t **) malloc(heap->size * sizeof(vertex_t *));
287 	if (!heap->nodes) {
288 		free(heap);
289 		return 1;
290 	}
291 	for (i = 0; i < heap->size; i++) {
292 		heap->nodes[i] = &adj_list[i];
293 		heap->nodes[i]->heap_id = i;
294 	}
295 
296 	/* sort elements */
297 	for (i = heap->size; i > 0; i--)
298 		heap_down(heap, i - 1);
299 
300 	*binheap = heap;
301 	return 0;
302 }
303 
304 /* returns current minimum and removes it from heap */
305 static vertex_t *heap_getmin(binary_heap_t * heap)
306 {
307 	vertex_t *min = NULL;
308 
309 	if (heap->size > 0)
310 		min = heap->nodes[0];
311 
312 	if (min == NULL)
313 		return min;
314 
315 	if (heap->size > 0) {
316 		if (heap->size > 1) {
317 			heap_exchange(heap, 0, heap->size - 1);
318 			heap->size--;
319 			heap_down(heap, 0);
320 		} else {
321 			heap->size--;
322 		}
323 	}
324 
325 	return min;
326 }
327 
328 /* cleanup heap */
329 static void heap_free(binary_heap_t * heap)
330 {
331 	if (heap) {
332 		if (heap->nodes) {
333 			free(heap->nodes);
334 			heap->nodes = NULL;
335 		}
336 		free(heap);
337 	}
338 }
339 
340 /**********************************************************************
341  **********************************************************************/
342 
343 /************ helper functions to save src/dest X vl combination ******
344  **********************************************************************/
345 /* compare function of two lids for stdlib qsort */
346 static int cmp_lids(const void *l1, const void *l2)
347 {
348 	ib_net16_t lid1 = *((ib_net16_t *) l1), lid2 = *((ib_net16_t *) l2);
349 
350 	if (lid1 < lid2)
351 		return -1;
352 	else if (lid1 > lid2)
353 		return 1;
354 	else
355 		return 0;
356 }
357 
358 /* use stdlib to sort the lid array */
359 static inline void vltable_sort_lids(vltable_t * vltable)
360 {
361 	qsort(vltable->lids, vltable->num_lids, sizeof(ib_net16_t), cmp_lids);
362 }
363 
364 /* use stdlib to get index of key in lid array;
365    return -1 if lid isn't found in lids array
366 */
367 static inline int64_t vltable_get_lidindex(ib_net16_t * key, vltable_t * vltable)
368 {
369 	ib_net16_t *found_lid = NULL;
370 
371 	found_lid =
372 	    (ib_net16_t *) bsearch(key, vltable->lids, vltable->num_lids,
373 				   sizeof(ib_net16_t), cmp_lids);
374 	if (found_lid)
375 		return found_lid - vltable->lids;
376 	else
377 		return -1;
378 }
379 
380 /* get virtual lane from src lid X dest lid combination;
381    return -1 for invalid lids
382 */
383 static int32_t vltable_get_vl(vltable_t * vltable, ib_net16_t slid, ib_net16_t dlid)
384 {
385 	int64_t ind1 = vltable_get_lidindex(&slid, vltable);
386 	int64_t ind2 = vltable_get_lidindex(&dlid, vltable);
387 
388 	if (ind1 > -1 && ind2 > -1)
389 		return (int32_t) (vltable->
390 				  vls[ind1 + ind2 * vltable->num_lids]);
391 	else
392 		return -1;
393 }
394 
395 /* set a virtual lane in the matrix */
396 static inline void vltable_insert(vltable_t * vltable, ib_net16_t slid,
397 				  ib_net16_t dlid, uint8_t vl)
398 {
399 	int64_t ind1 = vltable_get_lidindex(&slid, vltable);
400 	int64_t ind2 = vltable_get_lidindex(&dlid, vltable);
401 
402 	if (ind1 > -1 && ind2 > -1)
403 		vltable->vls[ind1 + ind2 * vltable->num_lids] = vl;
404 }
405 
406 /* change a number of lanes from lane xy to lane yz */
407 static void vltable_change_vl(vltable_t * vltable, uint8_t from, uint8_t to,
408 			      uint64_t count)
409 {
410 	uint64_t set = 0, stop = 0;
411 	uint64_t ind1 = 0, ind2 = 0;
412 
413 	for (ind1 = 0; ind1 < vltable->num_lids; ind1++) {
414 		for (ind2 = 0; ind2 < vltable->num_lids; ind2++) {
415 			if (set == count) {
416 				stop = 1;
417 				break;
418 			}
419 			if (ind1 != ind2) {
420 				if (vltable->
421 				    vls[ind1 + ind2 * vltable->num_lids] ==
422 				    from) {
423 					vltable->vls[ind1 +
424 						     ind2 * vltable->num_lids] =
425 					    to;
426 					set++;
427 				}
428 			}
429 		}
430 		if (stop)
431 			break;
432 	}
433 }
434 
435 static void vltable_print(osm_ucast_mgr_t * p_mgr, vltable_t * vltable)
436 {
437 	uint64_t ind1 = 0, ind2 = 0;
438 
439 	for (ind1 = 0; ind1 < vltable->num_lids; ind1++) {
440 		for (ind2 = 0; ind2 < vltable->num_lids; ind2++) {
441 			if (ind1 != ind2) {
442 				OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
443 					"   route from src_lid=%" PRIu16
444 					" to dest_lid=%" PRIu16 " on vl=%" PRIu8
445 					"\n", cl_ntoh16(vltable->lids[ind1]),
446 					cl_ntoh16(vltable->lids[ind2]),
447 					vltable->vls[ind1 +
448 						     ind2 * vltable->num_lids]);
449 			}
450 		}
451 	}
452 }
453 
454 static void vltable_dealloc(vltable_t ** vltable)
455 {
456 	if (*vltable) {
457 		if ((*vltable)->lids)
458 			free((*vltable)->lids);
459 		if ((*vltable)->vls)
460 			free((*vltable)->vls);
461 		free(*vltable);
462 		*vltable = NULL;
463 	}
464 }
465 
466 static int vltable_alloc(vltable_t ** vltable, uint64_t size)
467 {
468 	/* allocate VL table and indexing array */
469 	*vltable = (vltable_t *) malloc(sizeof(vltable_t));
470 	if (!(*vltable))
471 		goto ERROR;
472 	(*vltable)->num_lids = size;
473 	(*vltable)->lids = (ib_net16_t *) malloc(size * sizeof(ib_net16_t));
474 	if (!((*vltable)->lids))
475 		goto ERROR;
476 	(*vltable)->vls = (uint8_t *) malloc(size * size * sizeof(uint8_t));
477 	if (!((*vltable)->vls))
478 		goto ERROR;
479 	memset((*vltable)->vls, OSM_DEFAULT_SL, size * size);
480 
481 	return 0;
482 
483 ERROR:
484 	vltable_dealloc(vltable);
485 
486 	return 1;
487 }
488 
489 /**********************************************************************
490  **********************************************************************/
491 
492 /************ helper functions to save/manage the channel dep. graph **
493  **********************************************************************/
494 /* update the srcdest array;
495    realloc array (double the size) if size is not large enough
496 */
497 static void set_next_srcdest_pair(cdg_link_t * link, uint32_t srcdest)
498 {
499 	uint32_t new_size = 0, start_size = 2;
500 	uint32_t *tmp = NULL, *tmp2 = NULL;
501 
502 	if (link->num_pairs == 0) {
503 		link->srcdest_pairs =
504 		    (uint32_t *) malloc(start_size * sizeof(uint32_t));
505 		link->srcdest_pairs[link->num_pairs] = srcdest;
506 		link->max_len = start_size;
507 		link->removed = 0;
508 	} else if (link->num_pairs == link->max_len) {
509 		new_size = link->max_len << 1;
510 		tmp = (uint32_t *) malloc(new_size * sizeof(uint32_t));
511 		tmp =
512 		    memcpy(tmp, link->srcdest_pairs,
513 			   link->max_len * sizeof(uint32_t));
514 		tmp2 = link->srcdest_pairs;
515 		link->srcdest_pairs = tmp;
516 		link->srcdest_pairs[link->num_pairs] = srcdest;
517 		free(tmp2);
518 		link->max_len = new_size;
519 	} else {
520 		link->srcdest_pairs[link->num_pairs] = srcdest;
521 	}
522 	link->num_pairs++;
523 }
524 
525 static inline uint32_t get_next_srcdest_pair(cdg_link_t * link, uint32_t index)
526 {
527 	return link->srcdest_pairs[index];
528 }
529 
530 /* traverse binary tree to find a node */
531 static cdg_node_t *cdg_search(cdg_node_t * root, uint64_t channelID)
532 {
533 	while (root) {
534 		if (channelID < root->channelID)
535 			root = root->left;
536 		else if (channelID > root->channelID)
537 			root = root->right;
538 		else if (channelID == root->channelID)
539 			return root;
540 	}
541 	return NULL;
542 }
543 
544 /* insert new node into the binary tree */
545 static void cdg_insert(cdg_node_t ** root, cdg_node_t * new_node)
546 {
547 	cdg_node_t *current = *root;
548 
549 	if (!current) {
550 		current = new_node;
551 		*root = current;
552 		return;
553 	}
554 
555 	while (current) {
556 		if (new_node->channelID < current->channelID) {
557 			if (current->left) {
558 				current = current->left;
559 			} else {
560 				current->left = new_node;
561 				new_node->parent = current;
562 				break;
563 			}
564 		} else if (new_node->channelID > current->channelID) {
565 			if (current->right) {
566 				current = current->right;
567 			} else {
568 				current->right = new_node;
569 				new_node->parent = current;
570 				break;
571 			}
572 		} else if (new_node->channelID == current->channelID) {
573 			/* not really possible, maybe programming error */
574 			break;
575 		}
576 	}
577 }
578 
579 static void cdg_node_dealloc(cdg_node_t * node)
580 {
581 	cdg_link_t *link = node->linklist, *tmp = NULL;
582 
583 	/* dealloc linklist */
584 	while (link) {
585 		tmp = link;
586 		link = link->next;
587 
588 		if (tmp->num_pairs)
589 			free(tmp->srcdest_pairs);
590 		free(tmp);
591 	}
592 	/* dealloc node */
593 	free(node);
594 }
595 
596 static void cdg_dealloc(cdg_node_t ** root)
597 {
598 	cdg_node_t *current = *root;
599 
600 	while (current) {
601 		if (current->left) {
602 			current = current->left;
603 		} else if (current->right) {
604 			current = current->right;
605 		} else {
606 			if (current->parent == NULL) {
607 				cdg_node_dealloc(current);
608 				*root = NULL;
609 				break;
610 			}
611 			if (current->parent->left == current) {
612 				current = current->parent;
613 				cdg_node_dealloc(current->left);
614 				current->left = NULL;
615 			} else if (current->parent->right == current) {
616 				current = current->parent;
617 				cdg_node_dealloc(current->right);
618 				current->right = NULL;
619 			}
620 		}
621 	}
622 }
623 
624 /* search for a edge in the cdg which should be removed to break a cycle */
625 static cdg_link_t *get_weakest_link_in_cycle(cdg_node_t * cycle)
626 {
627 	cdg_node_t *current = cycle, *node_with_weakest_link = NULL;
628 	cdg_link_t *link = NULL, *weakest_link = NULL;
629 
630 	link = current->linklist;
631 	while (link) {
632 		if (link->node->status == GRAY) {
633 			weakest_link = link;
634 			node_with_weakest_link = current;
635 			current = link->node;
636 			break;
637 		}
638 		link = link->next;
639 	}
640 
641 	while (1) {
642 		current->status = UNKNOWN;
643 		link = current->linklist;
644 		while (link) {
645 			if (link->node->status == GRAY) {
646 				if ((link->num_pairs - link->removed) <
647 				    (weakest_link->num_pairs -
648 				     weakest_link->removed)) {
649 					weakest_link = link;
650 					node_with_weakest_link = current;
651 				}
652 				current = link->node;
653 				break;
654 			}
655 			link = link->next;
656 		}
657 		/* if complete cycle is traversed */
658 		if (current == cycle) {
659 			current->status = UNKNOWN;
660 			break;
661 		}
662 	}
663 
664 	if (node_with_weakest_link->linklist == weakest_link) {
665 		node_with_weakest_link->linklist = weakest_link->next;
666 	} else {
667 		link = node_with_weakest_link->linklist;
668 		while (link) {
669 			if (link->next == weakest_link) {
670 				link->next = weakest_link->next;
671 				break;
672 			}
673 			link = link->next;
674 		}
675 	}
676 
677 	return weakest_link;
678 }
679 
680 /* search for nodes in the cdg not yet reached in the cycle search process;
681    (some nodes are unreachable, e.g. a node is a source or the cdg has not connected parts)
682 */
683 static cdg_node_t *get_next_cdg_node(cdg_node_t * root)
684 {
685 	cdg_node_t *current = root, *res = NULL;
686 
687 	while (current) {
688 		current->visited = 1;
689 		if (current->status == UNKNOWN) {
690 			res = current;
691 			break;
692 		}
693 		if (current->left && !current->left->visited) {
694 			current = current->left;
695 		} else if (current->right && !current->right->visited) {
696 			current = current->right;
697 		} else {
698 			if (current->left)
699 				current->left->visited = 0;
700 			if (current->right)
701 				current->right->visited = 0;
702 			if (current->parent == NULL)
703 				break;
704 			else
705 				current = current->parent;
706 		}
707 	}
708 
709 	/* Clean up */
710 	while (current) {
711 		current->visited = 0;
712 		if (current->left)
713 			current->left->visited = 0;
714 		if (current->right)
715 			current->right->visited = 0;
716 		current = current->parent;
717 	}
718 
719 	return res;
720 }
721 
722 /* make a DFS on the cdg to check for a cycle */
723 static cdg_node_t *search_cycle_in_channel_dep_graph(cdg_node_t * cdg,
724 						     cdg_node_t * start_node)
725 {
726 	cdg_node_t *cycle = NULL;
727 	cdg_node_t *current = start_node, *next_node = NULL, *tmp = NULL;
728 	cdg_link_t *link = NULL;
729 
730 	while (current) {
731 		current->status = GRAY;
732 		link = current->linklist;
733 		next_node = NULL;
734 		while (link) {
735 			if (link->node->status == UNKNOWN) {
736 				next_node = link->node;
737 				break;
738 			}
739 			if (link->node->status == GRAY) {
740 				cycle = link->node;
741 				goto Exit;
742 			}
743 			link = link->next;
744 		}
745 		if (next_node) {
746 			next_node->pre = current;
747 			current = next_node;
748 		} else {
749 			/* found a sink in the graph, go to last node */
750 			current->status = BLACK;
751 
752 			/* srcdest_pairs of this node aren't relevant, free the allocated memory */
753 			link = current->linklist;
754 			while (link) {
755 				if (link->num_pairs)
756 					free(link->srcdest_pairs);
757 				link->srcdest_pairs = NULL;
758 				link->num_pairs = 0;
759 				link->removed = 0;
760 				link = link->next;
761 			}
762 
763 			if (current->pre) {
764 				tmp = current;
765 				current = current->pre;
766 				tmp->pre = NULL;
767 			} else {
768 				/* search for other subgraphs in cdg */
769 				current = get_next_cdg_node(cdg);
770 				if (!current)
771 					break;	/* all relevant nodes traversed, no more cycles found */
772 			}
773 		}
774 	}
775 
776 Exit:
777 	return cycle;
778 }
779 
780 /* calculate the path from source to destination port;
781    new channels are added directly to the cdg
782 */
783 static int update_channel_dep_graph(cdg_node_t ** cdg_root,
784 				    osm_port_t * src_port, uint16_t slid,
785 				    osm_port_t * dest_port, uint16_t dlid)
786 {
787 	osm_node_t *local_node = NULL, *remote_node = NULL;
788 	uint16_t local_lid = 0, remote_lid = 0;
789 	uint32_t srcdest = 0;
790 	uint8_t local_port = 0, remote_port = 0;
791 	uint64_t channelID = 0;
792 
793 	cdg_node_t *channel_head = NULL, *channel = NULL, *last_channel = NULL;
794 	cdg_link_t *linklist = NULL;
795 
796 	/* set the identifier for the src/dest pair to save this on each edge of the cdg */
797 	srcdest = (((uint32_t) slid) << 16) + ((uint32_t) dlid);
798 
799 	channel_head = (cdg_node_t *) malloc(sizeof(cdg_node_t));
800 	if (!channel_head)
801 		goto ERROR;
802 	set_default_cdg_node(channel_head);
803 	last_channel = channel_head;
804 
805 	/* if src is a Hca, then the channel from Hca to switch would be a source in the graph
806 	   sources can't be part of a cycle -> skip this channel
807 	 */
808 	remote_node =
809 	    osm_node_get_remote_node(src_port->p_node,
810 				     src_port->p_physp->port_num, &remote_port);
811 
812 	while (remote_node && remote_node->sw) {
813 		local_node = remote_node;
814 		local_port = local_node->sw->new_lft[dlid];
815 		/* sanity check: local_port must be set or routing is broken */
816 		if (local_port == OSM_NO_PATH)
817 			goto ERROR;
818 		local_lid = cl_ntoh16(osm_node_get_base_lid(local_node, 0));
819 		/* each port belonging to a switch has lmc==0 -> get_base_lid is fine
820 		   (local/remote port in this function are always part of a switch)
821 		 */
822 
823 		remote_node =
824 		    osm_node_get_remote_node(local_node, local_port,
825 					     &remote_port);
826 		/* if remote_node is a Hca, then the last channel from switch to Hca would be a sink in the cdg -> skip */
827 		if (!remote_node || !remote_node->sw)
828 			break;
829 		remote_lid = cl_ntoh16(osm_node_get_base_lid(remote_node, 0));
830 
831 		channelID =
832 		    (((uint64_t) local_lid) << 48) +
833 		    (((uint64_t) local_port) << 32) +
834 		    (((uint64_t) remote_lid) << 16) + ((uint64_t) remote_port);
835 		channel = cdg_search(*cdg_root, channelID);
836 		if (channel) {
837 			/* check whether last channel has connection to this channel, i.e. subpath already exists in cdg */
838 			linklist = last_channel->linklist;
839 			while (linklist && linklist->node != channel
840 			       && linklist->next)
841 				linklist = linklist->next;
842 			/* if there is no connection, add one */
843 			if (linklist) {
844 				if (linklist->node == channel) {
845 					set_next_srcdest_pair(linklist,
846 							      srcdest);
847 				} else {
848 					linklist->next =
849 					    (cdg_link_t *)
850 					    malloc(sizeof(cdg_link_t));
851 					if (!linklist->next)
852 						goto ERROR;
853 					linklist = linklist->next;
854 					linklist->node = channel;
855 					linklist->num_pairs = 0;
856 					linklist->srcdest_pairs = NULL;
857 					set_next_srcdest_pair(linklist,
858 							      srcdest);
859 					linklist->next = NULL;
860 				}
861 			} else {
862 				/* either this is the first channel of the path, or the last channel was a new channel, or last channel was a sink */
863 				last_channel->linklist =
864 				    (cdg_link_t *) malloc(sizeof(cdg_link_t));
865 				if (!last_channel->linklist)
866 					goto ERROR;
867 				last_channel->linklist->node = channel;
868 				last_channel->linklist->num_pairs = 0;
869 				last_channel->linklist->srcdest_pairs = NULL;
870 				set_next_srcdest_pair(last_channel->linklist,
871 						      srcdest);
872 				last_channel->linklist->next = NULL;
873 			}
874 		} else {
875 			/* create new channel */
876 			channel = (cdg_node_t *) malloc(sizeof(cdg_node_t));
877 			if (!channel)
878 				goto ERROR;
879 			set_default_cdg_node(channel);
880 			channel->channelID = channelID;
881 			cdg_insert(cdg_root, channel);
882 
883 			/* go to end of link list of last channel */
884 			linklist = last_channel->linklist;
885 			while (linklist && linklist->next)
886 				linklist = linklist->next;
887 			if (linklist) {
888 				/* update last link of an existing channel */
889 				linklist->next =
890 				    (cdg_link_t *) malloc(sizeof(cdg_link_t));
891 				if (!linklist->next)
892 					goto ERROR;
893 				linklist = linklist->next;
894 				linklist->node = channel;
895 				linklist->num_pairs = 0;
896 				linklist->srcdest_pairs = NULL;
897 				set_next_srcdest_pair(linklist, srcdest);
898 				linklist->next = NULL;
899 			} else {
900 				/* either this is the first channel of the path, or the last channel was a new channel, or last channel was a sink */
901 				last_channel->linklist =
902 				    (cdg_link_t *) malloc(sizeof(cdg_link_t));
903 				if (!last_channel->linklist)
904 					goto ERROR;
905 				last_channel->linklist->node = channel;
906 				last_channel->linklist->num_pairs = 0;
907 				last_channel->linklist->srcdest_pairs = NULL;
908 				set_next_srcdest_pair(last_channel->linklist,
909 						      srcdest);
910 				last_channel->linklist->next = NULL;
911 			}
912 		}
913 		last_channel = channel;
914 	}
915 
916 	if (channel_head->linklist) {
917 		if (channel_head->linklist->srcdest_pairs)
918 			free(channel_head->linklist->srcdest_pairs);
919 		free(channel_head->linklist);
920 	}
921 	free(channel_head);
922 
923 	return 0;
924 
925 ERROR:
926 	/* cleanup data and exit */
927 	if (channel_head) {
928 		if (channel_head->linklist)
929 			free(channel_head->linklist);
930 		free(channel_head);
931 	}
932 
933 	return 1;
934 }
935 
936 /* calculate the path from source to destination port;
937    the links in the cdg representing this path are decremented to simulate the removal
938 */
939 static int remove_path_from_cdg(cdg_node_t ** cdg_root, osm_port_t * src_port,
940 				uint16_t slid, osm_port_t * dest_port,
941 				uint16_t dlid)
942 {
943 	osm_node_t *local_node = NULL, *remote_node = NULL;
944 	uint16_t local_lid = 0, remote_lid = 0;
945 	uint8_t local_port = 0, remote_port = 0;
946 	uint64_t channelID = 0;
947 
948 	cdg_node_t *channel_head = NULL, *channel = NULL, *last_channel = NULL;
949 	cdg_link_t *linklist = NULL;
950 
951 	channel_head = (cdg_node_t *) malloc(sizeof(cdg_node_t));
952 	if (!channel_head)
953 		goto ERROR;
954 	set_default_cdg_node(channel_head);
955 	last_channel = channel_head;
956 
957 	/* if src is a Hca, then the channel from Hca to switch would be a source in the graph
958 	   sources can't be part of a cycle -> skip this channel
959 	 */
960 	remote_node =
961 	    osm_node_get_remote_node(src_port->p_node,
962 				     src_port->p_physp->port_num, &remote_port);
963 
964 	while (remote_node && remote_node->sw) {
965 		local_node = remote_node;
966 		local_port = local_node->sw->new_lft[dlid];
967 		/* sanity check: local_port must be set or routing is broken */
968 		if (local_port == OSM_NO_PATH)
969 			goto ERROR;
970 		local_lid = cl_ntoh16(osm_node_get_base_lid(local_node, 0));
971 
972 		remote_node =
973 		    osm_node_get_remote_node(local_node, local_port,
974 					     &remote_port);
975 		/* if remote_node is a Hca, then the last channel from switch to Hca would be a sink in the cdg -> skip */
976 		if (!remote_node || !remote_node->sw)
977 			break;
978 		remote_lid = cl_ntoh16(osm_node_get_base_lid(remote_node, 0));
979 
980 		channelID =
981 		    (((uint64_t) local_lid) << 48) +
982 		    (((uint64_t) local_port) << 32) +
983 		    (((uint64_t) remote_lid) << 16) + ((uint64_t) remote_port);
984 		channel = cdg_search(*cdg_root, channelID);
985 		if (channel) {
986 			/* check whether last channel has connection to this channel, i.e. subpath already exists in cdg */
987 			linklist = last_channel->linklist;
988 			while (linklist && linklist->node != channel
989 			       && linklist->next)
990 				linklist = linklist->next;
991 			/* remove the srcdest from the link */
992 			if (linklist) {
993 				if (linklist->node == channel) {
994 					linklist->removed++;
995 				} else {
996 					/* may happen if the link is missing (thru cycle detect algorithm) */
997 				}
998 			} else {
999 				/* may happen if the link is missing (thru cycle detect algorithm or last_channel==channel_head (dummy channel)) */
1000 			}
1001 		} else {
1002 			/* must be an error, channels for the path are added before, so a missing channel would be a corrupt data structure */
1003 			goto ERROR;
1004 		}
1005 		last_channel = channel;
1006 	}
1007 
1008 	if (channel_head->linklist)
1009 		free(channel_head->linklist);
1010 	free(channel_head);
1011 
1012 	return 0;
1013 
1014 ERROR:
1015 	/* cleanup data and exit */
1016 	if (channel_head) {
1017 		if (channel_head->linklist)
1018 			free(channel_head->linklist);
1019 		free(channel_head);
1020 	}
1021 
1022 	return 1;
1023 }
1024 
1025 /**********************************************************************
1026  **********************************************************************/
1027 
1028 /************ helper functions to generate an ordered list of ports ***
1029  ************ (functions copied from osm_ucast_mgr.c and modified) ****
1030  **********************************************************************/
1031 static void add_sw_endports_to_order_list(osm_switch_t * sw,
1032 					  osm_ucast_mgr_t * m,
1033 					  cl_qmap_t * guid_tbl,
1034 					  boolean_t add_guids)
1035 {
1036 	osm_port_t *port;
1037 	ib_net64_t port_guid;
1038 	uint64_t sw_guid;
1039 	osm_physp_t *p;
1040 	int i;
1041 	boolean_t found;
1042 
1043 	for (i = 1; i < sw->num_ports; i++) {
1044 		p = osm_node_get_physp_ptr(sw->p_node, i);
1045 		if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw) {
1046 			port_guid = p->p_remote_physp->port_guid;
1047 			/* check if link is healthy, otherwise ignore CA */
1048 			if (!osm_link_is_healthy(p)) {
1049 				sw_guid =
1050 				    cl_ntoh64(osm_node_get_node_guid
1051 					      (sw->p_node));
1052 				OSM_LOG(m->p_log, OSM_LOG_INFO,
1053 					"WRN AD40: ignoring CA due to unhealthy"
1054 					" link from switch 0x%016" PRIx64
1055 					" port %" PRIu8 " to CA 0x%016" PRIx64
1056 					"\n", sw_guid, i, cl_ntoh64(port_guid));
1057 			}
1058 			port = osm_get_port_by_guid(m->p_subn, port_guid);
1059 			if (!port)
1060 				continue;
1061 			if (!cl_is_qmap_empty(guid_tbl)) {
1062 				found = (cl_qmap_get(guid_tbl, port_guid)
1063 					 != cl_qmap_end(guid_tbl));
1064 				if ((add_guids && !found)
1065 				    || (!add_guids && found))
1066 					continue;
1067 			}
1068 			if (!cl_is_item_in_qlist(&m->port_order_list,
1069 						 &port->list_item))
1070 				cl_qlist_insert_tail(&m->port_order_list,
1071 						     &port->list_item);
1072 			else
1073 				OSM_LOG(m->p_log, OSM_LOG_INFO,
1074 					"WRN AD37: guid 0x%016" PRIx64
1075 					" already in list\n", port_guid);
1076 		}
1077 	}
1078 }
1079 
1080 static void add_guid_to_order_list(uint64_t guid, osm_ucast_mgr_t * m)
1081 {
1082 	osm_port_t *port = osm_get_port_by_guid(m->p_subn, cl_hton64(guid));
1083 
1084 	if (!port) {
1085 		 OSM_LOG(m->p_log, OSM_LOG_DEBUG,
1086 			 "port guid not found: 0x%016" PRIx64 "\n", guid);
1087 	}
1088 
1089 	if (!cl_is_item_in_qlist(&m->port_order_list, &port->list_item))
1090 		cl_qlist_insert_tail(&m->port_order_list, &port->list_item);
1091 	else
1092 		OSM_LOG(m->p_log, OSM_LOG_INFO,
1093 			"WRN AD38: guid 0x%016" PRIx64 " already in list\n",
1094 			guid);
1095 }
1096 
1097 /* compare function of #Hca attached to a switch for stdlib qsort */
1098 static int cmp_num_hca(const void * l1, const void * l2)
1099 {
1100 	vertex_t *sw1 = *((vertex_t **) l1);
1101 	vertex_t *sw2 = *((vertex_t **) l2);
1102 	uint32_t num_hca1 = 0, num_hca2 = 0;
1103 
1104 	if (sw1)
1105 		num_hca1 = sw1->num_hca;
1106 	if (sw2)
1107 		num_hca2 = sw2->num_hca;
1108 
1109 	if (num_hca1 > num_hca2)
1110 		return -1;
1111 	else if (num_hca1 < num_hca2)
1112 		return 1;
1113 	else
1114 		return 0;
1115 }
1116 
1117 /* use stdlib to sort the switch array depending on num_hca */
1118 static inline void sw_list_sort_by_num_hca(vertex_t ** sw_list,
1119 					   uint32_t sw_list_size)
1120 {
1121 	qsort(sw_list, sw_list_size, sizeof(vertex_t *), cmp_num_hca);
1122 }
1123 
1124 /**********************************************************************
1125  **********************************************************************/
1126 
1127 /************ helper functions to manage a map of CN and I/O guids ****
1128  **********************************************************************/
1129 static int add_guid_to_map(void * cxt, uint64_t guid, char * p)
1130 {
1131 	cl_qmap_t *map = cxt;
1132 	name_map_item_t *item;
1133 	name_map_item_t *inserted_item;
1134 
1135 	item = malloc(sizeof(*item));
1136 	if (!item)
1137 		return -1;
1138 
1139 	item->guid = cl_hton64(guid);	/* internal: network byte order */
1140 	item->name = NULL;		/* name isn't needed */
1141 	inserted_item = (name_map_item_t *) cl_qmap_insert(map, item->guid, &item->item);
1142 	if (inserted_item != item)
1143                 free(item);
1144 
1145 	return 0;
1146 }
1147 
1148 static void destroy_guid_map(cl_qmap_t * guid_tbl)
1149 {
1150 	name_map_item_t *p_guid = NULL, *p_next_guid = NULL;
1151 
1152 	p_next_guid = (name_map_item_t *) cl_qmap_head(guid_tbl);
1153 	while (p_next_guid != (name_map_item_t *) cl_qmap_end(guid_tbl)) {
1154 		p_guid = p_next_guid;
1155 		p_next_guid = (name_map_item_t *) cl_qmap_next(&p_guid->item);
1156 		free(p_guid);
1157 	}
1158 	cl_qmap_remove_all(guid_tbl);
1159 }
1160 
1161 /**********************************************************************
1162  **********************************************************************/
1163 
1164 static void dfsssp_print_graph(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1165 			       uint32_t size)
1166 {
1167 	uint32_t i = 0, c = 0;
1168 	link_t *link = NULL;
1169 
1170 	/* index 0 is for the source in dijkstra -> ignore */
1171 	for (i = 1; i < size; i++) {
1172 		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, "adj_list[%" PRIu32 "]:\n",
1173 			i);
1174 		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1175 			"   guid = 0x%" PRIx64 " lid = %" PRIu16 " (%s)\n",
1176 			adj_list[i].guid, adj_list[i].lid,
1177 			adj_list[i].sw->p_node->print_desc);
1178 		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1179 			"   num_hca = %" PRIu32 "\n", adj_list[i].num_hca);
1180 
1181 		c = 1;
1182 		for (link = adj_list[i].links; link != NULL;
1183 		     link = link->next, c++) {
1184 			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1185 				"   link[%" PRIu32 "]:\n", c);
1186 			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1187 				"      to guid = 0x%" PRIx64 " (%s) port %"
1188 				PRIu8 "\n", link->guid,
1189 				adj_list[link->to].sw->p_node->print_desc,
1190 				link->to_port);
1191 			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1192 				"      weight on this link = %" PRIu64 "\n",
1193 				link->weight);
1194 		}
1195 	}
1196 }
1197 
1198 /* predefine, to use this in next function */
1199 static void dfsssp_context_destroy(void *context);
1200 static int dijkstra(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1201 		    uint32_t adj_list_size, osm_port_t * port, uint16_t lid);
1202 
1203 /* traverse subnet to gather information about the connected switches */
1204 static int dfsssp_build_graph(void *context)
1205 {
1206 	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
1207 	osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) (dfsssp_ctx->p_mgr);
1208 
1209 	cl_qmap_t *port_tbl = &p_mgr->p_subn->port_guid_tbl;	/* 1 management port per switch + 1 or 2 ports for each Hca */
1210 	osm_port_t *p_port = NULL;
1211 	cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
1212 	cl_map_item_t *item = NULL;
1213 	osm_switch_t *sw = NULL;
1214 	osm_node_t *remote_node = NULL;
1215 	uint8_t port = 0, remote_port = 0;
1216 	uint32_t i = 0, j = 0, err = 0, undiscov = 0, max_num_undiscov = 0;
1217 	uint64_t total_num_hca = 0;
1218 	vertex_t *adj_list = NULL;
1219 	osm_physp_t *p_physp = NULL;
1220 	link_t *link = NULL, *head = NULL;
1221 	uint32_t num_sw = 0, adj_list_size = 0;
1222 	uint8_t lmc = 0;
1223 	uint16_t sm_lid = 0;
1224 
1225 	OSM_LOG_ENTER(p_mgr->p_log);
1226 	OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
1227 		"Building graph for df-/sssp routing\n");
1228 
1229 	/* if this pointer isn't NULL, this is a reroute step;
1230 	   old context will be destroyed (adj_list and srcdest2vl_table)
1231 	 */
1232 	if (dfsssp_ctx->adj_list)
1233 		dfsssp_context_destroy(context);
1234 
1235 	num_sw = cl_qmap_count(sw_tbl);
1236 	adj_list_size = num_sw + 1;
1237 	/* allocate an adjazenz list (array), 0. element is reserved for the source (Hca) in the routing algo, others are switches */
1238 	adj_list = (vertex_t *) malloc(adj_list_size * sizeof(vertex_t));
1239 	if (!adj_list) {
1240 		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1241 			"ERR AD02: cannot allocate memory for adj_list\n");
1242 		goto ERROR;
1243 	}
1244 	for (i = 0; i < adj_list_size; i++)
1245 		set_default_vertex(&adj_list[i]);
1246 
1247 	dfsssp_ctx->adj_list = adj_list;
1248 	dfsssp_ctx->adj_list_size = adj_list_size;
1249 
1250 	/* count the total number of Hca / LIDs (for lmc>0) in the fabric;
1251 	   even include base/enhanced switch port 0; base SP0 will have lmc=0
1252 	 */
1253 	for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl);
1254 	     item = cl_qmap_next(item)) {
1255 		p_port = (osm_port_t *) item;
1256 		if (osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_CA ||
1257 		    osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_SWITCH) {
1258 			lmc = osm_port_get_lmc(p_port);
1259 			total_num_hca += (1 << lmc);
1260 		}
1261 	}
1262 
1263 	i = 1;			/* fill adj_list -> start with index 1 */
1264 	for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
1265 	     item = cl_qmap_next(item), i++) {
1266 		sw = (osm_switch_t *) item;
1267 		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1268 			"Processing switch with GUID 0x%" PRIx64 "\n",
1269 			cl_ntoh64(osm_node_get_node_guid(sw->p_node)));
1270 
1271 		adj_list[i].guid =
1272 		    cl_ntoh64(osm_node_get_node_guid(sw->p_node));
1273 		adj_list[i].lid =
1274 		    cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
1275 		adj_list[i].sw = sw;
1276 
1277 		link = (link_t *) malloc(sizeof(link_t));
1278 		if (!link) {
1279 			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1280 				"ERR AD03: cannot allocate memory for a link\n");
1281 			goto ERROR;
1282 		}
1283 		head = link;
1284 		head->next = NULL;
1285 
1286 		/* add SP0 to number of CA connected to a switch */
1287 		lmc = osm_node_get_lmc(sw->p_node, 0);
1288 		adj_list[i].num_hca += (1 << lmc);
1289 
1290 		/* iterate over all ports in the switch, start with port 1 (port 0 is a management port) */
1291 		for (port = 1; port < sw->num_ports; port++) {
1292 			/* get the node behind the port */
1293 			remote_node =
1294 			    osm_node_get_remote_node(sw->p_node, port,
1295 						     &remote_port);
1296 			/* if there is no remote node on this port or it's the same switch -> try next port */
1297 			if (!remote_node || remote_node->sw == sw)
1298 				continue;
1299 			/* make sure the link is healthy */
1300 			p_physp = osm_node_get_physp_ptr(sw->p_node, port);
1301 			if (!p_physp || !osm_link_is_healthy(p_physp))
1302 				continue;
1303 			/* if there is a Hca connected -> count and cycle */
1304 			if (!remote_node->sw) {
1305 				lmc = osm_node_get_lmc(remote_node, (uint32_t)remote_port);
1306 				adj_list[i].num_hca += (1 << lmc);
1307 				continue;
1308 			}
1309 			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1310 				"Node 0x%" PRIx64 ", remote node 0x%" PRIx64
1311 				", port %" PRIu8 ", remote port %" PRIu8 "\n",
1312 				cl_ntoh64(osm_node_get_node_guid(sw->p_node)),
1313 				cl_ntoh64(osm_node_get_node_guid(remote_node)),
1314 				port, remote_port);
1315 
1316 			link->next = (link_t *) malloc(sizeof(link_t));
1317 			if (!link->next) {
1318 				OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1319 					"ERR AD08: cannot allocate memory for a link\n");
1320 				while (head) {
1321 					link = head;
1322 					head = head->next;
1323 					free(link);
1324 				}
1325 				goto ERROR;
1326 			}
1327 			link = link->next;
1328 			set_default_link(link);
1329 			link->guid =
1330 			    cl_ntoh64(osm_node_get_node_guid(remote_node));
1331 			link->from = i;
1332 			link->from_port = port;
1333 			link->to_port = remote_port;
1334 			link->weight = total_num_hca * total_num_hca;	/* initialize with P^2 to force shortest paths */
1335 		}
1336 
1337 		adj_list[i].links = head->next;
1338 		free(head);
1339 	}
1340 	/* connect the links with it's second adjacent node in the list */
1341 	for (i = 1; i < adj_list_size; i++) {
1342 		link = adj_list[i].links;
1343 		while (link) {
1344 			for (j = 1; j < adj_list_size; j++) {
1345 				if (link->guid == adj_list[j].guid) {
1346 					link->to = j;
1347 					break;
1348 				}
1349 			}
1350 			link = link->next;
1351 		}
1352 	}
1353 	/* do one dry run to determine connectivity issues */
1354 	sm_lid = p_mgr->p_subn->master_sm_base_lid;
1355 	p_port = osm_get_port_by_lid(p_mgr->p_subn, sm_lid);
1356 	err = dijkstra(p_mgr, adj_list, adj_list_size, p_port, sm_lid);
1357 	if (err) {
1358 		goto ERROR;
1359 	} else {
1360 		/* if sm is running on a switch, then dijkstra doesn't
1361 		   initialize the used_link for this switch
1362 		 */
1363 		if (osm_node_get_type(p_port->p_node) != IB_NODE_TYPE_CA)
1364 			max_num_undiscov = 1;
1365 		for (i = 1; i < adj_list_size; i++)
1366 			undiscov += (adj_list[i].used_link) ? 0 : 1;
1367 		if (max_num_undiscov < undiscov) {
1368 			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1369 				"ERR AD0C: unsupported network state (detached"
1370 				" and inaccessible switches found; gracefully"
1371 				" shutdown this routing engine)\n");
1372 			goto ERROR;
1373 		}
1374 	}
1375 	/* print the discovered graph */
1376 	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
1377 		dfsssp_print_graph(p_mgr, adj_list, adj_list_size);
1378 
1379 	OSM_LOG_EXIT(p_mgr->p_log);
1380 	return 0;
1381 
1382 ERROR:
1383 	dfsssp_context_destroy(context);
1384 	return -1;
1385 }
1386 
1387 static void print_routes(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1388 			 uint32_t adj_list_size, osm_port_t * port)
1389 {
1390 	uint32_t i = 0, j = 0;
1391 
1392 	for (i = 1; i < adj_list_size; i++) {
1393 		if (adj_list[i].state == DISCOVERED) {
1394 			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1395 				"Route from 0x%" PRIx64 " (%s) to 0x%" PRIx64
1396 				" (%s):\n", adj_list[i].guid,
1397 				adj_list[i].sw->p_node->print_desc,
1398 				cl_ntoh64(osm_node_get_node_guid(port->p_node)),
1399 				port->p_node->print_desc);
1400 			j = i;
1401 			while (adj_list[j].used_link) {
1402 				if (j > 0) {
1403 					OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1404 						"   0x%" PRIx64
1405 						" (%s) routes thru port %" PRIu8
1406 						"\n", adj_list[j].guid,
1407 						adj_list[j].sw->p_node->
1408 						print_desc,
1409 						adj_list[j].used_link->to_port);
1410 				} else {
1411 					OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1412 						"   0x%" PRIx64
1413 						" (%s) routes thru port %" PRIu8
1414 						"\n", adj_list[j].guid,
1415 						port->p_node->print_desc,
1416 						adj_list[j].used_link->to_port);
1417 				}
1418 				j = adj_list[j].used_link->from;
1419 			}
1420 		}
1421 	}
1422 }
1423 
1424 /* dijkstra step from one source to all switches in the df-/sssp graph */
1425 static int dijkstra(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1426 		    uint32_t adj_list_size, osm_port_t * port, uint16_t lid)
1427 {
1428 	uint32_t i = 0, j = 0, index = 0;
1429 	osm_node_t *remote_node = NULL;
1430 	uint8_t remote_port = 0;
1431 	vertex_t *current = NULL;
1432 	link_t *link = NULL;
1433 	uint64_t guid = 0;
1434 	binary_heap_t *heap = NULL;
1435 	int err = 0;
1436 
1437 	OSM_LOG_ENTER(p_mgr->p_log);
1438 
1439 	/* reset all switches for new round with a new source for dijkstra */
1440 	for (i = 1; i < adj_list_size; i++) {
1441 		adj_list[i].hops = 0;
1442 		adj_list[i].used_link = NULL;
1443 		adj_list[i].distance = INF;
1444 		adj_list[i].state = UNDISCOVERED;
1445 	}
1446 
1447 	/* if behind port is a Hca -> set adj_list[0] */
1448 	if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA) {
1449 		/* save old link to prevent many mallocs after set_default_... */
1450 		link = adj_list[0].links;
1451 		/* initialize adj_list[0] (the source for the routing, a Hca) */
1452 		set_default_vertex(&adj_list[0]);
1453 		adj_list[0].guid =
1454 		    cl_ntoh64(osm_node_get_node_guid(port->p_node));
1455 		adj_list[0].lid = lid;
1456 		index = 0;
1457 		/* write saved link back to new adj_list[0] */
1458 		adj_list[0].links = link;
1459 
1460 		/* initialize link to neighbor for adj_list[0];
1461 		   make sure the link is healthy
1462 		 */
1463 		if (port->p_physp && osm_link_is_healthy(port->p_physp)) {
1464 			remote_node =
1465 			    osm_node_get_remote_node(port->p_node,
1466 						     port->p_physp->port_num,
1467 						     &remote_port);
1468 			/* if there is no remote node on this port or it's the same Hca -> ignore */
1469 			if (remote_node
1470 			    && (osm_node_get_type(remote_node) ==
1471 				IB_NODE_TYPE_SWITCH)) {
1472 				if (!(adj_list[0].links)) {
1473 					adj_list[0].links =
1474 					    (link_t *) malloc(sizeof(link_t));
1475 					if (!(adj_list[0].links)) {
1476 						OSM_LOG(p_mgr->p_log,
1477 							OSM_LOG_ERROR,
1478 							"ERR AD07: cannot allocate memory for a link\n");
1479 						return 1;
1480 					}
1481 				}
1482 				set_default_link(adj_list[0].links);
1483 				adj_list[0].links->guid =
1484 				    cl_ntoh64(osm_node_get_node_guid
1485 					      (remote_node));
1486 				adj_list[0].links->from_port =
1487 				    port->p_physp->port_num;
1488 				adj_list[0].links->to_port = remote_port;
1489 				adj_list[0].links->weight = 1;
1490 				for (j = 1; j < adj_list_size; j++) {
1491 					if (adj_list[0].links->guid ==
1492 					    adj_list[j].guid) {
1493 						adj_list[0].links->to = j;
1494 						break;
1495 					}
1496 				}
1497 			}
1498 		} else {
1499 			/* if link is unhealthy then there's a severe issue */
1500 			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1501 				"ERR AD0B: unsupported network state (CA with"
1502 				" unhealthy link state discovered; should have"
1503 				" been filtered out before already; gracefully"
1504 				" shutdown this routing engine)\n");
1505 			return 1;
1506 		}
1507 		/* if behind port is a switch -> search switch in adj_list */
1508 	} else {
1509 		/* reset adj_list[0], if links=NULL reset was done before, then skip */
1510 		if (adj_list[0].links) {
1511 			free(adj_list[0].links);
1512 			set_default_vertex(&adj_list[0]);
1513 		}
1514 		/* search for the switch which is the source in this round */
1515 		guid = cl_ntoh64(osm_node_get_node_guid(port->p_node));
1516 		for (i = 1; i < adj_list_size; i++) {
1517 			if (guid == adj_list[i].guid) {
1518 				index = i;
1519 				break;
1520 			}
1521 		}
1522 	}
1523 
1524 	/* source in dijkstra */
1525 	adj_list[index].distance = 0;
1526 	adj_list[index].state = DISCOVERED;
1527 	adj_list[index].hops = 0;	/* the source has hop count = 0 */
1528 
1529 	/* create a heap to find (efficient) the node with the smallest distance */
1530 	if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA)
1531 		err = heap_create(adj_list, adj_list_size, &heap);
1532 	else
1533 		err = heap_create(&adj_list[1], adj_list_size - 1, &heap);
1534 	if (err) {
1535 		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1536 			"ERR AD09: cannot allocate memory for heap or heap->node in heap_create(...)\n");
1537 		return err;
1538 	}
1539 
1540 	current = heap_getmin(heap);
1541 	while (current) {
1542 		current->state = DISCOVERED;
1543 		if (current->used_link)	/* increment the number of hops to the source for each new node */
1544 			current->hops =
1545 			    adj_list[current->used_link->from].hops + 1;
1546 
1547 		/* add/update nodes which aren't discovered but accessible */
1548 		for (link = current->links; link != NULL; link = link->next) {
1549 			if ((adj_list[link->to].state != DISCOVERED)
1550 			    && (current->distance + link->weight <
1551 				adj_list[link->to].distance)) {
1552 				adj_list[link->to].used_link = link;
1553 				adj_list[link->to].distance =
1554 				    current->distance + link->weight;
1555 				heap_heapify(heap, adj_list[link->to].heap_id);
1556 			}
1557 		}
1558 
1559 		current = heap_getmin(heap);
1560 	}
1561 
1562 	/* destroy the heap */
1563 	heap_free(heap);
1564 	heap = NULL;
1565 
1566 	OSM_LOG_EXIT(p_mgr->p_log);
1567 	return 0;
1568 }
1569 
1570 /* update the linear forwarding tables of all switches with the informations
1571    from the last dijsktra step
1572 */
1573 static int update_lft(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1574 		      uint32_t adj_list_size, osm_port_t * p_port, uint16_t lid)
1575 {
1576 	uint32_t i = 0;
1577 	uint8_t port = 0;
1578 	uint8_t hops = 0;
1579 	osm_switch_t *p_sw = NULL;
1580 	boolean_t is_ignored_by_port_prof = FALSE;
1581 	osm_physp_t *p = NULL;
1582 	cl_status_t ret;
1583 
1584 	OSM_LOG_ENTER(p_mgr->p_log);
1585 
1586 	for (i = 1; i < adj_list_size; i++) {
1587 		/* if no route goes thru this switch -> cycle */
1588 		if (!(adj_list[i].used_link))
1589 			continue;
1590 
1591 		p_sw = adj_list[i].sw;
1592 		hops = adj_list[i].hops;
1593 		port = adj_list[i].used_link->to_port;
1594 		/* the used_link is the link that was used in dijkstra to reach this node,
1595 		   so the to_port is the local port on this node
1596 		 */
1597 
1598 		if (port == OSM_NO_PATH) {	/* if clause shouldn't be possible in this routing, but who cares */
1599 			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1600 				"ERR AD06: No path to get to LID %" PRIu16
1601 				" from switch 0x%" PRIx64 "\n", lid,
1602 				cl_ntoh64(osm_node_get_node_guid
1603 					  (p_sw->p_node)));
1604 
1605 			/* do not try to overwrite the ppro of non existing port ... */
1606 			is_ignored_by_port_prof = TRUE;
1607 			return 1;
1608 		} else {
1609 			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1610 				"Routing LID %" PRIu16 " to port %" PRIu8
1611 				" for switch 0x%" PRIx64 "\n", lid, port,
1612 				cl_ntoh64(osm_node_get_node_guid
1613 					  (p_sw->p_node)));
1614 
1615 			p = osm_node_get_physp_ptr(p_sw->p_node, port);
1616 			if (!p) {
1617 				OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1618 					"ERR AD0A: Physical port %d of Node GUID 0x%"
1619 					PRIx64 "not found\n", port,
1620 					cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
1621 				return 1;
1622 			}
1623 
1624 			/* we would like to optionally ignore this port in equalization
1625 			   as in the case of the Mellanox Anafa Internal PCI TCA port
1626 			 */
1627 			is_ignored_by_port_prof = p->is_prof_ignored;
1628 
1629 			/* We also would ignore this route if the target lid is of
1630 			   a switch and the port_profile_switch_node is not TRUE
1631 			 */
1632 			if (!p_mgr->p_subn->opt.port_profile_switch_nodes)
1633 				is_ignored_by_port_prof |=
1634 				    (osm_node_get_type(p_port->p_node) ==
1635 				     IB_NODE_TYPE_SWITCH);
1636 		}
1637 
1638 		/* to support lmc > 0 the functions alloc_ports_priv, free_ports_priv, find_and_add_remote_sys
1639 		   from minhop aren't needed cause osm_switch_recommend_path is implicitly calculated
1640 		   for each LID pair thru dijkstra;
1641 		   for each port the dijkstra algorithm calculates (max_lid_ho - min_lid_ho)-times maybe
1642 		   disjoint routes to spread the bandwidth -> diffent routes for one port and lmc>0
1643 		 */
1644 
1645 		/* set port in LFT */
1646 		p_sw->new_lft[lid] = port;
1647 		if (!is_ignored_by_port_prof) {
1648 			/* update the number of path routing thru this port */
1649 			osm_switch_count_path(p_sw, port);
1650 		}
1651 		/* set the hop count from this switch to the lid */
1652 		ret = osm_switch_set_hops(p_sw, lid, port, hops);
1653 		if (ret != CL_SUCCESS)
1654 			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1655 				"ERR AD05: cannot set hops for LID %" PRIu16
1656 				" at switch 0x%" PRIx64 "\n", lid,
1657 				cl_ntoh64(osm_node_get_node_guid
1658 					  (p_sw->p_node)));
1659 	}
1660 
1661 	OSM_LOG_EXIT(p_mgr->p_log);
1662 	return 0;
1663 }
1664 
1665 /* the function updates the multicast group membership information
1666    similar to create_mgrp_switch_map (osm_mcast_mgr.c)
1667    => with it we can identify if a switch needs to be processed
1668    or not in update_mcft
1669 */
1670 static void update_mgrp_membership(cl_qlist_t * port_list)
1671 {
1672 	osm_mcast_work_obj_t *wobj = NULL;
1673 	osm_port_t *port = NULL;
1674 	osm_switch_t *sw = NULL;
1675 	cl_list_item_t *i = NULL;
1676 
1677 	for (i = cl_qlist_head(port_list); i != cl_qlist_end(port_list);
1678 	     i = cl_qlist_next(i)) {
1679 		wobj = cl_item_obj(i, wobj, list_item);
1680 		port = wobj->p_port;
1681 		if (port->p_node->sw) {
1682 			sw = port->p_node->sw;
1683 			sw->is_mc_member = 1;
1684 		} else {
1685 			sw = port->p_physp->p_remote_physp->p_node->sw;
1686 			sw->num_of_mcm++;
1687 		}
1688 	}
1689 }
1690 
1691 /* reset is_mc_member and num_of_mcm for future computations */
1692 static void reset_mgrp_membership(vertex_t * adj_list, uint32_t adj_list_size)
1693 {
1694 	uint32_t i = 0;
1695 
1696 	for (i = 1; i < adj_list_size; i++) {
1697 		if (adj_list[i].dropped)
1698 			continue;
1699 
1700 		adj_list[i].sw->is_mc_member = 0;
1701 		adj_list[i].sw->num_of_mcm = 0;
1702 	}
1703 }
1704 
1705 /* update the multicast forwarding tables of all switches with the informations
1706    from the previous dijsktra step for the current mlid
1707 */
1708 static int update_mcft(osm_sm_t * p_sm, vertex_t * adj_list,
1709 		       uint32_t adj_list_size, uint16_t mlid_ho,
1710 		       cl_qmap_t * port_map, osm_switch_t * root_sw)
1711 {
1712 	uint32_t i = 0;
1713 	uint8_t port = 0, remote_port = 0;
1714 	uint8_t upstream_port = 0, downstream_port = 0;
1715 	ib_net64_t guid = 0;
1716 	osm_switch_t *p_sw = NULL;
1717 	osm_node_t *remote_node = NULL;
1718 	osm_physp_t *p_physp = NULL;
1719 	osm_mcast_tbl_t *p_tbl = NULL;
1720 	vertex_t *curr_adj = NULL;
1721 
1722 	OSM_LOG_ENTER(p_sm->p_log);
1723 
1724 	for (i = 1; i < adj_list_size; i++) {
1725 		if (adj_list[i].dropped)
1726 			continue;
1727 
1728 		p_sw = adj_list[i].sw;
1729 		OSM_LOG(p_sm->p_log, OSM_LOG_VERBOSE,
1730 			"Processing switch 0x%016" PRIx64
1731 			" (%s) for MLID 0x%X\n", cl_ntoh64(adj_list[i].guid),
1732 			p_sw->p_node->print_desc, mlid_ho);
1733 
1734 		/* if a) the switch does not support mcast  or
1735 		      b) no ports of this switch are part or the mcast group
1736 		   then cycle
1737 		 */
1738 		if (osm_switch_supports_mcast(p_sw) == FALSE ||
1739 		    (p_sw->num_of_mcm == 0 && !(p_sw->is_mc_member)))
1740 			continue;
1741 
1742 		p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1743 
1744 		/* add all ports of this sw to the mcast table,
1745 		   if they are part of the mcast grp
1746 		 */
1747 		if (p_sw->is_mc_member)
1748 			osm_mcast_tbl_set(p_tbl, mlid_ho, 0);
1749 		for (port = 1; port < p_sw->num_ports; port++) {
1750 			/* get the node behind the port */
1751 			remote_node =
1752 				osm_node_get_remote_node(p_sw->p_node, port,
1753 							 &remote_port);
1754 			/* check if connected and its not the same switch */
1755 			if (!remote_node || remote_node->sw == p_sw)
1756 				continue;
1757 			/* make sure the link is healthy */
1758 			p_physp = osm_node_get_physp_ptr(p_sw->p_node, port);
1759 			if (!p_physp || !osm_link_is_healthy(p_physp))
1760 				continue;
1761 			/* we don't add upstream ports in this step */
1762 			if (osm_node_get_type(remote_node) != IB_NODE_TYPE_CA)
1763 				continue;
1764 
1765 			guid = osm_physp_get_port_guid(osm_node_get_physp_ptr(
1766 						       remote_node,
1767 						       remote_port));
1768 			if (cl_qmap_get(port_map, guid)
1769 			    != cl_qmap_end(port_map))
1770 				osm_mcast_tbl_set(p_tbl, mlid_ho, port);
1771 		}
1772 
1773 		/* now we have to add the upstream port of 'this' switch and
1774 		   the downstream port of the next switch to the mcast table
1775 		   until we reach the root_sw
1776 		 */
1777 		curr_adj = &adj_list[i];
1778 		while (curr_adj->sw != root_sw) {
1779 			/* the used_link is the link that was used in dijkstra to reach this node,
1780 			   so the to_port is the local (upstream) port on curr_adj->sw
1781 			 */
1782 			upstream_port = curr_adj->used_link->to_port;
1783 			osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port);
1784 
1785 			/* now we go one step in direction root_sw and add the
1786 			   downstream port for the spanning tree
1787 			 */
1788 			downstream_port = curr_adj->used_link->from_port;
1789 			p_tbl = osm_switch_get_mcast_tbl_ptr(
1790 				adj_list[curr_adj->used_link->from].sw);
1791 			osm_mcast_tbl_set(p_tbl, mlid_ho, downstream_port);
1792 
1793 			curr_adj = &adj_list[curr_adj->used_link->from];
1794 		}
1795 	}
1796 
1797 	OSM_LOG_EXIT(p_sm->p_log);
1798 	return 0;
1799 }
1800 
1801 /* increment the edge weights of the df-/sssp graph which represent the number
1802    of paths on this link
1803 */
1804 static void update_weights(osm_ucast_mgr_t * p_mgr, vertex_t * adj_list,
1805 			   uint32_t adj_list_size)
1806 {
1807 	uint32_t i = 0, j = 0;
1808 	uint32_t additional_weight = 0;
1809 
1810 	OSM_LOG_ENTER(p_mgr->p_log);
1811 
1812 	for (i = 1; i < adj_list_size; i++) {
1813 		/* if no route goes thru this switch -> cycle */
1814 		if (!(adj_list[i].used_link))
1815 			continue;
1816 		additional_weight = adj_list[i].num_hca;
1817 
1818 		j = i;
1819 		while (adj_list[j].used_link) {
1820 			/* update the link from pre to this node */
1821 			adj_list[j].used_link->weight += additional_weight;
1822 
1823 			j = adj_list[j].used_link->from;
1824 		}
1825 	}
1826 
1827 	OSM_LOG_EXIT(p_mgr->p_log);
1828 }
1829 
1830 /* get the largest number of virtual lanes which is supported by all switches
1831    in the subnet
1832 */
1833 static uint8_t get_avail_vl_in_subn(osm_ucast_mgr_t * p_mgr)
1834 {
1835 	uint32_t i = 0;
1836 	uint8_t vls_avail = 0xFF, port_vls_avail = 0;
1837 	cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
1838 	cl_map_item_t *item = NULL;
1839 	osm_switch_t *sw = NULL;
1840 
1841 	/* traverse all switches to get the number of available virtual lanes in the subnet */
1842 	for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
1843 	     item = cl_qmap_next(item)) {
1844 		sw = (osm_switch_t *) item;
1845 
1846 		/* ignore management port 0 */
1847 		for (i = 1; i < osm_node_get_num_physp(sw->p_node); i++) {
1848 			osm_physp_t *p_physp =
1849 			    osm_node_get_physp_ptr(sw->p_node, i);
1850 
1851 			if (p_physp && p_physp->p_remote_physp) {
1852 				port_vls_avail =
1853 				    ib_port_info_get_op_vls(&p_physp->
1854 							    port_info);
1855 				if (port_vls_avail
1856 				    && port_vls_avail < vls_avail)
1857 					vls_avail = port_vls_avail;
1858 			}
1859 		}
1860 	}
1861 
1862 	/* ib_port_info_get_op_vls gives values 1 ... 5 (s. IBAS 14.2.5.6) */
1863 	vls_avail = 1 << (vls_avail - 1);
1864 
1865 	/* set boundaries (s. IBAS 3.5.7) */
1866 	if (vls_avail > 15)
1867 		vls_avail = 15;
1868 	if (vls_avail < 1)
1869 		vls_avail = 1;
1870 
1871 	return vls_avail;
1872 }
1873 
1874 /* search for cycles in the channel dependency graph to identify possible
1875    deadlocks in the network;
1876    assign new virtual lanes to some paths to break the deadlocks
1877 */
1878 static int dfsssp_remove_deadlocks(dfsssp_context_t * dfsssp_ctx)
1879 {
1880 	osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
1881 
1882 	cl_qlist_t *port_tbl = &p_mgr->port_order_list;	/* 1 management port per switch + 1 or 2 ports for each Hca */
1883 	cl_list_item_t *item1 = NULL, *item2 = NULL;
1884 	osm_port_t *src_port = NULL, *dest_port = NULL;
1885 
1886 	uint32_t i = 0, j = 0, err = 0;
1887 	uint8_t vl = 0, test_vl = 0, vl_avail = 0, vl_needed = 1;
1888 	double most_avg_paths = 0.0;
1889 	cdg_node_t **cdg = NULL, *start_here = NULL, *cycle = NULL;
1890 	cdg_link_t *weakest_link = NULL;
1891 	uint32_t srcdest = 0;
1892 
1893 	vltable_t *srcdest2vl_table = NULL;
1894 	uint8_t lmc = 0;
1895 	uint16_t slid = 0, dlid = 0, min_lid_ho = 0, max_lid_ho =
1896 	    0, min_lid_ho2 = 0, max_lid_ho2 = 0;;
1897 	uint64_t *paths_per_vl = NULL;
1898 	uint64_t from = 0, to = 0, count = 0;
1899 	uint8_t *split_count = NULL;
1900 	uint8_t ntype = 0;
1901 
1902 	OSM_LOG_ENTER(p_mgr->p_log);
1903 	OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
1904 		"Assign each src/dest pair a Virtual Lanes, to remove deadlocks in the routing\n");
1905 
1906 	vl_avail = get_avail_vl_in_subn(p_mgr);
1907 	OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
1908 		"Virtual Lanes available: %" PRIu8 "\n", vl_avail);
1909 
1910 	paths_per_vl = (uint64_t *) malloc(vl_avail * sizeof(uint64_t));
1911 	if (!paths_per_vl) {
1912 		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1913 			"ERR AD22: cannot allocate memory for paths_per_vl\n");
1914 		return 1;
1915 	}
1916 	memset(paths_per_vl, 0, vl_avail * sizeof(uint64_t));
1917 
1918 	cdg = (cdg_node_t **) malloc(vl_avail * sizeof(cdg_node_t *));
1919 	if (!cdg) {
1920 		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1921 			"ERR AD23: cannot allocate memory for cdg\n");
1922 		free(paths_per_vl);
1923 		return 1;
1924 	}
1925 	for (i = 0; i < vl_avail; i++)
1926 		cdg[i] = NULL;
1927 
1928 	count = 0;
1929 	/* count all ports (also multiple LIDs) of type CA or SP0 for size of VL table */
1930 	for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1931 	     item1 = cl_qlist_next(item1)) {
1932 		dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1933 						      list_item);
1934 		ntype = osm_node_get_type(dest_port->p_node);
1935 		if (ntype == IB_NODE_TYPE_CA || ntype == IB_NODE_TYPE_SWITCH) {
1936 			/* only SP0 with SLtoVLMapping support will be processed */
1937 			if (ntype == IB_NODE_TYPE_SWITCH
1938 			    && !(dest_port->p_physp->port_info.capability_mask
1939 			    & IB_PORT_CAP_HAS_SL_MAP))
1940 				continue;
1941 
1942 			lmc = osm_port_get_lmc(dest_port);
1943 			count += (1 << lmc);
1944 		}
1945 	}
1946 	/* allocate VL table and indexing array */
1947 	err = vltable_alloc(&srcdest2vl_table, count);
1948 	if (err) {
1949 		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1950 			"ERR AD26: cannot allocate memory for srcdest2vl_table\n");
1951 		goto ERROR;
1952 	}
1953 
1954 	i = 0;
1955 	/* fill lids into indexing array */
1956 	for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1957 	     item1 = cl_qlist_next(item1)) {
1958 		dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1959 						      list_item);
1960 		ntype = osm_node_get_type(dest_port->p_node);
1961 		if (ntype == IB_NODE_TYPE_CA || ntype == IB_NODE_TYPE_SWITCH) {
1962 			/* only SP0 with SLtoVLMapping support will be processed */
1963 			if (ntype == IB_NODE_TYPE_SWITCH
1964 			    && !(dest_port->p_physp->port_info.capability_mask
1965 			    & IB_PORT_CAP_HAS_SL_MAP))
1966 				continue;
1967 
1968 			osm_port_get_lid_range_ho(dest_port, &min_lid_ho,
1969 						  &max_lid_ho);
1970 			for (dlid = min_lid_ho; dlid <= max_lid_ho; dlid++, i++)
1971 				srcdest2vl_table->lids[i] = cl_hton16(dlid);
1972 		}
1973 	}
1974 	/* sort lids */
1975 	vltable_sort_lids(srcdest2vl_table);
1976 
1977 	test_vl = 0;
1978 	/* fill cdg[0] with routes from each src/dest port combination for all Hca/SP0 in the subnet */
1979 	for (item1 = cl_qlist_head(port_tbl); item1 != cl_qlist_end(port_tbl);
1980 	     item1 = cl_qlist_next(item1)) {
1981 		dest_port = (osm_port_t *)cl_item_obj(item1, dest_port,
1982 						      list_item);
1983 		ntype = osm_node_get_type(dest_port->p_node);
1984 		if ((ntype != IB_NODE_TYPE_CA && ntype != IB_NODE_TYPE_SWITCH)
1985 		    || !(dest_port->p_physp->port_info.capability_mask
1986 		    & IB_PORT_CAP_HAS_SL_MAP))
1987 			continue;
1988 
1989 		for (item2 = cl_qlist_head(port_tbl);
1990 		     item2 != cl_qlist_end(port_tbl);
1991 		     item2 = cl_qlist_next(item2)) {
1992 			src_port = (osm_port_t *)cl_item_obj(item2, src_port,
1993 							     list_item);
1994 			ntype = osm_node_get_type(src_port->p_node);
1995 			if ((ntype != IB_NODE_TYPE_CA
1996 			    && ntype != IB_NODE_TYPE_SWITCH)
1997 			    || !(src_port->p_physp->port_info.capability_mask
1998 			    & IB_PORT_CAP_HAS_SL_MAP))
1999 				continue;
2000 
2001 			if (src_port != dest_port) {
2002 				/* iterate over LIDs of src and dest port */
2003 				osm_port_get_lid_range_ho(src_port, &min_lid_ho,
2004 							  &max_lid_ho);
2005 				for (slid = min_lid_ho; slid <= max_lid_ho;
2006 				     slid++) {
2007 					osm_port_get_lid_range_ho
2008 					    (dest_port, &min_lid_ho2,
2009 					     &max_lid_ho2);
2010 					for (dlid = min_lid_ho2;
2011 					     dlid <= max_lid_ho2;
2012 					     dlid++) {
2013 
2014 						/* try to add the path to cdg[0] */
2015 						err =
2016 						    update_channel_dep_graph
2017 						    (&(cdg[test_vl]),
2018 						     src_port, slid,
2019 						     dest_port, dlid);
2020 						if (err) {
2021 							OSM_LOG(p_mgr->
2022 								p_log,
2023 								OSM_LOG_ERROR,
2024 								"ERR AD14: cannot allocate memory for cdg node or link in update_channel_dep_graph(...)\n");
2025 							goto ERROR;
2026 						}
2027 						/* add the <s,d> combination / corresponding virtual lane to the VL table */
2028 						vltable_insert
2029 						    (srcdest2vl_table,
2030 						     cl_hton16(slid),
2031 						     cl_hton16(dlid),
2032 						     test_vl);
2033 						paths_per_vl[test_vl]++;
2034 
2035 					}
2036 
2037 				}
2038 			}
2039 
2040 		}
2041 	}
2042 	dfsssp_ctx->srcdest2vl_table = srcdest2vl_table;
2043 
2044 	/* test all cdg for cycles and break the cycles by moving paths on the weakest link to the next cdg */
2045 	for (test_vl = 0; test_vl < vl_avail - 1; test_vl++) {
2046 		start_here = cdg[test_vl];
2047 		while (start_here) {
2048 			cycle =
2049 			    search_cycle_in_channel_dep_graph(cdg[test_vl],
2050 							      start_here);
2051 
2052 			if (cycle) {
2053 				vl_needed = test_vl + 2;
2054 
2055 				/* calc weakest link n cycle */
2056 				weakest_link = get_weakest_link_in_cycle(cycle);
2057 				if (!weakest_link) {
2058 					OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2059 						"ERR AD27: something went wrong in get_weakest_link_in_cycle(...)\n");
2060 					err = 1;
2061 					goto ERROR;
2062 				}
2063 
2064 				paths_per_vl[test_vl] -=
2065 				    weakest_link->num_pairs;
2066 				paths_per_vl[test_vl + 1] +=
2067 				    weakest_link->num_pairs;
2068 
2069 				/* move all <s,d> paths on this link to the next cdg */
2070 				for (i = 0; i < weakest_link->num_pairs; i++) {
2071 					srcdest =
2072 					    get_next_srcdest_pair(weakest_link,
2073 								  i);
2074 					slid = (uint16_t) (srcdest >> 16);
2075 					dlid =
2076 					    (uint16_t) ((srcdest << 16) >> 16);
2077 
2078 					/* only move if not moved in a previous step */
2079 					if (test_vl !=
2080 					    (uint8_t)
2081 					    vltable_get_vl(srcdest2vl_table,
2082 							   cl_hton16(slid),
2083 							   cl_hton16(dlid))) {
2084 						/* this path has been moved
2085 						   before -> don't count
2086 						 */
2087 						paths_per_vl[test_vl]++;
2088 						paths_per_vl[test_vl + 1]--;
2089 						continue;
2090 					}
2091 
2092 					src_port =
2093 					    osm_get_port_by_lid(p_mgr->p_subn,
2094 								cl_hton16
2095 								(slid));
2096 					dest_port =
2097 					    osm_get_port_by_lid(p_mgr->p_subn,
2098 								cl_hton16
2099 								(dlid));
2100 
2101 					/* remove path from current cdg / vl */
2102 					err =
2103 					    remove_path_from_cdg(&
2104 								 (cdg[test_vl]),
2105 								 src_port, slid,
2106 								 dest_port,
2107 								 dlid);
2108 					if (err) {
2109 						OSM_LOG(p_mgr->p_log,
2110 							OSM_LOG_ERROR,
2111 							"ERR AD44: something went wrong in remove_path_from_cdg(...)\n");
2112 						goto ERROR;
2113 					}
2114 
2115 					/* add path to next cdg / vl */
2116 					err =
2117 					    update_channel_dep_graph(&
2118 								     (cdg
2119 								      [test_vl +
2120 								       1]),
2121 								     src_port,
2122 								     slid,
2123 								     dest_port,
2124 								     dlid);
2125 					if (err) {
2126 						OSM_LOG(p_mgr->p_log,
2127 							OSM_LOG_ERROR,
2128 							"ERR AD14: cannot allocate memory for cdg node or link in update_channel_dep_graph(...)\n");
2129 						goto ERROR;
2130 					}
2131 					vltable_insert(srcdest2vl_table,
2132 						       cl_hton16(slid),
2133 						       cl_hton16(dlid),
2134 						       test_vl + 1);
2135 				}
2136 
2137 				if (weakest_link->num_pairs)
2138 					free(weakest_link->srcdest_pairs);
2139 				if (weakest_link)
2140 					free(weakest_link);
2141 			}
2142 
2143 			start_here = cycle;
2144 		}
2145 	}
2146 
2147 	/* test the last avail cdg for a cycle;
2148 	   if there is one, than vl_needed > vl_avail
2149 	 */
2150 	start_here = cdg[vl_avail - 1];
2151 	if (start_here) {
2152 		cycle =
2153 		    search_cycle_in_channel_dep_graph(cdg[vl_avail - 1],
2154 						      start_here);
2155 		if (cycle) {
2156 			vl_needed = vl_avail + 1;
2157 		}
2158 	}
2159 
2160 	OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2161 		"Virtual Lanes needed: %" PRIu8 "\n", vl_needed);
2162 	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_INFO)) {
2163 		OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2164 			"Paths per VL (before balancing):\n");
2165 		for (i = 0; i < vl_avail; i++)
2166 			OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2167 				"   %" PRIu32 ". lane: %" PRIu64 "\n", i,
2168 				paths_per_vl[i]);
2169 	}
2170 
2171 	OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
2172 		"Balancing the paths on the available Virtual Lanes\n");
2173 
2174 	/* optimal balancing virtual lanes, under condition: no additional cycle checks;
2175 	   sl/vl != 0 might be assigned to loopback packets (i.e. slid/dlid on the
2176 	   same port for lmc>0), but thats no problem, see IBAS 10.2.2.3
2177 	 */
2178 	split_count = (uint8_t *) calloc(vl_avail, sizeof(uint8_t));
2179 	if (!split_count) {
2180 		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2181 			"ERR AD24: cannot allocate memory for split_count, skip balancing\n");
2182 		err = 1;
2183 		goto ERROR;
2184 	}
2185 	/* initial state: paths for VLs won't be separated */
2186 	for (i = 0; i < ((vl_needed < vl_avail) ? vl_needed : vl_avail); i++)
2187 		split_count[i] = 1;
2188 	dfsssp_ctx->vl_split_count = split_count;
2189 	/* balancing is necessary if we have empty VLs */
2190 	if (vl_needed < vl_avail) {
2191 		/* split paths of VLs until we find an equal distribution */
2192 		for (i = vl_needed; i < vl_avail; i++) {
2193 			/* find VL with most paths in it */
2194 			vl = 0;
2195 			most_avg_paths = 0.0;
2196 			for (test_vl = 0; test_vl < vl_needed; test_vl++) {
2197 				if (most_avg_paths <
2198 				    ((double)paths_per_vl[test_vl] /
2199 				    split_count[test_vl])) {
2200 					vl = test_vl;
2201 					most_avg_paths =
2202 						(double)paths_per_vl[test_vl] /
2203 						split_count[test_vl];
2204 				}
2205 			}
2206 			split_count[vl]++;
2207 		}
2208 		/* change the VL assignment depending on split_count for
2209 		   all VLs except VL 0
2210 		 */
2211 		for (from = vl_needed - 1; from > 0; from--) {
2212 			/* how much space needed for others? */
2213 			to = 0;
2214 			for (i = 0; i < from; i++)
2215 				to += split_count[i];
2216 			count = paths_per_vl[from];
2217 			vltable_change_vl(srcdest2vl_table, from, to, count);
2218 			/* change also the information within the split_count
2219 			   array; this is important for fast calculation later
2220 			 */
2221 			split_count[to] = split_count[from];
2222 			split_count[from] = 0;
2223 			paths_per_vl[to] = paths_per_vl[from];
2224 			paths_per_vl[from] = 0;
2225 		}
2226 	} else if (vl_needed > vl_avail) {
2227 		/* routing not possible, a further development would be the LASH-TOR approach (update: LASH-TOR isn't possible, there is a mistake in the theory) */
2228 		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2229 			"ERR AD25: Not enough VLs available (avail=%d, needed=%d); Stopping dfsssp routing!\n",
2230 			vl_avail, vl_needed);
2231 		err = 1;
2232 		goto ERROR;
2233 	}
2234 	/* else { no balancing } */
2235 
2236 	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG)) {
2237 		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2238 			"Virtual Lanes per src/dest combination after balancing:\n");
2239 		vltable_print(p_mgr, srcdest2vl_table);
2240 	}
2241 	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_INFO)) {
2242 		OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2243 			"Approx. #paths per VL (after balancing):\n");
2244 		j = 0;
2245 		count = 1; /* to prevent div. by 0 */
2246 		for (i = 0; i < vl_avail; i++) {
2247 			if (split_count[i] > 0) {
2248 				j = i;
2249 				count = split_count[i];
2250 			}
2251 			OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2252 				"   %" PRIu32 ". lane: %" PRIu64 "\n", i,
2253 				paths_per_vl[j] / count);
2254 		}
2255 	}
2256 
2257 	free(paths_per_vl);
2258 
2259 	/* deallocate channel dependency graphs */
2260 	for (i = 0; i < vl_avail; i++)
2261 		cdg_dealloc(&cdg[i]);
2262 	free(cdg);
2263 
2264 	OSM_LOG_EXIT(p_mgr->p_log);
2265 	return 0;
2266 
2267 ERROR:
2268 	free(paths_per_vl);
2269 
2270 	for (i = 0; i < vl_avail; i++)
2271 		cdg_dealloc(&cdg[i]);
2272 	free(cdg);
2273 
2274 	vltable_dealloc(&srcdest2vl_table);
2275 	dfsssp_ctx->srcdest2vl_table = NULL;
2276 
2277 	return err;
2278 }
2279 
2280 /* meta function which calls subfunctions for dijkstra, update lft and weights,
2281    (and remove deadlocks) to calculate the routing for the subnet
2282 */
2283 static int dfsssp_do_dijkstra_routing(void *context)
2284 {
2285 	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2286 	osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2287 	vertex_t *adj_list = (vertex_t *) dfsssp_ctx->adj_list;
2288 	uint32_t adj_list_size = dfsssp_ctx->adj_list_size;
2289 
2290 	vertex_t **sw_list = NULL;
2291 	uint32_t sw_list_size = 0;
2292 	uint64_t guid = 0;
2293 	cl_qlist_t *qlist = NULL;
2294 	cl_list_item_t *qlist_item = NULL;
2295 
2296 	cl_qmap_t *sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
2297 	cl_qmap_t cn_tbl, io_tbl, *p_mixed_tbl = NULL;
2298 	cl_map_item_t *item = NULL;
2299 	osm_switch_t *sw = NULL;
2300 	osm_port_t *port = NULL;
2301 	uint32_t i = 0, err = 0;
2302 	uint16_t lid = 0, min_lid_ho = 0, max_lid_ho = 0;
2303 	uint8_t lmc = 0;
2304 	boolean_t cn_nodes_provided = FALSE, io_nodes_provided = FALSE;
2305 
2306 	OSM_LOG_ENTER(p_mgr->p_log);
2307 	OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
2308 		"Calculating shortest path from all Hca/switches to all\n");
2309 
2310 	cl_qmap_init(&cn_tbl);
2311 	cl_qmap_init(&io_tbl);
2312 	p_mixed_tbl = &cn_tbl;
2313 
2314 	cl_qlist_init(&p_mgr->port_order_list);
2315 
2316 	/* reset the new_lft for each switch */
2317 	for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
2318 	     item = cl_qmap_next(item)) {
2319 		sw = (osm_switch_t *) item;
2320 		/* initialize LIDs in buffer to invalid port number */
2321 		memset(sw->new_lft, OSM_NO_PATH, sw->max_lid_ho + 1);
2322 		/* initialize LFT and hop count for bsp0/esp0 of the switch */
2323 		min_lid_ho = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
2324 		lmc = osm_node_get_lmc(sw->p_node, 0);
2325 		for (i = min_lid_ho; i < min_lid_ho + (1 << lmc); i++) {
2326 			/* for each switch the port to the 'self'lid is the management port 0 */
2327 			sw->new_lft[i] = 0;
2328 			/* the hop count to the 'self'lid is 0 for each switch */
2329 			osm_switch_set_hops(sw, i, 0, 0);
2330 		}
2331 	}
2332 
2333 	/* we need an intermediate array of pointers to switches in adj_list;
2334 	   this array will be sorted in respect to num_hca (descending)
2335 	 */
2336 	sw_list_size = adj_list_size - 1;
2337 	sw_list = (vertex_t **)malloc(sw_list_size * sizeof(vertex_t *));
2338 	if (!sw_list) {
2339 		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2340 			"ERR AD29: cannot allocate memory for sw_list in dfsssp_do_dijkstra_routing\n");
2341 		goto ERROR;
2342 	}
2343 	memset(sw_list, 0, sw_list_size * sizeof(vertex_t *));
2344 
2345 	/* fill the array with references to the 'real' sw in adj_list */
2346 	for (i = 0; i < sw_list_size; i++)
2347 		sw_list[i] = &(adj_list[i + 1]);
2348 
2349 	/* sort the sw_list in descending order */
2350 	sw_list_sort_by_num_hca(sw_list, sw_list_size);
2351 
2352 	/* parse compute node guid file, if provided by the user */
2353 	if (p_mgr->p_subn->opt.cn_guid_file) {
2354 		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2355 			"Parsing compute nodes from file %s\n",
2356 			p_mgr->p_subn->opt.cn_guid_file);
2357 
2358 		if (parse_node_map(p_mgr->p_subn->opt.cn_guid_file,
2359 				   add_guid_to_map, &cn_tbl)) {
2360 			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2361 				"ERR AD33: Problem parsing compute node guid file\n");
2362 			goto ERROR;
2363 		}
2364 
2365 		if (cl_is_qmap_empty(&cn_tbl))
2366 			OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2367 				"WRN AD34: compute node guids file contains no valid guids\n");
2368 		else
2369 			cn_nodes_provided = TRUE;
2370 	}
2371 
2372 	/* parse I/O guid file, if provided by the user */
2373 	if (p_mgr->p_subn->opt.io_guid_file) {
2374 		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2375 			"Parsing I/O nodes from file %s\n",
2376 			p_mgr->p_subn->opt.io_guid_file);
2377 
2378 		if (parse_node_map(p_mgr->p_subn->opt.io_guid_file,
2379 				   add_guid_to_map, &io_tbl)) {
2380 			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2381 				"ERR AD35: Problem parsing I/O guid file\n");
2382 			goto ERROR;
2383 		}
2384 
2385 		if (cl_is_qmap_empty(&io_tbl))
2386 			OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2387 				"WRN AD36: I/O node guids file contains no valid guids\n");
2388 		else
2389 			io_nodes_provided = TRUE;
2390 	}
2391 
2392 	/* if we mix Hca/Tca/SP0 during the dijkstra routing, we might end up
2393 	   in rare cases with a bad balancing for Hca<->Hca connections, i.e.
2394 	   some inter-switch links get oversubscribed with paths;
2395 	   therefore: add Hca ports first to ensure good Hca<->Hca balancing
2396 	 */
2397 	if (cn_nodes_provided) {
2398 		for (i = 0; i < adj_list_size - 1; i++) {
2399 			if (sw_list[i] && sw_list[i]->sw) {
2400 				sw = (osm_switch_t *)(sw_list[i]->sw);
2401 				add_sw_endports_to_order_list(sw, p_mgr,
2402 							      &cn_tbl, TRUE);
2403 			} else {
2404 				OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2405 					"ERR AD30: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2406 				goto ERROR;
2407 			}
2408 		}
2409 	}
2410 	/* then: add Tca ports to ensure good Hca->Tca balancing and separate
2411 	   paths towards I/O nodes on the same switch (if possible)
2412 	 */
2413 	if (io_nodes_provided) {
2414 		for (i = 0; i < adj_list_size - 1; i++) {
2415 			if (sw_list[i] && sw_list[i]->sw) {
2416 				sw = (osm_switch_t *)(sw_list[i]->sw);
2417 				add_sw_endports_to_order_list(sw, p_mgr,
2418 							      &io_tbl, TRUE);
2419 			} else {
2420 				OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2421 					"ERR AD32: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2422 				goto ERROR;
2423 			}
2424 		}
2425 	}
2426 	/* then: add anything else, such as administration nodes, ... */
2427 	if (cn_nodes_provided && io_nodes_provided) {
2428 		cl_qmap_merge(&cn_tbl, &io_tbl);
2429 	} else if (io_nodes_provided) {
2430 		p_mixed_tbl = &io_tbl;
2431 	}
2432 	for (i = 0; i < adj_list_size - 1; i++) {
2433 		if (sw_list[i] && sw_list[i]->sw) {
2434 			sw = (osm_switch_t *)(sw_list[i]->sw);
2435 			add_sw_endports_to_order_list(sw, p_mgr, p_mixed_tbl,
2436 						      FALSE);
2437 		} else {
2438 			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2439 				"ERR AD39: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2440 			goto ERROR;
2441 		}
2442 	}
2443 	/* last: add SP0 afterwards which have lower priority for balancing */
2444 	for (i = 0; i < sw_list_size; i++) {
2445 		if (sw_list[i] && sw_list[i]->sw) {
2446 			sw = (osm_switch_t *)(sw_list[i]->sw);
2447 			guid = cl_ntoh64(osm_node_get_node_guid(sw->p_node));
2448 			add_guid_to_order_list(guid, p_mgr);
2449 		} else {
2450 			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2451 				"ERR AD31: corrupted sw_list array in dfsssp_do_dijkstra_routing\n");
2452 			goto ERROR;
2453 		}
2454 	}
2455 
2456 	/* the intermediate array lived long enough */
2457 	free(sw_list);
2458 	sw_list = NULL;
2459 	/* same is true for the compute node and I/O guid map */
2460 	destroy_guid_map(&cn_tbl);
2461 	cn_nodes_provided = FALSE;
2462 	destroy_guid_map(&io_tbl);
2463 	io_nodes_provided = FALSE;
2464 
2465 	/* do the routing for the each Hca in the subnet and each switch
2466 	   in the subnet (to add the routes to base/enhanced SP0)
2467 	 */
2468 	qlist = &p_mgr->port_order_list;
2469 	for (qlist_item = cl_qlist_head(qlist);
2470 	     qlist_item != cl_qlist_end(qlist);
2471 	     qlist_item = cl_qlist_next(qlist_item)) {
2472 		port = (osm_port_t *)cl_item_obj(qlist_item, port, list_item);
2473 
2474 		/* calculate shortest path with dijkstra from node to all switches/Hca */
2475 		if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_CA) {
2476 			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2477 				"Processing Hca with GUID 0x%" PRIx64 "\n",
2478 				cl_ntoh64(osm_node_get_node_guid
2479 					  (port->p_node)));
2480 		} else if (osm_node_get_type(port->p_node) == IB_NODE_TYPE_SWITCH) {
2481 			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2482 				"Processing switch with GUID 0x%" PRIx64 "\n",
2483 				cl_ntoh64(osm_node_get_node_guid
2484 					  (port->p_node)));
2485 		} else {
2486 			/* we don't handle routers, in case they show up */
2487 			continue;
2488 		}
2489 
2490 		/* distribute the LID range across the ports that can reach those LIDs
2491 		   to have disjoint paths for one destination port with lmc>0;
2492 		   for switches with bsp0: min=max; with esp0: max>min if lmc>0
2493 		 */
2494 		osm_port_get_lid_range_ho(port, &min_lid_ho,
2495 					  &max_lid_ho);
2496 		for (lid = min_lid_ho; lid <= max_lid_ho; lid++) {
2497 			/* do dijkstra from this Hca/LID/SP0 to each switch */
2498 			err =
2499 			    dijkstra(p_mgr, adj_list, adj_list_size, port, lid);
2500 			if (err)
2501 				goto ERROR;
2502 			if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
2503 				print_routes(p_mgr, adj_list, adj_list_size,
2504 					     port);
2505 
2506 			/* make an update for the linear forwarding tables of the switches */
2507 			err =
2508 			    update_lft(p_mgr, adj_list, adj_list_size, port, lid);
2509 			if (err)
2510 				goto ERROR;
2511 
2512 			/* add weights for calculated routes to adjust the weights for the next cycle */
2513 			update_weights(p_mgr, adj_list, adj_list_size);
2514 
2515 			if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG))
2516 				dfsssp_print_graph(p_mgr, adj_list,
2517 						   adj_list_size);
2518 		}
2519 	}
2520 
2521 	/* try deadlock removal only for the dfsssp routing (not for the sssp case, which is a subset of the dfsssp algorithm) */
2522 	if (dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_DFSSSP) {
2523 		/* remove potential deadlocks by assigning different virtual lanes to src/dest paths and balance the lanes */
2524 		err = dfsssp_remove_deadlocks(dfsssp_ctx);
2525 		if (err)
2526 			goto ERROR;
2527 	} else if (dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_SSSP) {
2528 		OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
2529 			"SSSP routing specified -> skipping deadlock removal thru dfsssp_remove_deadlocks(...)\n");
2530 	} else {
2531 		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
2532 			"ERR AD28: wrong routing engine specified in dfsssp_ctx\n");
2533 		goto ERROR;
2534 	}
2535 
2536 	/* list not needed after the dijkstra steps and deadlock removal */
2537 	cl_qlist_remove_all(&p_mgr->port_order_list);
2538 
2539 	/* print the new_lft for each switch after routing is done */
2540 	if (OSM_LOG_IS_ACTIVE_V2(p_mgr->p_log, OSM_LOG_DEBUG)) {
2541 		for (item = cl_qmap_head(sw_tbl); item != cl_qmap_end(sw_tbl);
2542 		     item = cl_qmap_next(item)) {
2543 			sw = (osm_switch_t *) item;
2544 			OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2545 				"Summary of the (new) LFT for switch 0x%" PRIx64
2546 				" (%s):\n",
2547 				cl_ntoh64(osm_node_get_node_guid(sw->p_node)),
2548 				sw->p_node->print_desc);
2549 			for (i = 0; i < sw->max_lid_ho + 1; i++)
2550 				if (sw->new_lft[i] != OSM_NO_PATH) {
2551 					OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
2552 						"   for LID=%" PRIu32
2553 						" use port=%" PRIu8 "\n", i,
2554 						sw->new_lft[i]);
2555 				}
2556 		}
2557 	}
2558 
2559 	OSM_LOG_EXIT(p_mgr->p_log);
2560 	return 0;
2561 
2562 ERROR:
2563 	if (!cl_is_qlist_empty(&p_mgr->port_order_list))
2564 		cl_qlist_remove_all(&p_mgr->port_order_list);
2565 	if (cn_nodes_provided)
2566 		destroy_guid_map(&cn_tbl);
2567 	if (io_nodes_provided)
2568 		destroy_guid_map(&io_tbl);
2569 	if (sw_list)
2570 		free(sw_list);
2571 	return -1;
2572 }
2573 
2574 /* meta function which calls subfunctions for finding the optimal switch
2575    for the spanning tree, performing a dijkstra step with this sw as root,
2576    and calculating the mcast table for MLID
2577 */
2578 static ib_api_status_t dfsssp_do_mcast_routing(void * context,
2579 					       osm_mgrp_box_t * mbox)
2580 {
2581 	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2582 	osm_ucast_mgr_t *p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2583 	osm_sm_t *sm = (osm_sm_t *) p_mgr->sm;
2584 	vertex_t *adj_list = (vertex_t *) dfsssp_ctx->adj_list;
2585 	uint32_t adj_list_size = dfsssp_ctx->adj_list_size;
2586 	cl_qlist_t mcastgrp_port_list;
2587 	cl_qmap_t mcastgrp_port_map;
2588 	osm_switch_t *root_sw = NULL, *p_sw = NULL;
2589 	osm_port_t *port = NULL;
2590 	ib_net16_t lid = 0;
2591 	uint32_t err = 0, num_ports = 0, i = 0;
2592 	ib_net64_t guid = 0;
2593 	ib_api_status_t status = IB_SUCCESS;
2594 
2595 	OSM_LOG_ENTER(sm->p_log);
2596 
2597 	/* using the ucast cache feature with dfsssp might mean that a leaf sw
2598 	   got removed (and got back) without calling dfsssp_build_graph
2599 	   and therefore the adj_list (and pointers to osm's internal switches)
2600 	   could be outdated (here we have no knowledge if it has happened, so
2601 	   unfortunately a check is necessary... still better than rebuilding
2602 	   adj_list every time we arrive here)
2603 	 */
2604 	if (p_mgr->p_subn->opt.use_ucast_cache && p_mgr->cache_valid) {
2605 		for (i = 1; i < adj_list_size; i++) {
2606 			guid = cl_hton64(adj_list[i].guid);
2607 			p_sw = osm_get_switch_by_guid(p_mgr->p_subn, guid);
2608 			if (p_sw) {
2609 				/* check if switch came back from the dead */
2610 				if (adj_list[i].dropped)
2611 					adj_list[i].dropped = FALSE;
2612 
2613 				/* verify that sw object has not been moved
2614 				   (this can happen for a leaf switch, if it
2615 				   was dropped and came back later without a
2616 				   rerouting), otherwise we have to update
2617 				   dfsssp's internal switch list with the new
2618 				   sw pointer
2619 				 */
2620 				if (p_sw == adj_list[i].sw)
2621 					continue;
2622 				else
2623 					adj_list[i].sw = p_sw;
2624 			} else {
2625 				/* if a switch from adj_list is not in the
2626 				   sw_guid_tbl anymore, then the only reason is
2627 				   that it was a leaf switch and opensm dropped
2628 				   it without calling a rerouting
2629 				   -> calling dijkstra is no problem, since it
2630 				      is a leaf and different from root_sw
2631 				   -> only update_mcft and reset_mgrp_membership
2632 				      need to be aware of these dropped switches
2633 				 */
2634 				if (!adj_list[i].dropped)
2635 					adj_list[i].dropped = TRUE;
2636 			}
2637 		}
2638 	}
2639 
2640 	/* create a map and a list of all ports which are member in the mcast
2641 	   group; map for searching elements and list for iteration
2642 	 */
2643 	if (osm_mcast_make_port_list_and_map(&mcastgrp_port_list,
2644 					     &mcastgrp_port_map, mbox)) {
2645 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD50: "
2646 			"Insufficient memory to make port list\n");
2647 		status = IB_ERROR;
2648 		goto Exit;
2649 	}
2650 
2651 	num_ports = cl_qlist_count(&mcastgrp_port_list);
2652 	if (num_ports < 2) {
2653 		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
2654 			"MLID 0x%X has %u members - nothing to do\n",
2655 			mbox->mlid, num_ports);
2656 		goto Exit;
2657 	}
2658 
2659 	/* find the root switch for the spanning tree, which has the smallest
2660 	   hops count to all LIDs in the mcast group
2661 	 */
2662 	root_sw = osm_mcast_mgr_find_root_switch(sm, &mcastgrp_port_list);
2663 	if (!root_sw) {
2664 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD51: "
2665 			"Unable to locate a suitable switch for group 0x%X\n",
2666 			mbox->mlid);
2667 		status = IB_ERROR;
2668 		goto Exit;
2669 	}
2670 
2671 	/* a) start one dijkstra step from the root switch to generate a
2672 	   spanning tree
2673 	   b) this might be a bit of an overkill to span the whole
2674 	   network, if there are only a few ports in the mcast group, but
2675 	   its only one dijkstra step for each mcast group and we did many
2676 	   steps before in the ucast routing for each LID in the subnet;
2677 	   c) we can use the subnet structure from the ucast routing, and
2678 	   don't even have to reset the link weights (=> therefore the mcast
2679 	   spanning tree will use less 'growded' links in the network)
2680 	   d) the mcast dfsssp algorithm will not change the link weights
2681 	 */
2682 	lid = osm_node_get_base_lid(root_sw->p_node, 0);
2683 	port = osm_get_port_by_lid(sm->p_subn, lid);
2684 	err = dijkstra(p_mgr, adj_list, adj_list_size, port, lid);
2685 	if (err) {
2686 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD52: "
2687 			"Dijkstra step for mcast failed for group 0x%X\n",
2688 			mbox->mlid);
2689 		status = IB_ERROR;
2690 		goto Exit;
2691 	}
2692 
2693 	/* set mcast group membership again for update_mcft
2694 	   (unfortunately: osm_mcast_mgr_find_root_switch resets it)
2695 	 */
2696 	update_mgrp_membership(&mcastgrp_port_list);
2697 
2698 	/* update the mcast forwarding tables of the switches */
2699 	err = update_mcft(sm, adj_list, adj_list_size, mbox->mlid,
2700 			  &mcastgrp_port_map, root_sw);
2701 	if (err) {
2702 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR AD53: "
2703 			"Update of mcast forwarding tables failed for group 0x%X\n",
2704 			mbox->mlid);
2705 		status = IB_ERROR;
2706 		goto Exit;
2707 	}
2708 
2709 Exit:
2710 	reset_mgrp_membership(adj_list, adj_list_size);
2711 	osm_mcast_drop_port_list(&mcastgrp_port_list);
2712 	OSM_LOG_EXIT(sm->p_log);
2713 	return status;
2714 }
2715 
2716 /* called from extern in QP creation process to gain the the service level and
2717    the virtual lane respectively for a <s,d> pair
2718 */
2719 static uint8_t get_dfsssp_sl(void *context, uint8_t hint_for_default_sl,
2720 			     const ib_net16_t slid, const ib_net16_t dlid)
2721 {
2722 	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2723 	osm_port_t *src_port, *dest_port;
2724 	vltable_t *srcdest2vl_table = NULL;
2725 	uint8_t *vl_split_count = NULL;
2726 	osm_ucast_mgr_t *p_mgr = NULL;
2727 	int32_t res = 0;
2728 
2729 	if (dfsssp_ctx
2730 	    && dfsssp_ctx->routing_type == OSM_ROUTING_ENGINE_TYPE_DFSSSP) {
2731 		p_mgr = (osm_ucast_mgr_t *) dfsssp_ctx->p_mgr;
2732 		srcdest2vl_table = (vltable_t *) (dfsssp_ctx->srcdest2vl_table);
2733 		vl_split_count = (uint8_t *) (dfsssp_ctx->vl_split_count);
2734 	}
2735 	else
2736 		return hint_for_default_sl;
2737 
2738 	src_port = osm_get_port_by_lid(p_mgr->p_subn, slid);
2739 	if (!src_port)
2740 		return hint_for_default_sl;
2741 
2742 	dest_port = osm_get_port_by_lid(p_mgr->p_subn, dlid);
2743 	if (!dest_port)
2744 		return hint_for_default_sl;
2745 
2746 	if (!srcdest2vl_table)
2747 		return hint_for_default_sl;
2748 
2749 	res = vltable_get_vl(srcdest2vl_table, slid, dlid);
2750 
2751 	/* we will randomly distribute the traffic over multiple VLs if
2752 	   necessary for good balancing; therefore vl_split_count provides
2753 	   the number of VLs to use for certain traffic
2754 	 */
2755 	if (res > -1) {
2756 		if (vl_split_count[res] > 1)
2757 			return (uint8_t) (res + rand()%(vl_split_count[res]));
2758 		else
2759 			return (uint8_t) res;
2760 	} else
2761 		return hint_for_default_sl;
2762 }
2763 
2764 static dfsssp_context_t *dfsssp_context_create(osm_opensm_t * p_osm,
2765 					       osm_routing_engine_type_t
2766 					       routing_type)
2767 {
2768 	dfsssp_context_t *dfsssp_ctx = NULL;
2769 
2770 	/* allocate memory */
2771 	dfsssp_ctx = (dfsssp_context_t *) malloc(sizeof(dfsssp_context_t));
2772 	if (dfsssp_ctx) {
2773 		/* set initial values */
2774 		dfsssp_ctx->routing_type = routing_type;
2775 		dfsssp_ctx->p_mgr = (osm_ucast_mgr_t *) & (p_osm->sm.ucast_mgr);
2776 		dfsssp_ctx->adj_list = NULL;
2777 		dfsssp_ctx->adj_list_size = 0;
2778 		dfsssp_ctx->srcdest2vl_table = NULL;
2779 		dfsssp_ctx->vl_split_count = NULL;
2780 	} else {
2781 		OSM_LOG(p_osm->sm.ucast_mgr.p_log, OSM_LOG_ERROR,
2782 			"ERR AD04: cannot allocate memory for dfsssp_ctx in dfsssp_context_create\n");
2783 		return NULL;
2784 	}
2785 
2786 	return dfsssp_ctx;
2787 }
2788 
2789 static void dfsssp_context_destroy(void *context)
2790 {
2791 	dfsssp_context_t *dfsssp_ctx = (dfsssp_context_t *) context;
2792 	vertex_t *adj_list = (vertex_t *) (dfsssp_ctx->adj_list);
2793 	uint32_t i = 0;
2794 	link_t *link = NULL, *tmp = NULL;
2795 
2796 	/* free adj_list */
2797 	for (i = 0; i < dfsssp_ctx->adj_list_size; i++) {
2798 		link = adj_list[i].links;
2799 		while (link) {
2800 			tmp = link;
2801 			link = link->next;
2802 			free(tmp);
2803 		}
2804 	}
2805 	free(adj_list);
2806 	dfsssp_ctx->adj_list = NULL;
2807 	dfsssp_ctx->adj_list_size = 0;
2808 
2809 	/* free srcdest2vl table and the split count information table
2810 	   (can be done, because dfsssp_context_destroy is called after
2811 	    osm_get_dfsssp_sl)
2812 	 */
2813 	vltable_dealloc(&(dfsssp_ctx->srcdest2vl_table));
2814 	dfsssp_ctx->srcdest2vl_table = NULL;
2815 
2816 	if (dfsssp_ctx->vl_split_count) {
2817 		free(dfsssp_ctx->vl_split_count);
2818 		dfsssp_ctx->vl_split_count = NULL;
2819 	}
2820 }
2821 
2822 static void delete(void *context)
2823 {
2824 	if (!context)
2825 		return;
2826 	dfsssp_context_destroy(context);
2827 
2828 	free(context);
2829 }
2830 
2831 int osm_ucast_dfsssp_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
2832 {
2833 	/* create context container and add ucast management object */
2834 	dfsssp_context_t *dfsssp_context =
2835 	    dfsssp_context_create(p_osm, OSM_ROUTING_ENGINE_TYPE_DFSSSP);
2836 	if (!dfsssp_context) {
2837 		return 1;	/* alloc failed -> skip this routing */
2838 	}
2839 
2840 	/* reset function pointers to dfsssp routines */
2841 	r->context = (void *)dfsssp_context;
2842 	r->build_lid_matrices = dfsssp_build_graph;
2843 	r->ucast_build_fwd_tables = dfsssp_do_dijkstra_routing;
2844 	r->mcast_build_stree = dfsssp_do_mcast_routing;
2845 	r->path_sl = get_dfsssp_sl;
2846 	r->destroy = delete;
2847 
2848 	/* we initialize with the current time to achieve a 'good' randomized
2849 	   assignment in get_dfsssp_sl(...)
2850 	 */
2851 	srand(time(NULL));
2852 
2853 	return 0;
2854 }
2855 
2856 int osm_ucast_sssp_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm)
2857 {
2858 	/* create context container and add ucast management object */
2859 	dfsssp_context_t *dfsssp_context =
2860 	    dfsssp_context_create(p_osm, OSM_ROUTING_ENGINE_TYPE_SSSP);
2861 	if (!dfsssp_context) {
2862 		return 1;	/* alloc failed -> skip this routing */
2863 	}
2864 
2865 	/* reset function pointers to sssp routines */
2866 	r->context = (void *)dfsssp_context;
2867 	r->build_lid_matrices = dfsssp_build_graph;
2868 	r->ucast_build_fwd_tables = dfsssp_do_dijkstra_routing;
2869 	r->mcast_build_stree = dfsssp_do_mcast_routing;
2870 	r->destroy = delete;
2871 
2872 	return 0;
2873 }
2874