1 /*
2  * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  * Copyright (c) 2009 HNR Consulting. All rights reserved.
6  *
7  * This software is available to you under a choice of one of two
8  * licenses.  You may choose to be licensed under the terms of the GNU
9  * General Public License (GPL) Version 2, available from the file
10  * COPYING in the main directory of this source tree, or the
11  * OpenIB.org BSD license below:
12  *
13  *     Redistribution and use in source and binary forms, with or
14  *     without modification, are permitted provided that the following
15  *     conditions are met:
16  *
17  *      - Redistributions of source code must retain the above
18  *        copyright notice, this list of conditions and the following
19  *        disclaimer.
20  *
21  *      - Redistributions in binary form must reproduce the above
22  *        copyright notice, this list of conditions and the following
23  *        disclaimer in the documentation and/or other materials
24  *        provided with the distribution.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33  * SOFTWARE.
34  *
35  */
36 
37 /*
38  * Abstract:
39  *    Implementation of osm_switch_t.
40  * This object represents an Infiniband switch.
41  * This object is part of the opensm family of objects.
42  */
43 
44 #if HAVE_CONFIG_H
45 #  include <config.h>
46 #endif				/* HAVE_CONFIG_H */
47 
48 #include <stdlib.h>
49 #include <string.h>
50 #include <complib/cl_math.h>
51 #include <iba/ib_types.h>
52 #include <opensm/osm_file_ids.h>
53 #define FILE_ID OSM_FILE_SWITCH_C
54 #include <opensm/osm_switch.h>
55 
56 struct switch_port_path {
57 	uint8_t port_num;
58 	uint32_t path_count;
59 	int found_sys_guid;
60 	int found_node_guid;
61 	uint32_t forwarded_to;
62 };
63 
64 cl_status_t osm_switch_set_hops(IN osm_switch_t * p_sw, IN uint16_t lid_ho,
65 				IN uint8_t port_num, IN uint8_t num_hops)
66 {
67 	if (!lid_ho || lid_ho > p_sw->max_lid_ho)
68 		return -1;
69 	if (port_num >= p_sw->num_ports)
70 		return -1;
71 	if (!p_sw->hops[lid_ho]) {
72 		p_sw->hops[lid_ho] = malloc(p_sw->num_ports);
73 		if (!p_sw->hops[lid_ho])
74 			return -1;
75 		memset(p_sw->hops[lid_ho], OSM_NO_PATH, p_sw->num_ports);
76 	}
77 
78 	p_sw->hops[lid_ho][port_num] = num_hops;
79 	if (p_sw->hops[lid_ho][0] > num_hops)
80 		p_sw->hops[lid_ho][0] = num_hops;
81 
82 	return 0;
83 }
84 
85 void osm_switch_delete(IN OUT osm_switch_t ** pp_sw)
86 {
87 	osm_switch_t *p_sw = *pp_sw;
88 	unsigned i;
89 
90 	osm_mcast_tbl_destroy(&p_sw->mcast_tbl);
91 	if (p_sw->p_prof)
92 		free(p_sw->p_prof);
93 	if (p_sw->search_ordering_ports)
94 		free(p_sw->search_ordering_ports);
95 	if (p_sw->lft)
96 		free(p_sw->lft);
97 	if (p_sw->new_lft)
98 		free(p_sw->new_lft);
99 	if (p_sw->hops) {
100 		for (i = 0; i < p_sw->num_hops; i++)
101 			if (p_sw->hops[i])
102 				free(p_sw->hops[i]);
103 		free(p_sw->hops);
104 	}
105 	free(*pp_sw);
106 	*pp_sw = NULL;
107 }
108 
109 osm_switch_t *osm_switch_new(IN osm_node_t * p_node,
110 			     IN const osm_madw_t * p_madw)
111 {
112 	osm_switch_t *p_sw;
113 	ib_switch_info_t *p_si;
114 	ib_smp_t *p_smp;
115 	uint8_t num_ports;
116 	uint32_t port_num;
117 
118 	CL_ASSERT(p_madw);
119 	CL_ASSERT(p_node);
120 
121 	p_smp = osm_madw_get_smp_ptr(p_madw);
122 	p_si = ib_smp_get_payload_ptr(p_smp);
123 	num_ports = osm_node_get_num_physp(p_node);
124 
125 	CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_SWITCH_INFO);
126 
127 	if (!p_si->lin_cap) /* The switch doesn't support LFT */
128 		return NULL;
129 
130 	p_sw = malloc(sizeof(*p_sw));
131 	if (!p_sw)
132 		return NULL;
133 
134 	memset(p_sw, 0, sizeof(*p_sw));
135 
136 	p_sw->p_node = p_node;
137 	p_sw->switch_info = *p_si;
138 	p_sw->num_ports = num_ports;
139 	p_sw->need_update = 2;
140 
141 	p_sw->p_prof = malloc(sizeof(*p_sw->p_prof) * num_ports);
142 	if (!p_sw->p_prof)
143 		goto err;
144 
145 	memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports);
146 
147 	osm_mcast_tbl_init(&p_sw->mcast_tbl, osm_node_get_num_physp(p_node),
148 			   cl_ntoh16(p_si->mcast_cap));
149 
150 	for (port_num = 0; port_num < num_ports; port_num++)
151 		osm_port_prof_construct(&p_sw->p_prof[port_num]);
152 
153 	return p_sw;
154 
155 err:
156 	osm_switch_delete(&p_sw);
157 	return NULL;
158 }
159 
160 boolean_t osm_switch_get_lft_block(IN const osm_switch_t * p_sw,
161 				   IN uint16_t block_id, OUT uint8_t * p_block)
162 {
163 	uint16_t base_lid_ho = block_id * IB_SMP_DATA_SIZE;
164 
165 	CL_ASSERT(p_sw);
166 	CL_ASSERT(p_block);
167 
168 	if (base_lid_ho > p_sw->max_lid_ho)
169 		return FALSE;
170 
171 	CL_ASSERT(base_lid_ho + IB_SMP_DATA_SIZE - 1 <= IB_LID_UCAST_END_HO);
172 	memcpy(p_block, &(p_sw->new_lft[base_lid_ho]), IB_SMP_DATA_SIZE);
173 	return TRUE;
174 }
175 
176 static struct osm_remote_node *
177 switch_find_guid_common(IN const osm_switch_t * p_sw,
178 			IN struct osm_remote_guids_count *r,
179 			IN uint8_t port_num, IN int find_sys_guid,
180 			IN int find_node_guid)
181 {
182 	struct osm_remote_node *p_remote_guid = NULL;
183 	osm_physp_t *p_physp;
184 	osm_physp_t *p_rem_physp;
185 	osm_node_t *p_rem_node;
186 	uint64_t sys_guid;
187 	uint64_t node_guid;
188 	unsigned int i;
189 
190 	CL_ASSERT(p_sw);
191 
192 	if (!r)
193 		goto out;
194 
195 	p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
196 	if (!p_physp)
197 		goto out;
198 
199 	p_rem_physp = osm_physp_get_remote(p_physp);
200 	p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
201 	sys_guid = p_rem_node->node_info.sys_guid;
202 	node_guid = p_rem_node->node_info.node_guid;
203 
204 	for (i = 0; i < r->count; i++) {
205 		if ((!find_sys_guid
206 		     || r->guids[i].node->node_info.sys_guid == sys_guid)
207 		    && (!find_node_guid
208 			|| r->guids[i].node->node_info.node_guid == node_guid)) {
209 			p_remote_guid = &r->guids[i];
210 			break;
211 		}
212 	}
213 
214 out:
215 	return p_remote_guid;
216 }
217 
218 static struct osm_remote_node *
219 switch_find_sys_guid_count(IN const osm_switch_t * p_sw,
220 			   IN struct osm_remote_guids_count *r,
221 			   IN uint8_t port_num)
222 {
223 	return switch_find_guid_common(p_sw, r, port_num, 1, 0);
224 }
225 
226 static struct osm_remote_node *
227 switch_find_node_guid_count(IN const osm_switch_t * p_sw,
228 			    IN struct osm_remote_guids_count *r,
229 			    IN uint8_t port_num)
230 {
231 	return switch_find_guid_common(p_sw, r, port_num, 0, 1);
232 }
233 
234 uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
235 				  IN osm_port_t * p_port, IN uint16_t lid_ho,
236 				  IN unsigned start_from,
237 				  IN boolean_t ignore_existing,
238 				  IN boolean_t routing_for_lmc,
239 				  IN boolean_t dor,
240 				  IN boolean_t port_shifting,
241 				  IN uint32_t scatter_ports,
242 				  IN osm_lft_type_enum lft_enum)
243 {
244 	/*
245 	   We support an enhanced LMC aware routing mode:
246 	   In the case of LMC > 0, we can track the remote side
247 	   system and node for all of the lids of the target
248 	   and try and avoid routing again through the same
249 	   system / node.
250 
251 	   Assume if routing_for_lmc is true that this procedure was
252 	   provided the tracking array and counter via p_port->priv,
253 	   and we can conduct this algorithm.
254 	 */
255 	uint16_t base_lid;
256 	uint8_t hops;
257 	uint8_t least_hops;
258 	uint8_t port_num;
259 	uint8_t num_ports;
260 	uint32_t least_paths = 0xFFFFFFFF;
261 	unsigned i;
262 	/*
263 	   The following will track the least paths if the
264 	   route should go through a new system/node
265 	 */
266 	uint32_t least_paths_other_sys = 0xFFFFFFFF;
267 	uint32_t least_paths_other_nodes = 0xFFFFFFFF;
268 	uint32_t least_forwarded_to = 0xFFFFFFFF;
269 	uint32_t check_count;
270 	uint8_t best_port = 0;
271 	/*
272 	   These vars track the best port if it connects to
273 	   not used system/node.
274 	 */
275 	uint8_t best_port_other_sys = 0;
276 	uint8_t best_port_other_node = 0;
277 	boolean_t port_found = FALSE;
278 	osm_physp_t *p_physp;
279 	osm_physp_t *p_rem_physp;
280 	osm_node_t *p_rem_node;
281 	osm_node_t *p_rem_node_first = NULL;
282 	struct osm_remote_node *p_remote_guid = NULL;
283 	struct osm_remote_node null_remote_node = {NULL, 0, 0};
284 	struct switch_port_path port_paths[IB_NODE_NUM_PORTS_MAX];
285 	unsigned int port_paths_total_paths = 0;
286 	unsigned int port_paths_count = 0;
287 	uint8_t scatter_possible_ports[IB_NODE_NUM_PORTS_MAX];
288 	unsigned int scatter_possible_ports_count = 0;
289 	int found_sys_guid = 0;
290 	int found_node_guid = 0;
291 
292 	CL_ASSERT(lid_ho > 0);
293 
294 	if (p_port->p_node->sw) {
295 		if (p_port->p_node->sw == p_sw)
296 			return 0;
297 		base_lid = osm_port_get_base_lid(p_port);
298 	} else {
299 		p_physp = p_port->p_physp;
300 		if (!p_physp || !p_physp->p_remote_physp ||
301 		    !p_physp->p_remote_physp->p_node->sw)
302 			return OSM_NO_PATH;
303 
304 		if (p_physp->p_remote_physp->p_node->sw == p_sw)
305 			return p_physp->p_remote_physp->port_num;
306 		base_lid =
307 		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
308 	}
309 	base_lid = cl_ntoh16(base_lid);
310 
311 	num_ports = p_sw->num_ports;
312 
313 	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
314 	if (least_hops == OSM_NO_PATH)
315 		return OSM_NO_PATH;
316 
317 	/*
318 	   First, inquire with the forwarding table for an existing
319 	   route.  If one is found, honor it unless:
320 	   1. the ignore existing flag is set.
321 	   2. the physical port is not a valid one or not healthy
322 	   3. the physical port has a remote port (the link is up)
323 	   4. the port has min-hops to the target (avoid loops)
324 	 */
325 	if (!ignore_existing) {
326 		port_num = osm_switch_get_port_by_lid(p_sw, lid_ho, lft_enum);
327 
328 		if (port_num != OSM_NO_PATH) {
329 			CL_ASSERT(port_num < num_ports);
330 
331 			p_physp =
332 			    osm_node_get_physp_ptr(p_sw->p_node, port_num);
333 			/*
334 			   Don't be too trusting of the current forwarding table!
335 			   Verify that the port number is legal and that the
336 			   LID is reachable through this port.
337 			 */
338 			if (p_physp && osm_physp_is_healthy(p_physp) &&
339 			    osm_physp_get_remote(p_physp)) {
340 				hops =
341 				    osm_switch_get_hop_count(p_sw, base_lid,
342 							     port_num);
343 				/*
344 				   If we aren't using pre-defined user routes
345 				   function, then we need to make sure that the
346 				   current path is the minimum one. In case of
347 				   having such a user function - this check will
348 				   not be done, and the old routing will be used.
349 				   Note: This means that it is the user's job to
350 				   clean all data in the forwarding tables that
351 				   he wants to be overridden by the minimum
352 				   hop function.
353 				 */
354 				if (hops == least_hops)
355 					return port_num;
356 			}
357 		}
358 	}
359 
360 	/*
361 	   This algorithm selects a port based on a static load balanced
362 	   selection across equal hop-count ports.
363 	   There is lots of room for improved sophistication here,
364 	   possibly guided by user configuration info.
365 	 */
366 
367 	/*
368 	   OpenSM routing is "local" - not considering a full lid to lid
369 	   path. As such we can not guarantee a path will not loop if we
370 	   do not always follow least hops.
371 	   So we must abort if not least hops.
372 	 */
373 
374 	/* port number starts with one and num_ports is 1 + num phys ports */
375 	for (i = start_from; i < start_from + num_ports; i++) {
376 		port_num = osm_switch_get_dimn_port(p_sw, i % num_ports);
377 		if (!port_num ||
378 		    osm_switch_get_hop_count(p_sw, base_lid, port_num) !=
379 		    least_hops)
380 			continue;
381 
382 		/* let us make sure it is not down or unhealthy */
383 		p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
384 		if (!p_physp || !osm_physp_is_healthy(p_physp) ||
385 		    /*
386 		       we require all - non sma ports to be linked
387 		       to be routed through
388 		     */
389 		    !osm_physp_get_remote(p_physp))
390 			continue;
391 
392 		/*
393 		   We located a least-hop port, possibly one of many.
394 		   For this port, check the running total count of
395 		   the number of paths through this port.  Select
396 		   the port routing the least number of paths.
397 		 */
398 		check_count =
399 		    osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
400 
401 
402 		if (dor) {
403 			/* Get the Remote Node */
404 			p_rem_physp = osm_physp_get_remote(p_physp);
405 			p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
406 			/* use the first dimension, but spread traffic
407 			 * out among the group of ports representing
408 			 * that dimension */
409 			if (!p_rem_node_first)
410 				p_rem_node_first = p_rem_node;
411 			else if (p_rem_node != p_rem_node_first)
412 				continue;
413 			if (routing_for_lmc) {
414 				struct osm_remote_guids_count *r = p_port->priv;
415 				uint8_t rem_port = osm_physp_get_port_num(p_rem_physp);
416 				unsigned int j;
417 
418 				for (j = 0; j < r->count; j++) {
419 					p_remote_guid = &r->guids[j];
420 					if ((p_remote_guid->node == p_rem_node)
421 					    && (p_remote_guid->port == rem_port))
422 						break;
423 				}
424 				if (j == r->count)
425 					p_remote_guid = &null_remote_node;
426 			}
427 		/*
428 		   Advanced LMC routing requires tracking of the
429 		   best port by the node connected to the other side of
430 		   it.
431 		 */
432 		} else if (routing_for_lmc) {
433 			/* Is the sys guid already used ? */
434 			p_remote_guid = switch_find_sys_guid_count(p_sw,
435 								   p_port->priv,
436 								   port_num);
437 
438 			/* If not update the least hops for this case */
439 			if (!p_remote_guid) {
440 				if (check_count < least_paths_other_sys) {
441 					least_paths_other_sys = check_count;
442 					best_port_other_sys = port_num;
443 					least_forwarded_to = 0;
444 				}
445 				found_sys_guid = 0;
446 			} else {	/* same sys found - try node */
447 
448 
449 				/* Else is the node guid already used ? */
450 				p_remote_guid = switch_find_node_guid_count(p_sw,
451 									    p_port->priv,
452 									    port_num);
453 
454 				/* If not update the least hops for this case */
455 				if (!p_remote_guid
456 				    && check_count < least_paths_other_nodes) {
457 					least_paths_other_nodes = check_count;
458 					best_port_other_node = port_num;
459 					least_forwarded_to = 0;
460 				}
461 				/* else prior sys and node guid already used */
462 
463 				if (!p_remote_guid)
464 					found_node_guid = 0;
465 				else
466 					found_node_guid = 1;
467 				found_sys_guid = 1;
468 			}	/* same sys found */
469 		}
470 
471 		port_paths[port_paths_count].port_num = port_num;
472 		port_paths[port_paths_count].path_count = check_count;
473 		if (routing_for_lmc) {
474 			port_paths[port_paths_count].found_sys_guid = found_sys_guid;
475 			port_paths[port_paths_count].found_node_guid = found_node_guid;
476 		}
477 		if (routing_for_lmc && p_remote_guid)
478 			port_paths[port_paths_count].forwarded_to = p_remote_guid->forwarded_to;
479 		else
480 			port_paths[port_paths_count].forwarded_to = 0;
481 		port_paths_total_paths += check_count;
482 		port_paths_count++;
483 
484 		/* routing for LMC mode */
485 		/*
486 		   the count is min but also lower then the max subscribed
487 		 */
488 		if (check_count < least_paths) {
489 			port_found = TRUE;
490 			best_port = port_num;
491 			least_paths = check_count;
492 			scatter_possible_ports_count = 0;
493 			scatter_possible_ports[scatter_possible_ports_count++] = port_num;
494 			if (routing_for_lmc
495 			    && p_remote_guid
496 			    && p_remote_guid->forwarded_to < least_forwarded_to)
497 				least_forwarded_to = p_remote_guid->forwarded_to;
498 		} else if (scatter_ports
499 			   && check_count == least_paths) {
500 			scatter_possible_ports[scatter_possible_ports_count++] = port_num;
501 		} else if (routing_for_lmc
502 			   && p_remote_guid
503 			   && check_count == least_paths
504 			   && p_remote_guid->forwarded_to < least_forwarded_to) {
505 			least_forwarded_to = p_remote_guid->forwarded_to;
506 			best_port = port_num;
507 		}
508 	}
509 
510 	if (port_found == FALSE)
511 		return OSM_NO_PATH;
512 
513 	if (port_shifting && port_paths_count) {
514 		/* In the port_paths[] array, we now have all the ports that we
515 		 * can route out of.  Using some shifting math below, possibly
516 		 * select a different one so that lids won't align in LFTs
517 		 *
518 		 * If lmc > 0, we need to loop through these ports to find the
519 		 * least_forwarded_to port, best_port_other_sys, and
520 		 * best_port_other_node just like before but through the different
521 		 * ordering.
522 		 */
523 
524 		least_paths = 0xFFFFFFFF;
525 		least_paths_other_sys = 0xFFFFFFFF;
526 		least_paths_other_nodes = 0xFFFFFFFF;
527 	        least_forwarded_to = 0xFFFFFFFF;
528 		best_port = 0;
529 		best_port_other_sys = 0;
530 		best_port_other_node = 0;
531 
532 		for (i = 0; i < port_paths_count; i++) {
533 			unsigned int idx;
534 
535 			idx = (port_paths_total_paths/port_paths_count + i) % port_paths_count;
536 
537 			if (routing_for_lmc) {
538 				if (!port_paths[idx].found_sys_guid
539 				    && port_paths[idx].path_count < least_paths_other_sys) {
540 					least_paths_other_sys = port_paths[idx].path_count;
541 					best_port_other_sys = port_paths[idx].port_num;
542 					least_forwarded_to = 0;
543 				}
544 				else if (!port_paths[idx].found_node_guid
545 					 && port_paths[idx].path_count < least_paths_other_nodes) {
546 					least_paths_other_nodes = port_paths[idx].path_count;
547 					best_port_other_node = port_paths[idx].port_num;
548 					least_forwarded_to = 0;
549 				}
550 			}
551 
552 			if (port_paths[idx].path_count < least_paths) {
553 				best_port = port_paths[idx].port_num;
554 				least_paths = port_paths[idx].path_count;
555 				if (routing_for_lmc
556 				    && (port_paths[idx].found_sys_guid
557 					|| port_paths[idx].found_node_guid)
558 				    && port_paths[idx].forwarded_to < least_forwarded_to)
559 					least_forwarded_to = port_paths[idx].forwarded_to;
560 			}
561 			else if (routing_for_lmc
562 				 && (port_paths[idx].found_sys_guid
563 				     || port_paths[idx].found_node_guid)
564 				 && port_paths[idx].path_count == least_paths
565 				 && port_paths[idx].forwarded_to < least_forwarded_to) {
566 				least_forwarded_to = port_paths[idx].forwarded_to;
567 				best_port = port_paths[idx].port_num;
568 			}
569 
570 		}
571 	}
572 
573 	/*
574 	   if we are in enhanced routing mode and the best port is not
575 	   the local port 0
576 	 */
577 	if (routing_for_lmc && best_port && !scatter_ports) {
578 		/* Select the least hop port of the non used sys first */
579 		if (best_port_other_sys)
580 			best_port = best_port_other_sys;
581 		else if (best_port_other_node)
582 			best_port = best_port_other_node;
583 	} else if (scatter_ports) {
584 		/*
585 		 * There is some danger that this random could "rebalance" the routes
586 		 * every time, to combat this there is a global srandom that
587 		 * occurs at the start of every sweep.
588 		 */
589 		unsigned int idx = random() % scatter_possible_ports_count;
590 		best_port = scatter_possible_ports[idx];
591 	}
592 	return best_port;
593 }
594 
595 void osm_switch_clear_hops(IN osm_switch_t * p_sw)
596 {
597 	unsigned i;
598 
599 	for (i = 0; i < p_sw->num_hops; i++)
600 		if (p_sw->hops[i])
601 			memset(p_sw->hops[i], OSM_NO_PATH, p_sw->num_ports);
602 }
603 
604 static int alloc_lft(IN osm_switch_t * p_sw, uint16_t lids)
605 {
606 	uint16_t lft_size;
607 
608 	/* Ensure LFT is in units of LFT block size */
609 	lft_size = (lids / IB_SMP_DATA_SIZE + 1) * IB_SMP_DATA_SIZE;
610 	if (lft_size > p_sw->lft_size) {
611 		uint8_t *new_lft = realloc(p_sw->lft, lft_size);
612 		if (!new_lft)
613 			return -1;
614 		memset(new_lft + p_sw->lft_size, OSM_NO_PATH,
615 		       lft_size - p_sw->lft_size);
616 		p_sw->lft = new_lft;
617 		p_sw->lft_size = lft_size;
618 	}
619 
620 	return 0;
621 }
622 
623 int osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw, IN uint16_t max_lids)
624 {
625 	uint8_t **hops;
626 	uint8_t *new_lft;
627 	unsigned i;
628 
629 	if (alloc_lft(p_sw, max_lids))
630 		return -1;
631 
632 	for (i = 0; i < p_sw->num_ports; i++)
633 		osm_port_prof_construct(&p_sw->p_prof[i]);
634 
635 	osm_switch_clear_hops(p_sw);
636 
637 	if (!(new_lft = realloc(p_sw->new_lft, p_sw->lft_size)))
638 		return -1;
639 
640 	p_sw->new_lft = new_lft;
641 
642 	memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size);
643 
644 	if (!p_sw->hops) {
645 		hops = malloc((max_lids + 1) * sizeof(hops[0]));
646 		if (!hops)
647 			return -1;
648 		memset(hops, 0, (max_lids + 1) * sizeof(hops[0]));
649 		p_sw->hops = hops;
650 		p_sw->num_hops = max_lids + 1;
651 	} else if (max_lids + 1 > p_sw->num_hops) {
652 		hops = realloc(p_sw->hops, (max_lids + 1) * sizeof(hops[0]));
653 		if (!hops)
654 			return -1;
655 		memset(hops + p_sw->num_hops, 0,
656 		       (max_lids + 1 - p_sw->num_hops) * sizeof(hops[0]));
657 		p_sw->hops = hops;
658 		p_sw->num_hops = max_lids + 1;
659 	}
660 	p_sw->max_lid_ho = max_lids;
661 
662 	return 0;
663 }
664 
665 uint8_t osm_switch_get_port_least_hops(IN const osm_switch_t * p_sw,
666 				       IN const osm_port_t * p_port)
667 {
668 	uint16_t lid;
669 
670 	if (p_port->p_node->sw) {
671 		if (p_port->p_node->sw == p_sw)
672 			return 0;
673 		lid = osm_node_get_base_lid(p_port->p_node, 0);
674 		return osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
675 	} else {
676 		osm_physp_t *p = p_port->p_physp;
677 		uint8_t hops;
678 
679 		if (!p || !p->p_remote_physp || !p->p_remote_physp->p_node->sw)
680 			return OSM_NO_PATH;
681 		if (p->p_remote_physp->p_node->sw == p_sw)
682 			return 1;
683 		lid = osm_node_get_base_lid(p->p_remote_physp->p_node, 0);
684 		hops = osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
685 		return hops != OSM_NO_PATH ? hops + 1 : OSM_NO_PATH;
686 	}
687 }
688 
689 uint8_t osm_switch_recommend_mcast_path(IN osm_switch_t * p_sw,
690 					IN osm_port_t * p_port,
691 					IN uint16_t mlid_ho,
692 					IN boolean_t ignore_existing)
693 {
694 	uint16_t base_lid;
695 	uint8_t hops;
696 	uint8_t port_num;
697 	uint8_t num_ports;
698 	uint8_t least_hops;
699 
700 	CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
701 
702 	if (p_port->p_node->sw) {
703 		if (p_port->p_node->sw == p_sw)
704 			return 0;
705 		base_lid = osm_port_get_base_lid(p_port);
706 	} else {
707 		osm_physp_t *p_physp = p_port->p_physp;
708 		if (!p_physp || !p_physp->p_remote_physp ||
709 		    !p_physp->p_remote_physp->p_node->sw)
710 			return OSM_NO_PATH;
711 		if (p_physp->p_remote_physp->p_node->sw == p_sw)
712 			return p_physp->p_remote_physp->port_num;
713 		base_lid =
714 		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
715 	}
716 	base_lid = cl_ntoh16(base_lid);
717 	num_ports = p_sw->num_ports;
718 
719 	/*
720 	   If the user wants us to ignore existing multicast routes,
721 	   then simply return the shortest hop count path to the
722 	   target port.
723 
724 	   Otherwise, return the first port that has a path to the target,
725 	   picking from the ports that are already in the multicast group.
726 	 */
727 	if (!ignore_existing) {
728 		for (port_num = 1; port_num < num_ports; port_num++) {
729 			if (!osm_mcast_tbl_is_port
730 			    (&p_sw->mcast_tbl, mlid_ho, port_num))
731 				continue;
732 			/*
733 			   Don't be too trusting of the current forwarding table!
734 			   Verify that the LID is reachable through this port.
735 			 */
736 			hops =
737 			    osm_switch_get_hop_count(p_sw, base_lid, port_num);
738 			if (hops != OSM_NO_PATH)
739 				return port_num;
740 		}
741 	}
742 
743 	/*
744 	   Either no existing mcast paths reach this port or we are
745 	   ignoring existing paths.
746 
747 	   Determine the best multicast path to the target.  Note that this
748 	   algorithm is slightly different from the one used for unicast route
749 	   recommendation.  In this case (multicast), we must NOT
750 	   perform any sort of load balancing.  We MUST take the FIRST
751 	   port found that has <= the lowest hop count path.  This prevents
752 	   more than one multicast path to the same remote switch which
753 	   prevents a multicast loop.  Multicast loops are bad since the same
754 	   multicast packet will go around and around, inevitably creating
755 	   a black hole that will destroy the Earth in a firey conflagration.
756 	 */
757 	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
758 	if (least_hops == OSM_NO_PATH)
759 		return OSM_NO_PATH;
760 	for (port_num = 1; port_num < num_ports; port_num++)
761 		if (osm_switch_get_hop_count(p_sw, base_lid, port_num) ==
762 		    least_hops)
763 			break;
764 
765 	CL_ASSERT(port_num < num_ports);
766 	return port_num;
767 }
768