1 /*
2  * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  * Copyright (c) 2008 Xsigo Systems Inc.  All rights reserved.
6  * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
7  * Copyright (c) 2010 HNR Consulting. All rights reserved.
8  * Copyright (C) 2012-2013 Tokyo Institute of Technology. All rights reserved.
9  *
10  * This software is available to you under a choice of one of two
11  * licenses.  You may choose to be licensed under the terms of the GNU
12  * General Public License (GPL) Version 2, available from the file
13  * COPYING in the main directory of this source tree, or the
14  * OpenIB.org BSD license below:
15  *
16  *     Redistribution and use in source and binary forms, with or
17  *     without modification, are permitted provided that the following
18  *     conditions are met:
19  *
20  *      - Redistributions of source code must retain the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer.
23  *
24  *      - Redistributions in binary form must reproduce the above
25  *        copyright notice, this list of conditions and the following
26  *        disclaimer in the documentation and/or other materials
27  *        provided with the distribution.
28  *
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
33  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
34  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
35  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36  * SOFTWARE.
37  *
38  */
39 
40 /*
41  * Abstract:
42  *    Implementation of osm_mcast_mgr_t.
43  * This file implements the Multicast Manager object.
44  */
45 
46 #if HAVE_CONFIG_H
47 #  include <config.h>
48 #endif				/* HAVE_CONFIG_H */
49 
50 #include <stdlib.h>
51 #include <string.h>
52 #include <iba/ib_types.h>
53 #include <complib/cl_debug.h>
54 #include <opensm/osm_file_ids.h>
55 #define FILE_ID OSM_FILE_MCAST_MGR_C
56 #include <opensm/osm_opensm.h>
57 #include <opensm/osm_sm.h>
58 #include <opensm/osm_multicast.h>
59 #include <opensm/osm_node.h>
60 #include <opensm/osm_switch.h>
61 #include <opensm/osm_helper.h>
62 #include <opensm/osm_msgdef.h>
63 #include <opensm/osm_mcast_mgr.h>
64 
mcast_work_obj_new(IN osm_port_t * p_port)65 static osm_mcast_work_obj_t *mcast_work_obj_new(IN osm_port_t * p_port)
66 {
67 	osm_mcast_work_obj_t *p_obj;
68 
69 	/*
70 	   clean allocated memory to avoid assertion when trying to insert to
71 	   qlist.
72 	   see cl_qlist_insert_tail(): CL_ASSERT(p_list_item->p_list != p_list)
73 	 */
74 	p_obj = malloc(sizeof(*p_obj));
75 	if (p_obj) {
76 		memset(p_obj, 0, sizeof(*p_obj));
77 		p_obj->p_port = p_port;
78 	}
79 
80 	return p_obj;
81 }
82 
mcast_work_obj_delete(IN osm_mcast_work_obj_t * p_wobj)83 static void mcast_work_obj_delete(IN osm_mcast_work_obj_t * p_wobj)
84 {
85 	free(p_wobj);
86 }
87 
osm_mcast_make_port_list_and_map(cl_qlist_t * list,cl_qmap_t * map,osm_mgrp_box_t * mbox)88 int osm_mcast_make_port_list_and_map(cl_qlist_t * list, cl_qmap_t * map,
89 				     osm_mgrp_box_t * mbox)
90 {
91 	cl_map_item_t *map_item;
92 	cl_list_item_t *list_item;
93 	osm_mgrp_t *mgrp;
94 	osm_mcm_port_t *mcm_port;
95 	osm_mcast_work_obj_t *wobj;
96 
97 	cl_qmap_init(map);
98 	cl_qlist_init(list);
99 
100 	for (list_item = cl_qlist_head(&mbox->mgrp_list);
101 	     list_item != cl_qlist_end(&mbox->mgrp_list);
102 	     list_item = cl_qlist_next(list_item)) {
103 		mgrp = cl_item_obj(list_item, mgrp, list_item);
104 		for (map_item = cl_qmap_head(&mgrp->mcm_port_tbl);
105 		     map_item != cl_qmap_end(&mgrp->mcm_port_tbl);
106 		     map_item = cl_qmap_next(map_item)) {
107 			/* Acquire the port object for this port guid, then
108 			   create the new worker object to build the list. */
109 			mcm_port = cl_item_obj(map_item, mcm_port, map_item);
110 			if (cl_qmap_get(map, mcm_port->port->guid) !=
111 			    cl_qmap_end(map))
112 				continue;
113 			wobj = mcast_work_obj_new(mcm_port->port);
114 			if (!wobj)
115 				return -1;
116 			cl_qlist_insert_tail(list, &wobj->list_item);
117 			cl_qmap_insert(map, mcm_port->port->guid,
118 				       &wobj->map_item);
119 		}
120 	}
121 	return 0;
122 }
123 
osm_mcast_drop_port_list(cl_qlist_t * list)124 void osm_mcast_drop_port_list(cl_qlist_t * list)
125 {
126 	while (cl_qlist_count(list))
127 		mcast_work_obj_delete((osm_mcast_work_obj_t *)
128 				      cl_qlist_remove_head(list));
129 }
130 
osm_purge_mtree(osm_sm_t * sm,IN osm_mgrp_box_t * mbox)131 void osm_purge_mtree(osm_sm_t * sm, IN osm_mgrp_box_t * mbox)
132 {
133 	OSM_LOG_ENTER(sm->p_log);
134 
135 	if (mbox->root)
136 		osm_mtree_destroy(mbox->root);
137 	mbox->root = NULL;
138 
139 	OSM_LOG_EXIT(sm->p_log);
140 }
141 
create_mgrp_switch_map(cl_qmap_t * m,cl_qlist_t * port_list)142 static void create_mgrp_switch_map(cl_qmap_t * m, cl_qlist_t * port_list)
143 {
144 	osm_mcast_work_obj_t *wobj;
145 	osm_port_t *port;
146 	osm_switch_t *sw;
147 	ib_net64_t guid;
148 	cl_list_item_t *i;
149 
150 	cl_qmap_init(m);
151 	for (i = cl_qlist_head(port_list); i != cl_qlist_end(port_list);
152 	     i = cl_qlist_next(i)) {
153 		wobj = cl_item_obj(i, wobj, list_item);
154 		port = wobj->p_port;
155 		if (port->p_node->sw) {
156 			sw = port->p_node->sw;
157 			sw->is_mc_member = 1;
158 		} else if (port->p_physp->p_remote_physp) {
159 			sw = port->p_physp->p_remote_physp->p_node->sw;
160 			sw->num_of_mcm++;
161 		} else
162 			continue;
163 		guid = osm_node_get_node_guid(sw->p_node);
164 		if (cl_qmap_get(m, guid) == cl_qmap_end(m))
165 			cl_qmap_insert(m, guid, &sw->mgrp_item);
166 	}
167 }
168 
destroy_mgrp_switch_map(cl_qmap_t * m)169 static void destroy_mgrp_switch_map(cl_qmap_t * m)
170 {
171 	osm_switch_t *sw;
172 	cl_map_item_t *i;
173 
174 	for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) {
175 		sw = cl_item_obj(i, sw, mgrp_item);
176 		sw->num_of_mcm = 0;
177 		sw->is_mc_member = 0;
178 	}
179 	cl_qmap_remove_all(m);
180 }
181 
182 /**********************************************************************
183  Calculate the maximal "min hops" from the given switch to any
184  of the group HCAs
185  **********************************************************************/
186 #ifdef OSM_VENDOR_INTF_ANAFA
mcast_mgr_compute_avg_hops(osm_sm_t * sm,cl_qmap_t * m,const osm_switch_t * this_sw)187 static float mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qmap_t * m,
188 					const osm_switch_t * this_sw)
189 {
190 	float avg_hops = 0;
191 	uint32_t hops = 0;
192 	uint32_t num_ports = 0;
193 	uint16_t lid;
194 	uint32_t least_hops;
195 	cl_map_item_t *i;
196 	osm_switch_t *sw;
197 
198 	OSM_LOG_ENTER(sm->p_log);
199 
200 	for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) {
201 		sw = cl_item_obj(i, sw, mcast_item);
202 		lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
203 		least_hops = osm_switch_get_least_hops(this_sw, lid);
204 		/* for all host that are MC members and attached to the switch,
205 		   we should add the (least_hops + 1) * number_of_such_hosts.
206 		   If switch itself is in the MC, we should add the least_hops only */
207 		hops += (least_hops + 1) * sw->num_of_mcm +
208 		    least_hops * sw->is_mc_member;
209 		num_ports += sw->num_of_mcm + sw->is_mc_member;
210 	}
211 
212 	/* We shouldn't be here if there aren't any ports in the group. */
213 	CL_ASSERT(num_ports);
214 
215 	avg_hops = (float)(hops / num_ports);
216 
217 	OSM_LOG_EXIT(sm->p_log);
218 	return avg_hops;
219 }
220 #else
mcast_mgr_compute_max_hops(osm_sm_t * sm,cl_qmap_t * m,const osm_switch_t * this_sw)221 static float mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qmap_t * m,
222 					const osm_switch_t * this_sw)
223 {
224 	uint32_t max_hops = 0, hops;
225 	uint16_t lid;
226 	cl_map_item_t *i;
227 	osm_switch_t *sw;
228 
229 	OSM_LOG_ENTER(sm->p_log);
230 
231 	/*
232 	   For each member of the multicast group, compute the
233 	   number of hops to its base LID.
234 	 */
235 	for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) {
236 		sw = cl_item_obj(i, sw, mgrp_item);
237 		lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0));
238 		hops = osm_switch_get_least_hops(this_sw, lid);
239 		if (!sw->is_mc_member)
240 			hops += 1;
241 		if (hops > max_hops)
242 			max_hops = hops;
243 	}
244 
245 	/* Note that at this point we might get (max_hops == 0),
246 	   which means that there's only one member in the mcast
247 	   group, and it's the current switch */
248 
249 	OSM_LOG_EXIT(sm->p_log);
250 	return (float)max_hops;
251 }
252 #endif
253 
254 /**********************************************************************
255    This function attempts to locate the optimal switch for the
256    center of the spanning tree.  The current algorithm chooses
257    a switch with the lowest average hop count to the members
258    of the multicast group.
259 **********************************************************************/
mcast_mgr_find_optimal_switch(osm_sm_t * sm,cl_qlist_t * list)260 static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm,
261 						   cl_qlist_t * list)
262 {
263 	cl_qmap_t mgrp_sw_map;
264 	cl_qmap_t *p_sw_tbl;
265 	osm_switch_t *p_sw, *p_best_sw = NULL;
266 	float hops = 0;
267 	float best_hops = 10000;	/* any big # will do */
268 
269 	OSM_LOG_ENTER(sm->p_log);
270 
271 	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
272 
273 	create_mgrp_switch_map(&mgrp_sw_map, list);
274 	for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
275 	     p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl);
276 	     p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) {
277 		if (!osm_switch_supports_mcast(p_sw))
278 			continue;
279 
280 #ifdef OSM_VENDOR_INTF_ANAFA
281 		hops = mcast_mgr_compute_avg_hops(sm, &mgrp_sw_map, p_sw);
282 #else
283 		hops = mcast_mgr_compute_max_hops(sm, &mgrp_sw_map, p_sw);
284 #endif
285 
286 		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
287 			"Switch 0x%016" PRIx64 ", hops = %f\n",
288 			cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)), hops);
289 
290 		if (hops < best_hops) {
291 			p_best_sw = p_sw;
292 			best_hops = hops;
293 		}
294 	}
295 
296 	if (p_best_sw)
297 		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
298 			"Best switch is 0x%" PRIx64 " (%s), hops = %f\n",
299 			cl_ntoh64(osm_node_get_node_guid(p_best_sw->p_node)),
300 			p_best_sw->p_node->print_desc, best_hops);
301 	else
302 		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
303 			"No multicast capable switches detected\n");
304 
305 	destroy_mgrp_switch_map(&mgrp_sw_map);
306 	OSM_LOG_EXIT(sm->p_log);
307 	return p_best_sw;
308 }
309 
310 /**********************************************************************
311    This function returns the existing or optimal root switch for the tree.
312 **********************************************************************/
osm_mcast_mgr_find_root_switch(osm_sm_t * sm,cl_qlist_t * list)313 osm_switch_t *osm_mcast_mgr_find_root_switch(osm_sm_t * sm, cl_qlist_t *list)
314 {
315 	osm_switch_t *p_sw = NULL;
316 
317 	OSM_LOG_ENTER(sm->p_log);
318 
319 	/*
320 	   We always look for the best multicast tree root switch.
321 	   Otherwise since we always start with a a single join
322 	   the root will be always on the first switch attached to it.
323 	   - Very bad ...
324 	 */
325 	p_sw = mcast_mgr_find_optimal_switch(sm, list);
326 
327 	OSM_LOG_EXIT(sm->p_log);
328 	return p_sw;
329 }
330 
mcast_mgr_set_mft_block(osm_sm_t * sm,IN osm_switch_t * p_sw,uint32_t block_num,uint32_t position)331 static int mcast_mgr_set_mft_block(osm_sm_t * sm, IN osm_switch_t * p_sw,
332 				   uint32_t block_num, uint32_t position)
333 {
334 	osm_node_t *p_node;
335 	osm_physp_t *p_physp;
336 	osm_dr_path_t *p_path;
337 	osm_madw_context_t context;
338 	ib_api_status_t status;
339 	uint32_t block_id_ho;
340 	osm_mcast_tbl_t *p_tbl;
341 	ib_net16_t block[IB_MCAST_BLOCK_SIZE];
342 	int ret = 0;
343 
344 	CL_ASSERT(sm);
345 
346 	OSM_LOG_ENTER(sm->p_log);
347 
348 	CL_ASSERT(p_sw);
349 
350 	p_node = p_sw->p_node;
351 
352 	CL_ASSERT(p_node);
353 
354 	p_physp = osm_node_get_physp_ptr(p_node, 0);
355 	p_path = osm_physp_get_dr_path_ptr(p_physp);
356 
357 	/*
358 	   Send multicast forwarding table blocks to the switch
359 	   as long as the switch indicates it has blocks needing
360 	   configuration.
361 	 */
362 
363 	context.mft_context.node_guid = osm_node_get_node_guid(p_node);
364 	context.mft_context.set_method = TRUE;
365 
366 	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
367 
368 	if (osm_mcast_tbl_get_block(p_tbl, (uint16_t) block_num,
369 				    (uint8_t) position, block)) {
370 		block_id_ho = block_num + (position << 28);
371 
372 		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
373 			"Writing MFT block %u position %u to switch 0x%" PRIx64
374 			"\n", block_num, position,
375 			cl_ntoh64(context.mft_context.node_guid));
376 
377 		status = osm_req_set(sm, p_path, (void *)block, sizeof(block),
378 				     IB_MAD_ATTR_MCAST_FWD_TBL,
379 				     cl_hton32(block_id_ho), FALSE,
380 				     ib_port_info_get_m_key(&p_physp->port_info),
381 				     CL_DISP_MSGID_NONE, &context);
382 		if (status != IB_SUCCESS) {
383 			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A02: "
384 				"Sending multicast fwd. tbl. block 0x%X to %s "
385 				"failed (%s)\n", block_id_ho,
386 				p_node->print_desc, ib_get_err_str(status));
387 			ret = -1;
388 		}
389 	}
390 
391 	OSM_LOG_EXIT(sm->p_log);
392 	return ret;
393 }
394 
395 /**********************************************************************
396   This is part of the recursive function to compute the paths in the
397   spanning tree that emanate from this switch.  On input, the p_list
398   contains the group members that must be routed from this switch.
399 **********************************************************************/
mcast_mgr_subdivide(osm_sm_t * sm,uint16_t mlid_ho,osm_switch_t * p_sw,cl_qlist_t * p_list,cl_qlist_t * list_array,uint8_t array_size)400 static void mcast_mgr_subdivide(osm_sm_t * sm, uint16_t mlid_ho,
401 				osm_switch_t * p_sw, cl_qlist_t * p_list,
402 				cl_qlist_t * list_array, uint8_t array_size)
403 {
404 	uint8_t port_num;
405 	boolean_t ignore_existing;
406 	osm_mcast_work_obj_t *p_wobj;
407 
408 	OSM_LOG_ENTER(sm->p_log);
409 
410 	/*
411 	   For Multicast Groups, we don't want to count on previous
412 	   configurations - since we can easily generate a storm
413 	   by loops.
414 	 */
415 	ignore_existing = TRUE;
416 
417 	/*
418 	   Subdivide the set of ports into non-overlapping subsets
419 	   that will be routed to other switches.
420 	 */
421 	while ((p_wobj =
422 		(osm_mcast_work_obj_t *) cl_qlist_remove_head(p_list)) !=
423 	       (osm_mcast_work_obj_t *) cl_qlist_end(p_list)) {
424 		port_num =
425 		    osm_switch_recommend_mcast_path(p_sw, p_wobj->p_port,
426 						    mlid_ho, ignore_existing);
427 		if (port_num == OSM_NO_PATH) {
428 			/*
429 			   This typically occurs if the switch does not support
430 			   multicast and the multicast tree must branch at this
431 			   switch.
432 			 */
433 			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A03: "
434 				"Error routing MLID 0x%X through switch 0x%"
435 				PRIx64 " %s\n"
436 				"\t\t\t\tNo multicast paths from this switch "
437 				"for port with LID %u\n", mlid_ho,
438 				cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)),
439 				p_sw->p_node->print_desc,
440 				cl_ntoh16(osm_port_get_base_lid
441 					  (p_wobj->p_port)));
442 			mcast_work_obj_delete(p_wobj);
443 			continue;
444 		}
445 
446 		if (port_num >= array_size) {
447 			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A04: "
448 				"Error routing MLID 0x%X through switch 0x%"
449 				PRIx64 " %s\n"
450 				"\t\t\t\tNo multicast paths from this switch "
451 				"to port with LID %u\n", mlid_ho,
452 				cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)),
453 				p_sw->p_node->print_desc,
454 				cl_ntoh16(osm_port_get_base_lid
455 					  (p_wobj->p_port)));
456 			mcast_work_obj_delete(p_wobj);
457 			continue;
458 		}
459 
460 		cl_qlist_insert_tail(&list_array[port_num], &p_wobj->list_item);
461 	}
462 
463 	OSM_LOG_EXIT(sm->p_log);
464 }
465 
mcast_mgr_purge_list(osm_sm_t * sm,uint16_t mlid,cl_qlist_t * list)466 static void mcast_mgr_purge_list(osm_sm_t * sm, uint16_t mlid, cl_qlist_t * list)
467 {
468 	if (OSM_LOG_IS_ACTIVE_V2(sm->p_log, OSM_LOG_ERROR)) {
469 		osm_mcast_work_obj_t *wobj;
470 		cl_list_item_t *i;
471 		for (i = cl_qlist_head(list); i != cl_qlist_end(list);
472 		     i = cl_qlist_next(i)) {
473 			wobj = cl_item_obj(i, wobj, list_item);
474 			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A06: "
475 				"Unable to route MLID 0x%X for port 0x%" PRIx64 "\n",
476 				mlid, cl_ntoh64(osm_port_get_guid(wobj->p_port)));
477 		}
478 	}
479 	osm_mcast_drop_port_list(list);
480 }
481 
482 /**********************************************************************
483   This is the recursive function to compute the paths in the spanning
484   tree that emanate from this switch.  On input, the p_list contains
485   the group members that must be routed from this switch.
486 
487   The function returns the newly created mtree node element.
488 **********************************************************************/
mcast_mgr_branch(osm_sm_t * sm,uint16_t mlid_ho,osm_switch_t * p_sw,cl_qlist_t * p_list,uint8_t depth,uint8_t upstream_port,uint8_t * p_max_depth)489 static osm_mtree_node_t *mcast_mgr_branch(osm_sm_t * sm, uint16_t mlid_ho,
490 					  osm_switch_t * p_sw,
491 					  cl_qlist_t * p_list, uint8_t depth,
492 					  uint8_t upstream_port,
493 					  uint8_t * p_max_depth)
494 {
495 	uint8_t max_children;
496 	osm_mtree_node_t *p_mtn = NULL;
497 	cl_qlist_t *list_array = NULL;
498 	uint8_t i;
499 	ib_net64_t node_guid;
500 	osm_mcast_work_obj_t *p_wobj;
501 	cl_qlist_t *p_port_list;
502 	size_t count;
503 	osm_mcast_tbl_t *p_tbl;
504 
505 	OSM_LOG_ENTER(sm->p_log);
506 
507 	CL_ASSERT(p_sw);
508 	CL_ASSERT(p_list);
509 	CL_ASSERT(p_max_depth);
510 
511 	node_guid = osm_node_get_node_guid(p_sw->p_node);
512 
513 	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
514 		"Routing MLID 0x%X through switch 0x%" PRIx64
515 		" %s, %u nodes at depth %u\n",
516 		mlid_ho, cl_ntoh64(node_guid), p_sw->p_node->print_desc,
517 		cl_qlist_count(p_list), depth);
518 
519 	CL_ASSERT(cl_qlist_count(p_list) > 0);
520 
521 	depth++;
522 
523 	if (depth >= 64) {
524 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A21: "
525 			"Maximal hops number is reached for MLID 0x%x."
526 			" Break processing\n", mlid_ho);
527 		mcast_mgr_purge_list(sm, mlid_ho, p_list);
528 		goto Exit;
529 	}
530 
531 	if (depth > *p_max_depth) {
532 		CL_ASSERT(depth == *p_max_depth + 1);
533 		*p_max_depth = depth;
534 	}
535 
536 	if (osm_switch_supports_mcast(p_sw) == FALSE) {
537 		/*
538 		   This switch doesn't do multicast.  Clean-up.
539 		 */
540 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A14: "
541 			"Switch 0x%" PRIx64 " %s does not support multicast\n",
542 			cl_ntoh64(node_guid), p_sw->p_node->print_desc);
543 
544 		/*
545 		   Deallocate all the work objects on this branch of the tree.
546 		 */
547 		mcast_mgr_purge_list(sm, mlid_ho, p_list);
548 		goto Exit;
549 	}
550 
551 	p_mtn = osm_mtree_node_new(p_sw);
552 	if (p_mtn == NULL) {
553 		/*
554 		   We are unable to continue routing down this
555 		   leg of the tree.  Clean-up.
556 		 */
557 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A15: "
558 			"Insufficient memory to build multicast tree\n");
559 
560 		/*
561 		   Deallocate all the work objects on this branch of the tree.
562 		 */
563 		mcast_mgr_purge_list(sm, mlid_ho, p_list);
564 		goto Exit;
565 	}
566 
567 	max_children = osm_mtree_node_get_max_children(p_mtn);
568 
569 	CL_ASSERT(max_children > 1);
570 
571 	/*
572 	   Prepare an empty list for each port in the switch.
573 	   TO DO - this list array could probably be moved
574 	   inside the switch element to save on malloc thrashing.
575 	 */
576 	list_array = malloc(sizeof(cl_qlist_t) * max_children);
577 	if (list_array == NULL) {
578 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A16: "
579 			"Unable to allocate list array\n");
580 		mcast_mgr_purge_list(sm, mlid_ho, p_list);
581 		osm_mtree_destroy(p_mtn);
582 		p_mtn = NULL;
583 		goto Exit;
584 	}
585 
586 	memset(list_array, 0, sizeof(cl_qlist_t) * max_children);
587 
588 	for (i = 0; i < max_children; i++)
589 		cl_qlist_init(&list_array[i]);
590 
591 	mcast_mgr_subdivide(sm, mlid_ho, p_sw, p_list, list_array, max_children);
592 
593 	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
594 
595 	/*
596 	   Add the upstream port to the forwarding table unless
597 	   we're at the root of the spanning tree.
598 	 */
599 	if (depth > 1) {
600 		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
601 			"Adding upstream port %u\n", upstream_port);
602 
603 		CL_ASSERT(upstream_port);
604 		osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port);
605 	}
606 
607 	/*
608 	   For each port that was allocated some routes,
609 	   recurse into this function to continue building the tree
610 	   if the node on the other end of that port is another switch.
611 	   Otherwise, the node is an endpoint, and we've found a leaf
612 	   of the tree.  Mark leaves with our special pointer value.
613 	 */
614 
615 	for (i = 0; i < max_children; i++) {
616 		const osm_physp_t *p_physp;
617 		const osm_physp_t *p_remote_physp;
618 		osm_node_t *p_node;
619 		const osm_node_t *p_remote_node;
620 
621 		p_port_list = &list_array[i];
622 
623 		count = cl_qlist_count(p_port_list);
624 
625 		/*
626 		   There should be no children routed through the upstream port!
627 		 */
628 		CL_ASSERT(upstream_port == 0 || i != upstream_port ||
629 			  (i == upstream_port && count == 0));
630 
631 		if (count == 0)
632 			continue;	/* No routes down this port. */
633 
634 		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
635 			"Routing %zu destinations via switch port %u\n",
636 			count, i);
637 
638 		if (i == 0) {
639 			/* This means we are adding the switch to the MC group.
640 			   We do not need to continue looking at the remote
641 			   port, just needed to add the port to the table */
642 			CL_ASSERT(count == 1);
643 
644 			osm_mcast_tbl_set(p_tbl, mlid_ho, i);
645 
646 			p_wobj = (osm_mcast_work_obj_t *)
647 			    cl_qlist_remove_head(p_port_list);
648 			mcast_work_obj_delete(p_wobj);
649 			continue;
650 		}
651 
652 		p_node = p_sw->p_node;
653 		p_remote_node = osm_node_get_remote_node(p_node, i, NULL);
654 		if (!p_remote_node) {
655 			/*
656 			 * If we reached here, it means the minhop table has
657 			 * invalid entries that leads to disconnected ports.
658 			 *
659 			 * A possible reason for the code to reach here is
660 			 * that ucast cache is enabled, and a leaf switch that
661 			 * is used as a non-leaf switch in a multicast has been
662 			 * removed from the fabric.
663 			 *
664 			 * When it happens, we should invalidate the cache
665 			 * and force rerouting of the fabric.
666 			 */
667 
668 			OSM_LOG(sm->p_log, OSM_LOG_ERROR,
669 				"ERR 0A1E: Tried to route MLID 0x%X through "
670 				"disconnected switch 0x%" PRIx64 " port %d\n",
671 				mlid_ho, cl_ntoh64(node_guid), i);
672 
673 			/* Free memory */
674 			mcast_mgr_purge_list(sm, mlid_ho, p_port_list);
675 
676 			/* Invalidate ucast cache */
677 			if (sm->ucast_mgr.p_subn->opt.use_ucast_cache &&
678 			    sm->ucast_mgr.cache_valid) {
679 				OSM_LOG(sm->p_log, OSM_LOG_INFO,
680 					"Unicast Cache will be invalidated due "
681 					"to multicast routing errors\n");
682 				osm_ucast_cache_invalidate(&sm->ucast_mgr);
683 				sm->p_subn->force_heavy_sweep = TRUE;
684 			}
685 
686 			continue;
687 		}
688 
689 		/*
690 		   This port routes frames for this mcast group.  Therefore,
691 		   set the appropriate bit in the multicast forwarding
692 		   table for this switch.
693 		 */
694 		osm_mcast_tbl_set(p_tbl, mlid_ho, i);
695 
696 		if (osm_node_get_type(p_remote_node) == IB_NODE_TYPE_SWITCH) {
697 			/*
698 			   Acquire a pointer to the remote switch then recurse.
699 			 */
700 			CL_ASSERT(p_remote_node->sw);
701 
702 			p_physp = osm_node_get_physp_ptr(p_node, i);
703 			CL_ASSERT(p_physp);
704 
705 			p_remote_physp = osm_physp_get_remote(p_physp);
706 			CL_ASSERT(p_remote_physp);
707 
708 			p_mtn->child_array[i] =
709 			    mcast_mgr_branch(sm, mlid_ho, p_remote_node->sw,
710 					     p_port_list, depth,
711 					     osm_physp_get_port_num
712 					     (p_remote_physp), p_max_depth);
713 		} else {
714 			/*
715 			   The neighbor node is not a switch, so this
716 			   must be a leaf.
717 			 */
718 			CL_ASSERT(count == 1);
719 
720 			p_mtn->child_array[i] = OSM_MTREE_LEAF;
721 			p_wobj = (osm_mcast_work_obj_t *)
722 			    cl_qlist_remove_head(p_port_list);
723 
724 			CL_ASSERT(cl_is_qlist_empty(p_port_list));
725 
726 			OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
727 				"Found leaf for port 0x%016" PRIx64
728 				" on switch port %u\n",
729 				cl_ntoh64(osm_port_get_guid(p_wobj->p_port)),
730 				i);
731 			mcast_work_obj_delete(p_wobj);
732 		}
733 	}
734 
735 	free(list_array);
736 Exit:
737 	OSM_LOG_EXIT(sm->p_log);
738 	return p_mtn;
739 }
740 
mcast_mgr_build_spanning_tree(osm_sm_t * sm,osm_mgrp_box_t * mbox)741 static ib_api_status_t mcast_mgr_build_spanning_tree(osm_sm_t * sm,
742 						     osm_mgrp_box_t * mbox)
743 {
744 	cl_qlist_t port_list;
745 	cl_qmap_t port_map;
746 	uint32_t num_ports;
747 	osm_switch_t *p_sw;
748 	ib_api_status_t status = IB_SUCCESS;
749 	uint8_t max_depth = 0;
750 
751 	OSM_LOG_ENTER(sm->p_log);
752 
753 	/*
754 	   TO DO - for now, just blow away the old tree.
755 	   In the future we'll need to construct the tree based
756 	   on multicast forwarding table information if the user wants to
757 	   preserve existing multicast routes.
758 	 */
759 	osm_purge_mtree(sm, mbox);
760 
761 	/* build the first "subset" containing all member ports */
762 	if (osm_mcast_make_port_list_and_map(&port_list, &port_map, mbox)) {
763 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A10: "
764 			"Insufficient memory to make port list\n");
765 		status = IB_ERROR;
766 		goto Exit;
767 	}
768 
769 	num_ports = cl_qlist_count(&port_list);
770 	if (num_ports < 2) {
771 		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
772 			"MLID 0x%X has %u members - nothing to do\n",
773 			mbox->mlid, num_ports);
774 		osm_mcast_drop_port_list(&port_list);
775 		goto Exit;
776 	}
777 
778 	/*
779 	   This function builds the single spanning tree recursively.
780 	   At each stage, the ports to be reached are divided into
781 	   non-overlapping subsets of member ports that can be reached through
782 	   a given switch port.  Construction then moves down each
783 	   branch, and the process starts again with each branch computing
784 	   for its own subset of the member ports.
785 
786 	   The maximum recursion depth is at worst the maximum hop count in the
787 	   subnet, which is spec limited to 64.
788 	 */
789 
790 	/*
791 	   Locate the switch around which to create the spanning
792 	   tree for this multicast group.
793 	 */
794 	p_sw = osm_mcast_mgr_find_root_switch(sm, &port_list);
795 	if (p_sw == NULL) {
796 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A08: "
797 			"Unable to locate a suitable switch for group 0x%X\n",
798 			mbox->mlid);
799 		osm_mcast_drop_port_list(&port_list);
800 		status = IB_ERROR;
801 		goto Exit;
802 	}
803 
804 	mbox->root = mcast_mgr_branch(sm, mbox->mlid, p_sw, &port_list, 0, 0,
805 				      &max_depth);
806 
807 	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
808 		"Configured MLID 0x%X for %u ports, max tree depth = %u\n",
809 		mbox->mlid, num_ports, max_depth);
810 Exit:
811 	OSM_LOG_EXIT(sm->p_log);
812 	return status;
813 }
814 
815 #if 0
816 /* unused */
817 void osm_mcast_mgr_set_table(osm_sm_t * sm, IN const osm_mgrp_t * p_mgrp,
818 			     IN const osm_mtree_node_t * p_mtn)
819 {
820 	uint8_t i;
821 	uint8_t max_children;
822 	osm_mtree_node_t *p_child_mtn;
823 	uint16_t mlid_ho;
824 	osm_mcast_tbl_t *p_tbl;
825 	osm_switch_t *p_sw;
826 
827 	OSM_LOG_ENTER(sm->p_log);
828 
829 	mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp));
830 	p_sw = osm_mtree_node_get_switch_ptr(p_mtn);
831 
832 	CL_ASSERT(p_sw);
833 
834 	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
835 		"Configuring MLID 0x%X on switch 0x%" PRIx64 "\n",
836 		mlid_ho, osm_node_get_node_guid(p_sw->p_node));
837 
838 	/*
839 	   For every child of this tree node, set the corresponding
840 	   bit in the switch's mcast table.
841 	 */
842 	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
843 	max_children = osm_mtree_node_get_max_children(p_mtn);
844 
845 	CL_ASSERT(max_children <= osm_switch_get_num_ports(p_sw));
846 
847 	osm_mcast_tbl_clear_mlid(p_tbl, mlid_ho);
848 
849 	for (i = 0; i < max_children; i++) {
850 		p_child_mtn = osm_mtree_node_get_child(p_mtn, i);
851 		if (p_child_mtn == NULL)
852 			continue;
853 
854 		osm_mcast_tbl_set(p_tbl, mlid_ho, i);
855 	}
856 
857 	OSM_LOG_EXIT(sm->p_log);
858 }
859 #endif
860 
mcast_mgr_clear(osm_sm_t * sm,uint16_t mlid)861 static void mcast_mgr_clear(osm_sm_t * sm, uint16_t mlid)
862 {
863 	osm_switch_t *p_sw;
864 	cl_qmap_t *p_sw_tbl;
865 	osm_mcast_tbl_t *p_mcast_tbl;
866 
867 	OSM_LOG_ENTER(sm->p_log);
868 
869 	/* Walk the switches and clear the routing entries for this MLID. */
870 	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
871 	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
872 	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
873 		p_mcast_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
874 		osm_mcast_tbl_clear_mlid(p_mcast_tbl, mlid);
875 		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
876 	}
877 
878 	OSM_LOG_EXIT(sm->p_log);
879 }
880 
881 #if 0
882 /* TO DO - make this real -- at least update spanning tree */
883 /**********************************************************************
884    Lock must be held on entry.
885 **********************************************************************/
886 ib_api_status_t osm_mcast_mgr_process_single(osm_sm_t * sm,
887 					     IN ib_net16_t const mlid,
888 					     IN ib_net64_t const port_guid,
889 					     IN uint8_t const join_state)
890 {
891 	uint8_t port_num;
892 	uint16_t mlid_ho;
893 	ib_net64_t sw_guid;
894 	osm_port_t *p_port;
895 	osm_physp_t *p_physp;
896 	osm_physp_t *p_remote_physp;
897 	osm_node_t *p_remote_node;
898 	osm_mcast_tbl_t *p_mcast_tbl;
899 	ib_api_status_t status = IB_SUCCESS;
900 
901 	OSM_LOG_ENTER(sm->p_log);
902 
903 	CL_ASSERT(mlid);
904 	CL_ASSERT(port_guid);
905 
906 	mlid_ho = cl_ntoh16(mlid);
907 
908 	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
909 		"Attempting to add port 0x%" PRIx64 " to MLID 0x%X, "
910 		"\n\t\t\t\tjoin state = 0x%X\n",
911 		cl_ntoh64(port_guid), mlid_ho, join_state);
912 
913 	/*
914 	   Acquire the Port object.
915 	 */
916 	p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
917 	if (!p_port) {
918 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A01: "
919 			"Unable to acquire port object for 0x%" PRIx64 "\n",
920 			cl_ntoh64(port_guid));
921 		status = IB_ERROR;
922 		goto Exit;
923 	}
924 
925 	p_physp = p_port->p_physp;
926 	if (p_physp == NULL) {
927 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A05: "
928 			"Unable to acquire phsyical port object for 0x%" PRIx64
929 			"\n", cl_ntoh64(port_guid));
930 		status = IB_ERROR;
931 		goto Exit;
932 	}
933 
934 	p_remote_physp = osm_physp_get_remote(p_physp);
935 	if (p_remote_physp == NULL) {
936 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A11: "
937 			"Unable to acquire remote phsyical port object "
938 			"for 0x%" PRIx64 "\n", cl_ntoh64(port_guid));
939 		status = IB_ERROR;
940 		goto Exit;
941 	}
942 
943 	p_remote_node = osm_physp_get_node_ptr(p_remote_physp);
944 
945 	CL_ASSERT(p_remote_node);
946 
947 	sw_guid = osm_node_get_node_guid(p_remote_node);
948 
949 	if (osm_node_get_type(p_remote_node) != IB_NODE_TYPE_SWITCH) {
950 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A22: "
951 			"Remote node not a switch node 0x%" PRIx64 "\n",
952 			cl_ntoh64(sw_guid));
953 		status = IB_ERROR;
954 		goto Exit;
955 	}
956 
957 	if (!p_remote_node->sw) {
958 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A12: "
959 			"No switch object 0x%" PRIx64 "\n", cl_ntoh64(sw_guid));
960 		status = IB_ERROR;
961 		goto Exit;
962 	}
963 
964 	if (osm_switch_is_in_mcast_tree(p_remote_node->sw, mlid_ho)) {
965 		/*
966 		   We're in luck. The switch attached to this port
967 		   is already in the multicast group, so we can just
968 		   add the specified port as a new leaf of the tree.
969 		 */
970 		if (join_state & (IB_JOIN_STATE_FULL | IB_JOIN_STATE_NON)) {
971 			/*
972 			   This node wants to receive multicast frames.
973 			   Get the switch port number to which the new member port
974 			   is attached, then configure this single mcast table.
975 			 */
976 			port_num = osm_physp_get_port_num(p_remote_physp);
977 			CL_ASSERT(port_num);
978 
979 			p_mcast_tbl =
980 			    osm_switch_get_mcast_tbl_ptr(p_remote_node->sw);
981 			osm_mcast_tbl_set(p_mcast_tbl, mlid_ho, port_num);
982 		} else {
983 			if (join_state & IB_JOIN_STATE_SEND_ONLY)
984 				OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
985 					"Success.  Nothing to do for send"
986 					"only member\n");
987 			else {
988 				OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A13: "
989 					"Unknown join state 0x%X\n",
990 					join_state);
991 				status = IB_ERROR;
992 				goto Exit;
993 			}
994 		}
995 	} else
996 		OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Unable to add port\n");
997 
998 Exit:
999 	OSM_LOG_EXIT(sm->p_log);
1000 	return status;
1001 }
1002 #endif
1003 
1004 /**********************************************************************
1005  Process the entire group.
1006  NOTE : The lock should be held externally!
1007  **********************************************************************/
mcast_mgr_process_mlid(osm_sm_t * sm,uint16_t mlid)1008 static ib_api_status_t mcast_mgr_process_mlid(osm_sm_t * sm, uint16_t mlid)
1009 {
1010 	ib_api_status_t status = IB_SUCCESS;
1011 	struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used;
1012 	osm_mgrp_box_t *mbox;
1013 
1014 	OSM_LOG_ENTER(sm->p_log);
1015 
1016 	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1017 		"Processing multicast group with mlid 0x%X\n", mlid);
1018 
1019 	/* Clear the multicast tables to start clean, then build
1020 	   the spanning tree which sets the mcast table bits for each
1021 	   port in the group. */
1022 	mcast_mgr_clear(sm, mlid);
1023 
1024 	mbox = osm_get_mbox_by_mlid(sm->p_subn, cl_hton16(mlid));
1025 	if (mbox) {
1026 		if (re && re->mcast_build_stree)
1027 			status = re->mcast_build_stree(re->context, mbox);
1028 		else
1029 			status = mcast_mgr_build_spanning_tree(sm, mbox);
1030 
1031 		if (status != IB_SUCCESS)
1032 			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A17: "
1033 				"Unable to create spanning tree (%s) for mlid "
1034 				"0x%x\n", ib_get_err_str(status), mlid);
1035 	}
1036 
1037 	OSM_LOG_EXIT(sm->p_log);
1038 	return status;
1039 }
1040 
mcast_mgr_set_mfttop(IN osm_sm_t * sm,IN osm_switch_t * p_sw)1041 static void mcast_mgr_set_mfttop(IN osm_sm_t * sm, IN osm_switch_t * p_sw)
1042 {
1043 	osm_node_t *p_node;
1044 	osm_dr_path_t *p_path;
1045 	osm_physp_t *p_physp;
1046 	osm_mcast_tbl_t *p_tbl;
1047 	osm_madw_context_t context;
1048 	ib_api_status_t status;
1049 	ib_switch_info_t si;
1050 	ib_net16_t mcast_top;
1051 
1052 	OSM_LOG_ENTER(sm->p_log);
1053 
1054 	CL_ASSERT(p_sw);
1055 
1056 	p_node = p_sw->p_node;
1057 
1058 	CL_ASSERT(p_node);
1059 
1060 	p_physp = osm_node_get_physp_ptr(p_node, 0);
1061 	p_path = osm_physp_get_dr_path_ptr(p_physp);
1062 	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1063 
1064 	if (sm->p_subn->opt.use_mfttop &&
1065 	    p_physp->port_info.capability_mask & IB_PORT_CAP_HAS_MCAST_FDB_TOP) {
1066 		/*
1067 		   Set the top of the multicast forwarding table.
1068 		 */
1069 		si = p_sw->switch_info;
1070 		if (sm->p_subn->first_time_master_sweep == TRUE)
1071 			mcast_top = cl_hton16(sm->mlids_init_max);
1072 		else {
1073 			if (p_tbl->max_block_in_use == -1)
1074 				mcast_top = cl_hton16(IB_LID_MCAST_START_HO - 1);
1075 			else
1076 				mcast_top = cl_hton16(IB_LID_MCAST_START_HO +
1077 						      (p_tbl->max_block_in_use + 1) * IB_MCAST_BLOCK_SIZE - 1);
1078 		}
1079 		if (mcast_top == si.mcast_top)
1080 			return;
1081 
1082 		si.mcast_top = mcast_top;
1083 
1084 		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1085 			"Setting switch MFT top to MLID 0x%x\n",
1086 			cl_ntoh16(si.mcast_top));
1087 
1088 		context.si_context.light_sweep = FALSE;
1089 		context.si_context.node_guid = osm_node_get_node_guid(p_node);
1090 		context.si_context.set_method = TRUE;
1091 		context.si_context.lft_top_change = FALSE;
1092 
1093 		status = osm_req_set(sm, p_path, (uint8_t *) & si,
1094 				     sizeof(si), IB_MAD_ATTR_SWITCH_INFO,
1095 				     0, FALSE,
1096 				     ib_port_info_get_m_key(&p_physp->port_info),
1097 				     CL_DISP_MSGID_NONE, &context);
1098 
1099 		if (status != IB_SUCCESS)
1100 			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A1B: "
1101 				"Sending SwitchInfo attribute failed (%s)\n",
1102 				ib_get_err_str(status));
1103 	}
1104 }
1105 
mcast_mgr_set_mftables(osm_sm_t * sm)1106 static int mcast_mgr_set_mftables(osm_sm_t * sm)
1107 {
1108 	cl_qmap_t *p_sw_tbl = &sm->p_subn->sw_guid_tbl;
1109 	osm_switch_t *p_sw;
1110 	osm_mcast_tbl_t *p_tbl;
1111 	int block_notdone, ret = 0;
1112 	int16_t block_num, max_block = -1;
1113 
1114 	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
1115 	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
1116 		p_sw->mft_block_num = 0;
1117 		p_sw->mft_position = 0;
1118 		p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1119 		if (osm_mcast_tbl_get_max_block_in_use(p_tbl) > max_block)
1120 			max_block = osm_mcast_tbl_get_max_block_in_use(p_tbl);
1121 		mcast_mgr_set_mfttop(sm, p_sw);
1122 		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
1123 	}
1124 
1125 	/* Stripe the MFT blocks across the switches */
1126 	for (block_num = 0; block_num <= max_block; block_num++) {
1127 		block_notdone = 1;
1128 		while (block_notdone) {
1129 			block_notdone = 0;
1130 			p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
1131 			while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
1132 				if (p_sw->mft_block_num == block_num) {
1133 					block_notdone = 1;
1134 					if (mcast_mgr_set_mft_block(sm, p_sw,
1135 								    p_sw->mft_block_num,
1136 								    p_sw->mft_position))
1137 						ret = -1;
1138 					p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
1139 					if (++p_sw->mft_position > p_tbl->max_position) {
1140 						p_sw->mft_position = 0;
1141 						p_sw->mft_block_num++;
1142 					}
1143 				}
1144 				p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
1145 			}
1146 		}
1147 	}
1148 
1149 	return ret;
1150 }
1151 
alloc_mfts(osm_sm_t * sm)1152 static int alloc_mfts(osm_sm_t * sm)
1153 {
1154 	int i;
1155 	cl_map_item_t *item;
1156 	osm_switch_t *p_sw;
1157 
1158 	for (i = sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO; i >= 0;
1159 	     i--)
1160 		if (sm->p_subn->mboxes[i])
1161 			break;
1162 	if (i < 0)
1163 		return 0;
1164 
1165 	/* Now, walk switches and (re)allocate multicast tables */
1166 	for (item = cl_qmap_head(&sm->p_subn->sw_guid_tbl);
1167 	     item != cl_qmap_end(&sm->p_subn->sw_guid_tbl);
1168 	     item = cl_qmap_next(item)) {
1169 		p_sw = (osm_switch_t *) item;
1170 		if (osm_mcast_tbl_realloc(&p_sw->mcast_tbl, i))
1171 			return -1;
1172 	}
1173 	return 0;
1174 }
1175 
1176 /**********************************************************************
1177   This is the function that is invoked during idle time and sweep to
1178   handle the process request for mcast groups where join/leave/delete
1179   was required.
1180  **********************************************************************/
osm_mcast_mgr_process(osm_sm_t * sm,boolean_t config_all)1181 int osm_mcast_mgr_process(osm_sm_t * sm, boolean_t config_all)
1182 {
1183 	int ret = 0;
1184 	unsigned i;
1185 	unsigned max_mlid;
1186 
1187 	OSM_LOG_ENTER(sm->p_log);
1188 
1189 	CL_PLOCK_EXCL_ACQUIRE(sm->p_lock);
1190 
1191 	/* If there are no switches in the subnet we have nothing to do. */
1192 	if (cl_qmap_count(&sm->p_subn->sw_guid_tbl) == 0) {
1193 		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
1194 			"No switches in subnet. Nothing to do\n");
1195 		goto exit;
1196 	}
1197 
1198 	if (alloc_mfts(sm)) {
1199 		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
1200 			"ERR 0A09: alloc_mfts failed\n");
1201 		ret = -1;
1202 		goto exit;
1203 	}
1204 
1205 	max_mlid = config_all ? sm->p_subn->max_mcast_lid_ho
1206 			- IB_LID_MCAST_START_HO : sm->mlids_req_max;
1207 	for (i = 0; i <= max_mlid; i++) {
1208 		if (sm->mlids_req[i] ||
1209 		    (config_all && sm->p_subn->mboxes[i])) {
1210 			sm->mlids_req[i] = 0;
1211 			mcast_mgr_process_mlid(sm, i + IB_LID_MCAST_START_HO);
1212 		}
1213 	}
1214 
1215 	sm->mlids_req_max = 0;
1216 
1217 	ret = mcast_mgr_set_mftables(sm);
1218 
1219 	osm_dump_mcast_routes(sm->p_subn->p_osm);
1220 
1221 exit:
1222 	CL_PLOCK_RELEASE(sm->p_lock);
1223 	OSM_LOG_EXIT(sm->p_log);
1224 	return ret;
1225 }
1226