1 /* 2 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. 3 * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. 5 * Copyright (c) 2008 Xsigo Systems Inc. All rights reserved. 6 * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. 7 * Copyright (c) 2010 HNR Consulting. All rights reserved. 8 * Copyright (C) 2012-2013 Tokyo Institute of Technology. All rights reserved. 9 * 10 * This software is available to you under a choice of one of two 11 * licenses. You may choose to be licensed under the terms of the GNU 12 * General Public License (GPL) Version 2, available from the file 13 * COPYING in the main directory of this source tree, or the 14 * OpenIB.org BSD license below: 15 * 16 * Redistribution and use in source and binary forms, with or 17 * without modification, are permitted provided that the following 18 * conditions are met: 19 * 20 * - Redistributions of source code must retain the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer. 23 * 24 * - Redistributions in binary form must reproduce the above 25 * copyright notice, this list of conditions and the following 26 * disclaimer in the documentation and/or other materials 27 * provided with the distribution. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 30 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 31 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 32 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 33 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 34 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 35 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 36 * SOFTWARE. 37 * 38 */ 39 40 /* 41 * Abstract: 42 * Implementation of osm_mcast_mgr_t. 43 * This file implements the Multicast Manager object. 44 */ 45 46 #if HAVE_CONFIG_H 47 # include <config.h> 48 #endif /* HAVE_CONFIG_H */ 49 50 #include <stdlib.h> 51 #include <string.h> 52 #include <iba/ib_types.h> 53 #include <complib/cl_debug.h> 54 #include <opensm/osm_file_ids.h> 55 #define FILE_ID OSM_FILE_MCAST_MGR_C 56 #include <opensm/osm_opensm.h> 57 #include <opensm/osm_sm.h> 58 #include <opensm/osm_multicast.h> 59 #include <opensm/osm_node.h> 60 #include <opensm/osm_switch.h> 61 #include <opensm/osm_helper.h> 62 #include <opensm/osm_msgdef.h> 63 #include <opensm/osm_mcast_mgr.h> 64 65 static osm_mcast_work_obj_t *mcast_work_obj_new(IN osm_port_t * p_port) 66 { 67 osm_mcast_work_obj_t *p_obj; 68 69 /* 70 clean allocated memory to avoid assertion when trying to insert to 71 qlist. 72 see cl_qlist_insert_tail(): CL_ASSERT(p_list_item->p_list != p_list) 73 */ 74 p_obj = malloc(sizeof(*p_obj)); 75 if (p_obj) { 76 memset(p_obj, 0, sizeof(*p_obj)); 77 p_obj->p_port = p_port; 78 } 79 80 return p_obj; 81 } 82 83 static void mcast_work_obj_delete(IN osm_mcast_work_obj_t * p_wobj) 84 { 85 free(p_wobj); 86 } 87 88 int osm_mcast_make_port_list_and_map(cl_qlist_t * list, cl_qmap_t * map, 89 osm_mgrp_box_t * mbox) 90 { 91 cl_map_item_t *map_item; 92 cl_list_item_t *list_item; 93 osm_mgrp_t *mgrp; 94 osm_mcm_port_t *mcm_port; 95 osm_mcast_work_obj_t *wobj; 96 97 cl_qmap_init(map); 98 cl_qlist_init(list); 99 100 for (list_item = cl_qlist_head(&mbox->mgrp_list); 101 list_item != cl_qlist_end(&mbox->mgrp_list); 102 list_item = cl_qlist_next(list_item)) { 103 mgrp = cl_item_obj(list_item, mgrp, list_item); 104 for (map_item = cl_qmap_head(&mgrp->mcm_port_tbl); 105 map_item != cl_qmap_end(&mgrp->mcm_port_tbl); 106 map_item = cl_qmap_next(map_item)) { 107 /* Acquire the port object for this port guid, then 108 create the new worker object to build the list. */ 109 mcm_port = cl_item_obj(map_item, mcm_port, map_item); 110 if (cl_qmap_get(map, mcm_port->port->guid) != 111 cl_qmap_end(map)) 112 continue; 113 wobj = mcast_work_obj_new(mcm_port->port); 114 if (!wobj) 115 return -1; 116 cl_qlist_insert_tail(list, &wobj->list_item); 117 cl_qmap_insert(map, mcm_port->port->guid, 118 &wobj->map_item); 119 } 120 } 121 return 0; 122 } 123 124 void osm_mcast_drop_port_list(cl_qlist_t * list) 125 { 126 while (cl_qlist_count(list)) 127 mcast_work_obj_delete((osm_mcast_work_obj_t *) 128 cl_qlist_remove_head(list)); 129 } 130 131 void osm_purge_mtree(osm_sm_t * sm, IN osm_mgrp_box_t * mbox) 132 { 133 OSM_LOG_ENTER(sm->p_log); 134 135 if (mbox->root) 136 osm_mtree_destroy(mbox->root); 137 mbox->root = NULL; 138 139 OSM_LOG_EXIT(sm->p_log); 140 } 141 142 static void create_mgrp_switch_map(cl_qmap_t * m, cl_qlist_t * port_list) 143 { 144 osm_mcast_work_obj_t *wobj; 145 osm_port_t *port; 146 osm_switch_t *sw; 147 ib_net64_t guid; 148 cl_list_item_t *i; 149 150 cl_qmap_init(m); 151 for (i = cl_qlist_head(port_list); i != cl_qlist_end(port_list); 152 i = cl_qlist_next(i)) { 153 wobj = cl_item_obj(i, wobj, list_item); 154 port = wobj->p_port; 155 if (port->p_node->sw) { 156 sw = port->p_node->sw; 157 sw->is_mc_member = 1; 158 } else if (port->p_physp->p_remote_physp) { 159 sw = port->p_physp->p_remote_physp->p_node->sw; 160 sw->num_of_mcm++; 161 } else 162 continue; 163 guid = osm_node_get_node_guid(sw->p_node); 164 if (cl_qmap_get(m, guid) == cl_qmap_end(m)) 165 cl_qmap_insert(m, guid, &sw->mgrp_item); 166 } 167 } 168 169 static void destroy_mgrp_switch_map(cl_qmap_t * m) 170 { 171 osm_switch_t *sw; 172 cl_map_item_t *i; 173 174 for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { 175 sw = cl_item_obj(i, sw, mgrp_item); 176 sw->num_of_mcm = 0; 177 sw->is_mc_member = 0; 178 } 179 cl_qmap_remove_all(m); 180 } 181 182 /********************************************************************** 183 Calculate the maximal "min hops" from the given switch to any 184 of the group HCAs 185 **********************************************************************/ 186 #ifdef OSM_VENDOR_INTF_ANAFA 187 static float mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qmap_t * m, 188 const osm_switch_t * this_sw) 189 { 190 float avg_hops = 0; 191 uint32_t hops = 0; 192 uint32_t num_ports = 0; 193 uint16_t lid; 194 uint32_t least_hops; 195 cl_map_item_t *i; 196 osm_switch_t *sw; 197 198 OSM_LOG_ENTER(sm->p_log); 199 200 for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { 201 sw = cl_item_obj(i, sw, mcast_item); 202 lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); 203 least_hops = osm_switch_get_least_hops(this_sw, lid); 204 /* for all host that are MC members and attached to the switch, 205 we should add the (least_hops + 1) * number_of_such_hosts. 206 If switch itself is in the MC, we should add the least_hops only */ 207 hops += (least_hops + 1) * sw->num_of_mcm + 208 least_hops * sw->is_mc_member; 209 num_ports += sw->num_of_mcm + sw->is_mc_member; 210 } 211 212 /* We shouldn't be here if there aren't any ports in the group. */ 213 CL_ASSERT(num_ports); 214 215 avg_hops = (float)(hops / num_ports); 216 217 OSM_LOG_EXIT(sm->p_log); 218 return avg_hops; 219 } 220 #else 221 static float mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qmap_t * m, 222 const osm_switch_t * this_sw) 223 { 224 uint32_t max_hops = 0, hops; 225 uint16_t lid; 226 cl_map_item_t *i; 227 osm_switch_t *sw; 228 229 OSM_LOG_ENTER(sm->p_log); 230 231 /* 232 For each member of the multicast group, compute the 233 number of hops to its base LID. 234 */ 235 for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { 236 sw = cl_item_obj(i, sw, mgrp_item); 237 lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); 238 hops = osm_switch_get_least_hops(this_sw, lid); 239 if (!sw->is_mc_member) 240 hops += 1; 241 if (hops > max_hops) 242 max_hops = hops; 243 } 244 245 /* Note that at this point we might get (max_hops == 0), 246 which means that there's only one member in the mcast 247 group, and it's the current switch */ 248 249 OSM_LOG_EXIT(sm->p_log); 250 return (float)max_hops; 251 } 252 #endif 253 254 /********************************************************************** 255 This function attempts to locate the optimal switch for the 256 center of the spanning tree. The current algorithm chooses 257 a switch with the lowest average hop count to the members 258 of the multicast group. 259 **********************************************************************/ 260 static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, 261 cl_qlist_t * list) 262 { 263 cl_qmap_t mgrp_sw_map; 264 cl_qmap_t *p_sw_tbl; 265 osm_switch_t *p_sw, *p_best_sw = NULL; 266 float hops = 0; 267 float best_hops = 10000; /* any big # will do */ 268 269 OSM_LOG_ENTER(sm->p_log); 270 271 p_sw_tbl = &sm->p_subn->sw_guid_tbl; 272 273 create_mgrp_switch_map(&mgrp_sw_map, list); 274 for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); 275 p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); 276 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { 277 if (!osm_switch_supports_mcast(p_sw)) 278 continue; 279 280 #ifdef OSM_VENDOR_INTF_ANAFA 281 hops = mcast_mgr_compute_avg_hops(sm, &mgrp_sw_map, p_sw); 282 #else 283 hops = mcast_mgr_compute_max_hops(sm, &mgrp_sw_map, p_sw); 284 #endif 285 286 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 287 "Switch 0x%016" PRIx64 ", hops = %f\n", 288 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)), hops); 289 290 if (hops < best_hops) { 291 p_best_sw = p_sw; 292 best_hops = hops; 293 } 294 } 295 296 if (p_best_sw) 297 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 298 "Best switch is 0x%" PRIx64 " (%s), hops = %f\n", 299 cl_ntoh64(osm_node_get_node_guid(p_best_sw->p_node)), 300 p_best_sw->p_node->print_desc, best_hops); 301 else 302 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 303 "No multicast capable switches detected\n"); 304 305 destroy_mgrp_switch_map(&mgrp_sw_map); 306 OSM_LOG_EXIT(sm->p_log); 307 return p_best_sw; 308 } 309 310 /********************************************************************** 311 This function returns the existing or optimal root switch for the tree. 312 **********************************************************************/ 313 osm_switch_t *osm_mcast_mgr_find_root_switch(osm_sm_t * sm, cl_qlist_t *list) 314 { 315 osm_switch_t *p_sw = NULL; 316 317 OSM_LOG_ENTER(sm->p_log); 318 319 /* 320 We always look for the best multicast tree root switch. 321 Otherwise since we always start with a a single join 322 the root will be always on the first switch attached to it. 323 - Very bad ... 324 */ 325 p_sw = mcast_mgr_find_optimal_switch(sm, list); 326 327 OSM_LOG_EXIT(sm->p_log); 328 return p_sw; 329 } 330 331 static int mcast_mgr_set_mft_block(osm_sm_t * sm, IN osm_switch_t * p_sw, 332 uint32_t block_num, uint32_t position) 333 { 334 osm_node_t *p_node; 335 osm_physp_t *p_physp; 336 osm_dr_path_t *p_path; 337 osm_madw_context_t context; 338 ib_api_status_t status; 339 uint32_t block_id_ho; 340 osm_mcast_tbl_t *p_tbl; 341 ib_net16_t block[IB_MCAST_BLOCK_SIZE]; 342 int ret = 0; 343 344 CL_ASSERT(sm); 345 346 OSM_LOG_ENTER(sm->p_log); 347 348 CL_ASSERT(p_sw); 349 350 p_node = p_sw->p_node; 351 352 CL_ASSERT(p_node); 353 354 p_physp = osm_node_get_physp_ptr(p_node, 0); 355 p_path = osm_physp_get_dr_path_ptr(p_physp); 356 357 /* 358 Send multicast forwarding table blocks to the switch 359 as long as the switch indicates it has blocks needing 360 configuration. 361 */ 362 363 context.mft_context.node_guid = osm_node_get_node_guid(p_node); 364 context.mft_context.set_method = TRUE; 365 366 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 367 368 if (osm_mcast_tbl_get_block(p_tbl, (uint16_t) block_num, 369 (uint8_t) position, block)) { 370 block_id_ho = block_num + (position << 28); 371 372 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 373 "Writing MFT block %u position %u to switch 0x%" PRIx64 374 "\n", block_num, position, 375 cl_ntoh64(context.mft_context.node_guid)); 376 377 status = osm_req_set(sm, p_path, (void *)block, sizeof(block), 378 IB_MAD_ATTR_MCAST_FWD_TBL, 379 cl_hton32(block_id_ho), FALSE, 380 ib_port_info_get_m_key(&p_physp->port_info), 381 CL_DISP_MSGID_NONE, &context); 382 if (status != IB_SUCCESS) { 383 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A02: " 384 "Sending multicast fwd. tbl. block 0x%X to %s " 385 "failed (%s)\n", block_id_ho, 386 p_node->print_desc, ib_get_err_str(status)); 387 ret = -1; 388 } 389 } 390 391 OSM_LOG_EXIT(sm->p_log); 392 return ret; 393 } 394 395 /********************************************************************** 396 This is part of the recursive function to compute the paths in the 397 spanning tree that emanate from this switch. On input, the p_list 398 contains the group members that must be routed from this switch. 399 **********************************************************************/ 400 static void mcast_mgr_subdivide(osm_sm_t * sm, uint16_t mlid_ho, 401 osm_switch_t * p_sw, cl_qlist_t * p_list, 402 cl_qlist_t * list_array, uint8_t array_size) 403 { 404 uint8_t port_num; 405 boolean_t ignore_existing; 406 osm_mcast_work_obj_t *p_wobj; 407 408 OSM_LOG_ENTER(sm->p_log); 409 410 /* 411 For Multicast Groups, we don't want to count on previous 412 configurations - since we can easily generate a storm 413 by loops. 414 */ 415 ignore_existing = TRUE; 416 417 /* 418 Subdivide the set of ports into non-overlapping subsets 419 that will be routed to other switches. 420 */ 421 while ((p_wobj = 422 (osm_mcast_work_obj_t *) cl_qlist_remove_head(p_list)) != 423 (osm_mcast_work_obj_t *) cl_qlist_end(p_list)) { 424 port_num = 425 osm_switch_recommend_mcast_path(p_sw, p_wobj->p_port, 426 mlid_ho, ignore_existing); 427 if (port_num == OSM_NO_PATH) { 428 /* 429 This typically occurs if the switch does not support 430 multicast and the multicast tree must branch at this 431 switch. 432 */ 433 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A03: " 434 "Error routing MLID 0x%X through switch 0x%" 435 PRIx64 " %s\n" 436 "\t\t\t\tNo multicast paths from this switch " 437 "for port with LID %u\n", mlid_ho, 438 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)), 439 p_sw->p_node->print_desc, 440 cl_ntoh16(osm_port_get_base_lid 441 (p_wobj->p_port))); 442 mcast_work_obj_delete(p_wobj); 443 continue; 444 } 445 446 if (port_num >= array_size) { 447 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A04: " 448 "Error routing MLID 0x%X through switch 0x%" 449 PRIx64 " %s\n" 450 "\t\t\t\tNo multicast paths from this switch " 451 "to port with LID %u\n", mlid_ho, 452 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)), 453 p_sw->p_node->print_desc, 454 cl_ntoh16(osm_port_get_base_lid 455 (p_wobj->p_port))); 456 mcast_work_obj_delete(p_wobj); 457 continue; 458 } 459 460 cl_qlist_insert_tail(&list_array[port_num], &p_wobj->list_item); 461 } 462 463 OSM_LOG_EXIT(sm->p_log); 464 } 465 466 static void mcast_mgr_purge_list(osm_sm_t * sm, uint16_t mlid, cl_qlist_t * list) 467 { 468 if (OSM_LOG_IS_ACTIVE_V2(sm->p_log, OSM_LOG_ERROR)) { 469 osm_mcast_work_obj_t *wobj; 470 cl_list_item_t *i; 471 for (i = cl_qlist_head(list); i != cl_qlist_end(list); 472 i = cl_qlist_next(i)) { 473 wobj = cl_item_obj(i, wobj, list_item); 474 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A06: " 475 "Unable to route MLID 0x%X for port 0x%" PRIx64 "\n", 476 mlid, cl_ntoh64(osm_port_get_guid(wobj->p_port))); 477 } 478 } 479 osm_mcast_drop_port_list(list); 480 } 481 482 /********************************************************************** 483 This is the recursive function to compute the paths in the spanning 484 tree that emanate from this switch. On input, the p_list contains 485 the group members that must be routed from this switch. 486 487 The function returns the newly created mtree node element. 488 **********************************************************************/ 489 static osm_mtree_node_t *mcast_mgr_branch(osm_sm_t * sm, uint16_t mlid_ho, 490 osm_switch_t * p_sw, 491 cl_qlist_t * p_list, uint8_t depth, 492 uint8_t upstream_port, 493 uint8_t * p_max_depth) 494 { 495 uint8_t max_children; 496 osm_mtree_node_t *p_mtn = NULL; 497 cl_qlist_t *list_array = NULL; 498 uint8_t i; 499 ib_net64_t node_guid; 500 osm_mcast_work_obj_t *p_wobj; 501 cl_qlist_t *p_port_list; 502 size_t count; 503 osm_mcast_tbl_t *p_tbl; 504 505 OSM_LOG_ENTER(sm->p_log); 506 507 CL_ASSERT(p_sw); 508 CL_ASSERT(p_list); 509 CL_ASSERT(p_max_depth); 510 511 node_guid = osm_node_get_node_guid(p_sw->p_node); 512 513 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 514 "Routing MLID 0x%X through switch 0x%" PRIx64 515 " %s, %u nodes at depth %u\n", 516 mlid_ho, cl_ntoh64(node_guid), p_sw->p_node->print_desc, 517 cl_qlist_count(p_list), depth); 518 519 CL_ASSERT(cl_qlist_count(p_list) > 0); 520 521 depth++; 522 523 if (depth >= 64) { 524 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A21: " 525 "Maximal hops number is reached for MLID 0x%x." 526 " Break processing\n", mlid_ho); 527 mcast_mgr_purge_list(sm, mlid_ho, p_list); 528 goto Exit; 529 } 530 531 if (depth > *p_max_depth) { 532 CL_ASSERT(depth == *p_max_depth + 1); 533 *p_max_depth = depth; 534 } 535 536 if (osm_switch_supports_mcast(p_sw) == FALSE) { 537 /* 538 This switch doesn't do multicast. Clean-up. 539 */ 540 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A14: " 541 "Switch 0x%" PRIx64 " %s does not support multicast\n", 542 cl_ntoh64(node_guid), p_sw->p_node->print_desc); 543 544 /* 545 Deallocate all the work objects on this branch of the tree. 546 */ 547 mcast_mgr_purge_list(sm, mlid_ho, p_list); 548 goto Exit; 549 } 550 551 p_mtn = osm_mtree_node_new(p_sw); 552 if (p_mtn == NULL) { 553 /* 554 We are unable to continue routing down this 555 leg of the tree. Clean-up. 556 */ 557 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A15: " 558 "Insufficient memory to build multicast tree\n"); 559 560 /* 561 Deallocate all the work objects on this branch of the tree. 562 */ 563 mcast_mgr_purge_list(sm, mlid_ho, p_list); 564 goto Exit; 565 } 566 567 max_children = osm_mtree_node_get_max_children(p_mtn); 568 569 CL_ASSERT(max_children > 1); 570 571 /* 572 Prepare an empty list for each port in the switch. 573 TO DO - this list array could probably be moved 574 inside the switch element to save on malloc thrashing. 575 */ 576 list_array = malloc(sizeof(cl_qlist_t) * max_children); 577 if (list_array == NULL) { 578 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A16: " 579 "Unable to allocate list array\n"); 580 mcast_mgr_purge_list(sm, mlid_ho, p_list); 581 osm_mtree_destroy(p_mtn); 582 p_mtn = NULL; 583 goto Exit; 584 } 585 586 memset(list_array, 0, sizeof(cl_qlist_t) * max_children); 587 588 for (i = 0; i < max_children; i++) 589 cl_qlist_init(&list_array[i]); 590 591 mcast_mgr_subdivide(sm, mlid_ho, p_sw, p_list, list_array, max_children); 592 593 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 594 595 /* 596 Add the upstream port to the forwarding table unless 597 we're at the root of the spanning tree. 598 */ 599 if (depth > 1) { 600 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 601 "Adding upstream port %u\n", upstream_port); 602 603 CL_ASSERT(upstream_port); 604 osm_mcast_tbl_set(p_tbl, mlid_ho, upstream_port); 605 } 606 607 /* 608 For each port that was allocated some routes, 609 recurse into this function to continue building the tree 610 if the node on the other end of that port is another switch. 611 Otherwise, the node is an endpoint, and we've found a leaf 612 of the tree. Mark leaves with our special pointer value. 613 */ 614 615 for (i = 0; i < max_children; i++) { 616 const osm_physp_t *p_physp; 617 const osm_physp_t *p_remote_physp; 618 osm_node_t *p_node; 619 const osm_node_t *p_remote_node; 620 621 p_port_list = &list_array[i]; 622 623 count = cl_qlist_count(p_port_list); 624 625 /* 626 There should be no children routed through the upstream port! 627 */ 628 CL_ASSERT(upstream_port == 0 || i != upstream_port || 629 (i == upstream_port && count == 0)); 630 631 if (count == 0) 632 continue; /* No routes down this port. */ 633 634 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 635 "Routing %zu destinations via switch port %u\n", 636 count, i); 637 638 if (i == 0) { 639 /* This means we are adding the switch to the MC group. 640 We do not need to continue looking at the remote 641 port, just needed to add the port to the table */ 642 CL_ASSERT(count == 1); 643 644 osm_mcast_tbl_set(p_tbl, mlid_ho, i); 645 646 p_wobj = (osm_mcast_work_obj_t *) 647 cl_qlist_remove_head(p_port_list); 648 mcast_work_obj_delete(p_wobj); 649 continue; 650 } 651 652 p_node = p_sw->p_node; 653 p_remote_node = osm_node_get_remote_node(p_node, i, NULL); 654 if (!p_remote_node) { 655 /* 656 * If we reached here, it means the minhop table has 657 * invalid entries that leads to disconnected ports. 658 * 659 * A possible reason for the code to reach here is 660 * that ucast cache is enabled, and a leaf switch that 661 * is used as a non-leaf switch in a multicast has been 662 * removed from the fabric. 663 * 664 * When it happens, we should invalidate the cache 665 * and force rerouting of the fabric. 666 */ 667 668 OSM_LOG(sm->p_log, OSM_LOG_ERROR, 669 "ERR 0A1E: Tried to route MLID 0x%X through " 670 "disconnected switch 0x%" PRIx64 " port %d\n", 671 mlid_ho, cl_ntoh64(node_guid), i); 672 673 /* Free memory */ 674 mcast_mgr_purge_list(sm, mlid_ho, p_port_list); 675 676 /* Invalidate ucast cache */ 677 if (sm->ucast_mgr.p_subn->opt.use_ucast_cache && 678 sm->ucast_mgr.cache_valid) { 679 OSM_LOG(sm->p_log, OSM_LOG_INFO, 680 "Unicast Cache will be invalidated due " 681 "to multicast routing errors\n"); 682 osm_ucast_cache_invalidate(&sm->ucast_mgr); 683 sm->p_subn->force_heavy_sweep = TRUE; 684 } 685 686 continue; 687 } 688 689 /* 690 This port routes frames for this mcast group. Therefore, 691 set the appropriate bit in the multicast forwarding 692 table for this switch. 693 */ 694 osm_mcast_tbl_set(p_tbl, mlid_ho, i); 695 696 if (osm_node_get_type(p_remote_node) == IB_NODE_TYPE_SWITCH) { 697 /* 698 Acquire a pointer to the remote switch then recurse. 699 */ 700 CL_ASSERT(p_remote_node->sw); 701 702 p_physp = osm_node_get_physp_ptr(p_node, i); 703 CL_ASSERT(p_physp); 704 705 p_remote_physp = osm_physp_get_remote(p_physp); 706 CL_ASSERT(p_remote_physp); 707 708 p_mtn->child_array[i] = 709 mcast_mgr_branch(sm, mlid_ho, p_remote_node->sw, 710 p_port_list, depth, 711 osm_physp_get_port_num 712 (p_remote_physp), p_max_depth); 713 } else { 714 /* 715 The neighbor node is not a switch, so this 716 must be a leaf. 717 */ 718 CL_ASSERT(count == 1); 719 720 p_mtn->child_array[i] = OSM_MTREE_LEAF; 721 p_wobj = (osm_mcast_work_obj_t *) 722 cl_qlist_remove_head(p_port_list); 723 724 CL_ASSERT(cl_is_qlist_empty(p_port_list)); 725 726 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 727 "Found leaf for port 0x%016" PRIx64 728 " on switch port %u\n", 729 cl_ntoh64(osm_port_get_guid(p_wobj->p_port)), 730 i); 731 mcast_work_obj_delete(p_wobj); 732 } 733 } 734 735 free(list_array); 736 Exit: 737 OSM_LOG_EXIT(sm->p_log); 738 return p_mtn; 739 } 740 741 static ib_api_status_t mcast_mgr_build_spanning_tree(osm_sm_t * sm, 742 osm_mgrp_box_t * mbox) 743 { 744 cl_qlist_t port_list; 745 cl_qmap_t port_map; 746 uint32_t num_ports; 747 osm_switch_t *p_sw; 748 ib_api_status_t status = IB_SUCCESS; 749 uint8_t max_depth = 0; 750 751 OSM_LOG_ENTER(sm->p_log); 752 753 /* 754 TO DO - for now, just blow away the old tree. 755 In the future we'll need to construct the tree based 756 on multicast forwarding table information if the user wants to 757 preserve existing multicast routes. 758 */ 759 osm_purge_mtree(sm, mbox); 760 761 /* build the first "subset" containing all member ports */ 762 if (osm_mcast_make_port_list_and_map(&port_list, &port_map, mbox)) { 763 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A10: " 764 "Insufficient memory to make port list\n"); 765 status = IB_ERROR; 766 goto Exit; 767 } 768 769 num_ports = cl_qlist_count(&port_list); 770 if (num_ports < 2) { 771 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 772 "MLID 0x%X has %u members - nothing to do\n", 773 mbox->mlid, num_ports); 774 osm_mcast_drop_port_list(&port_list); 775 goto Exit; 776 } 777 778 /* 779 This function builds the single spanning tree recursively. 780 At each stage, the ports to be reached are divided into 781 non-overlapping subsets of member ports that can be reached through 782 a given switch port. Construction then moves down each 783 branch, and the process starts again with each branch computing 784 for its own subset of the member ports. 785 786 The maximum recursion depth is at worst the maximum hop count in the 787 subnet, which is spec limited to 64. 788 */ 789 790 /* 791 Locate the switch around which to create the spanning 792 tree for this multicast group. 793 */ 794 p_sw = osm_mcast_mgr_find_root_switch(sm, &port_list); 795 if (p_sw == NULL) { 796 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A08: " 797 "Unable to locate a suitable switch for group 0x%X\n", 798 mbox->mlid); 799 osm_mcast_drop_port_list(&port_list); 800 status = IB_ERROR; 801 goto Exit; 802 } 803 804 mbox->root = mcast_mgr_branch(sm, mbox->mlid, p_sw, &port_list, 0, 0, 805 &max_depth); 806 807 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 808 "Configured MLID 0x%X for %u ports, max tree depth = %u\n", 809 mbox->mlid, num_ports, max_depth); 810 Exit: 811 OSM_LOG_EXIT(sm->p_log); 812 return status; 813 } 814 815 #if 0 816 /* unused */ 817 void osm_mcast_mgr_set_table(osm_sm_t * sm, IN const osm_mgrp_t * p_mgrp, 818 IN const osm_mtree_node_t * p_mtn) 819 { 820 uint8_t i; 821 uint8_t max_children; 822 osm_mtree_node_t *p_child_mtn; 823 uint16_t mlid_ho; 824 osm_mcast_tbl_t *p_tbl; 825 osm_switch_t *p_sw; 826 827 OSM_LOG_ENTER(sm->p_log); 828 829 mlid_ho = cl_ntoh16(osm_mgrp_get_mlid(p_mgrp)); 830 p_sw = osm_mtree_node_get_switch_ptr(p_mtn); 831 832 CL_ASSERT(p_sw); 833 834 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 835 "Configuring MLID 0x%X on switch 0x%" PRIx64 "\n", 836 mlid_ho, osm_node_get_node_guid(p_sw->p_node)); 837 838 /* 839 For every child of this tree node, set the corresponding 840 bit in the switch's mcast table. 841 */ 842 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 843 max_children = osm_mtree_node_get_max_children(p_mtn); 844 845 CL_ASSERT(max_children <= osm_switch_get_num_ports(p_sw)); 846 847 osm_mcast_tbl_clear_mlid(p_tbl, mlid_ho); 848 849 for (i = 0; i < max_children; i++) { 850 p_child_mtn = osm_mtree_node_get_child(p_mtn, i); 851 if (p_child_mtn == NULL) 852 continue; 853 854 osm_mcast_tbl_set(p_tbl, mlid_ho, i); 855 } 856 857 OSM_LOG_EXIT(sm->p_log); 858 } 859 #endif 860 861 static void mcast_mgr_clear(osm_sm_t * sm, uint16_t mlid) 862 { 863 osm_switch_t *p_sw; 864 cl_qmap_t *p_sw_tbl; 865 osm_mcast_tbl_t *p_mcast_tbl; 866 867 OSM_LOG_ENTER(sm->p_log); 868 869 /* Walk the switches and clear the routing entries for this MLID. */ 870 p_sw_tbl = &sm->p_subn->sw_guid_tbl; 871 p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); 872 while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) { 873 p_mcast_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 874 osm_mcast_tbl_clear_mlid(p_mcast_tbl, mlid); 875 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); 876 } 877 878 OSM_LOG_EXIT(sm->p_log); 879 } 880 881 #if 0 882 /* TO DO - make this real -- at least update spanning tree */ 883 /********************************************************************** 884 Lock must be held on entry. 885 **********************************************************************/ 886 ib_api_status_t osm_mcast_mgr_process_single(osm_sm_t * sm, 887 IN ib_net16_t const mlid, 888 IN ib_net64_t const port_guid, 889 IN uint8_t const join_state) 890 { 891 uint8_t port_num; 892 uint16_t mlid_ho; 893 ib_net64_t sw_guid; 894 osm_port_t *p_port; 895 osm_physp_t *p_physp; 896 osm_physp_t *p_remote_physp; 897 osm_node_t *p_remote_node; 898 osm_mcast_tbl_t *p_mcast_tbl; 899 ib_api_status_t status = IB_SUCCESS; 900 901 OSM_LOG_ENTER(sm->p_log); 902 903 CL_ASSERT(mlid); 904 CL_ASSERT(port_guid); 905 906 mlid_ho = cl_ntoh16(mlid); 907 908 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 909 "Attempting to add port 0x%" PRIx64 " to MLID 0x%X, " 910 "\n\t\t\t\tjoin state = 0x%X\n", 911 cl_ntoh64(port_guid), mlid_ho, join_state); 912 913 /* 914 Acquire the Port object. 915 */ 916 p_port = osm_get_port_by_guid(sm->p_subn, port_guid); 917 if (!p_port) { 918 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A01: " 919 "Unable to acquire port object for 0x%" PRIx64 "\n", 920 cl_ntoh64(port_guid)); 921 status = IB_ERROR; 922 goto Exit; 923 } 924 925 p_physp = p_port->p_physp; 926 if (p_physp == NULL) { 927 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A05: " 928 "Unable to acquire phsyical port object for 0x%" PRIx64 929 "\n", cl_ntoh64(port_guid)); 930 status = IB_ERROR; 931 goto Exit; 932 } 933 934 p_remote_physp = osm_physp_get_remote(p_physp); 935 if (p_remote_physp == NULL) { 936 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A11: " 937 "Unable to acquire remote phsyical port object " 938 "for 0x%" PRIx64 "\n", cl_ntoh64(port_guid)); 939 status = IB_ERROR; 940 goto Exit; 941 } 942 943 p_remote_node = osm_physp_get_node_ptr(p_remote_physp); 944 945 CL_ASSERT(p_remote_node); 946 947 sw_guid = osm_node_get_node_guid(p_remote_node); 948 949 if (osm_node_get_type(p_remote_node) != IB_NODE_TYPE_SWITCH) { 950 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A22: " 951 "Remote node not a switch node 0x%" PRIx64 "\n", 952 cl_ntoh64(sw_guid)); 953 status = IB_ERROR; 954 goto Exit; 955 } 956 957 if (!p_remote_node->sw) { 958 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A12: " 959 "No switch object 0x%" PRIx64 "\n", cl_ntoh64(sw_guid)); 960 status = IB_ERROR; 961 goto Exit; 962 } 963 964 if (osm_switch_is_in_mcast_tree(p_remote_node->sw, mlid_ho)) { 965 /* 966 We're in luck. The switch attached to this port 967 is already in the multicast group, so we can just 968 add the specified port as a new leaf of the tree. 969 */ 970 if (join_state & (IB_JOIN_STATE_FULL | IB_JOIN_STATE_NON)) { 971 /* 972 This node wants to receive multicast frames. 973 Get the switch port number to which the new member port 974 is attached, then configure this single mcast table. 975 */ 976 port_num = osm_physp_get_port_num(p_remote_physp); 977 CL_ASSERT(port_num); 978 979 p_mcast_tbl = 980 osm_switch_get_mcast_tbl_ptr(p_remote_node->sw); 981 osm_mcast_tbl_set(p_mcast_tbl, mlid_ho, port_num); 982 } else { 983 if (join_state & IB_JOIN_STATE_SEND_ONLY) 984 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 985 "Success. Nothing to do for send" 986 "only member\n"); 987 else { 988 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A13: " 989 "Unknown join state 0x%X\n", 990 join_state); 991 status = IB_ERROR; 992 goto Exit; 993 } 994 } 995 } else 996 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Unable to add port\n"); 997 998 Exit: 999 OSM_LOG_EXIT(sm->p_log); 1000 return status; 1001 } 1002 #endif 1003 1004 /********************************************************************** 1005 Process the entire group. 1006 NOTE : The lock should be held externally! 1007 **********************************************************************/ 1008 static ib_api_status_t mcast_mgr_process_mlid(osm_sm_t * sm, uint16_t mlid) 1009 { 1010 ib_api_status_t status = IB_SUCCESS; 1011 struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used; 1012 osm_mgrp_box_t *mbox; 1013 1014 OSM_LOG_ENTER(sm->p_log); 1015 1016 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 1017 "Processing multicast group with mlid 0x%X\n", mlid); 1018 1019 /* Clear the multicast tables to start clean, then build 1020 the spanning tree which sets the mcast table bits for each 1021 port in the group. */ 1022 mcast_mgr_clear(sm, mlid); 1023 1024 mbox = osm_get_mbox_by_mlid(sm->p_subn, cl_hton16(mlid)); 1025 if (mbox) { 1026 if (re && re->mcast_build_stree) 1027 status = re->mcast_build_stree(re->context, mbox); 1028 else 1029 status = mcast_mgr_build_spanning_tree(sm, mbox); 1030 1031 if (status != IB_SUCCESS) 1032 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A17: " 1033 "Unable to create spanning tree (%s) for mlid " 1034 "0x%x\n", ib_get_err_str(status), mlid); 1035 } 1036 1037 OSM_LOG_EXIT(sm->p_log); 1038 return status; 1039 } 1040 1041 static void mcast_mgr_set_mfttop(IN osm_sm_t * sm, IN osm_switch_t * p_sw) 1042 { 1043 osm_node_t *p_node; 1044 osm_dr_path_t *p_path; 1045 osm_physp_t *p_physp; 1046 osm_mcast_tbl_t *p_tbl; 1047 osm_madw_context_t context; 1048 ib_api_status_t status; 1049 ib_switch_info_t si; 1050 ib_net16_t mcast_top; 1051 1052 OSM_LOG_ENTER(sm->p_log); 1053 1054 CL_ASSERT(p_sw); 1055 1056 p_node = p_sw->p_node; 1057 1058 CL_ASSERT(p_node); 1059 1060 p_physp = osm_node_get_physp_ptr(p_node, 0); 1061 p_path = osm_physp_get_dr_path_ptr(p_physp); 1062 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 1063 1064 if (sm->p_subn->opt.use_mfttop && 1065 p_physp->port_info.capability_mask & IB_PORT_CAP_HAS_MCAST_FDB_TOP) { 1066 /* 1067 Set the top of the multicast forwarding table. 1068 */ 1069 si = p_sw->switch_info; 1070 if (sm->p_subn->first_time_master_sweep == TRUE) 1071 mcast_top = cl_hton16(sm->mlids_init_max); 1072 else { 1073 if (p_tbl->max_block_in_use == -1) 1074 mcast_top = cl_hton16(IB_LID_MCAST_START_HO - 1); 1075 else 1076 mcast_top = cl_hton16(IB_LID_MCAST_START_HO + 1077 (p_tbl->max_block_in_use + 1) * IB_MCAST_BLOCK_SIZE - 1); 1078 } 1079 if (mcast_top == si.mcast_top) 1080 return; 1081 1082 si.mcast_top = mcast_top; 1083 1084 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 1085 "Setting switch MFT top to MLID 0x%x\n", 1086 cl_ntoh16(si.mcast_top)); 1087 1088 context.si_context.light_sweep = FALSE; 1089 context.si_context.node_guid = osm_node_get_node_guid(p_node); 1090 context.si_context.set_method = TRUE; 1091 context.si_context.lft_top_change = FALSE; 1092 1093 status = osm_req_set(sm, p_path, (uint8_t *) & si, 1094 sizeof(si), IB_MAD_ATTR_SWITCH_INFO, 1095 0, FALSE, 1096 ib_port_info_get_m_key(&p_physp->port_info), 1097 CL_DISP_MSGID_NONE, &context); 1098 1099 if (status != IB_SUCCESS) 1100 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A1B: " 1101 "Sending SwitchInfo attribute failed (%s)\n", 1102 ib_get_err_str(status)); 1103 } 1104 } 1105 1106 static int mcast_mgr_set_mftables(osm_sm_t * sm) 1107 { 1108 cl_qmap_t *p_sw_tbl = &sm->p_subn->sw_guid_tbl; 1109 osm_switch_t *p_sw; 1110 osm_mcast_tbl_t *p_tbl; 1111 int block_notdone, ret = 0; 1112 int16_t block_num, max_block = -1; 1113 1114 p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); 1115 while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) { 1116 p_sw->mft_block_num = 0; 1117 p_sw->mft_position = 0; 1118 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 1119 if (osm_mcast_tbl_get_max_block_in_use(p_tbl) > max_block) 1120 max_block = osm_mcast_tbl_get_max_block_in_use(p_tbl); 1121 mcast_mgr_set_mfttop(sm, p_sw); 1122 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); 1123 } 1124 1125 /* Stripe the MFT blocks across the switches */ 1126 for (block_num = 0; block_num <= max_block; block_num++) { 1127 block_notdone = 1; 1128 while (block_notdone) { 1129 block_notdone = 0; 1130 p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); 1131 while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) { 1132 if (p_sw->mft_block_num == block_num) { 1133 block_notdone = 1; 1134 if (mcast_mgr_set_mft_block(sm, p_sw, 1135 p_sw->mft_block_num, 1136 p_sw->mft_position)) 1137 ret = -1; 1138 p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw); 1139 if (++p_sw->mft_position > p_tbl->max_position) { 1140 p_sw->mft_position = 0; 1141 p_sw->mft_block_num++; 1142 } 1143 } 1144 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); 1145 } 1146 } 1147 } 1148 1149 return ret; 1150 } 1151 1152 static int alloc_mfts(osm_sm_t * sm) 1153 { 1154 int i; 1155 cl_map_item_t *item; 1156 osm_switch_t *p_sw; 1157 1158 for (i = sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO; i >= 0; 1159 i--) 1160 if (sm->p_subn->mboxes[i]) 1161 break; 1162 if (i < 0) 1163 return 0; 1164 1165 /* Now, walk switches and (re)allocate multicast tables */ 1166 for (item = cl_qmap_head(&sm->p_subn->sw_guid_tbl); 1167 item != cl_qmap_end(&sm->p_subn->sw_guid_tbl); 1168 item = cl_qmap_next(item)) { 1169 p_sw = (osm_switch_t *) item; 1170 if (osm_mcast_tbl_realloc(&p_sw->mcast_tbl, i)) 1171 return -1; 1172 } 1173 return 0; 1174 } 1175 1176 /********************************************************************** 1177 This is the function that is invoked during idle time and sweep to 1178 handle the process request for mcast groups where join/leave/delete 1179 was required. 1180 **********************************************************************/ 1181 int osm_mcast_mgr_process(osm_sm_t * sm, boolean_t config_all) 1182 { 1183 int ret = 0; 1184 unsigned i; 1185 unsigned max_mlid; 1186 1187 OSM_LOG_ENTER(sm->p_log); 1188 1189 CL_PLOCK_EXCL_ACQUIRE(sm->p_lock); 1190 1191 /* If there are no switches in the subnet we have nothing to do. */ 1192 if (cl_qmap_count(&sm->p_subn->sw_guid_tbl) == 0) { 1193 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 1194 "No switches in subnet. Nothing to do\n"); 1195 goto exit; 1196 } 1197 1198 if (alloc_mfts(sm)) { 1199 OSM_LOG(sm->p_log, OSM_LOG_ERROR, 1200 "ERR 0A09: alloc_mfts failed\n"); 1201 ret = -1; 1202 goto exit; 1203 } 1204 1205 max_mlid = config_all ? sm->p_subn->max_mcast_lid_ho 1206 - IB_LID_MCAST_START_HO : sm->mlids_req_max; 1207 for (i = 0; i <= max_mlid; i++) { 1208 if (sm->mlids_req[i] || 1209 (config_all && sm->p_subn->mboxes[i])) { 1210 sm->mlids_req[i] = 0; 1211 mcast_mgr_process_mlid(sm, i + IB_LID_MCAST_START_HO); 1212 } 1213 } 1214 1215 sm->mlids_req_max = 0; 1216 1217 ret = mcast_mgr_set_mftables(sm); 1218 1219 osm_dump_mcast_routes(sm->p_subn->p_osm); 1220 1221 exit: 1222 CL_PLOCK_RELEASE(sm->p_lock); 1223 OSM_LOG_EXIT(sm->p_log); 1224 return ret; 1225 } 1226