1 /* 2 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. 3 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 * 34 */ 35 36 /* 37 * Abstract: 38 * Implementation of osm_ucast_mgr_t. 39 * This file implements the Unicast Manager object. 40 */ 41 42 #if HAVE_CONFIG_H 43 # include <config.h> 44 #endif /* HAVE_CONFIG_H */ 45 46 #include <stdio.h> 47 #include <stdlib.h> 48 #include <string.h> 49 #include <ctype.h> 50 #include <iba/ib_types.h> 51 #include <complib/cl_qmap.h> 52 #include <complib/cl_debug.h> 53 #include <complib/cl_qlist.h> 54 #include <opensm/osm_file_ids.h> 55 #define FILE_ID OSM_FILE_UCAST_MGR_C 56 #include <opensm/osm_ucast_mgr.h> 57 #include <opensm/osm_sm.h> 58 #include <opensm/osm_log.h> 59 #include <opensm/osm_node.h> 60 #include <opensm/osm_switch.h> 61 #include <opensm/osm_helper.h> 62 #include <opensm/osm_msgdef.h> 63 #include <opensm/osm_opensm.h> 64 65 void osm_ucast_mgr_construct(IN osm_ucast_mgr_t * p_mgr) 66 { 67 memset(p_mgr, 0, sizeof(*p_mgr)); 68 } 69 70 void osm_ucast_mgr_destroy(IN osm_ucast_mgr_t * p_mgr) 71 { 72 CL_ASSERT(p_mgr); 73 74 OSM_LOG_ENTER(p_mgr->p_log); 75 76 if (p_mgr->cache_valid) 77 osm_ucast_cache_invalidate(p_mgr); 78 79 OSM_LOG_EXIT(p_mgr->p_log); 80 } 81 82 ib_api_status_t osm_ucast_mgr_init(IN osm_ucast_mgr_t * p_mgr, IN osm_sm_t * sm) 83 { 84 ib_api_status_t status = IB_SUCCESS; 85 86 OSM_LOG_ENTER(sm->p_log); 87 88 osm_ucast_mgr_construct(p_mgr); 89 90 p_mgr->sm = sm; 91 p_mgr->p_log = sm->p_log; 92 p_mgr->p_subn = sm->p_subn; 93 p_mgr->p_lock = sm->p_lock; 94 95 if (sm->p_subn->opt.use_ucast_cache) 96 cl_qmap_init(&p_mgr->cache_sw_tbl); 97 98 OSM_LOG_EXIT(p_mgr->p_log); 99 return status; 100 } 101 102 /********************************************************************** 103 Add each switch's own and neighbor LIDs to its LID matrix 104 **********************************************************************/ 105 static void ucast_mgr_process_hop_0_1(IN cl_map_item_t * p_map_item, 106 IN void *context) 107 { 108 osm_switch_t * p_sw = (osm_switch_t *) p_map_item; 109 osm_node_t *p_remote_node; 110 uint16_t lid, remote_lid; 111 uint8_t i; 112 113 lid = cl_ntoh16(osm_node_get_base_lid(p_sw->p_node, 0)); 114 osm_switch_set_hops(p_sw, lid, 0, 0); 115 116 for (i = 1; i < p_sw->num_ports; i++) { 117 osm_physp_t *p = osm_node_get_physp_ptr(p_sw->p_node, i); 118 p_remote_node = (p && p->p_remote_physp) ? 119 p->p_remote_physp->p_node : NULL; 120 121 if (p_remote_node && p_remote_node->sw && 122 p_remote_node != p_sw->p_node) { 123 remote_lid = osm_node_get_base_lid(p_remote_node, 0); 124 remote_lid = cl_ntoh16(remote_lid); 125 osm_switch_set_hops(p_sw, remote_lid, i, p->hop_wf); 126 } 127 } 128 } 129 130 static void ucast_mgr_process_neighbor(IN osm_ucast_mgr_t * p_mgr, 131 IN osm_switch_t * p_this_sw, 132 IN osm_switch_t * p_remote_sw, 133 IN uint8_t port_num, 134 IN uint8_t remote_port_num) 135 { 136 osm_switch_t *p_sw; 137 cl_map_item_t *item; 138 uint16_t lid_ho; 139 uint16_t hops; 140 osm_physp_t *p; 141 142 OSM_LOG_ENTER(p_mgr->p_log); 143 144 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 145 "Node 0x%" PRIx64 ", remote node 0x%" PRIx64 146 ", port %u, remote port %u\n", 147 cl_ntoh64(osm_node_get_node_guid(p_this_sw->p_node)), 148 cl_ntoh64(osm_node_get_node_guid(p_remote_sw->p_node)), 149 port_num, remote_port_num); 150 151 p = osm_node_get_physp_ptr(p_this_sw->p_node, port_num); 152 153 for (item = cl_qmap_head(&p_mgr->p_subn->sw_guid_tbl); 154 item != cl_qmap_end(&p_mgr->p_subn->sw_guid_tbl); 155 item = cl_qmap_next(item)) { 156 p_sw = (osm_switch_t *) item; 157 lid_ho = cl_ntoh16(osm_node_get_base_lid(p_sw->p_node, 0)); 158 hops = osm_switch_get_least_hops(p_remote_sw, lid_ho); 159 if (hops == OSM_NO_PATH) 160 continue; 161 hops += p->hop_wf; 162 if (hops < 163 osm_switch_get_hop_count(p_this_sw, lid_ho, port_num)) { 164 if (osm_switch_set_hops 165 (p_this_sw, lid_ho, port_num, (uint8_t) hops) != 0) 166 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A03: " 167 "cannot set hops for lid %u at switch 0x%" 168 PRIx64 "\n", lid_ho, 169 cl_ntoh64(osm_node_get_node_guid 170 (p_this_sw->p_node))); 171 p_mgr->some_hop_count_set = TRUE; 172 } 173 } 174 175 OSM_LOG_EXIT(p_mgr->p_log); 176 } 177 178 static struct osm_remote_node *find_and_add_remote_sys(osm_switch_t * sw, 179 uint8_t port, 180 boolean_t dor, struct 181 osm_remote_guids_count 182 *r) 183 { 184 unsigned i; 185 osm_physp_t *p = osm_node_get_physp_ptr(sw->p_node, port); 186 osm_node_t *node = p->p_remote_physp->p_node; 187 uint8_t rem_port = osm_physp_get_port_num(p->p_remote_physp); 188 189 for (i = 0; i < r->count; i++) 190 if (r->guids[i].node == node) 191 if (!dor || (r->guids[i].port == rem_port)) 192 return &r->guids[i]; 193 194 r->guids[i].node = node; 195 r->guids[i].forwarded_to = 0; 196 r->guids[i].port = rem_port; 197 r->count++; 198 return &r->guids[i]; 199 } 200 201 static void ucast_mgr_process_port(IN osm_ucast_mgr_t * p_mgr, 202 IN osm_switch_t * p_sw, 203 IN osm_port_t * p_port, 204 IN unsigned lid_offset) 205 { 206 uint16_t min_lid_ho; 207 uint16_t max_lid_ho; 208 uint16_t lid_ho; 209 uint8_t port; 210 boolean_t is_ignored_by_port_prof; 211 ib_net64_t node_guid; 212 unsigned start_from = 1; 213 214 OSM_LOG_ENTER(p_mgr->p_log); 215 216 osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho); 217 218 /* If the lids are zero - then there was some problem with 219 * the initialization. Don't handle this port. */ 220 if (min_lid_ho == 0 || max_lid_ho == 0) { 221 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A04: " 222 "Port 0x%" PRIx64 " (%s port %d) has LID 0. An " 223 "initialization error occurred. Ignoring port\n", 224 cl_ntoh64(osm_port_get_guid(p_port)), 225 p_port->p_node->print_desc, 226 p_port->p_physp->port_num); 227 goto Exit; 228 } 229 230 lid_ho = min_lid_ho + lid_offset; 231 232 if (lid_ho > max_lid_ho) 233 goto Exit; 234 235 if (lid_offset && !p_mgr->is_dor) 236 /* ignore potential overflow - it is handled in osm_switch.c */ 237 start_from = 238 osm_switch_get_port_by_lid(p_sw, lid_ho - 1, OSM_NEW_LFT) + 1; 239 240 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 241 "Processing port 0x%" PRIx64 242 " (\'%s\' port %u), LID %u [%u,%u]\n", 243 cl_ntoh64(osm_port_get_guid(p_port)), 244 p_port->p_node->print_desc, p_port->p_physp->port_num, lid_ho, 245 min_lid_ho, max_lid_ho); 246 247 /* TODO - This should be runtime error, not a CL_ASSERT() */ 248 CL_ASSERT(max_lid_ho <= IB_LID_UCAST_END_HO); 249 250 node_guid = osm_node_get_node_guid(p_sw->p_node); 251 252 /* 253 The lid matrix contains the number of hops to each 254 lid from each port. From this information we determine 255 how best to distribute the LID range across the ports 256 that can reach those LIDs. 257 */ 258 port = osm_switch_recommend_path(p_sw, p_port, lid_ho, start_from, 259 p_mgr->p_subn->ignore_existing_lfts, 260 p_mgr->p_subn->opt.lmc, 261 p_mgr->is_dor, 262 p_mgr->p_subn->opt.port_shifting, 263 !lid_offset && p_port->use_scatter, 264 OSM_LFT); 265 266 if (port == OSM_NO_PATH) { 267 /* do not try to overwrite the ppro of non existing port ... */ 268 is_ignored_by_port_prof = TRUE; 269 270 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 271 "No path to get to LID %u from switch 0x%" PRIx64 "\n", 272 lid_ho, cl_ntoh64(node_guid)); 273 } else { 274 osm_physp_t *p = osm_node_get_physp_ptr(p_sw->p_node, port); 275 if (!p) 276 goto Exit; 277 278 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 279 "Routing LID %u to port %u for switch 0x%" PRIx64 "\n", 280 lid_ho, port, cl_ntoh64(node_guid)); 281 282 /* 283 we would like to optionally ignore this port in equalization 284 as in the case of the Mellanox Anafa Internal PCI TCA port 285 */ 286 is_ignored_by_port_prof = p->is_prof_ignored; 287 288 /* 289 We also would ignore this route if the target lid is of 290 a switch and the port_profile_switch_node is not TRUE 291 */ 292 if (!p_mgr->p_subn->opt.port_profile_switch_nodes) 293 is_ignored_by_port_prof |= 294 (osm_node_get_type(p_port->p_node) == 295 IB_NODE_TYPE_SWITCH); 296 } 297 298 /* 299 We have selected the port for this LID. 300 Write it to the forwarding tables. 301 */ 302 p_sw->new_lft[lid_ho] = port; 303 if (!is_ignored_by_port_prof) { 304 struct osm_remote_node *rem_node_used; 305 osm_switch_count_path(p_sw, port); 306 if (port > 0 && p_port->priv && 307 (rem_node_used = find_and_add_remote_sys(p_sw, port, 308 p_mgr->is_dor, 309 p_port->priv))) 310 rem_node_used->forwarded_to++; 311 } 312 313 Exit: 314 OSM_LOG_EXIT(p_mgr->p_log); 315 } 316 317 static void alloc_ports_priv(osm_ucast_mgr_t * mgr) 318 { 319 cl_qmap_t *port_tbl = &mgr->p_subn->port_guid_tbl; 320 struct osm_remote_guids_count *r; 321 osm_port_t *port; 322 cl_map_item_t *item; 323 unsigned lmc; 324 325 for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl); 326 item = cl_qmap_next(item)) { 327 port = (osm_port_t *) item; 328 lmc = ib_port_info_get_lmc(&port->p_physp->port_info); 329 r = malloc(sizeof(*r) + sizeof(r->guids[0]) * (1 << lmc)); 330 if (!r) { 331 OSM_LOG(mgr->p_log, OSM_LOG_ERROR, "ERR 3A09: " 332 "cannot allocate memory to track remote" 333 " systems for lmc > 0\n"); 334 port->priv = NULL; 335 continue; 336 } 337 memset(r, 0, sizeof(*r) + sizeof(r->guids[0]) * (1 << lmc)); 338 port->priv = r; 339 } 340 } 341 342 static void free_ports_priv(osm_ucast_mgr_t * mgr) 343 { 344 cl_qmap_t *port_tbl = &mgr->p_subn->port_guid_tbl; 345 osm_port_t *port; 346 cl_map_item_t *item; 347 for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl); 348 item = cl_qmap_next(item)) { 349 port = (osm_port_t *) item; 350 if (port->priv) { 351 free(port->priv); 352 port->priv = NULL; 353 } 354 } 355 } 356 357 static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item, 358 IN void *context) 359 { 360 osm_ucast_mgr_t *p_mgr = context; 361 osm_switch_t * p_sw = (osm_switch_t *) p_map_item; 362 unsigned i, lids_per_port; 363 364 OSM_LOG_ENTER(p_mgr->p_log); 365 366 CL_ASSERT(p_sw && p_sw->p_node); 367 368 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 369 "Processing switch 0x%" PRIx64 "\n", 370 cl_ntoh64(osm_node_get_node_guid(p_sw->p_node))); 371 372 /* Initialize LIDs in buffer to invalid port number. */ 373 memset(p_sw->new_lft, OSM_NO_PATH, p_sw->max_lid_ho + 1); 374 375 alloc_ports_priv(p_mgr); 376 377 /* 378 Iterate through every port setting LID routes for each 379 port based on base LID and LMC value. 380 */ 381 lids_per_port = 1 << p_mgr->p_subn->opt.lmc; 382 for (i = 0; i < lids_per_port; i++) { 383 cl_qlist_t *list = &p_mgr->port_order_list; 384 cl_list_item_t *item; 385 for (item = cl_qlist_head(list); item != cl_qlist_end(list); 386 item = cl_qlist_next(item)) { 387 osm_port_t *port = cl_item_obj(item, port, list_item); 388 ucast_mgr_process_port(p_mgr, p_sw, port, i); 389 } 390 } 391 392 free_ports_priv(p_mgr); 393 394 OSM_LOG_EXIT(p_mgr->p_log); 395 } 396 397 static void ucast_mgr_process_neighbors(IN cl_map_item_t * p_map_item, 398 IN void *context) 399 { 400 osm_switch_t * p_sw = (osm_switch_t *) p_map_item; 401 osm_ucast_mgr_t * p_mgr = context; 402 osm_node_t *p_node; 403 osm_node_t *p_remote_node; 404 uint32_t port_num; 405 uint8_t remote_port_num; 406 uint32_t num_ports; 407 osm_physp_t *p_physp; 408 409 OSM_LOG_ENTER(p_mgr->p_log); 410 411 p_node = p_sw->p_node; 412 413 CL_ASSERT(p_node); 414 CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH); 415 416 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 417 "Processing switch with GUID 0x%" PRIx64 "\n", 418 cl_ntoh64(osm_node_get_node_guid(p_node))); 419 420 num_ports = osm_node_get_num_physp(p_node); 421 422 /* 423 Start with port 1 to skip the switch's management port. 424 */ 425 for (port_num = 1; port_num < num_ports; port_num++) { 426 p_remote_node = osm_node_get_remote_node(p_node, 427 (uint8_t) port_num, 428 &remote_port_num); 429 if (p_remote_node && p_remote_node->sw 430 && (p_remote_node != p_node)) { 431 /* make sure the link is healthy. If it is not - don't 432 propagate through it. */ 433 p_physp = osm_node_get_physp_ptr(p_node, port_num); 434 if (!p_physp || !osm_link_is_healthy(p_physp)) 435 continue; 436 437 ucast_mgr_process_neighbor(p_mgr, p_sw, 438 p_remote_node->sw, 439 (uint8_t) port_num, 440 remote_port_num); 441 } 442 } 443 444 OSM_LOG_EXIT(p_mgr->p_log); 445 } 446 447 static int set_hop_wf(void *ctx, uint64_t guid, char *p) 448 { 449 osm_ucast_mgr_t *m = ctx; 450 osm_node_t *node = osm_get_node_by_guid(m->p_subn, cl_hton64(guid)); 451 osm_physp_t *physp; 452 unsigned port, hop_wf; 453 char *e; 454 455 if (!node || !node->sw) { 456 OSM_LOG(m->p_log, OSM_LOG_DEBUG, 457 "switch with guid 0x%016" PRIx64 " is not found\n", 458 guid); 459 return 0; 460 } 461 462 if (!p || !*p || !(port = strtoul(p, &e, 0)) || (p == e) || 463 port >= node->sw->num_ports) { 464 OSM_LOG(m->p_log, OSM_LOG_DEBUG, 465 "bad port specified for guid 0x%016" PRIx64 "\n", guid); 466 return 0; 467 } 468 469 p = e + 1; 470 471 if (!*p || !(hop_wf = strtoul(p, &e, 0)) || p == e || hop_wf >= 0x100) { 472 OSM_LOG(m->p_log, OSM_LOG_DEBUG, 473 "bad hop weight factor specified for guid 0x%016" PRIx64 474 "port %u\n", guid, port); 475 return 0; 476 } 477 478 physp = osm_node_get_physp_ptr(node, port); 479 if (!physp) 480 return 0; 481 482 physp->hop_wf = hop_wf; 483 484 return 0; 485 } 486 487 static void set_default_hop_wf(cl_map_item_t * p_map_item, void *ctx) 488 { 489 osm_switch_t *sw = (osm_switch_t *) p_map_item; 490 int i; 491 492 for (i = 1; i < sw->num_ports; i++) { 493 osm_physp_t *p = osm_node_get_physp_ptr(sw->p_node, i); 494 if (p) 495 p->hop_wf = 1; 496 } 497 } 498 499 static int set_search_ordering_ports(void *ctx, uint64_t guid, char *p) 500 { 501 osm_subn_t *p_subn = ctx; 502 osm_node_t *node = osm_get_node_by_guid(p_subn, cl_hton64(guid)); 503 osm_switch_t *sw; 504 uint8_t *search_ordering_ports = NULL; 505 uint8_t port; 506 unsigned int *ports = NULL; 507 const int bpw = sizeof(*ports)*8; 508 int words; 509 int i = 1; /* port 0 maps to port 0 */ 510 511 if (!node || !(sw = node->sw)) { 512 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE, 513 "switch with guid 0x%016" PRIx64 " is not found\n", 514 guid); 515 return 0; 516 } 517 518 if (sw->search_ordering_ports) { 519 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE, 520 "switch with guid 0x%016" PRIx64 " already listed\n", 521 guid); 522 return 0; 523 } 524 525 search_ordering_ports = malloc(sizeof(*search_ordering_ports)*sw->num_ports); 526 if (!search_ordering_ports) { 527 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR, 528 "ERR 3A07: cannot allocate memory for search_ordering_ports\n"); 529 return -1; 530 } 531 memset(search_ordering_ports, 0, sizeof(*search_ordering_ports)*sw->num_ports); 532 533 /* the ports array is for record keeping of which ports have 534 * been seen */ 535 words = (sw->num_ports + bpw - 1)/bpw; 536 ports = malloc(words*sizeof(*ports)); 537 if (!ports) { 538 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR, 539 "ERR 3A08: cannot allocate memory for ports\n"); 540 free(search_ordering_ports); 541 return -1; 542 } 543 memset(ports, 0, words*sizeof(*ports)); 544 545 while ((*p != '\0') && (*p != '#')) { 546 char *e; 547 548 port = strtoul(p, &e, 0); 549 if ((p == e) || (port == 0) || (port >= sw->num_ports) || 550 !osm_node_get_physp_ptr(node, port)) { 551 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE, 552 "bad port %d specified for guid 0x%016" PRIx64 "\n", 553 port, guid); 554 free(search_ordering_ports); 555 free(ports); 556 return 0; 557 } 558 559 if (ports[port/bpw] & (1u << (port%bpw))) { 560 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE, 561 "port %d already specified for guid 0x%016" PRIx64 "\n", 562 port, guid); 563 free(search_ordering_ports); 564 free(ports); 565 return 0; 566 } 567 568 ports[port/bpw] |= (1u << (port%bpw)); 569 search_ordering_ports[i++] = port; 570 571 p = e; 572 while (isspace(*p)) { 573 p++; 574 } 575 } 576 577 if (i > 1) { 578 for (port = 1; port < sw->num_ports; port++) { 579 /* fill out the rest of the search_ordering_ports array 580 * in sequence using the remaining unspecified 581 * ports. 582 */ 583 if (!(ports[port/bpw] & (1u << (port%bpw)))) { 584 search_ordering_ports[i++] = port; 585 } 586 } 587 sw->search_ordering_ports = search_ordering_ports; 588 } else { 589 free(search_ordering_ports); 590 } 591 592 free(ports); 593 return 0; 594 } 595 596 int osm_ucast_mgr_build_lid_matrices(IN osm_ucast_mgr_t * p_mgr) 597 { 598 uint32_t i; 599 uint32_t iteration_max; 600 cl_qmap_t *p_sw_guid_tbl; 601 602 p_sw_guid_tbl = &p_mgr->p_subn->sw_guid_tbl; 603 604 OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE, 605 "Starting switches' Min Hop Table Assignment\n"); 606 607 /* 608 Set up the weighting factors for the routing. 609 */ 610 cl_qmap_apply_func(p_sw_guid_tbl, set_default_hop_wf, NULL); 611 if (p_mgr->p_subn->opt.hop_weights_file) { 612 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 613 "Fetching hop weight factor file \'%s\'\n", 614 p_mgr->p_subn->opt.hop_weights_file); 615 if (parse_node_map(p_mgr->p_subn->opt.hop_weights_file, 616 set_hop_wf, p_mgr)) { 617 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A05: " 618 "cannot parse hop_weights_file \'%s\'\n", 619 p_mgr->p_subn->opt.hop_weights_file); 620 } 621 } 622 623 /* 624 Set the switch matrices for each switch's own port 0 LID(s) 625 then set the lid matrices for the each switch's leaf nodes. 626 */ 627 cl_qmap_apply_func(p_sw_guid_tbl, ucast_mgr_process_hop_0_1, p_mgr); 628 629 /* 630 Get the switch matrices for each switch's neighbors. 631 This process requires a number of iterations equal to 632 the number of switches in the subnet minus 1. 633 634 In each iteration, a switch learns the lid/port/hop 635 information (as contained by a switch's lid matrix) from 636 its immediate neighbors. After each iteration, a switch 637 (and it's neighbors) know more routing information than 638 it did on the previous iteration. 639 Thus, by repeatedly absorbing the routing information of 640 neighbor switches, every switch eventually learns how to 641 route all LIDs on the subnet. 642 643 Note that there may not be any switches in the subnet if 644 we are in simple p2p configuration. 645 */ 646 iteration_max = cl_qmap_count(p_sw_guid_tbl); 647 648 /* 649 If there are switches in the subnet, iterate until the lid 650 matrix has been constructed. Otherwise, just immediately 651 indicate we're done if no switches exist. 652 */ 653 if (iteration_max) { 654 iteration_max--; 655 656 /* 657 we need to find out when the propagation of 658 hop counts has relaxed. So this global variable 659 is preset to 0 on each iteration and if 660 if non of the switches was set will exit the 661 while loop 662 */ 663 p_mgr->some_hop_count_set = TRUE; 664 for (i = 0; (i < iteration_max) && p_mgr->some_hop_count_set; 665 i++) { 666 p_mgr->some_hop_count_set = FALSE; 667 cl_qmap_apply_func(p_sw_guid_tbl, 668 ucast_mgr_process_neighbors, p_mgr); 669 } 670 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 671 "Min-hop propagated in %d steps\n", i); 672 } 673 674 return 0; 675 } 676 677 static int ucast_mgr_setup_all_switches(osm_subn_t * p_subn) 678 { 679 osm_switch_t *p_sw; 680 uint16_t lids; 681 682 lids = (uint16_t) cl_ptr_vector_get_size(&p_subn->port_lid_tbl); 683 lids = lids ? lids - 1 : 0; 684 685 for (p_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl); 686 p_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl); 687 p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { 688 if (osm_switch_prepare_path_rebuild(p_sw, lids)) { 689 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR, "ERR 3A0B: " 690 "cannot setup switch 0x%016" PRIx64 "\n", 691 cl_ntoh64(osm_node_get_node_guid 692 (p_sw->p_node))); 693 return -1; 694 } 695 if (p_sw->search_ordering_ports) { 696 free(p_sw->search_ordering_ports); 697 p_sw->search_ordering_ports = NULL; 698 } 699 } 700 701 if (p_subn->opt.port_search_ordering_file) { 702 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_DEBUG, 703 "Fetching dimension ports file \'%s\'\n", 704 p_subn->opt.port_search_ordering_file); 705 if (parse_node_map(p_subn->opt.port_search_ordering_file, 706 set_search_ordering_ports, p_subn)) { 707 OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR, "ERR 3A0F: " 708 "cannot parse port_search_ordering_file \'%s\'\n", 709 p_subn->opt.port_search_ordering_file); 710 } 711 } 712 713 return 0; 714 } 715 716 static int add_guid_to_order_list(void *ctx, uint64_t guid, char *p) 717 { 718 osm_ucast_mgr_t *m = ctx; 719 osm_port_t *port = osm_get_port_by_guid(m->p_subn, cl_hton64(guid)); 720 721 if (!port) { 722 OSM_LOG(m->p_log, OSM_LOG_DEBUG, 723 "port guid not found: 0x%016" PRIx64 "\n", guid); 724 return 0; 725 } 726 727 if (port->flag) { 728 OSM_LOG(m->p_log, OSM_LOG_DEBUG, 729 "port guid specified multiple times 0x%016" PRIx64 "\n", 730 guid); 731 return 0; 732 } 733 734 cl_qlist_insert_tail(&m->port_order_list, &port->list_item); 735 port->flag = 1; 736 port->use_scatter = (m->p_subn->opt.guid_routing_order_no_scatter == TRUE) ? 0 : m->p_subn->opt.scatter_ports; 737 738 return 0; 739 } 740 741 static void add_port_to_order_list(cl_map_item_t * p_map_item, void *ctx) 742 { 743 osm_port_t *port = (osm_port_t *) p_map_item; 744 osm_ucast_mgr_t *m = ctx; 745 746 if (!port->flag) { 747 port->use_scatter = m->p_subn->opt.scatter_ports; 748 cl_qlist_insert_tail(&m->port_order_list, &port->list_item); 749 } else 750 port->flag = 0; 751 } 752 753 static int mark_ignored_port(void *ctx, uint64_t guid, char *p) 754 { 755 osm_ucast_mgr_t *m = ctx; 756 osm_node_t *node = osm_get_node_by_guid(m->p_subn, cl_hton64(guid)); 757 osm_physp_t *physp; 758 unsigned port; 759 760 if (!node || !node->sw) { 761 OSM_LOG(m->p_log, OSM_LOG_DEBUG, 762 "switch with guid 0x%016" PRIx64 " is not found\n", 763 guid); 764 return 0; 765 } 766 767 if (!p || !*p || !(port = strtoul(p, NULL, 0)) || 768 port >= node->sw->num_ports) { 769 OSM_LOG(m->p_log, OSM_LOG_DEBUG, 770 "bad port specified for guid 0x%016" PRIx64 "\n", guid); 771 return 0; 772 } 773 774 physp = osm_node_get_physp_ptr(node, port); 775 if (!physp) 776 return 0; 777 778 physp->is_prof_ignored = 1; 779 780 return 0; 781 } 782 783 static void clear_prof_ignore_flag(cl_map_item_t * p_map_item, void *ctx) 784 { 785 osm_switch_t *sw = (osm_switch_t *) p_map_item; 786 int i; 787 788 for (i = 1; i < sw->num_ports; i++) { 789 osm_physp_t *p = osm_node_get_physp_ptr(sw->p_node, i); 790 if (p) 791 p->is_prof_ignored = 0; 792 } 793 } 794 795 static void add_sw_endports_to_order_list(osm_switch_t * sw, 796 osm_ucast_mgr_t * m) 797 { 798 osm_port_t *port; 799 osm_physp_t *p; 800 int i; 801 802 for (i = 1; i < sw->num_ports; i++) { 803 p = osm_node_get_physp_ptr(sw->p_node, i); 804 if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw) { 805 port = osm_get_port_by_guid(m->p_subn, 806 p->p_remote_physp-> 807 port_guid); 808 if (!port || port->flag) 809 continue; 810 cl_qlist_insert_tail(&m->port_order_list, 811 &port->list_item); 812 port->flag = 1; 813 port->use_scatter = m->p_subn->opt.scatter_ports; 814 } 815 } 816 } 817 818 static void sw_count_endport_links(osm_switch_t * sw) 819 { 820 osm_physp_t *p; 821 int i; 822 823 sw->endport_links = 0; 824 for (i = 1; i < sw->num_ports; i++) { 825 p = osm_node_get_physp_ptr(sw->p_node, i); 826 if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw) 827 sw->endport_links++; 828 } 829 } 830 831 static int compar_sw_load(const void *s1, const void *s2) 832 { 833 #define get_sw_endport_links(s) (*(osm_switch_t **)s)->endport_links 834 return get_sw_endport_links(s2) - get_sw_endport_links(s1); 835 } 836 837 static void sort_ports_by_switch_load(osm_ucast_mgr_t * m) 838 { 839 int i, num = cl_qmap_count(&m->p_subn->sw_guid_tbl); 840 void **s = malloc(num * sizeof(*s)); 841 if (!s) { 842 OSM_LOG(m->p_log, OSM_LOG_ERROR, "ERR 3A0C: " 843 "No memory, skip by switch load sorting.\n"); 844 return; 845 } 846 s[0] = cl_qmap_head(&m->p_subn->sw_guid_tbl); 847 for (i = 1; i < num; i++) 848 s[i] = cl_qmap_next(s[i - 1]); 849 850 for (i = 0; i < num; i++) 851 sw_count_endport_links(s[i]); 852 853 qsort(s, num, sizeof(*s), compar_sw_load); 854 855 for (i = 0; i < num; i++) 856 add_sw_endports_to_order_list(s[i], m); 857 free(s); 858 } 859 860 static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr) 861 { 862 cl_qlist_init(&p_mgr->port_order_list); 863 864 if (p_mgr->p_subn->opt.guid_routing_order_file) { 865 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 866 "Fetching guid routing order file \'%s\'\n", 867 p_mgr->p_subn->opt.guid_routing_order_file); 868 869 if (parse_node_map(p_mgr->p_subn->opt.guid_routing_order_file, 870 add_guid_to_order_list, p_mgr)) 871 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A0D: " 872 "cannot parse guid routing order file \'%s\'\n", 873 p_mgr->p_subn->opt.guid_routing_order_file); 874 } 875 sort_ports_by_switch_load(p_mgr); 876 877 if (p_mgr->p_subn->opt.port_prof_ignore_file) { 878 cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, 879 clear_prof_ignore_flag, NULL); 880 if (parse_node_map(p_mgr->p_subn->opt.port_prof_ignore_file, 881 mark_ignored_port, p_mgr)) { 882 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A0E: " 883 "cannot parse port prof ignore file \'%s\'\n", 884 p_mgr->p_subn->opt.port_prof_ignore_file); 885 } 886 } 887 888 cl_qmap_apply_func(&p_mgr->p_subn->port_guid_tbl, 889 add_port_to_order_list, p_mgr); 890 891 cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, ucast_mgr_process_tbl, 892 p_mgr); 893 894 cl_qlist_remove_all(&p_mgr->port_order_list); 895 896 return 0; 897 } 898 899 static void ucast_mgr_set_fwd_top(IN cl_map_item_t * p_map_item, 900 IN void *cxt) 901 { 902 osm_ucast_mgr_t *p_mgr = cxt; 903 osm_switch_t * p_sw = (osm_switch_t *) p_map_item; 904 osm_node_t *p_node; 905 osm_physp_t *p_physp; 906 osm_dr_path_t *p_path; 907 osm_madw_context_t context; 908 ib_api_status_t status; 909 ib_switch_info_t si; 910 boolean_t set_swinfo_require = FALSE; 911 uint16_t lin_top; 912 uint8_t life_state; 913 914 CL_ASSERT(p_mgr); 915 916 OSM_LOG_ENTER(p_mgr->p_log); 917 918 CL_ASSERT(p_sw && p_sw->max_lid_ho); 919 920 p_node = p_sw->p_node; 921 922 CL_ASSERT(p_node); 923 924 if (p_mgr->max_lid < p_sw->max_lid_ho) 925 p_mgr->max_lid = p_sw->max_lid_ho; 926 927 p_physp = osm_node_get_physp_ptr(p_node, 0); 928 929 CL_ASSERT(p_physp); 930 931 p_path = osm_physp_get_dr_path_ptr(p_physp); 932 933 /* 934 Set the top of the unicast forwarding table. 935 */ 936 si = p_sw->switch_info; 937 lin_top = cl_hton16(p_sw->max_lid_ho); 938 if (lin_top != si.lin_top) { 939 set_swinfo_require = TRUE; 940 si.lin_top = lin_top; 941 context.si_context.lft_top_change = TRUE; 942 } else 943 context.si_context.lft_top_change = FALSE; 944 945 life_state = si.life_state; 946 ib_switch_info_set_life_time(&si, p_mgr->p_subn->opt.packet_life_time); 947 948 if (life_state != si.life_state) 949 set_swinfo_require = TRUE; 950 951 if (set_swinfo_require) { 952 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 953 "Setting switch FT top to LID %u\n", p_sw->max_lid_ho); 954 955 context.si_context.light_sweep = FALSE; 956 context.si_context.node_guid = osm_node_get_node_guid(p_node); 957 context.si_context.set_method = TRUE; 958 959 status = osm_req_set(p_mgr->sm, p_path, (uint8_t *) & si, 960 sizeof(si), IB_MAD_ATTR_SWITCH_INFO, 961 0, FALSE, 962 ib_port_info_get_m_key(&p_physp->port_info), 963 CL_DISP_MSGID_NONE, &context); 964 965 if (status != IB_SUCCESS) 966 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A06: " 967 "Sending SwitchInfo attribute failed (%s)\n", 968 ib_get_err_str(status)); 969 } 970 971 OSM_LOG_EXIT(p_mgr->p_log); 972 } 973 974 static int set_lft_block(IN osm_switch_t *p_sw, IN osm_ucast_mgr_t *p_mgr, 975 IN uint16_t block_id_ho) 976 { 977 osm_madw_context_t context; 978 osm_dr_path_t *p_path; 979 osm_physp_t *p_physp; 980 ib_api_status_t status; 981 982 /* 983 Send linear forwarding table blocks to the switch 984 as long as the switch indicates it has blocks needing 985 configuration. 986 */ 987 if (!p_sw->new_lft) { 988 /* any routing should provide the new_lft */ 989 CL_ASSERT(p_mgr->p_subn->opt.use_ucast_cache && 990 p_mgr->cache_valid && !p_sw->need_update); 991 return -1; 992 } 993 994 p_physp = osm_node_get_physp_ptr(p_sw->p_node, 0); 995 if (!p_physp) 996 return -1; 997 998 p_path = osm_physp_get_dr_path_ptr(p_physp); 999 1000 context.lft_context.node_guid = osm_node_get_node_guid(p_sw->p_node); 1001 context.lft_context.set_method = TRUE; 1002 1003 if (!p_sw->need_update && !p_mgr->p_subn->need_update && 1004 !memcmp(p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE, 1005 p_sw->lft + block_id_ho * IB_SMP_DATA_SIZE, 1006 IB_SMP_DATA_SIZE)) 1007 return 0; 1008 1009 /* 1010 * Zero the stored LFT block, so in case the MAD will end up 1011 * with error, we will resend it in the next sweep. 1012 */ 1013 memset(p_sw->lft + block_id_ho * IB_SMP_DATA_SIZE, 0, 1014 IB_SMP_DATA_SIZE); 1015 1016 OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, 1017 "Writing FT block %u to switch 0x%" PRIx64 "\n", block_id_ho, 1018 cl_ntoh64(context.lft_context.node_guid)); 1019 1020 status = osm_req_set(p_mgr->sm, p_path, 1021 p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE, 1022 IB_SMP_DATA_SIZE, IB_MAD_ATTR_LIN_FWD_TBL, 1023 cl_hton32(block_id_ho), FALSE, 1024 ib_port_info_get_m_key(&p_physp->port_info), 1025 CL_DISP_MSGID_NONE, &context); 1026 1027 if (status != IB_SUCCESS) { 1028 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A10: " 1029 "Sending linear fwd. tbl. block failed (%s)\n", 1030 ib_get_err_str(status)); 1031 return -1; 1032 } 1033 1034 return 0; 1035 } 1036 1037 static void ucast_mgr_pipeline_fwd_tbl(osm_ucast_mgr_t * p_mgr) 1038 { 1039 cl_qmap_t *tbl; 1040 cl_map_item_t *item; 1041 unsigned i, max_block = p_mgr->max_lid / IB_SMP_DATA_SIZE + 1; 1042 1043 tbl = &p_mgr->p_subn->sw_guid_tbl; 1044 for (i = 0; i < max_block; i++) 1045 for (item = cl_qmap_head(tbl); item != cl_qmap_end(tbl); 1046 item = cl_qmap_next(item)) 1047 set_lft_block((osm_switch_t *)item, p_mgr, i); 1048 } 1049 1050 void osm_ucast_mgr_set_fwd_tables(osm_ucast_mgr_t * p_mgr) 1051 { 1052 p_mgr->max_lid = 0; 1053 1054 cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, ucast_mgr_set_fwd_top, 1055 p_mgr); 1056 1057 ucast_mgr_pipeline_fwd_tbl(p_mgr); 1058 } 1059 1060 static int ucast_mgr_route(struct osm_routing_engine *r, osm_opensm_t * osm) 1061 { 1062 int ret; 1063 1064 OSM_LOG(&osm->log, OSM_LOG_VERBOSE, 1065 "building routing with \'%s\' routing algorithm...\n", r->name); 1066 1067 /* Set the before each lft build to keep the routes in place between sweeps */ 1068 if (osm->subn.opt.scatter_ports) 1069 srandom(osm->subn.opt.scatter_ports); 1070 1071 if (!r->build_lid_matrices || 1072 (ret = r->build_lid_matrices(r->context)) > 0) 1073 ret = osm_ucast_mgr_build_lid_matrices(&osm->sm.ucast_mgr); 1074 1075 if (ret < 0) { 1076 OSM_LOG(&osm->log, OSM_LOG_ERROR, 1077 "%s: cannot build lid matrices\n", r->name); 1078 return ret; 1079 } 1080 1081 if (!r->ucast_build_fwd_tables || 1082 (ret = r->ucast_build_fwd_tables(r->context)) > 0) 1083 ret = ucast_mgr_build_lfts(&osm->sm.ucast_mgr); 1084 1085 if (ret < 0) { 1086 OSM_LOG(&osm->log, OSM_LOG_ERROR, 1087 "%s: cannot build fwd tables\n", r->name); 1088 return ret; 1089 } 1090 1091 osm->routing_engine_used = r; 1092 1093 osm_ucast_mgr_set_fwd_tables(&osm->sm.ucast_mgr); 1094 1095 return 0; 1096 } 1097 1098 int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr) 1099 { 1100 osm_opensm_t *p_osm; 1101 struct osm_routing_engine *p_routing_eng; 1102 cl_qmap_t *p_sw_guid_tbl; 1103 int failed = 0; 1104 1105 OSM_LOG_ENTER(p_mgr->p_log); 1106 1107 p_sw_guid_tbl = &p_mgr->p_subn->sw_guid_tbl; 1108 p_osm = p_mgr->p_subn->p_osm; 1109 p_routing_eng = p_osm->routing_engine_list; 1110 1111 CL_PLOCK_EXCL_ACQUIRE(p_mgr->p_lock); 1112 1113 /* 1114 If there are no switches in the subnet, we are done. 1115 */ 1116 if (cl_qmap_count(p_sw_guid_tbl) == 0 || 1117 ucast_mgr_setup_all_switches(p_mgr->p_subn) < 0) 1118 goto Exit; 1119 1120 failed = -1; 1121 p_osm->routing_engine_used = NULL; 1122 while (p_routing_eng) { 1123 failed = ucast_mgr_route(p_routing_eng, p_osm); 1124 if (!failed) 1125 break; 1126 p_routing_eng = p_routing_eng->next; 1127 } 1128 1129 if (!p_osm->routing_engine_used && 1130 p_osm->no_fallback_routing_engine != TRUE) { 1131 /* If configured routing algorithm failed, use default MinHop */ 1132 failed = ucast_mgr_route(p_osm->default_routing_engine, p_osm); 1133 } 1134 1135 if (p_osm->routing_engine_used) { 1136 OSM_LOG(p_mgr->p_log, OSM_LOG_INFO, 1137 "%s tables configured on all switches\n", 1138 osm_routing_engine_type_str(p_osm-> 1139 routing_engine_used->type)); 1140 1141 if (p_mgr->p_subn->opt.use_ucast_cache) 1142 p_mgr->cache_valid = TRUE; 1143 } else { 1144 p_mgr->p_subn->subnet_initialization_error = TRUE; 1145 OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, 1146 "No routing engine able to successfully configure " 1147 " switch tables on current fabric\n"); 1148 } 1149 Exit: 1150 CL_PLOCK_RELEASE(p_mgr->p_lock); 1151 OSM_LOG_EXIT(p_mgr->p_log); 1152 return failed; 1153 } 1154 1155 static int ucast_build_lid_matrices(void *context) 1156 { 1157 return osm_ucast_mgr_build_lid_matrices(context); 1158 } 1159 1160 static int ucast_build_lfts(void *context) 1161 { 1162 return ucast_mgr_build_lfts(context); 1163 } 1164 1165 int osm_ucast_minhop_setup(struct osm_routing_engine *r, osm_opensm_t * osm) 1166 { 1167 r->context = &osm->sm.ucast_mgr; 1168 r->build_lid_matrices = ucast_build_lid_matrices; 1169 r->ucast_build_fwd_tables = ucast_build_lfts; 1170 return 0; 1171 } 1172 1173 static int ucast_dor_build_lfts(void *context) 1174 { 1175 osm_ucast_mgr_t *mgr = context; 1176 int ret; 1177 1178 mgr->is_dor = 1; 1179 ret = ucast_mgr_build_lfts(mgr); 1180 mgr->is_dor = 0; 1181 1182 return ret; 1183 } 1184 1185 int osm_ucast_dor_setup(struct osm_routing_engine *r, osm_opensm_t * osm) 1186 { 1187 r->context = &osm->sm.ucast_mgr; 1188 r->build_lid_matrices = ucast_build_lid_matrices; 1189 r->ucast_build_fwd_tables = ucast_dor_build_lfts; 1190 return 0; 1191 } 1192 1193 int ucast_dummy_build_lid_matrices(void *context) 1194 { 1195 return 0; 1196 } 1197