1 /* 2 * Copyright (c) 2009 Simula Research Laboratory. All rights reserved. 3 * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. 4 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. 5 * Copyright (c) 2002-2011 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. 7 * 8 * This software is available to you under a choice of one of two 9 * licenses. You may choose to be licensed under the terms of the GNU 10 * General Public License (GPL) Version 2, available from the file 11 * COPYING in the main directory of this source tree, or the 12 * OpenIB.org BSD license below: 13 * 14 * Redistribution and use in source and binary forms, with or 15 * without modification, are permitted provided that the following 16 * conditions are met: 17 * 18 * - Redistributions of source code must retain the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer. 21 * 22 * - Redistributions in binary form must reproduce the above 23 * copyright notice, this list of conditions and the following 24 * disclaimer in the documentation and/or other materials 25 * provided with the distribution. 26 * 27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 34 * SOFTWARE. 35 * 36 */ 37 38 /* 39 * Abstract: 40 * Implementation of OpenSM FatTree routing 41 */ 42 43 #if HAVE_CONFIG_H 44 # include <config.h> 45 #endif 46 47 #include <stdlib.h> 48 #include <string.h> 49 #include <ctype.h> 50 #include <errno.h> 51 #include <iba/ib_types.h> 52 #include <complib/cl_qmap.h> 53 #include <complib/cl_debug.h> 54 #include <opensm/osm_file_ids.h> 55 #define FILE_ID OSM_FILE_UCAST_FTREE_C 56 #include <opensm/osm_opensm.h> 57 #include <opensm/osm_switch.h> 58 59 /* 60 * FatTree rank is bounded between 2 and 8: 61 * - Tree of rank 1 has only trivial routing paths, 62 * so no need to use FatTree routing. 63 * - Why maximum rank is 8: 64 * Each node (switch) is assigned a unique tuple. 65 * Switches are stored in two cl_qmaps - one is 66 * ordered by guid, and the other by a key that is 67 * generated from tuple. Since cl_qmap supports only 68 * a 64-bit key, the maximal tuple length is 8 bytes. 69 * which means that maximal tree rank is 8. 70 * Note that the above also implies that each switch 71 * can have at max 255 up/down ports. 72 */ 73 74 #define FAT_TREE_MIN_RANK 2 75 #define FAT_TREE_MAX_RANK 8 76 77 typedef enum { 78 FTREE_DIRECTION_DOWN = -1, 79 FTREE_DIRECTION_SAME, 80 FTREE_DIRECTION_UP 81 } ftree_direction_t; 82 83 /*************************************************** 84 ** 85 ** Forward references 86 ** 87 ***************************************************/ 88 struct ftree_sw_t_; 89 struct ftree_hca_t_; 90 struct ftree_port_t_; 91 struct ftree_port_group_t_; 92 struct ftree_fabric_t_; 93 94 /*************************************************** 95 ** 96 ** ftree_tuple_t definition 97 ** 98 ***************************************************/ 99 100 #define FTREE_TUPLE_BUFF_LEN 1024 101 #define FTREE_TUPLE_LEN 8 102 103 typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN]; 104 typedef uint64_t ftree_tuple_key_t; 105 106 /*************************************************** 107 ** 108 ** ftree_sw_table_element_t definition 109 ** 110 ***************************************************/ 111 112 typedef struct { 113 cl_map_item_t map_item; 114 struct ftree_sw_t_ *p_sw; 115 } ftree_sw_tbl_element_t; 116 117 /*************************************************** 118 ** 119 ** ftree_port_t definition 120 ** 121 ***************************************************/ 122 123 typedef struct ftree_port_t_ { 124 cl_map_item_t map_item; 125 uint8_t port_num; /* port number on the current node */ 126 uint8_t remote_port_num; /* port number on the remote node */ 127 uint32_t counter_up; /* number of allocated routes upwards */ 128 uint32_t counter_down; /* number of allocated routes downwards */ 129 } ftree_port_t; 130 131 /*************************************************** 132 ** 133 ** ftree_port_group_t definition 134 ** 135 ***************************************************/ 136 137 typedef union ftree_hca_or_sw_ { 138 struct ftree_hca_t_ *p_hca; 139 struct ftree_sw_t_ *p_sw; 140 } ftree_hca_or_sw; 141 142 typedef struct ftree_port_group_t_ { 143 cl_map_item_t map_item; 144 uint16_t lid; /* lid of the current node */ 145 uint16_t remote_lid; /* lid of the remote node */ 146 ib_net64_t port_guid; /* port guid of this port */ 147 ib_net64_t node_guid; /* this node's guid */ 148 uint8_t node_type; /* this node's type */ 149 ib_net64_t remote_port_guid; /* port guid of the remote port */ 150 ib_net64_t remote_node_guid; /* node guid of the remote node */ 151 uint8_t remote_node_type; /* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */ 152 ftree_hca_or_sw hca_or_sw; /* pointer to this hca/switch */ 153 ftree_hca_or_sw remote_hca_or_sw; /* pointer to remote hca/switch */ 154 cl_ptr_vector_t ports; /* vector of ports to the same lid */ 155 boolean_t is_cn; /* whether this port is a compute node */ 156 boolean_t is_io; /* whether this port is an I/O node */ 157 uint32_t counter_down; /* number of allocated routes downwards */ 158 uint32_t counter_up; /* number of allocated routes upwards */ 159 } ftree_port_group_t; 160 161 /*************************************************** 162 ** 163 ** ftree_sw_t definition 164 ** 165 ***************************************************/ 166 167 typedef struct ftree_sw_t_ { 168 cl_map_item_t map_item; 169 osm_switch_t *p_osm_sw; 170 uint32_t rank; 171 ftree_tuple_t tuple; 172 uint16_t lid; 173 ftree_port_group_t **down_port_groups; 174 uint8_t down_port_groups_num; 175 ftree_port_group_t **sibling_port_groups; 176 uint8_t sibling_port_groups_num; 177 ftree_port_group_t **up_port_groups; 178 uint8_t up_port_groups_num; 179 boolean_t is_leaf; 180 unsigned down_port_groups_idx; 181 uint8_t *hops; 182 uint32_t min_counter_down; 183 boolean_t counter_up_changed; 184 } ftree_sw_t; 185 186 /*************************************************** 187 ** 188 ** ftree_hca_t definition 189 ** 190 ***************************************************/ 191 192 typedef struct ftree_hca_t_ { 193 cl_map_item_t map_item; 194 osm_node_t *p_osm_node; 195 ftree_port_group_t **up_port_groups; 196 uint8_t *disconnected_ports; 197 uint16_t up_port_groups_num; 198 unsigned cn_num; 199 } ftree_hca_t; 200 201 /*************************************************** 202 ** 203 ** ftree_fabric_t definition 204 ** 205 ***************************************************/ 206 207 typedef struct ftree_fabric_t_ { 208 osm_opensm_t *p_osm; 209 osm_subn_t *p_subn; 210 cl_qmap_t hca_tbl; 211 cl_qmap_t sw_tbl; 212 cl_qmap_t sw_by_tuple_tbl; 213 cl_qmap_t cn_guid_tbl; 214 cl_qmap_t io_guid_tbl; 215 unsigned cn_num; 216 unsigned ca_ports; 217 uint8_t leaf_switch_rank; 218 uint8_t max_switch_rank; 219 ftree_sw_t **leaf_switches; 220 uint32_t leaf_switches_num; 221 uint16_t max_cn_per_leaf; 222 uint16_t lft_max_lid; 223 boolean_t fabric_built; 224 } ftree_fabric_t; 225 226 static inline osm_subn_t *ftree_get_subnet(IN ftree_fabric_t * p_ftree) 227 { 228 return p_ftree->p_subn; 229 } 230 231 /*************************************************** 232 ** 233 ** comparators 234 ** 235 ***************************************************/ 236 237 static int compare_switches_by_index(IN const void *p1, IN const void *p2) 238 { 239 ftree_sw_t **pp_sw1 = (ftree_sw_t **) p1; 240 ftree_sw_t **pp_sw2 = (ftree_sw_t **) p2; 241 242 uint16_t i; 243 for (i = 0; i < FTREE_TUPLE_LEN; i++) { 244 if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i]) 245 return 1; 246 if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i]) 247 return -1; 248 } 249 return 0; 250 } 251 252 /***************************************************/ 253 254 static int 255 compare_port_groups_by_remote_switch_index(IN const void *p1, IN const void *p2) 256 { 257 ftree_port_group_t **pp_g1 = (ftree_port_group_t **) p1; 258 ftree_port_group_t **pp_g2 = (ftree_port_group_t **) p2; 259 260 return 261 compare_switches_by_index(&((*pp_g1)->remote_hca_or_sw.p_sw), 262 &((*pp_g2)->remote_hca_or_sw.p_sw)); 263 } 264 265 /*************************************************** 266 ** 267 ** ftree_tuple_t functions 268 ** 269 ***************************************************/ 270 271 static void tuple_init(IN ftree_tuple_t tuple) 272 { 273 memset(tuple, 0xFF, FTREE_TUPLE_LEN); 274 } 275 276 /***************************************************/ 277 278 static inline boolean_t tuple_assigned(IN ftree_tuple_t tuple) 279 { 280 return (tuple[0] != 0xFF); 281 } 282 283 /***************************************************/ 284 285 #define FTREE_TUPLE_BUFFERS_NUM 6 286 287 static const char *tuple_to_str(IN ftree_tuple_t tuple) 288 { 289 static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN]; 290 static uint8_t ind = 0; 291 char *ret_buffer; 292 uint32_t i; 293 294 if (!tuple_assigned(tuple)) 295 return "INDEX.NOT.ASSIGNED"; 296 297 buffer[ind][0] = '\0'; 298 299 for (i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++) { 300 if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN) 301 return "INDEX.TOO.LONG"; 302 if (i != 0) 303 strcat(buffer[ind], "."); 304 sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]); 305 } 306 307 ret_buffer = buffer[ind]; 308 ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM; 309 return ret_buffer; 310 } /* tuple_to_str() */ 311 312 /***************************************************/ 313 314 static inline ftree_tuple_key_t tuple_to_key(IN ftree_tuple_t tuple) 315 { 316 ftree_tuple_key_t key; 317 memcpy(&key, tuple, FTREE_TUPLE_LEN); 318 return key; 319 } 320 321 /***************************************************/ 322 323 static inline void tuple_from_key(IN ftree_tuple_t tuple, 324 IN ftree_tuple_key_t key) 325 { 326 memcpy(tuple, &key, FTREE_TUPLE_LEN); 327 } 328 329 /*************************************************** 330 ** 331 ** ftree_sw_tbl_element_t functions 332 ** 333 ***************************************************/ 334 335 static ftree_sw_tbl_element_t *sw_tbl_element_create(IN ftree_sw_t * p_sw) 336 { 337 ftree_sw_tbl_element_t *p_element = 338 (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t)); 339 if (!p_element) 340 return NULL; 341 memset(p_element, 0, sizeof(ftree_sw_tbl_element_t)); 342 343 p_element->p_sw = p_sw; 344 return p_element; 345 } 346 347 /***************************************************/ 348 349 static void sw_tbl_element_destroy(IN ftree_sw_tbl_element_t * p_element) 350 { 351 free(p_element); 352 } 353 354 /*************************************************** 355 ** 356 ** ftree_port_t functions 357 ** 358 ***************************************************/ 359 360 static ftree_port_t *port_create(IN uint8_t port_num, 361 IN uint8_t remote_port_num) 362 { 363 ftree_port_t *p_port = (ftree_port_t *) malloc(sizeof(ftree_port_t)); 364 if (!p_port) 365 return NULL; 366 memset(p_port, 0, sizeof(ftree_port_t)); 367 368 p_port->port_num = port_num; 369 p_port->remote_port_num = remote_port_num; 370 371 return p_port; 372 } 373 374 /***************************************************/ 375 376 static void port_destroy(IN ftree_port_t * p_port) 377 { 378 free(p_port); 379 } 380 381 /*************************************************** 382 ** 383 ** ftree_port_group_t functions 384 ** 385 ***************************************************/ 386 387 static ftree_port_group_t *port_group_create(IN uint16_t lid, 388 IN uint16_t remote_lid, 389 IN ib_net64_t port_guid, 390 IN ib_net64_t node_guid, 391 IN uint8_t node_type, 392 IN void *p_hca_or_sw, 393 IN ib_net64_t remote_port_guid, 394 IN ib_net64_t remote_node_guid, 395 IN uint8_t remote_node_type, 396 IN void *p_remote_hca_or_sw, 397 IN boolean_t is_cn, 398 IN boolean_t is_io) 399 { 400 ftree_port_group_t *p_group = 401 (ftree_port_group_t *) malloc(sizeof(ftree_port_group_t)); 402 if (p_group == NULL) 403 return NULL; 404 memset(p_group, 0, sizeof(ftree_port_group_t)); 405 406 p_group->lid = lid; 407 p_group->remote_lid = remote_lid; 408 memcpy(&p_group->port_guid, &port_guid, sizeof(ib_net64_t)); 409 memcpy(&p_group->node_guid, &node_guid, sizeof(ib_net64_t)); 410 memcpy(&p_group->remote_port_guid, &remote_port_guid, 411 sizeof(ib_net64_t)); 412 memcpy(&p_group->remote_node_guid, &remote_node_guid, 413 sizeof(ib_net64_t)); 414 415 p_group->node_type = node_type; 416 switch (node_type) { 417 case IB_NODE_TYPE_CA: 418 p_group->hca_or_sw.p_hca = (ftree_hca_t *) p_hca_or_sw; 419 break; 420 case IB_NODE_TYPE_SWITCH: 421 p_group->hca_or_sw.p_sw = (ftree_sw_t *) p_hca_or_sw; 422 break; 423 default: 424 /* we shouldn't get here - port is created only in hca or switch */ 425 CL_ASSERT(0); 426 } 427 428 p_group->remote_node_type = remote_node_type; 429 switch (remote_node_type) { 430 case IB_NODE_TYPE_CA: 431 p_group->remote_hca_or_sw.p_hca = 432 (ftree_hca_t *) p_remote_hca_or_sw; 433 break; 434 case IB_NODE_TYPE_SWITCH: 435 p_group->remote_hca_or_sw.p_sw = 436 (ftree_sw_t *) p_remote_hca_or_sw; 437 break; 438 default: 439 /* we shouldn't get here - port is created only in hca or switch */ 440 CL_ASSERT(0); 441 } 442 443 cl_ptr_vector_init(&p_group->ports, 0, /* min size */ 444 8); /* grow size */ 445 p_group->is_cn = is_cn; 446 p_group->is_io = is_io; 447 return p_group; 448 } /* port_group_create() */ 449 450 /***************************************************/ 451 452 static void port_group_destroy(IN ftree_port_group_t * p_group) 453 { 454 uint32_t i; 455 uint32_t size; 456 ftree_port_t *p_port; 457 458 if (!p_group) 459 return; 460 461 /* remove all the elements of p_group->ports vector */ 462 size = cl_ptr_vector_get_size(&p_group->ports); 463 for (i = 0; i < size; i++) 464 if (cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port) == CL_SUCCESS) 465 port_destroy(p_port); 466 467 cl_ptr_vector_destroy(&p_group->ports); 468 free(p_group); 469 } /* port_group_destroy() */ 470 471 /***************************************************/ 472 473 static void port_group_dump(IN ftree_fabric_t * p_ftree, 474 IN ftree_port_group_t * p_group, 475 IN ftree_direction_t direction) 476 { 477 ftree_port_t *p_port; 478 uint32_t size; 479 uint32_t i; 480 char *buff; 481 482 if (!p_group) 483 return; 484 485 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 486 return; 487 488 size = cl_ptr_vector_get_size(&p_group->ports); 489 490 buff = calloc(10, 1024); 491 if (!buff) { 492 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB33: " 493 "Failed to allocate buffer\n"); 494 return; 495 } 496 497 for (i = 0; i < size; i++) { 498 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port); 499 CL_ASSERT(p_port); 500 501 if (i != 0) 502 strcat(buff, ", "); 503 sprintf(buff + strlen(buff), "%u", p_port->port_num); 504 } 505 506 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 507 " Port Group of size %u, port(s): %s, direction: %s\n" 508 " Local <--> Remote GUID (LID):" 509 "0x%016" PRIx64 " (0x%04x) <--> 0x%016" PRIx64 " (0x%04x)\n", 510 size, buff, 511 (direction == FTREE_DIRECTION_DOWN) ? "DOWN" : (direction == 512 FTREE_DIRECTION_SAME) 513 ? "SIBLING" : "UP", cl_ntoh64(p_group->port_guid), 514 p_group->lid, cl_ntoh64(p_group->remote_port_guid), 515 p_group->remote_lid); 516 517 free(buff); 518 519 } /* port_group_dump() */ 520 521 /***************************************************/ 522 523 static void port_group_add_port(IN ftree_port_group_t * p_group, 524 IN uint8_t port_num, IN uint8_t remote_port_num) 525 { 526 uint16_t i; 527 ftree_port_t *p_port; 528 529 for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++) { 530 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port); 531 if (p_port->port_num == port_num) 532 return; 533 } 534 535 p_port = port_create(port_num, remote_port_num); 536 CL_ASSERT(p_port); 537 cl_ptr_vector_insert(&p_group->ports, p_port, NULL); 538 } 539 540 /*************************************************** 541 ** 542 ** ftree_sw_t functions 543 ** 544 ***************************************************/ 545 546 static ftree_sw_t *sw_create(IN osm_switch_t * p_osm_sw) 547 { 548 ftree_sw_t *p_sw; 549 uint8_t ports_num; 550 551 /* make sure that the switch has ports */ 552 if (p_osm_sw->num_ports == 1) 553 return NULL; 554 555 p_sw = (ftree_sw_t *) malloc(sizeof(ftree_sw_t)); 556 if (p_sw == NULL) 557 return NULL; 558 memset(p_sw, 0, sizeof(ftree_sw_t)); 559 560 p_sw->p_osm_sw = p_osm_sw; 561 p_sw->rank = 0xFFFFFFFF; 562 tuple_init(p_sw->tuple); 563 564 p_sw->lid = 565 cl_ntoh16(osm_node_get_base_lid(p_sw->p_osm_sw->p_node, 0)); 566 567 ports_num = osm_node_get_num_physp(p_sw->p_osm_sw->p_node); 568 p_sw->down_port_groups = 569 (ftree_port_group_t **) malloc(ports_num * 570 sizeof(ftree_port_group_t *)); 571 if (p_sw->down_port_groups == NULL) 572 goto FREE_P_SW; 573 memset(p_sw->down_port_groups, 0, ports_num * sizeof(ftree_port_group_t *)); 574 575 p_sw->up_port_groups = 576 (ftree_port_group_t **) malloc(ports_num * 577 sizeof(ftree_port_group_t *)); 578 if (p_sw->up_port_groups == NULL) 579 goto FREE_DOWN; 580 memset(p_sw->up_port_groups, 0, ports_num * sizeof(ftree_port_group_t *)); 581 582 p_sw->sibling_port_groups = 583 (ftree_port_group_t **) malloc(ports_num * 584 sizeof(ftree_port_group_t *)); 585 if (p_sw->sibling_port_groups == NULL) 586 goto FREE_UP; 587 memset(p_sw->sibling_port_groups, 0, ports_num * sizeof(ftree_port_group_t *)); 588 589 /* initialize lft buffer */ 590 memset(p_osm_sw->new_lft, OSM_NO_PATH, p_osm_sw->lft_size); 591 p_sw->hops = malloc((p_osm_sw->max_lid_ho + 1) * sizeof(*(p_sw->hops))); 592 if (p_sw->hops == NULL) 593 goto FREE_SIBLING; 594 595 memset(p_sw->hops, OSM_NO_PATH, p_osm_sw->max_lid_ho + 1); 596 597 return p_sw; 598 599 FREE_SIBLING: 600 free(p_sw->sibling_port_groups); 601 FREE_UP: 602 free(p_sw->up_port_groups); 603 FREE_DOWN: 604 free(p_sw->down_port_groups); 605 FREE_P_SW: 606 free(p_sw); 607 return NULL; 608 } /* sw_create() */ 609 610 /***************************************************/ 611 612 static void sw_destroy(IN ftree_sw_t * p_sw) 613 { 614 uint8_t i; 615 616 if (!p_sw) 617 return; 618 free(p_sw->hops); 619 620 for (i = 0; i < p_sw->down_port_groups_num; i++) 621 port_group_destroy(p_sw->down_port_groups[i]); 622 for (i = 0; i < p_sw->sibling_port_groups_num; i++) 623 port_group_destroy(p_sw->sibling_port_groups[i]); 624 for (i = 0; i < p_sw->up_port_groups_num; i++) 625 port_group_destroy(p_sw->up_port_groups[i]); 626 free(p_sw->down_port_groups); 627 free(p_sw->sibling_port_groups); 628 free(p_sw->up_port_groups); 629 630 free(p_sw); 631 } /* sw_destroy() */ 632 633 /***************************************************/ 634 635 static uint64_t sw_get_guid_no(IN ftree_sw_t * p_sw) 636 { 637 if (!p_sw) 638 return 0; 639 return osm_node_get_node_guid(p_sw->p_osm_sw->p_node); 640 } 641 642 /***************************************************/ 643 644 static uint64_t sw_get_guid_ho(IN ftree_sw_t * p_sw) 645 { 646 return cl_ntoh64(sw_get_guid_no(p_sw)); 647 } 648 649 /***************************************************/ 650 651 static void sw_dump(IN ftree_fabric_t * p_ftree, IN ftree_sw_t * p_sw) 652 { 653 uint32_t i; 654 655 if (!p_sw) 656 return; 657 658 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 659 return; 660 661 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 662 "Switch index: %s, GUID: 0x%016" PRIx64 663 ", Ports: %u DOWN, %u SIBLINGS, %u UP\n", 664 tuple_to_str(p_sw->tuple), sw_get_guid_ho(p_sw), 665 p_sw->down_port_groups_num, p_sw->sibling_port_groups_num, 666 p_sw->up_port_groups_num); 667 668 for (i = 0; i < p_sw->down_port_groups_num; i++) 669 port_group_dump(p_ftree, p_sw->down_port_groups[i], 670 FTREE_DIRECTION_DOWN); 671 for (i = 0; i < p_sw->sibling_port_groups_num; i++) 672 port_group_dump(p_ftree, p_sw->sibling_port_groups[i], 673 FTREE_DIRECTION_SAME); 674 for (i = 0; i < p_sw->up_port_groups_num; i++) 675 port_group_dump(p_ftree, p_sw->up_port_groups[i], 676 FTREE_DIRECTION_UP); 677 678 } /* sw_dump() */ 679 680 /***************************************************/ 681 682 static boolean_t sw_ranked(IN ftree_sw_t * p_sw) 683 { 684 return (p_sw->rank != 0xFFFFFFFF); 685 } 686 687 /***************************************************/ 688 689 static ftree_port_group_t *sw_get_port_group_by_remote_lid(IN ftree_sw_t * p_sw, 690 IN uint16_t 691 remote_lid, 692 IN ftree_direction_t 693 direction) 694 { 695 uint32_t i; 696 uint32_t size; 697 ftree_port_group_t **port_groups; 698 699 if (direction == FTREE_DIRECTION_UP) { 700 port_groups = p_sw->up_port_groups; 701 size = p_sw->up_port_groups_num; 702 } else if (direction == FTREE_DIRECTION_SAME) { 703 port_groups = p_sw->sibling_port_groups; 704 size = p_sw->sibling_port_groups_num; 705 } else { 706 port_groups = p_sw->down_port_groups; 707 size = p_sw->down_port_groups_num; 708 } 709 710 for (i = 0; i < size; i++) 711 if (remote_lid == port_groups[i]->remote_lid) 712 return port_groups[i]; 713 714 return NULL; 715 } /* sw_get_port_group_by_remote_lid() */ 716 717 /***************************************************/ 718 719 static void sw_add_port(IN ftree_sw_t * p_sw, IN uint8_t port_num, 720 IN uint8_t remote_port_num, IN uint16_t lid, 721 IN uint16_t remote_lid, IN ib_net64_t port_guid, 722 IN ib_net64_t remote_port_guid, 723 IN ib_net64_t remote_node_guid, 724 IN uint8_t remote_node_type, 725 IN void *p_remote_hca_or_sw, 726 IN ftree_direction_t direction) 727 { 728 ftree_port_group_t *p_group = 729 sw_get_port_group_by_remote_lid(p_sw, remote_lid, direction); 730 731 if (!p_group) { 732 p_group = port_group_create(lid, remote_lid, 733 port_guid, sw_get_guid_no(p_sw), 734 IB_NODE_TYPE_SWITCH, p_sw, 735 remote_port_guid, remote_node_guid, 736 remote_node_type, 737 p_remote_hca_or_sw, FALSE, FALSE); 738 CL_ASSERT(p_group); 739 740 if (direction == FTREE_DIRECTION_UP) { 741 p_sw->up_port_groups[p_sw->up_port_groups_num++] = 742 p_group; 743 } else if (direction == FTREE_DIRECTION_SAME) { 744 p_sw-> 745 sibling_port_groups[p_sw->sibling_port_groups_num++] 746 = p_group; 747 } else 748 p_sw->down_port_groups[p_sw->down_port_groups_num++] = 749 p_group; 750 } 751 port_group_add_port(p_group, port_num, remote_port_num); 752 753 } /* sw_add_port() */ 754 755 /***************************************************/ 756 757 static inline cl_status_t sw_set_hops(IN ftree_sw_t * p_sw, IN uint16_t lid, 758 IN uint8_t port_num, IN uint8_t hops, 759 IN boolean_t is_target_sw) 760 { 761 /* set local min hop table(LID) */ 762 p_sw->hops[lid] = hops; 763 if (is_target_sw) 764 return osm_switch_set_hops(p_sw->p_osm_sw, lid, port_num, hops); 765 return 0; 766 } 767 768 /***************************************************/ 769 770 static int set_hops_on_remote_sw(IN ftree_port_group_t * p_group, 771 IN uint16_t target_lid, IN uint8_t hops, 772 IN boolean_t is_target_sw) 773 { 774 ftree_port_t *p_port; 775 uint8_t i, ports_num; 776 ftree_sw_t *p_remote_sw = p_group->remote_hca_or_sw.p_sw; 777 778 /* if lid is a switch, we set the min hop table in the osm_switch struct */ 779 CL_ASSERT(p_group->remote_node_type == IB_NODE_TYPE_SWITCH); 780 p_remote_sw->hops[target_lid] = hops; 781 782 /* If target lid is a switch we set the min hop table values 783 * for each port on the associated osm_sw struct */ 784 if (!is_target_sw) 785 return 0; 786 787 ports_num = (uint8_t) cl_ptr_vector_get_size(&p_group->ports); 788 for (i = 0; i < ports_num; i++) { 789 cl_ptr_vector_at(&p_group->ports, i, (void *)&p_port); 790 if (sw_set_hops(p_remote_sw, target_lid, 791 p_port->remote_port_num, hops, is_target_sw)) 792 return -1; 793 } 794 return 0; 795 } 796 797 /***************************************************/ 798 799 static inline uint8_t 800 sw_get_least_hops(IN ftree_sw_t * p_sw, IN uint16_t target_lid) 801 { 802 CL_ASSERT(p_sw->hops != NULL); 803 return p_sw->hops[target_lid]; 804 } 805 806 /*************************************************** 807 ** 808 ** ftree_hca_t functions 809 ** 810 ***************************************************/ 811 812 static ftree_hca_t *hca_create(IN osm_node_t * p_osm_node) 813 { 814 ftree_hca_t *p_hca = (ftree_hca_t *) malloc(sizeof(ftree_hca_t)); 815 if (p_hca == NULL) 816 return NULL; 817 memset(p_hca, 0, sizeof(ftree_hca_t)); 818 819 p_hca->p_osm_node = p_osm_node; 820 p_hca->up_port_groups = (ftree_port_group_t **) 821 malloc(osm_node_get_num_physp(p_hca->p_osm_node) * 822 sizeof(ftree_port_group_t *)); 823 if (!p_hca->up_port_groups) { 824 free(p_hca); 825 return NULL; 826 } 827 memset(p_hca->up_port_groups, 0, osm_node_get_num_physp(p_hca->p_osm_node) * 828 sizeof(ftree_port_group_t *)); 829 830 p_hca->disconnected_ports = (uint8_t *) 831 calloc(osm_node_get_num_physp(p_hca->p_osm_node) + 1, sizeof(uint8_t)); 832 if (!p_hca->disconnected_ports) { 833 free(p_hca->up_port_groups); 834 free(p_hca); 835 return NULL; 836 } 837 p_hca->up_port_groups_num = 0; 838 return p_hca; 839 } 840 841 /***************************************************/ 842 843 static void hca_destroy(IN ftree_hca_t * p_hca) 844 { 845 uint32_t i; 846 847 if (!p_hca) 848 return; 849 850 for (i = 0; i < p_hca->up_port_groups_num; i++) 851 port_group_destroy(p_hca->up_port_groups[i]); 852 853 free(p_hca->up_port_groups); 854 free(p_hca->disconnected_ports); 855 856 free(p_hca); 857 } 858 859 /***************************************************/ 860 861 static uint64_t hca_get_guid_no(IN ftree_hca_t * p_hca) 862 { 863 if (!p_hca) 864 return 0; 865 return osm_node_get_node_guid(p_hca->p_osm_node); 866 } 867 868 /***************************************************/ 869 870 static uint64_t hca_get_guid_ho(IN ftree_hca_t * p_hca) 871 { 872 return cl_ntoh64(hca_get_guid_no(p_hca)); 873 } 874 875 /***************************************************/ 876 877 static void hca_dump(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca) 878 { 879 uint32_t i; 880 881 if (!p_hca) 882 return; 883 884 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 885 return; 886 887 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 888 "CA GUID: 0x%016" PRIx64 ", Ports: %u UP\n", 889 hca_get_guid_ho(p_hca), p_hca->up_port_groups_num); 890 891 for (i = 0; i < p_hca->up_port_groups_num; i++) 892 port_group_dump(p_ftree, p_hca->up_port_groups[i], 893 FTREE_DIRECTION_UP); 894 } 895 896 static ftree_port_group_t *hca_get_port_group_by_lid(IN ftree_hca_t * 897 p_hca, 898 IN uint16_t 899 lid) 900 { 901 uint32_t i; 902 for (i = 0; i < p_hca->up_port_groups_num; i++) 903 if (lid == 904 p_hca->up_port_groups[i]->lid) 905 return p_hca->up_port_groups[i]; 906 907 return NULL; 908 } 909 /***************************************************/ 910 911 static void hca_add_port(IN ftree_fabric_t * p_ftree, 912 IN ftree_hca_t * p_hca, IN uint8_t port_num, 913 IN uint8_t remote_port_num, IN uint16_t lid, 914 IN uint16_t remote_lid, IN ib_net64_t port_guid, 915 IN ib_net64_t remote_port_guid, 916 IN ib_net64_t remote_node_guid, 917 IN uint8_t remote_node_type, 918 IN void *p_remote_hca_or_sw, IN boolean_t is_cn, 919 IN boolean_t is_io) 920 { 921 ftree_port_group_t *p_group; 922 923 /* this function is supposed to be called only for adding ports 924 in hca's that lead to switches */ 925 CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH); 926 927 p_group = hca_get_port_group_by_lid(p_hca, lid); 928 929 if (!p_group) { 930 p_group = port_group_create(lid, remote_lid, 931 port_guid, hca_get_guid_no(p_hca), 932 IB_NODE_TYPE_CA, p_hca, 933 remote_port_guid, remote_node_guid, 934 remote_node_type, 935 p_remote_hca_or_sw, is_cn, is_io); 936 CL_ASSERT(p_group); 937 p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group; 938 port_group_add_port(p_group, port_num, remote_port_num); 939 } else 940 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 941 "ERR AB32: Duplicated LID for CA GUID: 0x%016" PRIx64 "\n", 942 cl_ntoh64(port_guid)); 943 } /* hca_add_port() */ 944 945 /*************************************************** 946 ** 947 ** ftree_fabric_t functions 948 ** 949 ***************************************************/ 950 951 static ftree_fabric_t *fabric_create() 952 { 953 ftree_fabric_t *p_ftree = 954 (ftree_fabric_t *) malloc(sizeof(ftree_fabric_t)); 955 if (p_ftree == NULL) 956 return NULL; 957 958 memset(p_ftree, 0, sizeof(ftree_fabric_t)); 959 960 cl_qmap_init(&p_ftree->hca_tbl); 961 cl_qmap_init(&p_ftree->sw_tbl); 962 cl_qmap_init(&p_ftree->sw_by_tuple_tbl); 963 cl_qmap_init(&p_ftree->cn_guid_tbl); 964 cl_qmap_init(&p_ftree->io_guid_tbl); 965 966 return p_ftree; 967 } 968 969 /***************************************************/ 970 971 static void fabric_clear(ftree_fabric_t * p_ftree) 972 { 973 ftree_hca_t *p_hca; 974 ftree_hca_t *p_next_hca; 975 ftree_sw_t *p_sw; 976 ftree_sw_t *p_next_sw; 977 ftree_sw_tbl_element_t *p_element; 978 ftree_sw_tbl_element_t *p_next_element; 979 name_map_item_t *p_guid_element, *p_next_guid_element; 980 981 if (!p_ftree) 982 return; 983 984 /* remove all the elements of hca_tbl */ 985 986 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 987 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 988 p_hca = p_next_hca; 989 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 990 hca_destroy(p_hca); 991 } 992 cl_qmap_remove_all(&p_ftree->hca_tbl); 993 994 /* remove all the elements of sw_tbl */ 995 996 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 997 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 998 p_sw = p_next_sw; 999 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 1000 sw_destroy(p_sw); 1001 } 1002 cl_qmap_remove_all(&p_ftree->sw_tbl); 1003 1004 /* remove all the elements of sw_by_tuple_tbl */ 1005 1006 p_next_element = 1007 (ftree_sw_tbl_element_t *) cl_qmap_head(&p_ftree->sw_by_tuple_tbl); 1008 while (p_next_element != (ftree_sw_tbl_element_t *) 1009 cl_qmap_end(&p_ftree->sw_by_tuple_tbl)) { 1010 p_element = p_next_element; 1011 p_next_element = (ftree_sw_tbl_element_t *) 1012 cl_qmap_next(&p_element->map_item); 1013 sw_tbl_element_destroy(p_element); 1014 } 1015 cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl); 1016 1017 /* remove all the elements of cn_guid_tbl */ 1018 p_next_guid_element = 1019 (name_map_item_t *) cl_qmap_head(&p_ftree->cn_guid_tbl); 1020 while (p_next_guid_element != 1021 (name_map_item_t *) cl_qmap_end(&p_ftree->cn_guid_tbl)) { 1022 p_guid_element = p_next_guid_element; 1023 p_next_guid_element = 1024 (name_map_item_t *) cl_qmap_next(&p_guid_element->item); 1025 free(p_guid_element); 1026 } 1027 cl_qmap_remove_all(&p_ftree->cn_guid_tbl); 1028 1029 /* remove all the elements of io_guid_tbl */ 1030 p_next_guid_element = 1031 (name_map_item_t *) cl_qmap_head(&p_ftree->io_guid_tbl); 1032 while (p_next_guid_element != 1033 (name_map_item_t *) cl_qmap_end(&p_ftree->io_guid_tbl)) { 1034 p_guid_element = p_next_guid_element; 1035 p_next_guid_element = 1036 (name_map_item_t *) cl_qmap_next(&p_guid_element->item); 1037 free(p_guid_element); 1038 } 1039 cl_qmap_remove_all(&p_ftree->io_guid_tbl); 1040 1041 /* free the leaf switches array */ 1042 if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches)) 1043 free(p_ftree->leaf_switches); 1044 1045 p_ftree->leaf_switches_num = 0; 1046 p_ftree->cn_num = 0; 1047 p_ftree->ca_ports = 0; 1048 p_ftree->leaf_switch_rank = 0; 1049 p_ftree->max_switch_rank = 0; 1050 p_ftree->max_cn_per_leaf = 0; 1051 p_ftree->lft_max_lid = 0; 1052 p_ftree->leaf_switches = NULL; 1053 p_ftree->fabric_built = FALSE; 1054 1055 } /* fabric_destroy() */ 1056 1057 /***************************************************/ 1058 1059 static void fabric_destroy(ftree_fabric_t * p_ftree) 1060 { 1061 if (!p_ftree) 1062 return; 1063 fabric_clear(p_ftree); 1064 free(p_ftree); 1065 } 1066 1067 /***************************************************/ 1068 1069 static uint8_t fabric_get_rank(ftree_fabric_t * p_ftree) 1070 { 1071 return p_ftree->leaf_switch_rank + 1; 1072 } 1073 1074 /***************************************************/ 1075 1076 static void fabric_add_hca(ftree_fabric_t * p_ftree, osm_node_t * p_osm_node) 1077 { 1078 ftree_hca_t *p_hca; 1079 1080 CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA); 1081 1082 p_hca = hca_create(p_osm_node); 1083 if (!p_hca) 1084 return; 1085 1086 cl_qmap_insert(&p_ftree->hca_tbl, p_osm_node->node_info.node_guid, 1087 &p_hca->map_item); 1088 } 1089 1090 /***************************************************/ 1091 1092 static void fabric_add_sw(ftree_fabric_t * p_ftree, osm_switch_t * p_osm_sw) 1093 { 1094 ftree_sw_t *p_sw; 1095 1096 CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH); 1097 1098 p_sw = sw_create(p_osm_sw); 1099 if (!p_sw) 1100 return; 1101 1102 cl_qmap_insert(&p_ftree->sw_tbl, p_osm_sw->p_node->node_info.node_guid, 1103 &p_sw->map_item); 1104 1105 /* track the max lid (in host order) that exists in the fabric */ 1106 if (p_sw->lid > p_ftree->lft_max_lid) 1107 p_ftree->lft_max_lid = p_sw->lid; 1108 } 1109 1110 /***************************************************/ 1111 1112 static void fabric_add_sw_by_tuple(IN ftree_fabric_t * p_ftree, 1113 IN ftree_sw_t * p_sw) 1114 { 1115 CL_ASSERT(tuple_assigned(p_sw->tuple)); 1116 1117 cl_qmap_insert(&p_ftree->sw_by_tuple_tbl, tuple_to_key(p_sw->tuple), 1118 &sw_tbl_element_create(p_sw)->map_item); 1119 } 1120 1121 /***************************************************/ 1122 1123 static ftree_sw_t *fabric_get_sw_by_tuple(IN ftree_fabric_t * p_ftree, 1124 IN ftree_tuple_t tuple) 1125 { 1126 ftree_sw_tbl_element_t *p_element; 1127 1128 CL_ASSERT(tuple_assigned(tuple)); 1129 1130 tuple_to_key(tuple); 1131 1132 p_element = 1133 (ftree_sw_tbl_element_t *) cl_qmap_get(&p_ftree->sw_by_tuple_tbl, 1134 tuple_to_key(tuple)); 1135 if (p_element == 1136 (ftree_sw_tbl_element_t *) cl_qmap_end(&p_ftree->sw_by_tuple_tbl)) 1137 return NULL; 1138 1139 return p_element->p_sw; 1140 } 1141 1142 /***************************************************/ 1143 1144 static ftree_sw_t *fabric_get_sw_by_guid(IN ftree_fabric_t * p_ftree, 1145 IN uint64_t guid) 1146 { 1147 ftree_sw_t *p_sw; 1148 p_sw = (ftree_sw_t *) cl_qmap_get(&p_ftree->sw_tbl, guid); 1149 if (p_sw == (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) 1150 return NULL; 1151 return p_sw; 1152 } 1153 1154 /***************************************************/ 1155 1156 static ftree_hca_t *fabric_get_hca_by_guid(IN ftree_fabric_t * p_ftree, 1157 IN uint64_t guid) 1158 { 1159 ftree_hca_t *p_hca; 1160 p_hca = (ftree_hca_t *) cl_qmap_get(&p_ftree->hca_tbl, guid); 1161 if (p_hca == (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) 1162 return NULL; 1163 return p_hca; 1164 } 1165 1166 /***************************************************/ 1167 1168 static void fabric_dump(ftree_fabric_t * p_ftree) 1169 { 1170 uint32_t i; 1171 ftree_hca_t *p_hca; 1172 ftree_sw_t *p_sw; 1173 1174 if (!OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 1175 return; 1176 1177 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n" 1178 " |-------------------------------|\n" 1179 " |- Full fabric topology dump -|\n" 1180 " |-------------------------------|\n\n"); 1181 1182 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "-- CAs:\n"); 1183 1184 for (p_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 1185 p_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl); 1186 p_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item)) { 1187 hca_dump(p_ftree, p_hca); 1188 } 1189 1190 for (i = 0; i <= p_ftree->max_switch_rank; i++) { 1191 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 1192 "-- Rank %u switches\n", i); 1193 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1194 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl); 1195 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) { 1196 if (p_sw->rank == i) 1197 sw_dump(p_ftree, p_sw); 1198 } 1199 } 1200 1201 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, "\n" 1202 " |---------------------------------------|\n" 1203 " |- Full fabric topology dump completed -|\n" 1204 " |---------------------------------------|\n\n"); 1205 } /* fabric_dump() */ 1206 1207 /***************************************************/ 1208 1209 static void fabric_dump_general_info(IN ftree_fabric_t * p_ftree) 1210 { 1211 uint32_t i, j; 1212 ftree_sw_t *p_sw; 1213 1214 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1215 "General fabric topology info\n"); 1216 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1217 "============================\n"); 1218 1219 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1220 " - FatTree rank (roots to leaf switches): %u\n", 1221 p_ftree->leaf_switch_rank + 1); 1222 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1223 " - FatTree max switch rank: %u\n", p_ftree->max_switch_rank); 1224 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1225 " - Fabric has %u CAs, %u CA ports (%u of them CNs), %u switches\n", 1226 cl_qmap_count(&p_ftree->hca_tbl), p_ftree->ca_ports, 1227 p_ftree->cn_num, cl_qmap_count(&p_ftree->sw_tbl)); 1228 1229 CL_ASSERT(p_ftree->ca_ports >= p_ftree->cn_num); 1230 1231 for (i = 0; i <= p_ftree->max_switch_rank; i++) { 1232 j = 0; 1233 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1234 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl); 1235 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) { 1236 if (p_sw->rank == i) 1237 j++; 1238 } 1239 if (i == 0) 1240 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1241 " - Fabric has %u switches at rank %u (roots)\n", 1242 j, i); 1243 else if (i == p_ftree->leaf_switch_rank) 1244 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1245 " - Fabric has %u switches at rank %u (%u of them leafs)\n", 1246 j, i, p_ftree->leaf_switches_num); 1247 else 1248 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_INFO, 1249 " - Fabric has %u switches at rank %u\n", j, 1250 i); 1251 } 1252 1253 if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_VERBOSE)) { 1254 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1255 " - Root switches:\n"); 1256 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1257 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl); 1258 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) { 1259 if (p_sw->rank == 0) 1260 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1261 " GUID: 0x%016" PRIx64 1262 ", LID: %u, Index %s\n", 1263 sw_get_guid_ho(p_sw), 1264 p_sw->lid, 1265 tuple_to_str(p_sw->tuple)); 1266 } 1267 1268 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1269 " - Leaf switches (sorted by index):\n"); 1270 for (i = 0; i < p_ftree->leaf_switches_num; i++) { 1271 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1272 " GUID: 0x%016" PRIx64 1273 ", LID: %u, Index %s\n", 1274 sw_get_guid_ho(p_ftree->leaf_switches[i]), 1275 p_ftree->leaf_switches[i]->lid, 1276 tuple_to_str(p_ftree->leaf_switches[i]->tuple)); 1277 } 1278 } 1279 } /* fabric_dump_general_info() */ 1280 1281 /***************************************************/ 1282 1283 static void fabric_dump_hca_ordering(IN ftree_fabric_t * p_ftree) 1284 { 1285 ftree_hca_t *p_hca; 1286 ftree_sw_t *p_sw; 1287 ftree_port_group_t *p_group_on_sw; 1288 ftree_port_group_t *p_group_on_hca; 1289 int rename_status = 0; 1290 uint32_t i; 1291 uint32_t j; 1292 unsigned printed_hcas_on_leaf; 1293 1294 char path[1024], path_tmp[1032]; 1295 FILE *p_hca_ordering_file; 1296 const char *filename = "opensm-ftree-ca-order.dump"; 1297 1298 snprintf(path, sizeof(path), "%s/%s", 1299 p_ftree->p_osm->subn.opt.dump_files_dir, filename); 1300 1301 snprintf(path_tmp, sizeof(path_tmp), "%s.tmp", path); 1302 1303 p_hca_ordering_file = fopen(path_tmp, "w"); 1304 if (!p_hca_ordering_file) { 1305 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB01: " 1306 "cannot open file \'%s\': %s\n", path_tmp, 1307 strerror(errno)); 1308 return; 1309 } 1310 1311 /* for each leaf switch (in indexing order) */ 1312 for (i = 0; i < p_ftree->leaf_switches_num; i++) { 1313 p_sw = p_ftree->leaf_switches[i]; 1314 printed_hcas_on_leaf = 0; 1315 1316 /* for each real CA (CNs and not) connected to this switch */ 1317 for (j = 0; j < p_sw->down_port_groups_num; j++) { 1318 p_group_on_sw = p_sw->down_port_groups[j]; 1319 1320 if (p_group_on_sw->remote_node_type != IB_NODE_TYPE_CA) 1321 continue; 1322 1323 p_hca = p_group_on_sw->remote_hca_or_sw.p_hca; 1324 p_group_on_hca = 1325 hca_get_port_group_by_lid(p_hca, 1326 p_group_on_sw-> 1327 remote_lid); 1328 1329 /* treat non-compute nodes as dummies */ 1330 if (!p_group_on_hca->is_cn) 1331 continue; 1332 1333 fprintf(p_hca_ordering_file, "0x%04x\t%s\n", 1334 p_group_on_hca->lid, 1335 p_hca->p_osm_node->print_desc); 1336 1337 printed_hcas_on_leaf++; 1338 } 1339 1340 /* now print missing HCAs */ 1341 for (j = 0; 1342 j < (p_ftree->max_cn_per_leaf - printed_hcas_on_leaf); j++) 1343 fprintf(p_hca_ordering_file, "0xFFFF\tDUMMY\n"); 1344 1345 } 1346 /* done going through all the leaf switches */ 1347 1348 fclose(p_hca_ordering_file); 1349 1350 rename_status = rename(path_tmp, path); 1351 if (rename_status) { 1352 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB03: " 1353 "cannot rename file \'%s\': %s\n", path_tmp, 1354 strerror(errno)); 1355 } 1356 } /* fabric_dump_hca_ordering() */ 1357 1358 /***************************************************/ 1359 1360 static void fabric_assign_tuple(IN ftree_fabric_t * p_ftree, 1361 IN ftree_sw_t * p_sw, 1362 IN ftree_tuple_t new_tuple) 1363 { 1364 memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN); 1365 fabric_add_sw_by_tuple(p_ftree, p_sw); 1366 } 1367 1368 /***************************************************/ 1369 1370 static void fabric_assign_first_tuple(IN ftree_fabric_t * p_ftree, 1371 IN ftree_sw_t * p_sw, 1372 IN unsigned int subtree) 1373 { 1374 uint8_t i; 1375 ftree_tuple_t new_tuple; 1376 1377 if (p_ftree->leaf_switch_rank >= FTREE_TUPLE_LEN) 1378 return; 1379 1380 tuple_init(new_tuple); 1381 new_tuple[0] = (uint8_t) p_sw->rank; 1382 1383 for (i = 1; i <= p_ftree->leaf_switch_rank; i++) 1384 new_tuple[i] = 0; 1385 1386 if (p_sw->rank == 0) { 1387 if (p_ftree->leaf_switch_rank > 1) 1388 new_tuple[p_ftree->leaf_switch_rank] = subtree; 1389 1390 for (i = 0; i < 0xFF; i++) { 1391 new_tuple[1] = i; 1392 if (fabric_get_sw_by_tuple(p_ftree, new_tuple) == NULL) 1393 break; 1394 } 1395 if (i == 0xFF) { 1396 /* new tuple not found - there are more than 255 ports in one direction */ 1397 return; 1398 } 1399 } 1400 fabric_assign_tuple(p_ftree, p_sw, new_tuple); 1401 } 1402 1403 /***************************************************/ 1404 1405 static void fabric_get_new_tuple(IN ftree_fabric_t * p_ftree, 1406 OUT ftree_tuple_t new_tuple, 1407 IN ftree_tuple_t from_tuple, 1408 IN ftree_direction_t direction) 1409 { 1410 ftree_sw_t *p_sw; 1411 ftree_tuple_t temp_tuple; 1412 uint8_t var_index; 1413 uint8_t i; 1414 1415 tuple_init(new_tuple); 1416 memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN); 1417 1418 if (direction == FTREE_DIRECTION_DOWN) { 1419 temp_tuple[0]++; 1420 var_index = from_tuple[0] + 1; 1421 } else { 1422 temp_tuple[0]--; 1423 var_index = from_tuple[0]; 1424 } 1425 1426 for (i = 0; i < 0xFF; i++) { 1427 temp_tuple[var_index] = i; 1428 p_sw = fabric_get_sw_by_tuple(p_ftree, temp_tuple); 1429 if (p_sw == NULL) /* found free tuple */ 1430 break; 1431 } 1432 1433 if (i == 0xFF) { 1434 /* new tuple not found - there are more than 255 ports in one direction */ 1435 return; 1436 } 1437 memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN); 1438 1439 } /* fabric_get_new_tuple() */ 1440 1441 /***************************************************/ 1442 1443 static inline boolean_t fabric_roots_provided(IN ftree_fabric_t * p_ftree) 1444 { 1445 return (p_ftree->p_osm->subn.opt.root_guid_file != NULL); 1446 } 1447 1448 /***************************************************/ 1449 1450 static inline boolean_t fabric_cns_provided(IN ftree_fabric_t * p_ftree) 1451 { 1452 return (p_ftree->p_osm->subn.opt.cn_guid_file != NULL); 1453 } 1454 1455 /***************************************************/ 1456 1457 static inline boolean_t fabric_ios_provided(IN ftree_fabric_t * p_ftree) 1458 { 1459 return (p_ftree->p_osm->subn.opt.io_guid_file != NULL); 1460 } 1461 1462 /***************************************************/ 1463 1464 static int fabric_mark_leaf_switches(IN ftree_fabric_t * p_ftree) 1465 { 1466 ftree_sw_t *p_sw; 1467 ftree_hca_t *p_hca; 1468 ftree_hca_t *p_next_hca; 1469 unsigned i; 1470 int res = 0; 1471 1472 OSM_LOG_ENTER(&p_ftree->p_osm->log); 1473 1474 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1475 "Marking leaf switches in fabric\n"); 1476 1477 /* Scan all the CAs, if they have CNs - find CN port and mark switch 1478 that is connected to this port as leaf switch. 1479 Also, ensure that this marked leaf has rank of p_ftree->leaf_switch_rank. */ 1480 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 1481 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 1482 p_hca = p_next_hca; 1483 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 1484 if (!p_hca->cn_num) 1485 continue; 1486 1487 for (i = 0; i < p_hca->up_port_groups_num; i++) { 1488 if (!p_hca->up_port_groups[i]->is_cn) 1489 continue; 1490 1491 /* In CAs, port group alway has one port, and since this 1492 port group is CN, we know that this port is compute node */ 1493 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type == 1494 IB_NODE_TYPE_SWITCH); 1495 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw; 1496 1497 /* check if this switch was already processed */ 1498 if (p_sw->is_leaf) 1499 continue; 1500 p_sw->is_leaf = TRUE; 1501 1502 /* ensure that this leaf switch is at the correct tree level */ 1503 if (p_sw->rank != p_ftree->leaf_switch_rank) { 1504 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 1505 "ERR AB26: CN port 0x%" PRIx64 1506 " is connected to switch 0x%" PRIx64 1507 " with rank %u, " 1508 "while FatTree leaf rank is %u\n", 1509 cl_ntoh64(p_hca-> 1510 up_port_groups[i]->port_guid), 1511 sw_get_guid_ho(p_sw), p_sw->rank, 1512 p_ftree->leaf_switch_rank); 1513 res = -1; 1514 goto Exit; 1515 1516 } 1517 } 1518 } 1519 1520 Exit: 1521 OSM_LOG_EXIT(&p_ftree->p_osm->log); 1522 return res; 1523 } /* fabric_mark_leaf_switches() */ 1524 1525 /***************************************************/ 1526 static void bfs_fabric_indexing(IN ftree_fabric_t * p_ftree, 1527 IN ftree_sw_t *p_first_sw) 1528 { 1529 ftree_sw_t *p_remote_sw; 1530 ftree_sw_t *p_sw = NULL; 1531 ftree_tuple_t new_tuple; 1532 uint32_t i; 1533 cl_list_t bfs_list; 1534 1535 OSM_LOG_ENTER(&p_ftree->p_osm->log); 1536 cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl)); 1537 /* 1538 * Now run BFS and assign indexes to all switches 1539 * Pseudo code of the algorithm is as follows: 1540 * 1541 * * Add first switch to BFS queue 1542 * * While (BFS queue not empty) 1543 * - Pop the switch from the head of the queue 1544 * - Scan all the downward and upward ports 1545 * - For each port 1546 * + Get the remote switch 1547 * + Assign index to the remote switch 1548 * + Add remote switch to the BFS queue 1549 */ 1550 1551 cl_list_insert_tail(&bfs_list, p_first_sw); 1552 1553 while (!cl_is_list_empty(&bfs_list)) { 1554 p_sw = (ftree_sw_t *) cl_list_remove_head(&bfs_list); 1555 1556 /* Discover all the nodes from ports that are pointing down */ 1557 1558 if (p_sw->rank >= p_ftree->leaf_switch_rank) { 1559 /* whether downward ports are pointing to CAs or switches, 1560 we don't assign indexes to switches that are located 1561 lower than leaf switches */ 1562 } else { 1563 /* This is not the leaf switch */ 1564 for (i = 0; i < p_sw->down_port_groups_num; i++) { 1565 /* Work with port groups that are pointing to switches only. 1566 No need to assign indexing to HCAs */ 1567 if (p_sw-> 1568 down_port_groups[i]->remote_node_type != 1569 IB_NODE_TYPE_SWITCH) 1570 continue; 1571 1572 p_remote_sw = 1573 p_sw->down_port_groups[i]-> 1574 remote_hca_or_sw.p_sw; 1575 if (tuple_assigned(p_remote_sw->tuple)) { 1576 /* this switch has been already indexed */ 1577 continue; 1578 } 1579 /* allocate new tuple */ 1580 fabric_get_new_tuple(p_ftree, new_tuple, 1581 p_sw->tuple, 1582 FTREE_DIRECTION_DOWN); 1583 /* Assign the new tuple to the remote switch. 1584 This fuction also adds the switch into the switch_by_tuple table. */ 1585 fabric_assign_tuple(p_ftree, p_remote_sw, 1586 new_tuple); 1587 1588 /* add the newly discovered switch to the BFS queue */ 1589 cl_list_insert_tail(&bfs_list, p_remote_sw); 1590 } 1591 /* Done assigning indexes to all the remote switches 1592 that are pointed by the downgoing ports. 1593 Now sort port groups according to remote index. */ 1594 qsort(p_sw->down_port_groups, /* array */ 1595 p_sw->down_port_groups_num, /* number of elements */ 1596 sizeof(ftree_port_group_t *), /* size of each element */ 1597 compare_port_groups_by_remote_switch_index); /* comparator */ 1598 } 1599 1600 /* Done indexing switches from ports that go down. 1601 Now do the same with ports that are pointing up. 1602 if we started from root (rank == 0), the leaf is bsf termination point */ 1603 1604 if (p_sw->rank != 0 && (p_first_sw->rank != 0 || !p_sw->is_leaf)) { 1605 /* This is not the root switch, which means that all the ports 1606 that are pointing up are taking us to another switches. */ 1607 for (i = 0; i < p_sw->up_port_groups_num; i++) { 1608 p_remote_sw = 1609 p_sw->up_port_groups[i]-> 1610 remote_hca_or_sw.p_sw; 1611 if (tuple_assigned(p_remote_sw->tuple)) 1612 continue; 1613 /* allocate new tuple */ 1614 fabric_get_new_tuple(p_ftree, new_tuple, 1615 p_sw->tuple, 1616 FTREE_DIRECTION_UP); 1617 /* Assign the new tuple to the remote switch. 1618 This fuction also adds the switch to the 1619 switch_by_tuple table. */ 1620 fabric_assign_tuple(p_ftree, 1621 p_remote_sw, new_tuple); 1622 /* add the newly discovered switch to the BFS queue */ 1623 cl_list_insert_tail(&bfs_list, p_remote_sw); 1624 } 1625 /* Done assigning indexes to all the remote switches 1626 that are pointed by the upgoing ports. 1627 Now sort port groups according to remote index. */ 1628 qsort(p_sw->up_port_groups, /* array */ 1629 p_sw->up_port_groups_num, /* number of elements */ 1630 sizeof(ftree_port_group_t *), /* size of each element */ 1631 compare_port_groups_by_remote_switch_index); /* comparator */ 1632 } 1633 /* Done assigning indexes to all the switches that are directly connected 1634 to the current switch - go to the next switch in the BFS queue */ 1635 } 1636 cl_list_destroy(&bfs_list); 1637 1638 OSM_LOG_EXIT(&p_ftree->p_osm->log); 1639 } 1640 1641 static void fabric_make_indexing(IN ftree_fabric_t * p_ftree) 1642 { 1643 ftree_sw_t *p_sw = NULL; 1644 unsigned int subtree = 0; 1645 OSM_LOG_ENTER(&p_ftree->p_osm->log); 1646 1647 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1648 "Starting FatTree indexing\n"); 1649 1650 /* using the first switch as a starting point for indexing algorithm. */ 1651 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1652 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl); 1653 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) { 1654 if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing) { 1655 /* find first root switch */ 1656 if (p_sw->rank != 0) 1657 continue; 1658 } else { 1659 /* find first leaf switch */ 1660 if (!p_sw->is_leaf) 1661 continue; 1662 } 1663 /* Assign the first tuple to the switch that is used as BFS starting point 1664 in the subtree. 1665 The tuple will be as follows: [rank].0...0.subtree 1666 This fuction also adds the switch it into the switch_by_tuple table. */ 1667 if (!tuple_assigned(p_sw->tuple)) { 1668 fabric_assign_first_tuple(p_ftree, p_sw, subtree++); 1669 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1670 "Indexing starting point:\n" 1671 " - Switch rank : %u\n" 1672 " - Switch index : %s\n" 1673 " - Node LID : %u\n" 1674 " - Node GUID : 0x%016" 1675 PRIx64 "\n", p_sw->rank, tuple_to_str(p_sw->tuple), 1676 p_sw->lid, sw_get_guid_ho(p_sw)); 1677 } 1678 1679 bfs_fabric_indexing(p_ftree, p_sw); 1680 1681 if (ftree_get_subnet(p_ftree)->opt.quasi_ftree_indexing == FALSE) 1682 goto Exit; 1683 } 1684 p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1685 while (p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 1686 if (p_sw->is_leaf) { 1687 qsort(p_sw->up_port_groups, /* array */ 1688 p_sw->up_port_groups_num, /* number of elements */ 1689 sizeof(ftree_port_group_t *), /* size of each element */ 1690 compare_port_groups_by_remote_switch_index); /* comparator */ 1691 } 1692 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 1693 1694 } 1695 Exit: 1696 OSM_LOG_EXIT(&p_ftree->p_osm->log); 1697 } /* fabric_make_indexing() */ 1698 /***************************************************/ 1699 1700 static int fabric_create_leaf_switch_array(IN ftree_fabric_t * p_ftree) 1701 { 1702 ftree_sw_t *p_sw; 1703 ftree_sw_t *p_next_sw; 1704 ftree_sw_t **all_switches_at_leaf_level; 1705 unsigned i; 1706 unsigned all_leaf_idx = 0; 1707 unsigned first_leaf_idx; 1708 unsigned last_leaf_idx; 1709 int res = 0; 1710 1711 OSM_LOG_ENTER(&p_ftree->p_osm->log); 1712 1713 /* create array of ALL the switches that have leaf rank */ 1714 all_switches_at_leaf_level = (ftree_sw_t **) 1715 malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *)); 1716 if (!all_switches_at_leaf_level) { 1717 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID, 1718 "Fat-tree routing: Memory allocation failed\n"); 1719 res = -1; 1720 goto Exit; 1721 } 1722 memset(all_switches_at_leaf_level, 0, 1723 cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *)); 1724 1725 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1726 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 1727 p_sw = p_next_sw; 1728 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 1729 if (p_sw->rank == p_ftree->leaf_switch_rank) { 1730 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 1731 "Adding switch 0x%" PRIx64 1732 " to full leaf switch array\n", 1733 sw_get_guid_ho(p_sw)); 1734 all_switches_at_leaf_level[all_leaf_idx++] = p_sw; 1735 } 1736 } 1737 1738 /* quick-sort array of leaf switches by index */ 1739 qsort(all_switches_at_leaf_level, /* array */ 1740 all_leaf_idx, /* number of elements */ 1741 sizeof(ftree_sw_t *), /* size of each element */ 1742 compare_switches_by_index); /* comparator */ 1743 1744 /* check the first and the last REAL leaf (the one 1745 that has CNs) in the array of all the leafs */ 1746 1747 first_leaf_idx = all_leaf_idx; 1748 last_leaf_idx = 0; 1749 for (i = 0; i < all_leaf_idx; i++) { 1750 if (all_switches_at_leaf_level[i]->is_leaf) { 1751 if (i < first_leaf_idx) 1752 first_leaf_idx = i; 1753 last_leaf_idx = i; 1754 } 1755 } 1756 1757 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 1758 "Full leaf array info: first_leaf_idx = %u, last_leaf_idx = %u\n", 1759 first_leaf_idx, last_leaf_idx); 1760 1761 if (first_leaf_idx >= last_leaf_idx) { 1762 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 1763 "Failed to find leaf switches - topology is not " 1764 "fat-tree\n"); 1765 res = -1; 1766 goto Exit; 1767 } 1768 1769 /* Create array of REAL leaf switches, sorted by index. 1770 This array may contain switches at the same rank w/o CNs, 1771 in case this is the order of indexing. */ 1772 p_ftree->leaf_switches_num = last_leaf_idx - first_leaf_idx + 1; 1773 p_ftree->leaf_switches = (ftree_sw_t **) 1774 malloc(p_ftree->leaf_switches_num * sizeof(ftree_sw_t *)); 1775 if (!p_ftree->leaf_switches) { 1776 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID, 1777 "Fat-tree routing: Memory allocation failed\n"); 1778 res = -1; 1779 goto Exit; 1780 } 1781 1782 memcpy(p_ftree->leaf_switches, 1783 &(all_switches_at_leaf_level[first_leaf_idx]), 1784 p_ftree->leaf_switches_num * sizeof(ftree_sw_t *)); 1785 1786 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 1787 "Created array of %u leaf switches\n", 1788 p_ftree->leaf_switches_num); 1789 1790 Exit: 1791 free(all_switches_at_leaf_level); 1792 OSM_LOG_EXIT(&p_ftree->p_osm->log); 1793 return res; 1794 } /* fabric_create_leaf_switch_array() */ 1795 1796 /***************************************************/ 1797 1798 static void fabric_set_max_cn_per_leaf(IN ftree_fabric_t * p_ftree) 1799 { 1800 unsigned i; 1801 unsigned j; 1802 unsigned cns_on_this_leaf; 1803 ftree_sw_t *p_sw; 1804 ftree_port_group_t *p_group, *p_up_group; 1805 ftree_hca_t *p_hca; 1806 1807 for (i = 0; i < p_ftree->leaf_switches_num; i++) { 1808 p_sw = p_ftree->leaf_switches[i]; 1809 cns_on_this_leaf = 0; 1810 for (j = 0; j < p_sw->down_port_groups_num; j++) { 1811 p_group = p_sw->down_port_groups[j]; 1812 if (p_group->remote_node_type != IB_NODE_TYPE_CA) 1813 continue; 1814 p_hca = p_group->remote_hca_or_sw.p_hca; 1815 /* 1816 * Get the hca port group corresponding 1817 * to the LID of remote HCA port 1818 */ 1819 p_up_group = hca_get_port_group_by_lid(p_hca, 1820 p_group->remote_lid); 1821 1822 CL_ASSERT(p_up_group); 1823 1824 if (p_up_group->is_cn) 1825 cns_on_this_leaf++; 1826 } 1827 if (cns_on_this_leaf > p_ftree->max_cn_per_leaf) 1828 p_ftree->max_cn_per_leaf = cns_on_this_leaf; 1829 } 1830 } /* fabric_set_max_cn_per_leaf() */ 1831 1832 /***************************************************/ 1833 1834 static boolean_t fabric_validate_topology(IN ftree_fabric_t * p_ftree) 1835 { 1836 ftree_port_group_t *p_group; 1837 ftree_port_group_t *p_ref_group; 1838 ftree_sw_t *p_sw; 1839 ftree_sw_t *p_next_sw; 1840 ftree_sw_t **reference_sw_arr; 1841 uint16_t tree_rank = fabric_get_rank(p_ftree); 1842 boolean_t res = TRUE; 1843 uint8_t i; 1844 1845 OSM_LOG_ENTER(&p_ftree->p_osm->log); 1846 1847 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 1848 "Validating fabric topology\n"); 1849 1850 reference_sw_arr = 1851 (ftree_sw_t **) malloc(tree_rank * sizeof(ftree_sw_t *)); 1852 if (reference_sw_arr == NULL) { 1853 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_SYS, FILE_ID, 1854 "Fat-tree routing: Memory allocation failed\n"); 1855 return FALSE; 1856 } 1857 memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *)); 1858 1859 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 1860 while (res && p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 1861 p_sw = p_next_sw; 1862 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 1863 1864 if (!reference_sw_arr[p_sw->rank]) 1865 /* This is the first switch in the current level that 1866 we're checking - use it as a reference */ 1867 reference_sw_arr[p_sw->rank] = p_sw; 1868 else { 1869 /* compare this switch properties to the reference switch */ 1870 1871 if (reference_sw_arr[p_sw->rank]->up_port_groups_num != 1872 p_sw->up_port_groups_num) { 1873 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 1874 "ERR AB09: Different number of upward port groups on switches:\n" 1875 " GUID 0x%016" PRIx64 1876 ", LID %u, Index %s - %u groups\n" 1877 " GUID 0x%016" PRIx64 1878 ", LID %u, Index %s - %u groups\n", 1879 sw_get_guid_ho 1880 (reference_sw_arr[p_sw->rank]), 1881 reference_sw_arr[p_sw->rank]->lid, 1882 tuple_to_str 1883 (reference_sw_arr[p_sw->rank]->tuple), 1884 reference_sw_arr[p_sw-> 1885 rank]-> 1886 up_port_groups_num, 1887 sw_get_guid_ho(p_sw), p_sw->lid, 1888 tuple_to_str(p_sw->tuple), 1889 p_sw->up_port_groups_num); 1890 res = FALSE; 1891 break; 1892 } 1893 1894 if (p_sw->rank != (tree_rank - 1) && 1895 reference_sw_arr[p_sw-> 1896 rank]->down_port_groups_num != 1897 p_sw->down_port_groups_num) { 1898 /* we're allowing some hca's to be missing */ 1899 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 1900 "ERR AB0A: Different number of downward port groups on switches:\n" 1901 " GUID 0x%016" PRIx64 1902 ", LID %u, Index %s - %u port groups\n" 1903 " GUID 0x%016" PRIx64 1904 ", LID %u, Index %s - %u port groups\n", 1905 sw_get_guid_ho 1906 (reference_sw_arr[p_sw->rank]), 1907 reference_sw_arr[p_sw->rank]->lid, 1908 tuple_to_str 1909 (reference_sw_arr[p_sw->rank]->tuple), 1910 reference_sw_arr[p_sw-> 1911 rank]-> 1912 down_port_groups_num, 1913 sw_get_guid_ho(p_sw), p_sw->lid, 1914 tuple_to_str(p_sw->tuple), 1915 p_sw->down_port_groups_num); 1916 res = FALSE; 1917 break; 1918 } 1919 1920 if (reference_sw_arr[p_sw->rank]->up_port_groups_num != 1921 0) { 1922 p_ref_group = 1923 reference_sw_arr[p_sw-> 1924 rank]->up_port_groups[0]; 1925 for (i = 0; i < p_sw->up_port_groups_num; i++) { 1926 p_group = p_sw->up_port_groups[i]; 1927 if (cl_ptr_vector_get_size 1928 (&p_ref_group->ports) != 1929 cl_ptr_vector_get_size 1930 (&p_group->ports)) { 1931 OSM_LOG(&p_ftree->p_osm->log, 1932 OSM_LOG_ERROR, 1933 "ERR AB0B: Different number of ports in an upward port group on switches:\n" 1934 " GUID 0x%016" 1935 PRIx64 1936 ", LID %u, Index %s - %u ports\n" 1937 " GUID 0x%016" 1938 PRIx64 1939 ", LID %u, Index %s - %u ports\n", 1940 sw_get_guid_ho 1941 (reference_sw_arr 1942 [p_sw->rank]), 1943 reference_sw_arr[p_sw-> 1944 rank]-> 1945 lid, 1946 tuple_to_str 1947 (reference_sw_arr 1948 [p_sw->rank]->tuple), 1949 cl_ptr_vector_get_size 1950 (&p_ref_group->ports), 1951 sw_get_guid_ho(p_sw), 1952 p_sw->lid, 1953 tuple_to_str(p_sw-> 1954 tuple), 1955 cl_ptr_vector_get_size 1956 (&p_group->ports)); 1957 res = FALSE; 1958 break; 1959 } 1960 } 1961 } 1962 if (reference_sw_arr[p_sw->rank]->down_port_groups_num 1963 != 0 && p_sw->rank != (tree_rank - 1)) { 1964 /* we're allowing some hca's to be missing */ 1965 p_ref_group = 1966 reference_sw_arr[p_sw-> 1967 rank]->down_port_groups[0]; 1968 for (i = 0; i < p_sw->down_port_groups_num; i++) { 1969 p_group = p_sw->down_port_groups[0]; 1970 if (cl_ptr_vector_get_size 1971 (&p_ref_group->ports) != 1972 cl_ptr_vector_get_size 1973 (&p_group->ports)) { 1974 OSM_LOG(&p_ftree->p_osm->log, 1975 OSM_LOG_ERROR, 1976 "ERR AB0C: Different number of ports in an downward port group on switches:\n" 1977 " GUID 0x%016" 1978 PRIx64 1979 ", LID %u, Index %s - %u ports\n" 1980 " GUID 0x%016" 1981 PRIx64 1982 ", LID %u, Index %s - %u ports\n", 1983 sw_get_guid_ho 1984 (reference_sw_arr 1985 [p_sw->rank]), 1986 reference_sw_arr[p_sw-> 1987 rank]-> 1988 lid, 1989 tuple_to_str 1990 (reference_sw_arr 1991 [p_sw->rank]->tuple), 1992 cl_ptr_vector_get_size 1993 (&p_ref_group->ports), 1994 sw_get_guid_ho(p_sw), 1995 p_sw->lid, 1996 tuple_to_str(p_sw-> 1997 tuple), 1998 cl_ptr_vector_get_size 1999 (&p_group->ports)); 2000 res = FALSE; 2001 break; 2002 } 2003 } 2004 } 2005 } /* end of else */ 2006 } /* end of while */ 2007 2008 if (res == TRUE) 2009 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 2010 "Fabric topology has been identified as FatTree\n"); 2011 else 2012 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 2013 "ERR AB0D: Fabric topology hasn't been identified as FatTree\n"); 2014 2015 free(reference_sw_arr); 2016 OSM_LOG_EXIT(&p_ftree->p_osm->log); 2017 return res; 2018 } /* fabric_validate_topology() */ 2019 2020 /*************************************************** 2021 ***************************************************/ 2022 2023 static void set_sw_fwd_table(IN cl_map_item_t * const p_map_item, 2024 IN void *context) 2025 { 2026 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item; 2027 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context; 2028 2029 p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid; 2030 } 2031 2032 /*************************************************** 2033 ***************************************************/ 2034 2035 /* 2036 * Function: Finds the least loaded port group and stores its counter 2037 * Given : A switch 2038 */ 2039 static inline void recalculate_min_counter_down(ftree_sw_t * p_sw) 2040 { 2041 uint32_t min = (1 << 30); 2042 uint32_t i; 2043 for (i = 0; i < p_sw->down_port_groups_num; i++) { 2044 if (p_sw->down_port_groups[i]->counter_down < min) { 2045 min = p_sw->down_port_groups[i]->counter_down; 2046 } 2047 } 2048 p_sw->min_counter_down = min; 2049 return; 2050 } 2051 2052 /* 2053 * Function: Return the counter value of the least loaded down port group 2054 * Given : A switch 2055 */ 2056 static inline uint32_t find_lowest_loaded_group_on_sw(ftree_sw_t * p_sw) 2057 { 2058 return p_sw->min_counter_down; 2059 } 2060 2061 /* 2062 * Function: Compare the load of two port groups and return which is the least loaded 2063 * Given : Two port groups with remote switch 2064 * When both port groups are equally loaded, it picks the one whom 2065 * remote switch down ports are least loaded. 2066 * This way, it prefers the switch from where it will be easier to go down (creating upward routes). 2067 * If both are equal, it picks the lowest INDEX to be deterministic. 2068 */ 2069 static inline int port_group_compare_load_down(const ftree_port_group_t * p1, 2070 const ftree_port_group_t * p2) 2071 { 2072 int temp = p1->counter_down - p2->counter_down; 2073 if (temp > 0) 2074 return 1; 2075 if (temp < 0) 2076 return -1; 2077 2078 /* Find the less loaded remote sw and choose this one */ 2079 do { 2080 uint32_t load1 = 2081 find_lowest_loaded_group_on_sw(p1->remote_hca_or_sw.p_sw); 2082 uint32_t load2 = 2083 find_lowest_loaded_group_on_sw(p2->remote_hca_or_sw.p_sw); 2084 temp = load1 - load2; 2085 if (temp > 0) 2086 return 1; 2087 } while (0); 2088 /* If they are both equal, choose the lowest index */ 2089 return compare_port_groups_by_remote_switch_index(&p1, &p2); 2090 } 2091 2092 static inline int port_group_compare_load_up(const ftree_port_group_t * p1, 2093 const ftree_port_group_t * p2) 2094 { 2095 int temp = p1->counter_up - p2->counter_up; 2096 if (temp > 0) 2097 return 1; 2098 if (temp < 0) 2099 return -1; 2100 2101 /* If they are both equal, choose the lowest index */ 2102 return compare_port_groups_by_remote_switch_index (&p1,&p2); 2103 } 2104 2105 /* 2106 * Function: Sorts an array of port group by up load order 2107 * Given : A port group array and its length 2108 * As the list is mostly sorted, we used a bubble sort instead of qsort 2109 * as it is much faster. 2110 * 2111 * Important note: 2112 * This function and bubble_sort_down must NOT be factorized. 2113 * Although most of the code is the same and a function pointer could be used 2114 * for the compareason function, it would prevent the compareason function to be inlined 2115 * and cost a great deal to performances. 2116 */ 2117 static inline void 2118 bubble_sort_up(ftree_port_group_t ** p_group_array, uint32_t nmemb) 2119 { 2120 uint32_t i = 0; 2121 uint32_t j = 0; 2122 ftree_port_group_t *tmp = p_group_array[0]; 2123 2124 /* As this function is a great number of times, we only go into the loop 2125 * if one of the port counters has changed, thus saving some tests */ 2126 if (tmp->hca_or_sw.p_sw->counter_up_changed == FALSE) { 2127 return; 2128 } 2129 /* While we did modifications on the array order */ 2130 /* i may grew above array length but next loop will fail and tmp will be null for the next time 2131 * this way we save a test i < nmemb for each pass through the loop */ 2132 for (i = 0; tmp; i++) { 2133 /* Assume the array is orderd */ 2134 tmp = NULL; 2135 /* Comparing elements j and j-1 */ 2136 for (j = 1; j < (nmemb - i); j++) { 2137 /* If they are the wrong way around */ 2138 if (port_group_compare_load_up(p_group_array[j], 2139 p_group_array[j - 1]) < 0) { 2140 /* We invert them */ 2141 tmp = p_group_array[j - 1]; 2142 p_group_array[j - 1] = p_group_array[j]; 2143 p_group_array[j] = tmp; 2144 /* This sets tmp != NULL so the main loop will make another pass */ 2145 } 2146 } 2147 } 2148 2149 /* We have reordered the array so as long noone changes the counter 2150 * it's not necessary to do it again */ 2151 p_group_array[0]->hca_or_sw.p_sw->counter_up_changed = FALSE; 2152 } 2153 2154 static inline void 2155 bubble_sort_siblings(ftree_port_group_t ** p_group_array, uint32_t nmemb) 2156 { 2157 uint32_t i = 0; 2158 uint32_t j = 0; 2159 ftree_port_group_t *tmp = p_group_array[0]; 2160 2161 /* While we did modifications on the array order */ 2162 /* i may grew above array length but next loop will fail and tmp will be null for the next time 2163 * this way we save a test i < nmemb for each pass through the loop */ 2164 for (i = 0; tmp != NULL; i++) { 2165 /* Assume the array is orderd */ 2166 tmp = NULL; 2167 /* Comparing elements j and j-1 */ 2168 for (j = 1; j < (nmemb - i); j++) { 2169 /* If they are the wrong way around */ 2170 if (port_group_compare_load_up(p_group_array[j], 2171 p_group_array[j - 1]) < 0) { 2172 /* We invert them */ 2173 tmp = p_group_array[j - 1]; 2174 p_group_array[j - 1] = p_group_array[j]; 2175 p_group_array[j] = tmp; 2176 } 2177 } 2178 } 2179 } 2180 2181 /* 2182 * Function: Sorts an array of port group. Order is decide through 2183 * port_group_compare_load_down ( up counters, least load remote switch, biggest GUID) 2184 * Given : A port group array and its length. Each port group points to a remote switch (not a HCA) 2185 * As the list is mostly sorted, we used a bubble sort instead of qsort 2186 * as it is much faster. 2187 * 2188 * Important note: 2189 * This function and bubble_sort_up must NOT be factorized. 2190 * Although most of the code is the same and a function pointer could be used 2191 * for the compareason function, it would prevent the compareason function to be inlined 2192 * and cost a great deal to performances. 2193 */ 2194 static inline void 2195 bubble_sort_down(ftree_port_group_t ** p_group_array, uint32_t nmemb) 2196 { 2197 uint32_t i = 0; 2198 uint32_t j = 0; 2199 ftree_port_group_t *tmp = p_group_array[0]; 2200 2201 /* While we did modifications on the array order */ 2202 /* i may grew above array length but next loop will fail and tmp will be null for the next time 2203 * this way we save a test i < nmemb for each pass through the loop */ 2204 for (i = 0; tmp; i++) { 2205 /* Assume the array is orderd */ 2206 tmp = NULL; 2207 /* Comparing elements j and j-1 */ 2208 for (j = 1; j < (nmemb - i); j++) { 2209 /* If they are the wrong way around */ 2210 if (port_group_compare_load_down 2211 (p_group_array[j], p_group_array[j - 1]) < 0) { 2212 /* We invert them */ 2213 tmp = p_group_array[j - 1]; 2214 p_group_array[j - 1] = p_group_array[j]; 2215 p_group_array[j] = tmp; 2216 2217 } 2218 } 2219 } 2220 } 2221 2222 /*************************************************** 2223 ***************************************************/ 2224 2225 /* 2226 * Function: assign-up-going-port-by-descending-down 2227 * Given : a switch and a LID 2228 * Pseudo code: 2229 * foreach down-going-port-group (in indexing order) 2230 * skip this group if the LFT(LID) port is part of this group 2231 * find the least loaded port of the group (scan in indexing order) 2232 * r-port is the remote port connected to it 2233 * assign the remote switch node LFT(LID) to r-port 2234 * increase r-port usage counter 2235 * assign-up-going-port-by-descending-down to r-port node (recursion) 2236 */ 2237 2238 static boolean_t 2239 fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree, 2240 IN ftree_sw_t * p_sw, 2241 IN ftree_sw_t * p_prev_sw, 2242 IN uint16_t target_lid, 2243 IN boolean_t is_main_path, 2244 IN boolean_t is_target_a_sw, 2245 IN uint8_t current_hops) 2246 { 2247 ftree_sw_t *p_remote_sw; 2248 uint16_t ports_num; 2249 ftree_port_group_t *p_group; 2250 ftree_port_t *p_port; 2251 ftree_port_t *p_min_port; 2252 uint16_t j; 2253 uint16_t k; 2254 boolean_t created_route = FALSE; 2255 boolean_t routed = 0; 2256 uint8_t least_hops; 2257 2258 /* if there is no down-going ports */ 2259 if (p_sw->down_port_groups_num == 0) 2260 return FALSE; 2261 2262 /* foreach down-going port group (in load order) */ 2263 bubble_sort_up(p_sw->down_port_groups, p_sw->down_port_groups_num); 2264 2265 if (p_sw->sibling_port_groups_num > 0) 2266 bubble_sort_siblings(p_sw->sibling_port_groups, 2267 p_sw->sibling_port_groups_num); 2268 2269 for (k = 0; 2270 k < 2271 (p_sw->down_port_groups_num + 2272 ((target_lid != 0) ? p_sw->sibling_port_groups_num : 0)); k++) { 2273 2274 if (k < p_sw->down_port_groups_num) { 2275 p_group = p_sw->down_port_groups[k]; 2276 } else { 2277 p_group = 2278 p_sw->sibling_port_groups[k - 2279 p_sw-> 2280 down_port_groups_num]; 2281 } 2282 2283 /* If this port group doesn't point to a switch, mark 2284 that the route was created and skip to the next group */ 2285 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH) { 2286 created_route = TRUE; 2287 continue; 2288 } 2289 2290 if (p_prev_sw 2291 && p_group->remote_lid == p_prev_sw->lid) { 2292 /* This port group has a port that was used when we entered this switch, 2293 which means that the current group points to the switch where we were 2294 at the previous step of the algorithm (before going up). 2295 Skipping this group. */ 2296 continue; 2297 } 2298 2299 /* find the least loaded port of the group (in indexing order) */ 2300 p_min_port = NULL; 2301 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports); 2302 if(ports_num == 0) 2303 continue; 2304 2305 for (j = 0; j < ports_num; j++) { 2306 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port); 2307 /* first port that we're checking - set as port with the lowest load */ 2308 /* or this port is less loaded - use it as min */ 2309 if (!p_min_port || 2310 p_port->counter_up < p_min_port->counter_up) 2311 p_min_port = p_port; 2312 } 2313 /* At this point we have selected a port in this group with the 2314 lowest load of upgoing routes. 2315 Set on the remote switch how to get to the target_lid - 2316 set LFT(target_lid) on the remote switch to the remote port */ 2317 p_remote_sw = p_group->remote_hca_or_sw.p_sw; 2318 least_hops = sw_get_least_hops(p_remote_sw, target_lid); 2319 2320 if (least_hops != OSM_NO_PATH) { 2321 /* Loop in the fabric - we already routed the remote switch 2322 on our way UP, and now we see it again on our way DOWN */ 2323 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2324 "Loop of length %d in the fabric:\n " 2325 "Switch %s (LID %u) closes loop through switch %s (LID %u)\n", 2326 current_hops, 2327 tuple_to_str(p_remote_sw->tuple), 2328 p_group->lid, 2329 tuple_to_str(p_sw->tuple), 2330 p_group->remote_lid); 2331 /* We skip only if we have come through a longer path */ 2332 if (current_hops + 1 >= least_hops) 2333 continue; 2334 } 2335 2336 /* Four possible cases: 2337 * 2338 * 1. is_main_path == TRUE: 2339 * - going DOWN(TRUE,TRUE) through ALL the groups 2340 * + promoting port counter 2341 * + setting path in remote switch fwd tbl 2342 * + setting hops in remote switch on all the ports of each group 2343 * 2344 * 2. is_main_path == FALSE: 2345 * - going DOWN(TRUE,FALSE) through ALL the groups but only if 2346 * the remote (lower) switch hasn't been already configured 2347 * for this target LID (or with a longer path) 2348 * + promoting port counter 2349 * + setting path in remote switch fwd tbl if it hasn't been set yet 2350 * + setting hops in remote switch on all the ports of each group 2351 * if it hasn't been set yet 2352 */ 2353 2354 /* setting fwd tbl port only */ 2355 p_remote_sw->p_osm_sw->new_lft[target_lid] = 2356 p_min_port->remote_port_num; 2357 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2358 "Switch %s: set path to CA LID %u through port %u\n", 2359 tuple_to_str(p_remote_sw->tuple), 2360 target_lid, p_min_port->remote_port_num); 2361 2362 /* On the remote switch that is pointed by the p_group, 2363 set hops for ALL the ports in the remote group. */ 2364 2365 set_hops_on_remote_sw(p_group, target_lid, 2366 current_hops + 1, is_target_a_sw); 2367 2368 /* Recursion step: 2369 Assign upgoing ports by stepping down, starting on REMOTE switch */ 2370 routed = fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw, /* remote switch - used as a route-upgoing alg. start point */ 2371 NULL, /* prev. position - NULL to mark that we went down and not up */ 2372 target_lid, /* LID that we're routing to */ 2373 is_main_path, /* whether this is path to HCA that should by tracked by counters */ 2374 is_target_a_sw, /* Whether target lid is a switch or not */ 2375 current_hops + 1); /* Number of hops done to this point */ 2376 created_route |= routed; 2377 /* Counters are promoted only if a route toward a node is created */ 2378 if (routed) { 2379 p_min_port->counter_up++; 2380 p_group->counter_up++; 2381 p_group->hca_or_sw.p_sw->counter_up_changed = TRUE; 2382 } 2383 } 2384 /* done scanning all the down-going port groups */ 2385 2386 /* if the route was created, promote the index that 2387 indicates which group should we start with when 2388 going through all the downgoing groups */ 2389 if (created_route) 2390 p_sw->down_port_groups_idx = (p_sw->down_port_groups_idx + 1) 2391 % p_sw->down_port_groups_num; 2392 2393 return created_route; 2394 } /* fabric_route_upgoing_by_going_down() */ 2395 2396 /***************************************************/ 2397 2398 /* 2399 * Function: assign-down-going-port-by-ascending-up 2400 * Given : a switch and a LID 2401 * Pseudo code: 2402 * find the least loaded port of all the upgoing groups (scan in indexing order) 2403 * assign the LFT(LID) of remote switch to that port 2404 * track that port usage 2405 * assign-up-going-port-by-descending-down on CURRENT switch 2406 * assign-down-going-port-by-ascending-up on REMOTE switch (recursion) 2407 */ 2408 2409 static boolean_t 2410 fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree, 2411 IN ftree_sw_t * p_sw, 2412 IN ftree_sw_t * p_prev_sw, 2413 IN uint16_t target_lid, 2414 IN boolean_t is_main_path, 2415 IN boolean_t is_target_a_sw, 2416 IN uint16_t reverse_hop_credit, 2417 IN uint16_t reverse_hops, 2418 IN uint8_t current_hops) 2419 { 2420 ftree_sw_t *p_remote_sw; 2421 uint16_t ports_num; 2422 ftree_port_group_t *p_group; 2423 ftree_port_t *p_port; 2424 ftree_port_group_t *p_min_group; 2425 ftree_port_t *p_min_port; 2426 uint16_t i; 2427 uint16_t j; 2428 boolean_t created_route = FALSE; 2429 boolean_t routed = FALSE; 2430 2431 2432 /* Assign upgoing ports by stepping down, starting on THIS switch */ 2433 created_route = fabric_route_upgoing_by_going_down(p_ftree, p_sw, /* local switch - used as a route-upgoing alg. start point */ 2434 p_prev_sw, /* switch that we went up from (NULL means that we went down) */ 2435 target_lid, /* LID that we're routing to */ 2436 is_main_path, /* whether this path to HCA should by tracked by counters */ 2437 is_target_a_sw, /* Whether target lid is a switch or not */ 2438 current_hops); /* Number of hops done up to this point */ 2439 2440 /* recursion stop condition - if it's a root switch, */ 2441 if (p_sw->rank == 0) { 2442 if (reverse_hop_credit > 0) { 2443 /* We go up by going down as we have some reverse_hop_credit left */ 2444 /* We use the index to scatter a bit the reverse up routes */ 2445 p_sw->down_port_groups_idx = 2446 (p_sw->down_port_groups_idx + 2447 1) % p_sw->down_port_groups_num; 2448 i = p_sw->down_port_groups_idx; 2449 for (j = 0; j < p_sw->down_port_groups_num; j++) { 2450 2451 p_group = p_sw->down_port_groups[i]; 2452 i = (i + 1) % p_sw->down_port_groups_num; 2453 2454 /* Skip this port group unless it points to a switch */ 2455 if (p_group->remote_node_type != 2456 IB_NODE_TYPE_SWITCH) 2457 continue; 2458 p_remote_sw = p_group->remote_hca_or_sw.p_sw; 2459 2460 created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */ 2461 p_sw, /* this switch - prev. position switch for the function */ 2462 target_lid, /* LID that we're routing to */ 2463 is_main_path, /* whether this is path to HCA that should by tracked by counters */ 2464 is_target_a_sw, /* Whether target lid is a switch or not */ 2465 reverse_hop_credit - 1, /* Remaining reverse_hops allowed */ 2466 reverse_hops + 1, /* Number of reverse_hops done up to this point */ 2467 current_hops 2468 + 2469 1); 2470 } 2471 2472 } 2473 return created_route; 2474 } 2475 2476 /* We should generate a list of port sorted by load so we can find easily the least 2477 * going port and explore the other pots on secondary routes more easily (and quickly) */ 2478 bubble_sort_down(p_sw->up_port_groups, p_sw->up_port_groups_num); 2479 2480 p_min_group = p_sw->up_port_groups[0]; 2481 /* Find the least loaded upgoing port in the selected group */ 2482 p_min_port = NULL; 2483 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports); 2484 for (j = 0; j < ports_num; j++) { 2485 cl_ptr_vector_at(&p_min_group->ports, j, (void *)&p_port); 2486 if (!p_min_port) { 2487 /* first port that we're checking - use 2488 it as a port with the lowest load */ 2489 p_min_port = p_port; 2490 } else if (p_port->counter_down < p_min_port->counter_down) { 2491 /* this port is less loaded - use it as min */ 2492 p_min_port = p_port; 2493 } 2494 } 2495 2496 /* At this point we have selected a group and port with the 2497 lowest load of downgoing routes. 2498 Set on the remote switch how to get to the target_lid - 2499 set LFT(target_lid) on the remote switch to the remote port */ 2500 p_remote_sw = p_min_group->remote_hca_or_sw.p_sw; 2501 2502 /* Four possible cases: 2503 * 2504 * 1. is_main_path == TRUE: 2505 * - going UP(TRUE,TRUE) on selected min_group and min_port 2506 * + promoting port counter 2507 * + setting path in remote switch fwd tbl 2508 * + setting hops in remote switch on all the ports of selected group 2509 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0 2510 * + NOT promoting port counter 2511 * + setting path in remote switch fwd tbl if it hasn't been set yet 2512 * + setting hops in remote switch on all the ports of each group 2513 * if it hasn't been set yet 2514 * 2515 * 2. is_main_path == FALSE: 2516 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0, 2517 * but only if the remote (upper) switch hasn't been already 2518 * configured for this target LID 2519 * + NOT promoting port counter 2520 * + setting path in remote switch fwd tbl if it hasn't been set yet 2521 * + setting hops in remote switch on all the ports of each group 2522 * if it hasn't been set yet 2523 */ 2524 2525 /* covering first half of case 1, and case 3 */ 2526 if (is_main_path) { 2527 if (p_sw->is_leaf) { 2528 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2529 " - Routing MAIN path for %s CA LID %u: %s --> %s\n", 2530 (target_lid != 0) ? "real" : "DUMMY", 2531 target_lid, 2532 tuple_to_str(p_sw->tuple), 2533 tuple_to_str(p_remote_sw->tuple)); 2534 } 2535 /* The number of downgoing routes is tracked in the 2536 p_group->counter_down p_port->counter_down counters of the 2537 group and port that belong to the lower side of the link 2538 (on switch with higher rank) */ 2539 p_min_group->counter_down++; 2540 p_min_port->counter_down++; 2541 if (p_min_group->counter_down == 2542 (p_min_group->remote_hca_or_sw.p_sw->min_counter_down + 2543 1)) { 2544 recalculate_min_counter_down 2545 (p_min_group->remote_hca_or_sw.p_sw); 2546 } 2547 2548 /* This LID may already be in the LFT in the reverse_hop feature is used */ 2549 /* We update the LFT only if this LID isn't already present. */ 2550 2551 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */ 2552 if ((p_remote_sw->p_osm_sw->new_lft[target_lid] == OSM_NO_PATH) 2553 || 2554 (current_hops + 1 < 2555 sw_get_least_hops(p_remote_sw, target_lid))) { 2556 2557 p_remote_sw->p_osm_sw->new_lft[target_lid] = 2558 p_min_port->remote_port_num; 2559 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2560 "Switch %s: set path to CA LID %u through port %u\n", 2561 tuple_to_str(p_remote_sw->tuple), 2562 target_lid, 2563 p_min_port->remote_port_num); 2564 2565 /* On the remote switch that is pointed by the min_group, 2566 set hops for ALL the ports in the remote group. */ 2567 2568 set_hops_on_remote_sw(p_min_group, target_lid, 2569 current_hops + 1, 2570 is_target_a_sw); 2571 } 2572 /* Recursion step: Assign downgoing ports by stepping up, starting on REMOTE switch. */ 2573 created_route |= fabric_route_downgoing_by_going_up(p_ftree, 2574 p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */ 2575 p_sw, /* this switch - prev. position switch for the function */ 2576 target_lid, /* LID that we're routing to */ 2577 is_main_path, /* whether this is path to HCA that should by tracked by counters */ 2578 is_target_a_sw, /* Whether target lid is a switch or not */ 2579 reverse_hop_credit, /* Remaining reverse_hops allowed */ 2580 reverse_hops, /* Number of reverse_hops done up to this point */ 2581 current_hops + 1); 2582 } 2583 2584 /* What's left to do at this point: 2585 * 2586 * 1. is_main_path == TRUE: 2587 * - going UP(TRUE,FALSE) on rest of the groups, each time on port 0, 2588 * but only if the remote (upper) switch hasn't been already 2589 * configured for this target LID 2590 * + NOT promoting port counter 2591 * + setting path in remote switch fwd tbl if it hasn't been set yet 2592 * + setting hops in remote switch on all the ports of each group 2593 * if it hasn't been set yet 2594 * 2595 * 2. is_main_path == FALSE: 2596 * - going UP(TRUE,FALSE) on ALL the groups, each time on port 0, 2597 * but only if the remote (upper) switch hasn't been already 2598 * configured for this target LID 2599 * + NOT promoting port counter 2600 * + setting path in remote switch fwd tbl if it hasn't been set yet 2601 * + setting hops in remote switch on all the ports of each group 2602 * if it hasn't been set yet 2603 * 2604 * These two rules can be rephrased this way: 2605 * - foreach UP port group 2606 * + if remote switch has been set with the target LID 2607 * - skip this port group 2608 * + else 2609 * - select port 0 2610 * - do NOT promote port counter 2611 * - set path in remote switch fwd tbl 2612 * - set hops in remote switch on all the ports of this group 2613 * - go UP(TRUE,FALSE) to the remote switch 2614 */ 2615 2616 for (i = is_main_path ? 1 : 0; i < p_sw->up_port_groups_num; i++) { 2617 p_group = p_sw->up_port_groups[i]; 2618 p_remote_sw = p_group->remote_hca_or_sw.p_sw; 2619 2620 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */ 2621 if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH) 2622 if (current_hops + 1 >= 2623 sw_get_least_hops(p_remote_sw, target_lid)) 2624 continue; 2625 2626 if (p_sw->is_leaf) { 2627 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2628 " - Routing SECONDARY path for LID %u: %s --> %s\n", 2629 target_lid, 2630 tuple_to_str(p_sw->tuple), 2631 tuple_to_str(p_remote_sw->tuple)); 2632 } 2633 2634 /* Routing REAL lids on SECONDARY path means routing 2635 switch-to-switch or switch-to-CA paths. 2636 We can safely assume that switch will initiate very 2637 few traffic, so there's no point wasting runtime on 2638 trying to balance these routes - always pick port 0. */ 2639 p_min_port = NULL; 2640 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports); 2641 if(ports_num == 0) 2642 continue; 2643 for (j = 0; j < ports_num; j++) { 2644 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port); 2645 if (!p_min_port) { 2646 /* first port that we're checking - use 2647 it as a port with the lowest load */ 2648 p_min_port = p_port; 2649 } else if (p_port->counter_down < 2650 p_min_port->counter_down) { 2651 /* this port is less loaded - use it as min */ 2652 p_min_port = p_port; 2653 } 2654 } 2655 2656 p_port = p_min_port; 2657 p_remote_sw->p_osm_sw->new_lft[target_lid] = 2658 p_port->remote_port_num; 2659 2660 /* On the remote switch that is pointed by the p_group, 2661 set hops for ALL the ports in the remote group. */ 2662 2663 set_hops_on_remote_sw(p_group, target_lid, 2664 current_hops + 1, is_target_a_sw); 2665 2666 /* Recursion step: 2667 Assign downgoing ports by stepping up, starting on REMOTE switch. */ 2668 routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */ 2669 p_sw, /* this switch - prev. position switch for the function */ 2670 target_lid, /* LID that we're routing to */ 2671 FALSE, /* whether this is path to HCA that should by tracked by counters */ 2672 is_target_a_sw, /* Whether target lid is a switch or not */ 2673 reverse_hop_credit, /* Remaining reverse_hops allowed */ 2674 reverse_hops, /* Number of reverse_hops done up to this point */ 2675 current_hops + 1); 2676 created_route |= routed; 2677 } 2678 2679 /* Now doing the same thing with horizontal links */ 2680 if (p_sw->sibling_port_groups_num > 0) 2681 bubble_sort_down(p_sw->sibling_port_groups, 2682 p_sw->sibling_port_groups_num); 2683 2684 for (i = 0; i < p_sw->sibling_port_groups_num; i++) { 2685 p_group = p_sw->sibling_port_groups[i]; 2686 p_remote_sw = p_group->remote_hca_or_sw.p_sw; 2687 2688 /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count) */ 2689 if (p_remote_sw->p_osm_sw->new_lft[target_lid] != OSM_NO_PATH) 2690 if (current_hops + 1 >= 2691 sw_get_least_hops(p_remote_sw, target_lid)) 2692 continue; 2693 2694 if (p_sw->is_leaf) { 2695 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2696 " - Routing SECONDARY path for LID %u: %s --> %s\n", 2697 target_lid, 2698 tuple_to_str(p_sw->tuple), 2699 tuple_to_str(p_remote_sw->tuple)); 2700 } 2701 2702 /* Routing REAL lids on SECONDARY path means routing 2703 switch-to-switch or switch-to-CA paths. 2704 We can safely assume that switch will initiate very 2705 few traffic, so there's no point wasting runtime on 2706 trying to balance these routes - always pick port 0. */ 2707 2708 p_min_port = NULL; 2709 ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports); 2710 for (j = 0; j < ports_num; j++) { 2711 cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port); 2712 if (!p_min_port) { 2713 /* first port that we're checking - use 2714 it as a port with the lowest load */ 2715 p_min_port = p_port; 2716 } else if (p_port->counter_down < 2717 p_min_port->counter_down) { 2718 /* this port is less loaded - use it as min */ 2719 p_min_port = p_port; 2720 } 2721 } 2722 2723 p_port = p_min_port; 2724 p_remote_sw->p_osm_sw->new_lft[target_lid] = 2725 p_port->remote_port_num; 2726 2727 /* On the remote switch that is pointed by the p_group, 2728 set hops for ALL the ports in the remote group. */ 2729 2730 set_hops_on_remote_sw(p_group, target_lid, 2731 current_hops + 1, is_target_a_sw); 2732 2733 /* Recursion step: 2734 Assign downgoing ports by stepping up, starting on REMOTE switch. */ 2735 routed = fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */ 2736 p_sw, /* this switch - prev. position switch for the function */ 2737 target_lid, /* LID that we're routing to */ 2738 FALSE, /* whether this is path to HCA that should by tracked by counters */ 2739 is_target_a_sw, /* Whether target lid is a switch or not */ 2740 reverse_hop_credit, /* Remaining reverse_hops allowed */ 2741 reverse_hops, /* Number of reverse_hops done up to this point */ 2742 current_hops + 1); 2743 created_route |= routed; 2744 if (routed) { 2745 p_min_group->counter_down++; 2746 p_min_port->counter_down++; 2747 } 2748 } 2749 2750 /* If we don't have any reverse hop credits, we are done */ 2751 if (reverse_hop_credit == 0) 2752 return created_route; 2753 2754 if (p_sw->is_leaf) 2755 return created_route; 2756 2757 /* We explore all the down group ports */ 2758 /* We try to reverse jump for each of them */ 2759 /* They already have a route to us from the upgoing_by_going_down started earlier */ 2760 /* This is only so it'll continue exploring up, after this step backwards */ 2761 for (i = 0; i < p_sw->down_port_groups_num; i++) { 2762 p_group = p_sw->down_port_groups[i]; 2763 p_remote_sw = p_group->remote_hca_or_sw.p_sw; 2764 2765 /* Skip this port group unless it points to a switch */ 2766 if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH) 2767 continue; 2768 2769 /* Recursion step: 2770 Assign downgoing ports by stepping up, fter doing one step down starting on REMOTE switch. */ 2771 created_route |= fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */ 2772 p_sw, /* this switch - prev. position switch for the function */ 2773 target_lid, /* LID that we're routing to */ 2774 TRUE, /* whether this is path to HCA that should by tracked by counters */ 2775 is_target_a_sw, /* Whether target lid is a switch or not */ 2776 reverse_hop_credit - 1, /* Remaining reverse_hops allowed */ 2777 reverse_hops + 1, /* Number of reverse_hops done up to this point */ 2778 current_hops 2779 + 1); 2780 } 2781 return created_route; 2782 2783 } /* ftree_fabric_route_downgoing_by_going_up() */ 2784 2785 /***************************************************/ 2786 2787 /* 2788 * Pseudo code: 2789 * foreach leaf switch (in indexing order) 2790 * for each compute node (in indexing order) 2791 * obtain the LID of the compute node 2792 * set local LFT(LID) of the port connecting to compute node 2793 * call assign-down-going-port-by-ascending-up(TRUE,TRUE) on CURRENT switch 2794 * for each MISSING compute node 2795 * call assign-down-going-port-by-ascending-up(FALSE,TRUE) on CURRENT switch 2796 */ 2797 2798 static void fabric_route_to_cns(IN ftree_fabric_t * p_ftree) 2799 { 2800 ftree_sw_t *p_sw; 2801 ftree_hca_t *p_hca; 2802 ftree_port_group_t *p_leaf_port_group; 2803 ftree_port_group_t *p_hca_port_group; 2804 ftree_port_t *p_port; 2805 unsigned int i, j; 2806 uint16_t hca_lid; 2807 unsigned routed_targets_on_leaf; 2808 2809 OSM_LOG_ENTER(&p_ftree->p_osm->log); 2810 2811 /* for each leaf switch (in indexing order) */ 2812 for (i = 0; i < p_ftree->leaf_switches_num; i++) { 2813 p_sw = p_ftree->leaf_switches[i]; 2814 routed_targets_on_leaf = 0; 2815 2816 /* for each HCA connected to this switch */ 2817 for (j = 0; j < p_sw->down_port_groups_num; j++) { 2818 p_leaf_port_group = p_sw->down_port_groups[j]; 2819 2820 /* work with this port group only if the remote node is CA */ 2821 if (p_leaf_port_group->remote_node_type != 2822 IB_NODE_TYPE_CA) 2823 continue; 2824 2825 p_hca = p_leaf_port_group->remote_hca_or_sw.p_hca; 2826 2827 /* work with this port group only if remote HCA has CNs */ 2828 if (!p_hca->cn_num) 2829 continue; 2830 2831 p_hca_port_group = 2832 hca_get_port_group_by_lid(p_hca, 2833 p_leaf_port_group-> 2834 remote_lid); 2835 CL_ASSERT(p_hca_port_group); 2836 2837 /* work with this port group only if remote port is CN */ 2838 if (!p_hca_port_group->is_cn) 2839 continue; 2840 2841 /* obtain the LID of HCA port */ 2842 hca_lid = p_leaf_port_group->remote_lid; 2843 2844 /* set local LFT(LID) to the port that is connected to HCA */ 2845 cl_ptr_vector_at(&p_leaf_port_group->ports, 0, 2846 (void *)&p_port); 2847 p_sw->p_osm_sw->new_lft[hca_lid] = p_port->port_num; 2848 2849 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2850 "Switch %s: set path to CN LID %u through port %u\n", 2851 tuple_to_str(p_sw->tuple), 2852 hca_lid, p_port->port_num); 2853 2854 /* set local min hop table(LID) to route to the CA */ 2855 sw_set_hops(p_sw, hca_lid, p_port->port_num, 1, FALSE); 2856 2857 /* Assign downgoing ports by stepping up. 2858 Since we're routing here only CNs, we're routing it as REAL 2859 LID and updating fat-tree balancing counters. */ 2860 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */ 2861 NULL, /* prev. position switch */ 2862 hca_lid, /* LID that we're routing to */ 2863 TRUE, /* whether this path to HCA should by tracked by counters */ 2864 FALSE, /* whether target lid is a switch or not */ 2865 0, /* Number of reverse hops allowed */ 2866 0, /* Number of reverse hops done yet */ 2867 1); /* Number of hops done yet */ 2868 2869 /* count how many real targets have been routed from this leaf switch */ 2870 routed_targets_on_leaf++; 2871 } 2872 2873 /* We're done with the real targets (all CNs) of this leaf switch. 2874 Now route the dummy HCAs that are missing or that are non-CNs. 2875 When routing to dummy HCAs we don't fill lid matrices. */ 2876 if (p_ftree->max_cn_per_leaf > routed_targets_on_leaf) { 2877 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2878 "Routing %u dummy CAs\n", 2879 p_ftree->max_cn_per_leaf - 2880 p_sw->down_port_groups_num); 2881 for (j = 0; j < 2882 p_ftree->max_cn_per_leaf - routed_targets_on_leaf; 2883 j++) { 2884 ftree_sw_t *p_next_sw, *p_ftree_sw; 2885 sw_set_hops(p_sw, 0, 0xFF, 1, FALSE); 2886 /* assign downgoing ports by stepping up */ 2887 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */ 2888 NULL, /* prev. position switch */ 2889 0, /* LID that we're routing to - ignored for dummy HCA */ 2890 TRUE, /* whether this path to HCA should by tracked by counters */ 2891 FALSE, /* Whether the target LID is a switch or not */ 2892 0, /* Number of reverse hops allowed */ 2893 0, /* Number of reverse hops done yet */ 2894 1); /* Number of hops done yet */ 2895 2896 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 2897 /* need to clean the LID 0 hops for dummy node */ 2898 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 2899 p_ftree_sw = p_next_sw; 2900 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_ftree_sw->map_item); 2901 p_ftree_sw->hops[0] = OSM_NO_PATH; 2902 p_ftree_sw->p_osm_sw->new_lft[0] = OSM_NO_PATH; 2903 } 2904 2905 } 2906 } 2907 } 2908 /* done going through all the leaf switches */ 2909 OSM_LOG_EXIT(&p_ftree->p_osm->log); 2910 } /* fabric_route_to_cns() */ 2911 2912 /***************************************************/ 2913 2914 /* 2915 * Pseudo code: 2916 * foreach HCA non-CN port in fabric 2917 * obtain the LID of the HCA port 2918 * get switch that is connected to this HCA port 2919 * set switch LFT(LID) to the port connected to the HCA port 2920 * call assign-down-going-port-by-ascending-up(TRUE,TRUE) on the switch 2921 * 2922 * Routing to these HCAs is routing a REAL hca lid on MAIN path. 2923 * We want to allow load-leveling of the traffic to the non-CNs, 2924 * because such nodes may include IO nodes with heavy usage 2925 * - we should set fwd tables 2926 * - we should update port counters 2927 * Routing to non-CNs is done after routing to CNs, so updated port 2928 * counters will not affect CN-to-CN routing. 2929 */ 2930 2931 static void fabric_route_to_non_cns(IN ftree_fabric_t * p_ftree) 2932 { 2933 ftree_sw_t *p_sw; 2934 ftree_hca_t *p_hca; 2935 ftree_hca_t *p_next_hca; 2936 ftree_port_t *p_hca_port; 2937 ftree_port_group_t *p_hca_port_group; 2938 uint16_t hca_lid; 2939 unsigned port_num_on_switch; 2940 unsigned i; 2941 2942 OSM_LOG_ENTER(&p_ftree->p_osm->log); 2943 2944 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 2945 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 2946 p_hca = p_next_hca; 2947 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 2948 2949 for (i = 0; i < p_hca->up_port_groups_num; i++) { 2950 p_hca_port_group = p_hca->up_port_groups[i]; 2951 2952 /* skip this port if it's CN, in which case it has been already routed */ 2953 if (p_hca_port_group->is_cn) 2954 continue; 2955 2956 /* skip this port if it is not connected to switch */ 2957 if (p_hca_port_group->remote_node_type != 2958 IB_NODE_TYPE_SWITCH) 2959 continue; 2960 2961 p_sw = p_hca_port_group->remote_hca_or_sw.p_sw; 2962 hca_lid = p_hca_port_group->lid; 2963 2964 /* set switches LFT(LID) to the port that is connected to HCA */ 2965 cl_ptr_vector_at(&p_hca_port_group->ports, 0, 2966 (void *)&p_hca_port); 2967 port_num_on_switch = p_hca_port->remote_port_num; 2968 p_sw->p_osm_sw->new_lft[hca_lid] = port_num_on_switch; 2969 2970 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 2971 "Switch %s: set path to non-CN HCA LID %u through port %u\n", 2972 tuple_to_str(p_sw->tuple), 2973 hca_lid, port_num_on_switch); 2974 2975 /* set local min hop table(LID) to route to the CA */ 2976 sw_set_hops(p_sw, hca_lid, port_num_on_switch, /* port num */ 2977 1, FALSE); /* hops */ 2978 2979 /* Assign downgoing ports by stepping up. 2980 We're routing REAL targets. They are not CNs and not included 2981 in the leafs array, but we treat them as MAIN path to allow load 2982 leveling, which means that the counters will be updated. */ 2983 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */ 2984 NULL, /* prev. position switch */ 2985 hca_lid, /* LID that we're routing to */ 2986 TRUE, /* whether this path to HCA should by tracked by counters */ 2987 FALSE, /* Whether the target LID is a switch or not */ 2988 p_hca_port_group->is_io ? p_ftree->p_osm->subn.opt.max_reverse_hops : 0, /* Number or reverse hops allowed */ 2989 0, /* Number or reverse hops done yet */ 2990 1); /* Number of hops done yet */ 2991 } 2992 /* done with all the port groups of this HCA - go to next HCA */ 2993 } 2994 2995 OSM_LOG_EXIT(&p_ftree->p_osm->log); 2996 } /* fabric_route_to_non_cns() */ 2997 2998 /***************************************************/ 2999 3000 /* 3001 * Pseudo code: 3002 * foreach switch in fabric 3003 * obtain its LID 3004 * set local LFT(LID) to port 0 3005 * call assign-down-going-port-by-ascending-up(TRUE,FALSE) on CURRENT switch 3006 * 3007 * Routing to switch is similar to routing a REAL hca lid on SECONDARY path: 3008 * - we should set fwd tables 3009 * - we should NOT update port counters 3010 */ 3011 3012 static void fabric_route_to_switches(IN ftree_fabric_t * p_ftree) 3013 { 3014 ftree_sw_t *p_sw; 3015 ftree_sw_t *p_next_sw; 3016 3017 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3018 3019 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 3020 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 3021 p_sw = p_next_sw; 3022 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 3023 3024 /* set local LFT(LID) to 0 (route to itself) */ 3025 p_sw->p_osm_sw->new_lft[p_sw->lid] = 0; 3026 3027 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3028 "Switch %s (LID %u): routing switch-to-switch paths\n", 3029 tuple_to_str(p_sw->tuple), p_sw->lid); 3030 3031 /* set min hop table of the switch to itself */ 3032 sw_set_hops(p_sw, p_sw->lid, 0, /* port_num */ 3033 0, TRUE); /* hops */ 3034 3035 fabric_route_downgoing_by_going_up(p_ftree, p_sw, /* local switch - used as a route-downgoing alg. start point */ 3036 NULL, /* prev. position switch */ 3037 p_sw->lid, /* LID that we're routing to */ 3038 FALSE, /* whether this path to HCA should by tracked by counters */ 3039 TRUE, /* Whether the target LID is a switch or not */ 3040 0, /* Number of reverse hops allowed */ 3041 0, /* Number of reverse hops done yet */ 3042 0); /* Number of hops done yet */ 3043 } 3044 3045 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3046 } /* fabric_route_to_switches() */ 3047 3048 /*************************************************** 3049 ***************************************************/ 3050 3051 static void fabric_route_roots(IN ftree_fabric_t * p_ftree) 3052 { 3053 uint16_t lid; 3054 uint8_t port_num; 3055 osm_port_t *p_port; 3056 ftree_sw_t *p_sw; 3057 ftree_sw_t *p_leaf_sw; 3058 3059 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3060 3061 /* 3062 * We need a switch that will accomodate all the down/up turns in 3063 * the fabric. Having these turn in a single place in the fabric 3064 * will not create credit loops. 3065 * So we need to select this switch. 3066 * The idea here is to chose leaf with the highest index. I don't 3067 * have any theory to back me up on this. It's just a general thought 3068 * that this way the switch that might be a bottleneck for many mcast 3069 * groups will be far away from the OpenSM, so it will draw the 3070 * multicast traffic away from the SM. 3071 */ 3072 3073 p_leaf_sw = p_ftree->leaf_switches[p_ftree->leaf_switches_num-1]; 3074 3075 /* 3076 * Now go over all the switches in the fabric that 3077 * have lower rank, and route the missing LIDs to 3078 * the selected leaf switch. 3079 * In short, this leaf switch now poses a target 3080 * for all those missing LIDs. 3081 */ 3082 3083 for (p_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 3084 p_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl); 3085 p_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item)) { 3086 3087 if (p_sw->rank >= p_ftree->leaf_switch_rank) 3088 continue; 3089 3090 for (lid = 1; lid <= p_leaf_sw->p_osm_sw->max_lid_ho; lid ++) { 3091 3092 if (p_sw->p_osm_sw->new_lft[lid] != OSM_NO_PATH || 3093 p_leaf_sw->hops[lid] == OSM_NO_PATH) 3094 continue; 3095 3096 p_port = osm_get_port_by_lid_ho(&p_ftree->p_osm->subn, 3097 lid); 3098 3099 /* we're interested only in switches */ 3100 if (!p_port || !p_port->p_node->sw) 3101 continue; 3102 3103 /* 3104 * the missing LID will be routed through the same 3105 * port that routes to the selected leaf switch 3106 */ 3107 port_num = p_sw->p_osm_sw->new_lft[p_leaf_sw->lid]; 3108 3109 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3110 "Switch %s: setting path to LID %u " 3111 "through port %u\n", 3112 tuple_to_str(p_sw->tuple), lid, port_num); 3113 3114 /* set local lft */ 3115 p_sw->p_osm_sw->new_lft[lid] = port_num; 3116 3117 /* 3118 * Set local min hop table. 3119 * The distance to the target LID is a distance 3120 * to the selected leaf switch plus the distance 3121 * from the leaf to the target LID. 3122 */ 3123 sw_set_hops(p_sw, lid, port_num, 3124 p_sw->hops[p_leaf_sw->lid] + 3125 p_leaf_sw->hops[lid], TRUE); 3126 } 3127 } 3128 3129 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3130 } /* fabric_route_roots() */ 3131 3132 /***************************************************/ 3133 3134 static int fabric_populate_nodes(IN ftree_fabric_t * p_ftree) 3135 { 3136 osm_node_t *p_osm_node; 3137 osm_node_t *p_next_osm_node; 3138 3139 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3140 3141 p_next_osm_node = 3142 (osm_node_t *) cl_qmap_head(&p_ftree->p_osm->subn.node_guid_tbl); 3143 while (p_next_osm_node != 3144 (osm_node_t *) cl_qmap_end(&p_ftree->p_osm-> 3145 subn.node_guid_tbl)) { 3146 p_osm_node = p_next_osm_node; 3147 p_next_osm_node = 3148 (osm_node_t *) cl_qmap_next(&p_osm_node->map_item); 3149 switch (osm_node_get_type(p_osm_node)) { 3150 case IB_NODE_TYPE_CA: 3151 fabric_add_hca(p_ftree, p_osm_node); 3152 break; 3153 case IB_NODE_TYPE_ROUTER: 3154 break; 3155 case IB_NODE_TYPE_SWITCH: 3156 fabric_add_sw(p_ftree, p_osm_node->sw); 3157 break; 3158 default: 3159 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3160 "ERR AB0E: " "Node GUID 0x%016" PRIx64 3161 " - Unknown node type: %s\n", 3162 cl_ntoh64(osm_node_get_node_guid(p_osm_node)), 3163 ib_get_node_type_str(osm_node_get_type 3164 (p_osm_node))); 3165 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3166 return -1; 3167 } 3168 } 3169 3170 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3171 return 0; 3172 } /* fabric_populate_nodes() */ 3173 3174 /*************************************************** 3175 ***************************************************/ 3176 3177 static boolean_t sw_update_rank(IN ftree_sw_t * p_sw, IN uint32_t new_rank) 3178 { 3179 if (sw_ranked(p_sw) && p_sw->rank <= new_rank) 3180 return FALSE; 3181 p_sw->rank = new_rank; 3182 return TRUE; 3183 3184 } 3185 3186 /***************************************************/ 3187 3188 static void rank_switches_from_leafs(IN ftree_fabric_t * p_ftree, 3189 IN cl_list_t * p_ranking_bfs_list) 3190 { 3191 ftree_sw_t *p_sw; 3192 ftree_sw_t *p_remote_sw; 3193 osm_node_t *p_node; 3194 osm_node_t *p_remote_node; 3195 osm_physp_t *p_osm_port; 3196 uint8_t i; 3197 unsigned max_rank = 0; 3198 3199 while (!cl_is_list_empty(p_ranking_bfs_list)) { 3200 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list); 3201 p_node = p_sw->p_osm_sw->p_node; 3202 3203 /* note: skipping port 0 on switches */ 3204 for (i = 1; i < osm_node_get_num_physp(p_node); i++) { 3205 p_osm_port = osm_node_get_physp_ptr(p_node, i); 3206 if (!p_osm_port || !osm_link_is_healthy(p_osm_port)) 3207 continue; 3208 3209 p_remote_node = 3210 osm_node_get_remote_node(p_node, i, NULL); 3211 if (!p_remote_node) 3212 continue; 3213 if (osm_node_get_type(p_remote_node) != 3214 IB_NODE_TYPE_SWITCH) 3215 continue; 3216 3217 p_remote_sw = fabric_get_sw_by_guid(p_ftree, 3218 osm_node_get_node_guid 3219 (p_remote_node)); 3220 if (!p_remote_sw) { 3221 /* remote node is not a switch */ 3222 continue; 3223 } 3224 3225 /* if needed, rank the remote switch and add it to the BFS list */ 3226 if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) { 3227 max_rank = p_remote_sw->rank; 3228 cl_list_insert_tail(p_ranking_bfs_list, 3229 p_remote_sw); 3230 } 3231 } 3232 } 3233 3234 /* set FatTree maximal switch rank */ 3235 p_ftree->max_switch_rank = max_rank; 3236 3237 } /* rank_switches_from_leafs() */ 3238 3239 /***************************************************/ 3240 3241 static int rank_leaf_switches(IN ftree_fabric_t * p_ftree, 3242 IN ftree_hca_t * p_hca, 3243 IN cl_list_t * p_ranking_bfs_list) 3244 { 3245 ftree_sw_t *p_sw; 3246 osm_node_t *p_osm_node = p_hca->p_osm_node; 3247 osm_node_t *p_remote_osm_node; 3248 osm_physp_t *p_osm_port; 3249 static uint8_t i = 0; 3250 int res = 0; 3251 3252 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3253 3254 for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++) { 3255 p_osm_port = osm_node_get_physp_ptr(p_osm_node, i); 3256 if (!p_osm_port || !osm_link_is_healthy(p_osm_port)) 3257 continue; 3258 3259 p_remote_osm_node = 3260 osm_node_get_remote_node(p_osm_node, i, NULL); 3261 if (!p_remote_osm_node) 3262 continue; 3263 3264 switch (osm_node_get_type(p_remote_osm_node)) { 3265 case IB_NODE_TYPE_CA: 3266 /* HCA connected directly to another HCA - not FatTree */ 3267 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3268 "ERR AB0F: " 3269 "CA conected directly to another CA: " "0x%016" 3270 PRIx64 " <---> 0x%016" PRIx64 "\n", 3271 hca_get_guid_ho(p_hca), 3272 cl_ntoh64(osm_node_get_node_guid 3273 (p_remote_osm_node))); 3274 res = -1; 3275 goto Exit; 3276 3277 case IB_NODE_TYPE_ROUTER: 3278 /* leaving this port - proceeding to the next one */ 3279 continue; 3280 3281 case IB_NODE_TYPE_SWITCH: 3282 /* continue with this port */ 3283 break; 3284 3285 default: 3286 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3287 "ERR AB10: Node GUID 0x%016" PRIx64 3288 " - Unknown node type: %s\n", 3289 cl_ntoh64(osm_node_get_node_guid 3290 (p_remote_osm_node)), 3291 ib_get_node_type_str(osm_node_get_type 3292 (p_remote_osm_node))); 3293 res = -1; 3294 goto Exit; 3295 } 3296 3297 /* remote node is switch */ 3298 3299 p_sw = fabric_get_sw_by_guid(p_ftree, 3300 osm_node_get_node_guid 3301 (p_osm_port->p_remote_physp-> 3302 p_node)); 3303 CL_ASSERT(p_sw); 3304 3305 /* if needed, rank the remote switch and add it to the BFS list */ 3306 3307 if (!sw_update_rank(p_sw, 0)) 3308 continue; 3309 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3310 "Marking rank of switch that is directly connected to CA:\n" 3311 " - CA guid : 0x%016" 3312 PRIx64 "\n" 3313 " - Switch guid: 0x%016" 3314 PRIx64 "\n" 3315 " - Switch LID : %u\n", 3316 hca_get_guid_ho(p_hca), 3317 sw_get_guid_ho(p_sw), p_sw->lid); 3318 cl_list_insert_tail(p_ranking_bfs_list, p_sw); 3319 } 3320 3321 Exit: 3322 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3323 return res; 3324 } /* rank_leaf_switches() */ 3325 3326 /***************************************************/ 3327 3328 static void sw_reverse_rank(IN cl_map_item_t * const p_map_item, 3329 IN void *context) 3330 { 3331 ftree_fabric_t *p_ftree = (ftree_fabric_t *) context; 3332 ftree_sw_t *p_sw = (ftree_sw_t * const)p_map_item; 3333 if (p_sw->rank != 0xFFFFFFFF) 3334 p_sw->rank = p_ftree->max_switch_rank - p_sw->rank; 3335 } 3336 3337 /*************************************************** 3338 ***************************************************/ 3339 3340 static int 3341 fabric_construct_hca_ports(IN ftree_fabric_t * p_ftree, IN ftree_hca_t * p_hca) 3342 { 3343 ftree_sw_t *p_remote_sw; 3344 osm_node_t *p_node = p_hca->p_osm_node; 3345 osm_node_t *p_remote_node; 3346 uint8_t remote_node_type; 3347 ib_net64_t remote_node_guid; 3348 osm_physp_t *p_remote_osm_port; 3349 uint8_t i; 3350 uint8_t remote_port_num; 3351 boolean_t is_cn; 3352 boolean_t is_in_cn_file; 3353 boolean_t is_io; 3354 boolean_t is_cns_file_provided = fabric_cns_provided(p_ftree); 3355 boolean_t is_ios_file_provided = fabric_ios_provided(p_ftree); 3356 int res = 0; 3357 3358 for (i = 0; i < osm_node_get_num_physp(p_node); i++) { 3359 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i); 3360 is_io = FALSE; 3361 is_cn = TRUE; 3362 is_in_cn_file = FALSE; 3363 3364 if (!p_osm_port || !osm_link_is_healthy(p_osm_port)) 3365 continue; 3366 3367 if (p_hca->disconnected_ports[i]) 3368 continue; 3369 3370 p_remote_osm_port = osm_physp_get_remote(p_osm_port); 3371 p_remote_node = 3372 osm_node_get_remote_node(p_node, i, &remote_port_num); 3373 3374 if (!p_remote_osm_port || !p_remote_node) 3375 continue; 3376 3377 remote_node_type = osm_node_get_type(p_remote_node); 3378 remote_node_guid = osm_node_get_node_guid(p_remote_node); 3379 3380 switch (remote_node_type) { 3381 case IB_NODE_TYPE_ROUTER: 3382 /* leaving this port - proceeding to the next one */ 3383 continue; 3384 3385 case IB_NODE_TYPE_CA: 3386 /* HCA connected directly to another HCA - not FatTree */ 3387 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3388 "ERR AB11: " 3389 "CA conected directly to another CA: " "0x%016" 3390 PRIx64 " <---> 0x%016" PRIx64 "\n", 3391 cl_ntoh64(osm_node_get_node_guid(p_node)), 3392 cl_ntoh64(remote_node_guid)); 3393 res = -1; 3394 goto Exit; 3395 3396 case IB_NODE_TYPE_SWITCH: 3397 /* continue with this port */ 3398 break; 3399 3400 default: 3401 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3402 "ERR AB12: Node GUID 0x%016" PRIx64 3403 " - Unknown node type: %s\n", 3404 cl_ntoh64(remote_node_guid), 3405 ib_get_node_type_str(remote_node_type)); 3406 res = -1; 3407 goto Exit; 3408 } 3409 3410 /* remote node is switch */ 3411 3412 p_remote_sw = fabric_get_sw_by_guid(p_ftree, remote_node_guid); 3413 CL_ASSERT(p_remote_sw); 3414 3415 /* If CN file is not supplied, then all the CAs considered as Compute Nodes. 3416 Otherwise all the CAs are not CNs, and only guids that are present in the 3417 CN file will be marked as compute nodes. */ 3418 if (is_cns_file_provided == TRUE) { 3419 name_map_item_t *p_elem = (name_map_item_t *) 3420 cl_qmap_get(&p_ftree->cn_guid_tbl, 3421 cl_ntoh64(osm_physp_get_port_guid 3422 (p_osm_port))); 3423 if (p_elem == (name_map_item_t *) 3424 cl_qmap_end(&p_ftree->cn_guid_tbl)) 3425 is_cn = FALSE; 3426 else 3427 is_in_cn_file = TRUE; 3428 } 3429 if (is_in_cn_file == FALSE && is_ios_file_provided == TRUE) { 3430 name_map_item_t *p_elem = (name_map_item_t *) 3431 cl_qmap_get(&p_ftree->io_guid_tbl, 3432 cl_ntoh64(osm_physp_get_port_guid 3433 (p_osm_port))); 3434 if (p_elem != (name_map_item_t *) 3435 cl_qmap_end(&p_ftree->io_guid_tbl)) { 3436 is_io = TRUE; 3437 is_cn = FALSE; 3438 } 3439 } 3440 3441 if (is_cn) { 3442 p_ftree->cn_num++; 3443 p_hca->cn_num++; 3444 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3445 "Marking CN port GUID 0x%016" PRIx64 "\n", 3446 cl_ntoh64(osm_physp_get_port_guid(p_osm_port))); 3447 } else if (is_io) { 3448 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3449 "Marking I/O port GUID 0x%016" PRIx64 "\n", 3450 cl_ntoh64(osm_physp_get_port_guid(p_osm_port))); 3451 } else { 3452 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3453 "Marking non-CN port GUID 0x%016" PRIx64 "\n", 3454 cl_ntoh64(osm_physp_get_port_guid(p_osm_port))); 3455 } 3456 p_ftree->ca_ports++; 3457 3458 hca_add_port(p_ftree, 3459 p_hca, /* local ftree_hca object */ 3460 i, /* local port number */ 3461 remote_port_num, /* remote port number */ 3462 cl_ntoh16(osm_node_get_base_lid(p_node, i)), /* local lid */ 3463 cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0)), /* remote lid */ 3464 osm_physp_get_port_guid(p_osm_port), /* local port guid */ 3465 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */ 3466 remote_node_guid, /* remote node guid */ 3467 remote_node_type, /* remote node type */ 3468 (void *)p_remote_sw, /* remote ftree_hca/sw object */ 3469 is_cn, is_io); /* whether this port is compute node */ 3470 } 3471 3472 Exit: 3473 return res; 3474 } /* fabric_construct_hca_ports() */ 3475 3476 /*************************************************** 3477 ***************************************************/ 3478 3479 static int fabric_construct_sw_ports(IN ftree_fabric_t * p_ftree, 3480 IN ftree_sw_t * p_sw) 3481 { 3482 ftree_hca_t *p_remote_hca; 3483 ftree_sw_t *p_remote_sw; 3484 osm_node_t *p_node = p_sw->p_osm_sw->p_node; 3485 osm_node_t *p_remote_node; 3486 uint16_t remote_lid; 3487 uint8_t remote_node_type; 3488 ib_net64_t remote_node_guid; 3489 osm_physp_t *p_remote_osm_port; 3490 ftree_direction_t direction; 3491 void *p_remote_hca_or_sw; 3492 uint8_t i; 3493 uint8_t remote_port_num; 3494 int res = 0; 3495 3496 CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH); 3497 3498 for (i = 1; i < osm_node_get_num_physp(p_node); i++) { 3499 osm_physp_t *p_osm_port = osm_node_get_physp_ptr(p_node, i); 3500 if (!p_osm_port || !osm_link_is_healthy(p_osm_port)) 3501 continue; 3502 3503 p_remote_osm_port = osm_physp_get_remote(p_osm_port); 3504 if (!p_remote_osm_port) 3505 continue; 3506 3507 p_remote_node = 3508 osm_node_get_remote_node(p_node, i, &remote_port_num); 3509 if (!p_remote_node) 3510 continue; 3511 3512 /* ignore any loopback connection on switch */ 3513 if (p_node == p_remote_node) { 3514 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3515 "Ignoring loopback on switch GUID 0x%016" PRIx64 3516 ", LID %u, rank %u\n", 3517 sw_get_guid_ho(p_sw), 3518 p_sw->lid, p_sw->rank); 3519 continue; 3520 } 3521 3522 remote_node_type = osm_node_get_type(p_remote_node); 3523 remote_node_guid = osm_node_get_node_guid(p_remote_node); 3524 3525 switch (remote_node_type) { 3526 case IB_NODE_TYPE_ROUTER: 3527 /* leaving this port - proceeding to the next one */ 3528 continue; 3529 3530 case IB_NODE_TYPE_CA: 3531 /* switch connected to hca */ 3532 3533 p_remote_hca = 3534 fabric_get_hca_by_guid(p_ftree, remote_node_guid); 3535 CL_ASSERT(p_remote_hca); 3536 3537 p_remote_hca_or_sw = (void *)p_remote_hca; 3538 direction = FTREE_DIRECTION_DOWN; 3539 3540 remote_lid = 3541 cl_ntoh16(osm_physp_get_base_lid(p_remote_osm_port)); 3542 break; 3543 3544 case IB_NODE_TYPE_SWITCH: 3545 /* switch connected to another switch */ 3546 3547 p_remote_sw = 3548 fabric_get_sw_by_guid(p_ftree, remote_node_guid); 3549 CL_ASSERT(p_remote_sw); 3550 3551 p_remote_hca_or_sw = (void *)p_remote_sw; 3552 3553 if (p_sw->rank > p_remote_sw->rank) { 3554 direction = FTREE_DIRECTION_UP; 3555 } else if (p_sw->rank == p_remote_sw->rank) { 3556 direction = FTREE_DIRECTION_SAME; 3557 } else 3558 direction = FTREE_DIRECTION_DOWN; 3559 3560 /* switch LID is only in port 0 port_info structure */ 3561 remote_lid = 3562 cl_ntoh16(osm_node_get_base_lid(p_remote_node, 0)); 3563 3564 break; 3565 3566 default: 3567 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3568 "ERR AB13: Node GUID 0x%016" PRIx64 3569 " - Unknown node type: %s\n", 3570 cl_ntoh64(remote_node_guid), 3571 ib_get_node_type_str(remote_node_type)); 3572 res = -1; 3573 goto Exit; 3574 } 3575 sw_add_port(p_sw, /* local ftree_sw object */ 3576 i, /* local port number */ 3577 remote_port_num, /* remote port number */ 3578 p_sw->lid, /* local lid */ 3579 remote_lid, /* remote lid */ 3580 osm_physp_get_port_guid(p_osm_port), /* local port guid */ 3581 osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */ 3582 remote_node_guid, /* remote node guid */ 3583 remote_node_type, /* remote node type */ 3584 p_remote_hca_or_sw, /* remote ftree_hca/sw object */ 3585 direction); /* port direction (up or down) */ 3586 3587 /* Track the max lid (in host order) that exists in the fabric */ 3588 if (remote_lid > p_ftree->lft_max_lid) 3589 p_ftree->lft_max_lid = remote_lid; 3590 } 3591 3592 Exit: 3593 return res; 3594 } /* fabric_construct_sw_ports() */ 3595 3596 /*************************************************** 3597 ***************************************************/ 3598 struct rank_root_cxt { 3599 ftree_fabric_t *fabric; 3600 cl_list_t *list; 3601 }; 3602 /*************************************************** 3603 ***************************************************/ 3604 static int rank_root_sw_by_guid(void *cxt, uint64_t guid, char *p) 3605 { 3606 struct rank_root_cxt *c = cxt; 3607 ftree_sw_t *sw; 3608 3609 sw = fabric_get_sw_by_guid(c->fabric, cl_hton64(guid)); 3610 if (!sw) { 3611 /* the specified root guid wasn't found in the fabric */ 3612 OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_ERROR, "ERR AB24: " 3613 "Root switch GUID 0x%" PRIx64 " not found\n", guid); 3614 return 0; 3615 } 3616 3617 OSM_LOG(&c->fabric->p_osm->log, OSM_LOG_DEBUG, 3618 "Ranking root switch with GUID 0x%" PRIx64 "\n", guid); 3619 sw->rank = 0; 3620 cl_list_insert_tail(c->list, sw); 3621 3622 return 0; 3623 } 3624 /*************************************************** 3625 ***************************************************/ 3626 static boolean_t fabric_load_roots(IN ftree_fabric_t * p_ftree, 3627 IN cl_list_t* p_ranking_bfs_list) 3628 { 3629 struct rank_root_cxt context; 3630 unsigned num_roots; 3631 3632 if (p_ranking_bfs_list) { 3633 3634 /* Rank all the roots and add them to list */ 3635 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3636 "Fetching root nodes from file %s\n", 3637 p_ftree->p_osm->subn.opt.root_guid_file); 3638 3639 context.fabric = p_ftree; 3640 context.list = p_ranking_bfs_list; 3641 if (parse_node_map(p_ftree->p_osm->subn.opt.root_guid_file, 3642 rank_root_sw_by_guid, &context)) { 3643 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB2A: " 3644 "cannot parse root guids file \'%s\'\n", 3645 p_ftree->p_osm->subn.opt.root_guid_file); 3646 return FALSE; 3647 } 3648 3649 num_roots = cl_list_count(p_ranking_bfs_list); 3650 if (!num_roots) { 3651 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, "ERR AB25: " 3652 "No valid roots supplied\n"); 3653 return FALSE; 3654 } 3655 3656 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3657 "Ranked %u valid root switches\n", num_roots); 3658 } 3659 return TRUE; 3660 } 3661 /*************************************************** 3662 ***************************************************/ 3663 static int fabric_rank_from_roots(IN ftree_fabric_t * p_ftree, 3664 IN cl_list_t* p_ranking_bfs_list) 3665 { 3666 osm_node_t *p_osm_node; 3667 osm_node_t *p_remote_osm_node; 3668 osm_physp_t *p_osm_physp; 3669 ftree_sw_t *p_sw; 3670 ftree_sw_t *p_remote_sw; 3671 int res = 0; 3672 unsigned max_rank = 0; 3673 unsigned i; 3674 3675 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3676 3677 if (!p_ranking_bfs_list) { 3678 res = -1; 3679 goto Exit; 3680 } 3681 while (!cl_is_list_empty(p_ranking_bfs_list)) { 3682 p_sw = (ftree_sw_t *) cl_list_remove_head(p_ranking_bfs_list); 3683 p_osm_node = p_sw->p_osm_sw->p_node; 3684 3685 /* note: skipping port 0 on switches */ 3686 for (i = 1; i < osm_node_get_num_physp(p_osm_node); i++) { 3687 p_osm_physp = osm_node_get_physp_ptr(p_osm_node, i); 3688 if (!p_osm_physp || !osm_link_is_healthy(p_osm_physp)) 3689 continue; 3690 3691 p_remote_osm_node = 3692 osm_node_get_remote_node(p_osm_node, i, NULL); 3693 if (!p_remote_osm_node) 3694 continue; 3695 3696 if (osm_node_get_type(p_remote_osm_node) != 3697 IB_NODE_TYPE_SWITCH) 3698 continue; 3699 3700 p_remote_sw = fabric_get_sw_by_guid(p_ftree, 3701 osm_node_get_node_guid 3702 (p_remote_osm_node)); 3703 CL_ASSERT(p_remote_sw); 3704 3705 /* if needed, rank the remote switch and add it to the BFS list */ 3706 if (sw_update_rank(p_remote_sw, p_sw->rank + 1)) { 3707 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3708 "Ranking switch 0x%" PRIx64 3709 " with rank %u\n", 3710 sw_get_guid_ho(p_remote_sw), 3711 p_remote_sw->rank); 3712 max_rank = p_remote_sw->rank; 3713 cl_list_insert_tail(p_ranking_bfs_list, 3714 p_remote_sw); 3715 } 3716 } 3717 /* done with ports of this switch - go to the next switch in the list */ 3718 } 3719 3720 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3721 "Subnet ranking completed. Max Node Rank = %u\n", max_rank); 3722 3723 /* set FatTree maximal switch rank */ 3724 p_ftree->max_switch_rank = max_rank; 3725 3726 Exit: 3727 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3728 return res; 3729 } /* fabric_rank_from_roots() */ 3730 3731 /*************************************************** 3732 ***************************************************/ 3733 3734 static int fabric_rank_from_hcas(IN ftree_fabric_t * p_ftree) 3735 { 3736 ftree_hca_t *p_hca; 3737 ftree_hca_t *p_next_hca; 3738 cl_list_t ranking_bfs_list; 3739 int res = 0; 3740 3741 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3742 3743 cl_list_init(&ranking_bfs_list, 10); 3744 3745 /* Mark REVERSED rank of all the switches in the subnet. 3746 Start from switches that are connected to hca's, and 3747 scan all the switches in the subnet. */ 3748 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 3749 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 3750 p_hca = p_next_hca; 3751 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 3752 if (rank_leaf_switches(p_ftree, p_hca, &ranking_bfs_list) != 0) { 3753 res = -1; 3754 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3755 "ERR AB14: " 3756 "Subnet ranking failed - subnet is not FatTree"); 3757 goto Exit; 3758 } 3759 } 3760 3761 /* Now rank rest of the switches in the fabric, while the 3762 list already contains all the ranked leaf switches */ 3763 rank_switches_from_leafs(p_ftree, &ranking_bfs_list); 3764 3765 /* fix ranking of the switches by reversing the ranking direction */ 3766 cl_qmap_apply_func(&p_ftree->sw_tbl, sw_reverse_rank, (void *)p_ftree); 3767 3768 Exit: 3769 cl_list_destroy(&ranking_bfs_list); 3770 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3771 return res; 3772 } /* fabric_rank_from_hcas() */ 3773 3774 /*************************************************** 3775 * After ranking from HCA's we want to re-rank using 3776 * the roots 3777 ***************************************************/ 3778 static int fabric_rerank_using_root(IN ftree_fabric_t * p_ftree, 3779 IN cl_list_t* p_ranking_bfs_list) 3780 { 3781 ftree_sw_t *p_sw = NULL; 3782 ftree_sw_t *p_next_sw; 3783 int res; 3784 3785 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3786 3787 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 3788 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 3789 p_sw = p_next_sw; 3790 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 3791 if (p_sw->rank == 0) 3792 cl_list_insert_tail(p_ranking_bfs_list, p_sw); 3793 else 3794 p_sw->rank = 0xFFFFFFFF; 3795 } 3796 res = fabric_rank_from_roots(p_ftree, p_ranking_bfs_list); 3797 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3798 return res; 3799 } 3800 /*************************************************** 3801 ***************************************************/ 3802 static int fabric_rank(IN ftree_fabric_t * p_ftree) 3803 { 3804 int res = -1; 3805 cl_list_t ranking_bfs_list; 3806 3807 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3808 cl_list_init(&ranking_bfs_list, 10); 3809 3810 if (fabric_roots_provided(p_ftree) && 3811 fabric_load_roots(p_ftree, &ranking_bfs_list)) 3812 res = fabric_rank_from_roots(p_ftree, &ranking_bfs_list); 3813 else { 3814 res = fabric_rank_from_hcas(p_ftree); 3815 if (!res) 3816 res = fabric_rerank_using_root(p_ftree, &ranking_bfs_list); 3817 } 3818 3819 if (res) 3820 goto Exit; 3821 3822 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3823 "FatTree max switch rank is %u\n", p_ftree->max_switch_rank); 3824 3825 Exit: 3826 cl_list_destroy(&ranking_bfs_list); 3827 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3828 return res; 3829 } /* fabric_rank() */ 3830 3831 /*************************************************** 3832 ***************************************************/ 3833 3834 static void fabric_set_leaf_rank(IN ftree_fabric_t * p_ftree) 3835 { 3836 unsigned i; 3837 ftree_sw_t *p_sw; 3838 ftree_hca_t *p_hca = NULL; 3839 ftree_hca_t *p_next_hca; 3840 3841 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3842 3843 if (!fabric_roots_provided(p_ftree)) { 3844 /* If root file is not provided, the fabric has to be pure fat-tree 3845 in terms of ranking. Thus, leaf switches rank is the max rank. */ 3846 p_ftree->leaf_switch_rank = p_ftree->max_switch_rank; 3847 } else { 3848 /* Find the first CN and set the leaf_switch_rank to the rank 3849 of the switch that is connected to this CN. Later we will 3850 ensure that all the leaf switches have the same rank. */ 3851 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 3852 while (p_next_hca != 3853 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 3854 p_hca = p_next_hca; 3855 if (p_hca->cn_num) 3856 break; 3857 p_next_hca = 3858 (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 3859 } 3860 /* we know that there are CNs in the fabric, so just to be sure... */ 3861 CL_ASSERT(p_next_hca != 3862 (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)); 3863 3864 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3865 "Selected CN port GUID 0x%" PRIx64 "\n", 3866 hca_get_guid_ho(p_hca)); 3867 3868 for (i = 0; (i < p_hca->up_port_groups_num) 3869 && (!p_hca->up_port_groups[i]->is_cn); i++) 3870 ; 3871 CL_ASSERT(i < p_hca->up_port_groups_num); 3872 CL_ASSERT(p_hca->up_port_groups[i]->remote_node_type == 3873 IB_NODE_TYPE_SWITCH); 3874 3875 p_sw = p_hca->up_port_groups[i]->remote_hca_or_sw.p_sw; 3876 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3877 "Selected leaf switch GUID 0x%" PRIx64 ", rank %u\n", 3878 sw_get_guid_ho(p_sw), p_sw->rank); 3879 p_ftree->leaf_switch_rank = p_sw->rank; 3880 } 3881 3882 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 3883 "FatTree leaf switch rank is %u\n", p_ftree->leaf_switch_rank); 3884 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3885 } /* fabric_set_leaf_rank() */ 3886 3887 /*************************************************** 3888 ***************************************************/ 3889 3890 static int fabric_populate_ports(IN ftree_fabric_t * p_ftree) 3891 { 3892 ftree_hca_t *p_hca; 3893 ftree_hca_t *p_next_hca; 3894 ftree_sw_t *p_sw; 3895 ftree_sw_t *p_next_sw; 3896 int res = 0; 3897 3898 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3899 3900 p_next_hca = (ftree_hca_t *) cl_qmap_head(&p_ftree->hca_tbl); 3901 while (p_next_hca != (ftree_hca_t *) cl_qmap_end(&p_ftree->hca_tbl)) { 3902 p_hca = p_next_hca; 3903 p_next_hca = (ftree_hca_t *) cl_qmap_next(&p_hca->map_item); 3904 if (fabric_construct_hca_ports(p_ftree, p_hca) != 0) { 3905 res = -1; 3906 goto Exit; 3907 } 3908 } 3909 3910 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 3911 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 3912 p_sw = p_next_sw; 3913 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 3914 if (fabric_construct_sw_ports(p_ftree, p_sw) != 0) { 3915 res = -1; 3916 goto Exit; 3917 } 3918 } 3919 Exit: 3920 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3921 return res; 3922 } /* fabric_populate_ports() */ 3923 3924 /*************************************************** 3925 ***************************************************/ 3926 static int add_guid_item_to_map(void *cxt, uint64_t guid, char *p) 3927 { 3928 cl_qmap_t *map = cxt; 3929 name_map_item_t *item; 3930 name_map_item_t *inserted_item; 3931 3932 item = malloc(sizeof(*item)); 3933 if (!item) 3934 return -1; 3935 3936 item->guid = guid; 3937 inserted_item = (name_map_item_t *) cl_qmap_insert(map, guid, &item->item); 3938 if (inserted_item != item) 3939 free(item); 3940 3941 return 0; 3942 } 3943 3944 static int fabric_read_guid_files(IN ftree_fabric_t * p_ftree) 3945 { 3946 int status = 0; 3947 3948 OSM_LOG_ENTER(&p_ftree->p_osm->log); 3949 3950 if (fabric_cns_provided(p_ftree)) { 3951 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3952 "Fetching compute nodes from file %s\n", 3953 p_ftree->p_osm->subn.opt.cn_guid_file); 3954 3955 if (parse_node_map(p_ftree->p_osm->subn.opt.cn_guid_file, 3956 add_guid_item_to_map, 3957 &p_ftree->cn_guid_tbl)) { 3958 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3959 "ERR AB23: " "Problem parsing CN guid file\n"); 3960 status = -1; 3961 goto Exit; 3962 } 3963 3964 if (!cl_qmap_count(&p_ftree->cn_guid_tbl)) { 3965 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3966 "ERR AB27: " 3967 "Compute node guids file has no valid guids\n"); 3968 status = -1; 3969 goto Exit; 3970 } 3971 } 3972 3973 if (fabric_ios_provided(p_ftree)) { 3974 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 3975 "Fetching I/O nodes from file %s\n", 3976 p_ftree->p_osm->subn.opt.io_guid_file); 3977 3978 if (parse_node_map(p_ftree->p_osm->subn.opt.io_guid_file, 3979 add_guid_item_to_map, 3980 &p_ftree->io_guid_tbl)) { 3981 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3982 "ERR AB28: Problem parsing I/O guid file\n"); 3983 status = -1; 3984 goto Exit; 3985 } 3986 3987 if (!cl_qmap_count(&p_ftree->io_guid_tbl)) { 3988 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 3989 "ERR AB29: " 3990 "I/O node guids file has no valid guids\n"); 3991 status = -1; 3992 goto Exit; 3993 } 3994 } 3995 Exit: 3996 OSM_LOG_EXIT(&p_ftree->p_osm->log); 3997 return status; 3998 } /*fabric_read_guid_files() */ 3999 4000 /*************************************************** 4001 ***************************************************/ 4002 /* Get a Sw and remove all depended HCA's, meaning all 4003 * HCA's which this is the only switch they are connected 4004 * to */ 4005 static int remove_depended_hca(IN ftree_fabric_t *p_ftree, IN ftree_sw_t *p_sw) 4006 { 4007 ftree_hca_t *p_hca; 4008 int counter = 0; 4009 int port_num; 4010 uint8_t remote_port_num; 4011 osm_physp_t* physp; 4012 osm_node_t* sw_node; 4013 uint64_t remote_hca_guid; 4014 4015 sw_node = p_sw->p_osm_sw->p_node; 4016 for (port_num = 0; port_num < sw_node->physp_tbl_size; port_num++) { 4017 physp = osm_node_get_physp_ptr(sw_node, port_num); 4018 if (physp && physp->p_remote_physp) { 4019 if (osm_node_get_type(physp->p_remote_physp->p_node) == IB_NODE_TYPE_CA) { 4020 remote_hca_guid = 4021 osm_node_get_node_guid(physp->p_remote_physp->p_node); 4022 p_hca = fabric_get_hca_by_guid(p_ftree, remote_hca_guid); 4023 if (!p_hca) 4024 continue; 4025 4026 remote_port_num = 4027 osm_physp_get_port_num(physp->p_remote_physp); 4028 p_hca->disconnected_ports[remote_port_num] = 1; 4029 } 4030 } 4031 } 4032 return counter; 4033 } 4034 /*************************************************** 4035 ***************************************************/ 4036 static void fabric_remove_unranked_sw(IN ftree_fabric_t *p_ftree) 4037 { 4038 ftree_sw_t *p_sw = NULL; 4039 ftree_sw_t *p_next_sw; 4040 int removed_hca; 4041 int count = 0; 4042 4043 p_next_sw = (ftree_sw_t *) cl_qmap_head(&p_ftree->sw_tbl); 4044 while (p_next_sw != (ftree_sw_t *) cl_qmap_end(&p_ftree->sw_tbl)) { 4045 p_sw = p_next_sw; 4046 p_next_sw = (ftree_sw_t *) cl_qmap_next(&p_sw->map_item); 4047 if (!sw_ranked(p_sw)) { 4048 cl_qmap_remove_item(&p_ftree->sw_tbl,&p_sw->map_item); 4049 removed_hca = remove_depended_hca(p_ftree, p_sw); 4050 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4051 "Removing Unranked sw 0x%" PRIx64 " (with %d dependent hca's)\n", 4052 sw_get_guid_ho(p_sw),removed_hca); 4053 sw_destroy(p_sw); 4054 count++; 4055 } 4056 } 4057 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG, 4058 "Removed %d invalid switches\n", count); 4059 } 4060 /*************************************************** 4061 ***************************************************/ 4062 static int construct_fabric(IN void *context) 4063 { 4064 ftree_fabric_t *p_ftree = context; 4065 int status = 0; 4066 4067 OSM_LOG_ENTER(&p_ftree->p_osm->log); 4068 4069 fabric_clear(p_ftree); 4070 4071 if (p_ftree->p_osm->subn.opt.lmc > 0) { 4072 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4073 "LMC > 0 is not supported by fat-tree routing.\n" 4074 "Falling back to default routing\n"); 4075 status = -1; 4076 goto Exit; 4077 } 4078 4079 if (cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl) < 2) { 4080 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4081 "Fabric has %u switches - topology is not fat-tree.\n" 4082 "Falling back to default routing\n", 4083 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)); 4084 status = -1; 4085 goto Exit; 4086 } 4087 4088 if ((cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl) - 4089 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)) < 2) { 4090 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4091 "Fabric has %u nodes (%u switches) - topology is not fat-tree.\n" 4092 "Falling back to default routing\n", 4093 cl_qmap_count(&p_ftree->p_osm->subn.node_guid_tbl), 4094 cl_qmap_count(&p_ftree->p_osm->subn.sw_guid_tbl)); 4095 status = -1; 4096 goto Exit; 4097 } 4098 4099 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n" 4100 " |----------------------------------------|\n" 4101 " |- Starting FatTree fabric construction -|\n" 4102 " |----------------------------------------|\n\n"); 4103 4104 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4105 "Populating FatTree Switch and CA tables\n"); 4106 if (fabric_populate_nodes(p_ftree) != 0) { 4107 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4108 "Fabric topology is not fat-tree - " 4109 "falling back to default routing\n"); 4110 status = -1; 4111 goto Exit; 4112 } 4113 4114 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4115 "Reading guid files provided by user\n"); 4116 if (fabric_read_guid_files(p_ftree) != 0) { 4117 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4118 "Failed reading guid files - " 4119 "falling back to default routing\n"); 4120 status = -1; 4121 goto Exit; 4122 } 4123 4124 if (cl_qmap_count(&p_ftree->hca_tbl) < 2) { 4125 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4126 "Fabric has %u CAs - topology is not fat-tree.\n" 4127 "Falling back to default routing\n", 4128 cl_qmap_count(&p_ftree->hca_tbl)); 4129 status = -1; 4130 goto Exit; 4131 } 4132 4133 /* Rank all the switches in the fabric. 4134 After that we will know only fabric max switch rank. 4135 We will be able to check leaf switches rank and the 4136 whole tree rank after filling ports and marking CNs. */ 4137 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "Ranking FatTree\n"); 4138 if (fabric_rank(p_ftree) != 0) { 4139 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4140 "Failed ranking the tree\n"); 4141 status = -1; 4142 goto Exit; 4143 } 4144 fabric_remove_unranked_sw(p_ftree); 4145 4146 if (p_ftree->max_switch_rank == 0 && 4147 cl_qmap_count(&p_ftree->sw_tbl) > 1) { 4148 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_ERROR, 4149 "ERR AB2B: Found more than one root on fabric with " 4150 "maximum rank 0\n"); 4151 status = -1; 4152 goto Exit; 4153 } 4154 4155 /* For each hca and switch, construct array of ports. 4156 This is done after the whole FatTree data structure is ready, 4157 because we want the ports to have pointers to ftree_{sw,hca}_t 4158 objects, and we need the switches to be already ranked because 4159 that's how the port direction is determined. */ 4160 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4161 "Populating CA & switch ports\n"); 4162 if (fabric_populate_ports(p_ftree) != 0) { 4163 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4164 "Fabric topology is not a fat-tree\n"); 4165 status = -1; 4166 goto Exit; 4167 } else if (p_ftree->cn_num == 0) { 4168 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4169 "Fabric has no valid compute nodes\n"); 4170 status = -1; 4171 goto Exit; 4172 } 4173 4174 /* Now that the CA ports have been created and CNs were marked, 4175 we can complete the fabric ranking - set leaf switches rank. */ 4176 fabric_set_leaf_rank(p_ftree); 4177 4178 if (fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK || 4179 fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK) { 4180 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4181 "Fabric rank is %u (should be between %u and %u)\n", 4182 fabric_get_rank(p_ftree), FAT_TREE_MIN_RANK, 4183 FAT_TREE_MAX_RANK); 4184 status = -1; 4185 goto Exit; 4186 } 4187 4188 /* Mark all the switches in the fabric with rank equal to 4189 p_ftree->leaf_switch_rank and that are also connected to CNs. 4190 As a by-product, this function also runs basic topology 4191 validation - it checks that all the CNs are at the same rank. */ 4192 if (fabric_mark_leaf_switches(p_ftree)) { 4193 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4194 "Fabric topology is not a fat-tree\n"); 4195 status = -1; 4196 goto Exit; 4197 } 4198 4199 /* Assign index to all the switches in the fabric. 4200 This function also sorts leaf switch array by the switch index, 4201 sorts all the port arrays of the indexed switches by remote 4202 switch index, and creates switch-by-tuple table (sw_by_tuple_tbl) */ 4203 fabric_make_indexing(p_ftree); 4204 4205 /* Create leaf switch array sorted by index. 4206 This array contains switches with rank equal to p_ftree->leaf_switch_rank 4207 and that are also connected to CNs (REAL leafs), and it may contain 4208 switches at the same leaf rank w/o CNs, if this is the order of indexing. 4209 In any case, the first and the last switches in the array are REAL leafs. */ 4210 if (fabric_create_leaf_switch_array(p_ftree)) { 4211 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4212 "Fabric topology is not a fat-tree\n"); 4213 status = -1; 4214 goto Exit; 4215 } 4216 4217 /* calculate and set ftree.max_cn_per_leaf field */ 4218 fabric_set_max_cn_per_leaf(p_ftree); 4219 4220 /* print general info about fabric topology */ 4221 fabric_dump_general_info(p_ftree); 4222 4223 /* dump full tree topology */ 4224 if (OSM_LOG_IS_ACTIVE_V2(&p_ftree->p_osm->log, OSM_LOG_DEBUG)) 4225 fabric_dump(p_ftree); 4226 4227 /* the fabric is required to be PURE fat-tree only if the root 4228 guid file hasn't been provided by user */ 4229 if (!fabric_roots_provided(p_ftree) && 4230 !fabric_validate_topology(p_ftree)) { 4231 osm_log_v2(&p_ftree->p_osm->log, OSM_LOG_INFO, FILE_ID, 4232 "Fabric topology is not a fat-tree\n"); 4233 status = -1; 4234 goto Exit; 4235 } 4236 4237 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4238 "Max LID in switch LFTs: %u\n", p_ftree->lft_max_lid); 4239 4240 /* Build the full lid matrices needed for multicast routing */ 4241 osm_ucast_mgr_build_lid_matrices(&p_ftree->p_osm->sm.ucast_mgr); 4242 4243 Exit: 4244 if (status != 0) { 4245 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4246 "Clearing FatTree Fabric data structures\n"); 4247 fabric_clear(p_ftree); 4248 } else 4249 p_ftree->fabric_built = TRUE; 4250 4251 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, "\n" 4252 " |--------------------------------------------------|\n" 4253 " |- Done constructing FatTree fabric (status = %d) -|\n" 4254 " |--------------------------------------------------|\n\n", 4255 status); 4256 4257 OSM_LOG_EXIT(&p_ftree->p_osm->log); 4258 return status; 4259 } /* construct_fabric() */ 4260 4261 /*************************************************** 4262 ***************************************************/ 4263 4264 static int do_routing(IN void *context) 4265 { 4266 ftree_fabric_t *p_ftree = context; 4267 int status = 0; 4268 4269 OSM_LOG_ENTER(&p_ftree->p_osm->log); 4270 4271 if (!p_ftree->fabric_built) { 4272 status = -1; 4273 goto Exit; 4274 } 4275 4276 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4277 "Starting FatTree routing\n"); 4278 4279 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4280 "Filling switch forwarding tables for Compute Nodes\n"); 4281 fabric_route_to_cns(p_ftree); 4282 4283 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4284 "Filling switch forwarding tables for non-CN targets\n"); 4285 fabric_route_to_non_cns(p_ftree); 4286 4287 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4288 "Filling switch forwarding tables for switch-to-switch paths\n"); 4289 fabric_route_to_switches(p_ftree); 4290 4291 if (p_ftree->p_osm->subn.opt.connect_roots) { 4292 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4293 "Connecting switches that are unreachable within " 4294 "Up/Down rules\n"); 4295 fabric_route_roots(p_ftree); 4296 } 4297 4298 /* for each switch, set its fwd table */ 4299 cl_qmap_apply_func(&p_ftree->sw_tbl, set_sw_fwd_table, (void *)p_ftree); 4300 4301 /* write out hca ordering file */ 4302 fabric_dump_hca_ordering(p_ftree); 4303 4304 OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_VERBOSE, 4305 "FatTree routing is done\n"); 4306 4307 Exit: 4308 OSM_LOG_EXIT(&p_ftree->p_osm->log); 4309 return status; 4310 } 4311 4312 /*************************************************** 4313 ***************************************************/ 4314 4315 static void delete(IN void *context) 4316 { 4317 if (!context) 4318 return; 4319 fabric_destroy((ftree_fabric_t *) context); 4320 } 4321 4322 /*************************************************** 4323 ***************************************************/ 4324 4325 int osm_ucast_ftree_setup(struct osm_routing_engine *r, osm_opensm_t * p_osm) 4326 { 4327 ftree_fabric_t *p_ftree = fabric_create(); 4328 if (!p_ftree) 4329 return -1; 4330 4331 p_ftree->p_osm = p_osm; 4332 p_ftree->p_subn = p_osm->sm.ucast_mgr.p_subn; 4333 4334 r->context = (void *)p_ftree; 4335 r->build_lid_matrices = construct_fabric; 4336 r->ucast_build_fwd_tables = do_routing; 4337 r->destroy = delete; 4338 4339 return 0; 4340 } 4341