1 /* 2 * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. 3 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. 4 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved. 5 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. 6 * Copyright (c) 2009 HNR Consulting. All rights reserved. 7 * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved. 8 * 9 * This software is available to you under a choice of one of two 10 * licenses. You may choose to be licensed under the terms of the GNU 11 * General Public License (GPL) Version 2, available from the file 12 * COPYING in the main directory of this source tree, or the 13 * OpenIB.org BSD license below: 14 * 15 * Redistribution and use in source and binary forms, with or 16 * without modification, are permitted provided that the following 17 * conditions are met: 18 * 19 * - Redistributions of source code must retain the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer. 22 * 23 * - Redistributions in binary form must reproduce the above 24 * copyright notice, this list of conditions and the following 25 * disclaimer in the documentation and/or other materials 26 * provided with the distribution. 27 * 28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 29 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 30 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 31 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 32 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 33 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 34 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 35 * SOFTWARE. 36 * 37 */ 38 39 /* 40 * Abstract: 41 * Implementation of osm_state_mgr_t. 42 * This file implements the State Manager object. 43 */ 44 45 #if HAVE_CONFIG_H 46 # include <config.h> 47 #endif /* HAVE_CONFIG_H */ 48 49 #include <unistd.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <iba/ib_types.h> 53 #include <complib/cl_passivelock.h> 54 #include <complib/cl_debug.h> 55 #include <complib/cl_qmap.h> 56 #include <opensm/osm_file_ids.h> 57 #define FILE_ID OSM_FILE_STATE_MGR_C 58 #include <opensm/osm_sm.h> 59 #include <opensm/osm_madw.h> 60 #include <opensm/osm_switch.h> 61 #include <opensm/osm_log.h> 62 #include <opensm/osm_subnet.h> 63 #include <opensm/osm_helper.h> 64 #include <opensm/osm_msgdef.h> 65 #include <opensm/osm_node.h> 66 #include <opensm/osm_port.h> 67 #include <vendor/osm_vendor_api.h> 68 #include <opensm/osm_inform.h> 69 #include <opensm/osm_opensm.h> 70 #include <opensm/osm_congestion_control.h> 71 #include <opensm/osm_db.h> 72 #include <opensm/osm_service.h> 73 #include <opensm/osm_guid.h> 74 75 extern void osm_drop_mgr_process(IN osm_sm_t * sm); 76 extern int osm_qos_setup(IN osm_opensm_t * p_osm); 77 extern int osm_pkey_mgr_process(IN osm_opensm_t * p_osm); 78 extern int osm_mcast_mgr_process(IN osm_sm_t * sm, boolean_t config_all); 79 extern int osm_link_mgr_process(IN osm_sm_t * sm, IN uint8_t state); 80 extern void osm_guid_mgr_process(IN osm_sm_t * sm); 81 82 static void state_mgr_up_msg(IN const osm_sm_t * sm) 83 { 84 /* 85 * This message should be written only once - when the 86 * SM moves to Master state and the subnet is up for 87 * the first time. 88 */ 89 osm_log_v2(sm->p_log, sm->p_subn->first_time_master_sweep ? 90 OSM_LOG_SYS : OSM_LOG_INFO, FILE_ID, "SUBNET UP\n"); 91 92 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 93 sm->p_subn->opt.sweep_interval ? 94 "SUBNET UP" : "SUBNET UP (sweep disabled)"); 95 } 96 97 static void state_mgr_reset_node_count(IN cl_map_item_t * p_map_item, 98 IN void *context) 99 { 100 osm_node_t *p_node = (osm_node_t *) p_map_item; 101 102 p_node->discovery_count = 0; 103 104 memset(p_node->physp_discovered, 0, sizeof(uint8_t) * p_node->physp_tbl_size); 105 } 106 107 static void state_mgr_reset_port_count(IN cl_map_item_t * p_map_item, 108 IN void *context) 109 { 110 osm_port_t *p_port = (osm_port_t *) p_map_item; 111 112 p_port->discovery_count = 0; 113 } 114 115 static void state_mgr_reset_switch_count(IN cl_map_item_t * p_map_item, 116 IN void *context) 117 { 118 osm_switch_t *p_sw = (osm_switch_t *) p_map_item; 119 120 if (p_sw->max_lid_ho != 0) 121 p_sw->need_update = 1; 122 } 123 124 static void state_mgr_get_sw_info(IN cl_map_item_t * p_object, IN void *context) 125 { 126 osm_node_t *p_node; 127 osm_physp_t *p_physp; 128 osm_dr_path_t *p_dr_path; 129 osm_madw_context_t mad_context; 130 osm_switch_t *const p_sw = (osm_switch_t *) p_object; 131 osm_sm_t *sm = context; 132 ib_api_status_t status; 133 134 OSM_LOG_ENTER(sm->p_log); 135 136 p_node = p_sw->p_node; 137 p_physp = osm_node_get_physp_ptr(p_node, 0); 138 p_dr_path = osm_physp_get_dr_path_ptr(p_physp); 139 140 memset(&mad_context, 0, sizeof(mad_context)); 141 142 mad_context.si_context.node_guid = osm_node_get_node_guid(p_node); 143 mad_context.si_context.set_method = FALSE; 144 mad_context.si_context.light_sweep = TRUE; 145 mad_context.si_context.lft_top_change = FALSE; 146 147 status = osm_req_get(sm, p_dr_path, IB_MAD_ATTR_SWITCH_INFO, 0, 148 FALSE, ib_port_info_get_m_key(&p_physp->port_info), 149 OSM_MSG_LIGHT_SWEEP_FAIL, &mad_context); 150 if (status != IB_SUCCESS) 151 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3304: " 152 "Request for SwitchInfo from 0x%" PRIx64 " failed (%s)\n", 153 cl_ntoh64(osm_node_get_node_guid(p_node)), 154 ib_get_err_str(status)); 155 156 OSM_LOG_EXIT(sm->p_log); 157 } 158 159 /********************************************************************** 160 Initiate a remote port info request for the given physical port 161 **********************************************************************/ 162 static void state_mgr_get_remote_port_info(IN osm_sm_t * sm, 163 IN osm_physp_t * p_physp) 164 { 165 osm_dr_path_t *p_dr_path; 166 osm_dr_path_t rem_node_dr_path; 167 osm_madw_context_t mad_context; 168 ib_api_status_t status; 169 170 OSM_LOG_ENTER(sm->p_log); 171 172 /* generate a dr path leaving on the physp to the remote node */ 173 p_dr_path = osm_physp_get_dr_path_ptr(p_physp); 174 memcpy(&rem_node_dr_path, p_dr_path, sizeof(osm_dr_path_t)); 175 if (osm_dr_path_extend(&rem_node_dr_path, osm_physp_get_port_num(p_physp))) { 176 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 332D: " 177 "DR path with hop count %d couldn't be extended " 178 "so skipping PortInfo query\n", 179 p_dr_path->hop_count); 180 goto Exit; 181 } 182 183 memset(&mad_context, 0, sizeof(mad_context)); 184 185 mad_context.pi_context.node_guid = 186 osm_node_get_node_guid(osm_physp_get_node_ptr(p_physp)); 187 mad_context.pi_context.port_guid = p_physp->port_guid; 188 mad_context.pi_context.set_method = FALSE; 189 mad_context.pi_context.light_sweep = TRUE; 190 mad_context.pi_context.active_transition = FALSE; 191 mad_context.pi_context.client_rereg = FALSE; 192 193 /* note that with some negative logic - if the query failed it means 194 * that there is no point in going to heavy sweep */ 195 status = osm_req_get(sm, &rem_node_dr_path, IB_MAD_ATTR_PORT_INFO, 0, 196 TRUE, 0, CL_DISP_MSGID_NONE, &mad_context); 197 if (status != IB_SUCCESS) 198 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 332E: " 199 "Request for remote PortInfo from 0x%" PRIx64 " failed (%s)\n", 200 cl_ntoh64(p_physp->port_guid), ib_get_err_str(status)); 201 202 Exit: 203 OSM_LOG_EXIT(sm->p_log); 204 } 205 206 /********************************************************************** 207 Initiates a thorough sweep of the subnet. 208 Used when there is suspicion that something on the subnet has changed. 209 **********************************************************************/ 210 static ib_api_status_t state_mgr_sweep_hop_0(IN osm_sm_t * sm) 211 { 212 ib_api_status_t status; 213 osm_dr_path_t dr_path; 214 osm_bind_handle_t h_bind; 215 uint8_t path_array[IB_SUBNET_PATH_HOPS_MAX]; 216 217 OSM_LOG_ENTER(sm->p_log); 218 219 memset(path_array, 0, sizeof(path_array)); 220 221 /* 222 * First, get the bind handle. 223 */ 224 h_bind = osm_sm_mad_ctrl_get_bind_handle(&sm->mad_ctrl); 225 if (h_bind != OSM_BIND_INVALID_HANDLE) { 226 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 227 "INITIATING HEAVY SWEEP"); 228 /* 229 * Start the sweep by clearing the port counts, then 230 * get our own NodeInfo at 0 hops. 231 */ 232 CL_PLOCK_ACQUIRE(sm->p_lock); 233 234 cl_qmap_apply_func(&sm->p_subn->node_guid_tbl, 235 state_mgr_reset_node_count, sm); 236 237 cl_qmap_apply_func(&sm->p_subn->port_guid_tbl, 238 state_mgr_reset_port_count, sm); 239 240 cl_qmap_apply_func(&sm->p_subn->sw_guid_tbl, 241 state_mgr_reset_switch_count, sm); 242 243 /* Set the in_sweep_hop_0 flag in subn to be TRUE. 244 * This will indicate the sweeping not to continue beyond the 245 * the current node. 246 * This is relevant for the case of SM on switch, since in the 247 * switch info we need to signal somehow not to continue 248 * the sweeping. */ 249 sm->p_subn->in_sweep_hop_0 = TRUE; 250 251 CL_PLOCK_RELEASE(sm->p_lock); 252 253 osm_dr_path_init(&dr_path, 0, path_array); 254 CL_PLOCK_ACQUIRE(sm->p_lock); 255 status = osm_req_get(sm, &dr_path, IB_MAD_ATTR_NODE_INFO, 0, 256 TRUE, 0, CL_DISP_MSGID_NONE, NULL); 257 CL_PLOCK_RELEASE(sm->p_lock); 258 if (status != IB_SUCCESS) 259 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3305: " 260 "Request for NodeInfo failed (%s)\n", 261 ib_get_err_str(status)); 262 } else { 263 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 264 "No bound ports. Deferring sweep...\n"); 265 status = IB_INVALID_STATE; 266 } 267 268 OSM_LOG_EXIT(sm->p_log); 269 return status; 270 } 271 272 /********************************************************************** 273 Clear out all existing port lid assignments 274 **********************************************************************/ 275 static ib_api_status_t state_mgr_clean_known_lids(IN osm_sm_t * sm) 276 { 277 ib_api_status_t status = IB_SUCCESS; 278 cl_ptr_vector_t *p_vec = &(sm->p_subn->port_lid_tbl); 279 uint32_t i; 280 281 OSM_LOG_ENTER(sm->p_log); 282 283 /* we need a lock here! */ 284 CL_PLOCK_ACQUIRE(sm->p_lock); 285 286 for (i = 0; i < cl_ptr_vector_get_size(p_vec); i++) 287 cl_ptr_vector_set(p_vec, i, NULL); 288 289 CL_PLOCK_RELEASE(sm->p_lock); 290 291 OSM_LOG_EXIT(sm->p_log); 292 return status; 293 } 294 295 /********************************************************************** 296 Clear SA cache 297 **********************************************************************/ 298 static ib_api_status_t state_mgr_sa_clean(IN osm_sm_t * sm) 299 { 300 ib_api_status_t status = IB_SUCCESS; 301 cl_qmap_t *p_port_guid_tbl; 302 osm_assigned_guids_t *p_assigned_guids, *p_next_assigned_guids; 303 osm_alias_guid_t *p_alias_guid, *p_next_alias_guid; 304 osm_mcm_port_t *mcm_port; 305 osm_subn_t * p_subn; 306 osm_port_t *p_port; 307 osm_infr_t *p_infr; 308 osm_svcr_t *p_svcr; 309 310 OSM_LOG_ENTER(sm->p_log); 311 312 p_subn = sm->p_subn; 313 314 /* we need a lock here! */ 315 CL_PLOCK_EXCL_ACQUIRE(sm->p_lock); 316 317 if (p_subn->opt.drop_event_subscriptions) { 318 /* Clean InformInfo records */ 319 p_infr = (osm_infr_t *) cl_qlist_remove_head(&p_subn->sa_infr_list); 320 while (p_infr != 321 (osm_infr_t *) cl_qlist_end(&p_subn->sa_infr_list)) { 322 osm_infr_delete(p_infr); 323 p_infr = (osm_infr_t *) cl_qlist_remove_head(&p_subn->sa_infr_list); 324 } 325 326 /* For now, treat Service Records in same category as InformInfos */ 327 /* Clean Service records */ 328 p_svcr = (osm_svcr_t *) cl_qlist_remove_head(&p_subn->sa_sr_list); 329 while (p_svcr != 330 (osm_svcr_t *) cl_qlist_end(&p_subn->sa_sr_list)) { 331 osm_svcr_delete(p_svcr); 332 p_svcr = (osm_svcr_t *) cl_qlist_remove_head(&p_subn->sa_sr_list); 333 } 334 } 335 336 /* Clean Multicast member list on each port */ 337 p_port_guid_tbl = &p_subn->port_guid_tbl; 338 for (p_port = (osm_port_t *) cl_qmap_head(p_port_guid_tbl); 339 p_port != (osm_port_t *) cl_qmap_end(p_port_guid_tbl); 340 p_port = (osm_port_t *) cl_qmap_next(&p_port->map_item)) { 341 while (!cl_is_qlist_empty(&p_port->mcm_list)) { 342 mcm_port = cl_item_obj(cl_qlist_head(&p_port->mcm_list), 343 mcm_port, list_item); 344 osm_mgrp_delete_port(p_subn, sm->p_log, mcm_port->mgrp, 345 p_port); 346 } 347 /* Hack - clean alias guid table from physp */ 348 free(p_port->p_physp->p_guids); 349 p_port->p_physp->p_guids = NULL; 350 } 351 352 /* Clean Alias Guid work objects */ 353 while (cl_qlist_count(&p_subn->alias_guid_list)) 354 osm_guid_work_obj_delete((osm_guidinfo_work_obj_t *) 355 cl_qlist_remove_head(&p_subn->alias_guid_list)); 356 357 /* Clean Assigned GUIDs table */ 358 p_next_assigned_guids = (osm_assigned_guids_t *) 359 cl_qmap_head(&p_subn->assigned_guids_tbl); 360 while (p_next_assigned_guids != 361 (osm_assigned_guids_t *) cl_qmap_end(&p_subn->assigned_guids_tbl)) { 362 p_assigned_guids = p_next_assigned_guids; 363 p_next_assigned_guids = (osm_assigned_guids_t *) 364 cl_qmap_next(&p_assigned_guids->map_item); 365 cl_qmap_remove_item(&p_subn->assigned_guids_tbl, 366 &p_assigned_guids->map_item); 367 osm_assigned_guids_delete(&p_assigned_guids); 368 } 369 370 /* Clean Alias GUIDs table */ 371 p_next_alias_guid = (osm_alias_guid_t *) 372 cl_qmap_head(&p_subn->alias_port_guid_tbl); 373 while (p_next_alias_guid != 374 (osm_alias_guid_t *) cl_qmap_end(&p_subn->alias_port_guid_tbl)) { 375 p_alias_guid = p_next_alias_guid; 376 p_next_alias_guid = (osm_alias_guid_t *) 377 cl_qmap_next(&p_alias_guid->map_item); 378 if (osm_alias_guid_get_alias_guid(p_alias_guid) != 379 osm_alias_guid_get_base_guid(p_alias_guid)) { 380 /* Clean if it's not base port GUID */ 381 cl_qmap_remove_item(&p_subn->alias_port_guid_tbl, 382 &p_alias_guid->map_item); 383 osm_alias_guid_delete(&p_alias_guid); 384 } 385 } 386 387 p_subn->p_osm->sa.dirty = TRUE; 388 389 CL_PLOCK_RELEASE(sm->p_lock); 390 OSM_LOG_EXIT(sm->p_log); 391 return status; 392 } 393 394 /********************************************************************** 395 Notifies the transport layer that the local LID has changed, 396 which give it a chance to update address vectors, etc.. 397 **********************************************************************/ 398 static ib_api_status_t state_mgr_notify_lid_change(IN osm_sm_t * sm) 399 { 400 ib_api_status_t status; 401 osm_bind_handle_t h_bind; 402 403 OSM_LOG_ENTER(sm->p_log); 404 405 /* 406 * First, get the bind handle. 407 */ 408 h_bind = osm_sm_mad_ctrl_get_bind_handle(&sm->mad_ctrl); 409 if (h_bind == OSM_BIND_INVALID_HANDLE) { 410 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3306: " 411 "No bound ports\n"); 412 status = IB_ERROR; 413 goto Exit; 414 } 415 416 /* 417 * Notify the transport layer that we changed the local LID. 418 */ 419 status = osm_vendor_local_lid_change(h_bind); 420 if (status != IB_SUCCESS) 421 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3307: " 422 "Vendor LID update failed (%s)\n", 423 ib_get_err_str(status)); 424 425 Exit: 426 OSM_LOG_EXIT(sm->p_log); 427 return status; 428 } 429 430 /********************************************************************** 431 Returns true if the SM port is down. 432 The SM's port object must exist in the port_guid table. 433 **********************************************************************/ 434 static boolean_t state_mgr_is_sm_port_down(IN osm_sm_t * sm) 435 { 436 ib_net64_t port_guid; 437 osm_port_t *p_port; 438 osm_physp_t *p_physp; 439 uint8_t state; 440 441 OSM_LOG_ENTER(sm->p_log); 442 443 port_guid = sm->p_subn->sm_port_guid; 444 445 /* 446 * If we don't know our own port guid yet, assume the port is down. 447 */ 448 if (port_guid == 0) { 449 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3308: " 450 "SM port GUID unknown\n"); 451 state = IB_LINK_DOWN; 452 goto Exit; 453 } 454 455 CL_ASSERT(port_guid); 456 457 CL_PLOCK_ACQUIRE(sm->p_lock); 458 p_port = osm_get_port_by_guid(sm->p_subn, port_guid); 459 if (!p_port) { 460 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3309: " 461 "SM port with GUID:%016" PRIx64 " is unknown\n", 462 cl_ntoh64(port_guid)); 463 state = IB_LINK_DOWN; 464 CL_PLOCK_RELEASE(sm->p_lock); 465 goto Exit; 466 } 467 468 p_physp = p_port->p_physp; 469 470 CL_ASSERT(p_physp); 471 472 if (p_port->p_node->sw && 473 !ib_switch_info_is_enhanced_port0(&p_port->p_node->sw->switch_info)) 474 state = IB_LINK_ACTIVE; /* base SP0 */ 475 else 476 state = osm_physp_get_port_state(p_physp); 477 478 if (!p_port->discovery_count) { 479 OSM_LOG(sm->p_log, OSM_LOG_ERROR, 480 "ERR 330A: Failed to discover SM port\n"); 481 state = IB_LINK_DOWN; 482 } 483 484 CL_PLOCK_RELEASE(sm->p_lock); 485 486 Exit: 487 OSM_LOG_EXIT(sm->p_log); 488 return (state == IB_LINK_DOWN); 489 } 490 491 /********************************************************************** 492 Sweeps the node 1 hop away. 493 This sets off a "chain reaction" that causes discovery of the subnet. 494 Used when there is suspicion that something on the subnet has changed. 495 **********************************************************************/ 496 static ib_api_status_t state_mgr_sweep_hop_1(IN osm_sm_t * sm) 497 { 498 ib_api_status_t status = IB_SUCCESS; 499 osm_madw_context_t context; 500 osm_node_t *p_node; 501 osm_port_t *p_port; 502 osm_dr_path_t hop_1_path; 503 ib_net64_t port_guid; 504 uint8_t port_num; 505 uint8_t path_array[IB_SUBNET_PATH_HOPS_MAX]; 506 uint8_t num_ports; 507 osm_physp_t *p_ext_physp; 508 509 OSM_LOG_ENTER(sm->p_log); 510 511 /* 512 * First, get our own port and node objects. 513 */ 514 port_guid = sm->p_subn->sm_port_guid; 515 516 CL_ASSERT(port_guid); 517 518 /* Set the in_sweep_hop_0 flag in subn to be FALSE. 519 * This will indicate the sweeping to continue beyond the 520 * the current node. 521 * This is relevant for the case of SM on switch, since in the 522 * switch info we need to signal that the sweeping should 523 * continue through the switch. */ 524 sm->p_subn->in_sweep_hop_0 = FALSE; 525 526 p_port = osm_get_port_by_guid(sm->p_subn, port_guid); 527 if (!p_port) { 528 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3310: " 529 "No SM port object\n"); 530 status = IB_ERROR; 531 goto Exit; 532 } 533 534 p_node = p_port->p_node; 535 CL_ASSERT(p_node); 536 537 port_num = ib_node_info_get_local_port_num(&p_node->node_info); 538 539 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 540 "Probing hop 1 on local port %u\n", port_num); 541 542 memset(path_array, 0, sizeof(path_array)); 543 /* the hop_1 operations depend on the type of our node. 544 * Currently - legal nodes that can host SM are SW and CA */ 545 switch (osm_node_get_type(p_node)) { 546 case IB_NODE_TYPE_CA: 547 case IB_NODE_TYPE_ROUTER: 548 memset(&context, 0, sizeof(context)); 549 context.ni_context.node_guid = osm_node_get_node_guid(p_node); 550 context.ni_context.port_num = port_num; 551 552 path_array[1] = port_num; 553 554 osm_dr_path_init(&hop_1_path, 1, path_array); 555 CL_PLOCK_ACQUIRE(sm->p_lock); 556 status = osm_req_get(sm, &hop_1_path, IB_MAD_ATTR_NODE_INFO, 0, 557 TRUE, 0, CL_DISP_MSGID_NONE, &context); 558 CL_PLOCK_RELEASE(sm->p_lock); 559 if (status != IB_SUCCESS) 560 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3311: " 561 "Request for NodeInfo failed (%s)\n", 562 ib_get_err_str(status)); 563 break; 564 565 case IB_NODE_TYPE_SWITCH: 566 /* Need to go over all the ports of the switch, and send a 567 * node_info from them. This doesn't include the port 0 of the 568 * switch, which hosts the SM. 569 * Note: We'll send another switchInfo on port 0, since if no 570 * ports are connected, we still want to get some response, and 571 * have the subnet come up. 572 */ 573 num_ports = osm_node_get_num_physp(p_node); 574 for (port_num = 1; port_num < num_ports; port_num++) { 575 /* go through the port only if the port is not DOWN */ 576 p_ext_physp = osm_node_get_physp_ptr(p_node, port_num); 577 if (p_ext_physp && ib_port_info_get_port_state 578 (&(p_ext_physp->port_info)) > IB_LINK_DOWN) { 579 memset(&context, 0, sizeof(context)); 580 context.ni_context.node_guid = 581 osm_node_get_node_guid(p_node); 582 context.ni_context.port_num = port_num; 583 584 path_array[1] = port_num; 585 osm_dr_path_init(&hop_1_path, 1, path_array); 586 CL_PLOCK_ACQUIRE(sm->p_lock); 587 status = osm_req_get(sm, &hop_1_path, 588 IB_MAD_ATTR_NODE_INFO, 0, 589 TRUE, 0, 590 CL_DISP_MSGID_NONE, 591 &context); 592 CL_PLOCK_RELEASE(sm->p_lock); 593 if (status != IB_SUCCESS) 594 OSM_LOG(sm->p_log, OSM_LOG_ERROR, 595 "ERR 3312: " 596 "Request for NodeInfo failed (%s)\n", 597 ib_get_err_str(status)); 598 } 599 } 600 break; 601 602 default: 603 OSM_LOG(sm->p_log, OSM_LOG_ERROR, 604 "ERR 3313: Unknown node type %d (%s)\n", 605 osm_node_get_type(p_node), p_node->print_desc); 606 } 607 608 Exit: 609 OSM_LOG_EXIT(sm->p_log); 610 return status; 611 } 612 613 static void query_sm_info(cl_map_item_t * item, void *cxt) 614 { 615 osm_madw_context_t context; 616 osm_remote_sm_t *r_sm = cl_item_obj(item, r_sm, map_item); 617 osm_sm_t *sm = cxt; 618 ib_api_status_t ret; 619 osm_port_t *p_port; 620 621 p_port= osm_get_port_by_guid(sm->p_subn, r_sm->smi.guid); 622 if (p_port == NULL) { 623 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3340: " 624 "No port object on given sm object\n"); 625 return; 626 } 627 628 context.smi_context.port_guid = r_sm->smi.guid; 629 context.smi_context.set_method = FALSE; 630 context.smi_context.light_sweep = TRUE; 631 632 ret = osm_req_get(sm, osm_physp_get_dr_path_ptr(p_port->p_physp), 633 IB_MAD_ATTR_SM_INFO, 0, FALSE, 634 ib_port_info_get_m_key(&p_port->p_physp->port_info), 635 CL_DISP_MSGID_NONE, &context); 636 if (ret != IB_SUCCESS) 637 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3314: " 638 "Failure requesting SMInfo (%s)\n", 639 ib_get_err_str(ret)); 640 } 641 642 static void state_mgr_reset_state_change_bit(IN cl_map_item_t * obj, 643 IN void *context) 644 { 645 osm_madw_context_t mad_context; 646 osm_switch_t *p_sw = (osm_switch_t *) obj; 647 osm_sm_t *sm = context; 648 osm_node_t *p_node; 649 osm_physp_t *p_physp; 650 osm_dr_path_t *p_path; 651 ib_api_status_t status; 652 ib_switch_info_t si; 653 654 OSM_LOG_ENTER(sm->p_log); 655 656 CL_ASSERT(p_sw); 657 658 p_node = p_sw->p_node; 659 660 CL_ASSERT(p_node); 661 662 p_physp = osm_node_get_physp_ptr(p_node, 0); 663 p_path = osm_physp_get_dr_path_ptr(p_physp); 664 665 if (!ib_switch_info_get_state_change(&p_sw->switch_info)) 666 goto exit; 667 668 si = p_sw->switch_info; 669 670 ib_switch_info_state_change_set(&si); 671 672 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 673 "Resetting PortStateChange on switch GUID 0x%016" PRIx64 "\n", 674 cl_ntoh64(osm_node_get_node_guid(p_node))); 675 676 mad_context.si_context.light_sweep = FALSE; 677 mad_context.si_context.node_guid = osm_node_get_node_guid(p_node); 678 mad_context.si_context.set_method = TRUE; 679 mad_context.si_context.lft_top_change = FALSE; 680 681 status = osm_req_set(sm, p_path, (uint8_t *) &si, 682 sizeof(si), IB_MAD_ATTR_SWITCH_INFO, 683 0, FALSE, 684 ib_port_info_get_m_key(&p_physp->port_info), 685 CL_DISP_MSGID_NONE, &mad_context); 686 687 if (status != IB_SUCCESS) 688 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 332A: " 689 "Sending SwitchInfo attribute failed (%s)\n", 690 ib_get_err_str(status)); 691 692 exit: 693 OSM_LOG_EXIT(sm->p_log); 694 } 695 696 static void state_mgr_update_node_desc(IN cl_map_item_t * obj, IN void *context) 697 { 698 osm_madw_context_t mad_context; 699 osm_node_t *p_node = (osm_node_t *) obj; 700 osm_sm_t *sm = context; 701 osm_physp_t *p_physp = NULL; 702 unsigned i, num_ports; 703 ib_api_status_t status; 704 705 OSM_LOG_ENTER(sm->p_log); 706 707 CL_ASSERT(p_node); 708 709 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 710 "Updating NodeDesc for 0x%016" PRIx64 "\n", 711 cl_ntoh64(osm_node_get_node_guid(p_node))); 712 713 /* get a physp to request from. */ 714 num_ports = osm_node_get_num_physp(p_node); 715 for (i = 0; i < num_ports; i++) 716 if ((p_physp = osm_node_get_physp_ptr(p_node, i))) 717 break; 718 719 if (!p_physp) { 720 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 331C: " 721 "Failed to find any valid physical port object.\n"); 722 goto exit; 723 } 724 725 mad_context.nd_context.node_guid = osm_node_get_node_guid(p_node); 726 727 status = osm_req_get(sm, osm_physp_get_dr_path_ptr(p_physp), 728 IB_MAD_ATTR_NODE_DESC, 0, FALSE, 729 ib_port_info_get_m_key(&p_physp->port_info), 730 CL_DISP_MSGID_NONE, &mad_context); 731 if (status != IB_SUCCESS) 732 OSM_LOG(sm->p_log, OSM_LOG_ERROR, 733 "ERR 331B: Failure initiating NodeDescription request " 734 "(%s) to 0x%016" PRIx64 "\n", ib_get_err_str(status), 735 cl_ntoh64(osm_node_get_node_guid(p_node))); 736 737 exit: 738 OSM_LOG_EXIT(sm->p_log); 739 } 740 741 void osm_reset_switch_state_change_bit(IN osm_opensm_t *osm) 742 { 743 CL_PLOCK_ACQUIRE(&osm->lock); 744 cl_qmap_apply_func(&osm->subn.sw_guid_tbl, state_mgr_reset_state_change_bit, 745 &osm->sm); 746 CL_PLOCK_RELEASE(&osm->lock); 747 } 748 749 void osm_update_node_desc(IN osm_opensm_t *osm) 750 { 751 CL_PLOCK_ACQUIRE(&osm->lock); 752 cl_qmap_apply_func(&osm->subn.node_guid_tbl, state_mgr_update_node_desc, 753 &osm->sm); 754 CL_PLOCK_RELEASE(&osm->lock); 755 } 756 757 /********************************************************************** 758 During a light sweep, check each node to see if the node description 759 is valid and if not issue a ND query. 760 **********************************************************************/ 761 static void state_mgr_get_node_desc(IN cl_map_item_t * obj, IN void *context) 762 { 763 osm_node_t *p_node = (osm_node_t *) obj; 764 osm_sm_t *sm = context; 765 766 OSM_LOG_ENTER(sm->p_log); 767 768 CL_ASSERT(p_node); 769 770 if (p_node->print_desc 771 && strcmp(p_node->print_desc, OSM_NODE_DESC_UNKNOWN)) 772 /* if ND is valid, do nothing */ 773 goto exit; 774 775 OSM_LOG(sm->p_log, OSM_LOG_ERROR, 776 "ERR 3319: Unknown node description for node GUID " 777 "0x%016" PRIx64 ". Reissuing ND query\n", 778 cl_ntoh64(osm_node_get_node_guid(p_node))); 779 780 state_mgr_update_node_desc(obj, context); 781 782 exit: 783 OSM_LOG_EXIT(sm->p_log); 784 } 785 786 /********************************************************************** 787 Initiates a lightweight sweep of the subnet. 788 Used during normal sweeps after the subnet is up. 789 **********************************************************************/ 790 static ib_api_status_t state_mgr_light_sweep_start(IN osm_sm_t * sm) 791 { 792 ib_api_status_t status = IB_SUCCESS; 793 osm_bind_handle_t h_bind; 794 cl_qmap_t *p_sw_tbl; 795 cl_map_item_t *p_next; 796 osm_node_t *p_node; 797 osm_physp_t *p_physp; 798 uint8_t port_num; 799 800 OSM_LOG_ENTER(sm->p_log); 801 802 p_sw_tbl = &sm->p_subn->sw_guid_tbl; 803 804 /* 805 * First, get the bind handle. 806 */ 807 h_bind = osm_sm_mad_ctrl_get_bind_handle(&sm->mad_ctrl); 808 if (h_bind == OSM_BIND_INVALID_HANDLE) { 809 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 810 "No bound ports. Deferring sweep...\n"); 811 status = IB_INVALID_STATE; 812 goto _exit; 813 } 814 815 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, "INITIATING LIGHT SWEEP"); 816 CL_PLOCK_ACQUIRE(sm->p_lock); 817 cl_qmap_apply_func(p_sw_tbl, state_mgr_get_sw_info, sm); 818 CL_PLOCK_RELEASE(sm->p_lock); 819 820 CL_PLOCK_ACQUIRE(sm->p_lock); 821 cl_qmap_apply_func(&sm->p_subn->node_guid_tbl, state_mgr_get_node_desc, 822 sm); 823 CL_PLOCK_RELEASE(sm->p_lock); 824 825 /* now scan the list of physical ports that were not down but have no remote port */ 826 CL_PLOCK_ACQUIRE(sm->p_lock); 827 p_next = cl_qmap_head(&sm->p_subn->node_guid_tbl); 828 while (p_next != cl_qmap_end(&sm->p_subn->node_guid_tbl)) { 829 p_node = (osm_node_t *) p_next; 830 p_next = cl_qmap_next(p_next); 831 832 for (port_num = 1; port_num < osm_node_get_num_physp(p_node); 833 port_num++) { 834 p_physp = osm_node_get_physp_ptr(p_node, port_num); 835 if (p_physp && (osm_physp_get_port_state(p_physp) != 836 IB_LINK_DOWN) 837 && !osm_physp_get_remote(p_physp)) { 838 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3315: " 839 "Unknown remote side for node 0x%016" 840 PRIx64 841 " (%s) port %u. Adding to light sweep sampling list\n", 842 cl_ntoh64(osm_node_get_node_guid 843 (p_node)), 844 p_node->print_desc, port_num); 845 846 osm_dump_dr_path_v2(sm->p_log, 847 osm_physp_get_dr_path_ptr 848 (p_physp), FILE_ID, OSM_LOG_ERROR); 849 850 state_mgr_get_remote_port_info(sm, p_physp); 851 } 852 } 853 } 854 855 cl_qmap_apply_func(&sm->p_subn->sm_guid_tbl, query_sm_info, sm); 856 857 CL_PLOCK_RELEASE(sm->p_lock); 858 859 _exit: 860 OSM_LOG_EXIT(sm->p_log); 861 return status; 862 } 863 864 /********************************************************************** 865 * Go over all the remote SMs (as updated in the sm_guid_tbl). 866 * Find if there is a remote sm that is a master SM. 867 * If there is a remote master SM - return a pointer to it, 868 * else - return NULL. 869 **********************************************************************/ 870 static osm_remote_sm_t *state_mgr_exists_other_master_sm(IN osm_sm_t * sm) 871 { 872 cl_qmap_t *p_sm_tbl; 873 osm_remote_sm_t *p_sm; 874 osm_remote_sm_t *p_sm_res = NULL; 875 osm_node_t *p_node; 876 877 OSM_LOG_ENTER(sm->p_log); 878 879 p_sm_tbl = &sm->p_subn->sm_guid_tbl; 880 881 /* go over all the remote SMs */ 882 for (p_sm = (osm_remote_sm_t *) cl_qmap_head(p_sm_tbl); 883 p_sm != (osm_remote_sm_t *) cl_qmap_end(p_sm_tbl); 884 p_sm = (osm_remote_sm_t *) cl_qmap_next(&p_sm->map_item)) { 885 /* If the sm is in MASTER state - return a pointer to it */ 886 p_node = osm_get_node_by_guid(sm->p_subn, p_sm->smi.guid); 887 if (ib_sminfo_get_state(&p_sm->smi) == IB_SMINFO_STATE_MASTER) { 888 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 889 "Found remote master SM with guid:0x%016" PRIx64 890 " (node %s)\n", cl_ntoh64(p_sm->smi.guid), 891 p_node ? p_node->print_desc : "UNKNOWN"); 892 p_sm_res = p_sm; 893 goto Exit; 894 } 895 } 896 897 Exit: 898 OSM_LOG_EXIT(sm->p_log); 899 return p_sm_res; 900 } 901 902 /********************************************************************** 903 * Go over all remote SMs (as updated in the sm_guid_tbl). 904 * Find the one with the highest priority and lowest guid. 905 * Compare this SM to the local SM. If the local SM is higher - 906 * return NULL, if the remote SM is higher - return a pointer to it. 907 **********************************************************************/ 908 static osm_remote_sm_t *state_mgr_get_highest_sm(IN osm_sm_t * sm) 909 { 910 cl_qmap_t *p_sm_tbl; 911 osm_remote_sm_t *p_sm = NULL; 912 osm_remote_sm_t *p_highest_sm; 913 uint8_t highest_sm_priority; 914 ib_net64_t highest_sm_guid; 915 osm_node_t *p_node; 916 917 OSM_LOG_ENTER(sm->p_log); 918 919 p_sm_tbl = &sm->p_subn->sm_guid_tbl; 920 921 /* Start with the local sm as the standard */ 922 p_highest_sm = NULL; 923 highest_sm_priority = sm->p_subn->opt.sm_priority; 924 highest_sm_guid = sm->p_subn->sm_port_guid; 925 926 /* go over all the remote SMs */ 927 for (p_sm = (osm_remote_sm_t *) cl_qmap_head(p_sm_tbl); 928 p_sm != (osm_remote_sm_t *) cl_qmap_end(p_sm_tbl); 929 p_sm = (osm_remote_sm_t *) cl_qmap_next(&p_sm->map_item)) { 930 931 /* If the sm is in NOTACTIVE state - continue */ 932 if (ib_sminfo_get_state(&p_sm->smi) == 933 IB_SMINFO_STATE_NOTACTIVE) 934 continue; 935 936 if (osm_sm_is_greater_than(ib_sminfo_get_priority(&p_sm->smi), 937 p_sm->smi.guid, highest_sm_priority, 938 highest_sm_guid)) { 939 /* the new p_sm is with higher priority - update the highest_sm */ 940 /* to this sm */ 941 p_highest_sm = p_sm; 942 highest_sm_priority = 943 ib_sminfo_get_priority(&p_sm->smi); 944 highest_sm_guid = p_sm->smi.guid; 945 } 946 } 947 948 if (p_highest_sm != NULL) { 949 p_node = osm_get_node_by_guid(sm->p_subn, p_highest_sm->smi.guid); 950 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, 951 "Found higher priority SM with guid: %016" PRIx64 " (node %s)\n", 952 cl_ntoh64(p_highest_sm->smi.guid), 953 p_node ? p_node->print_desc : "UNKNOWN"); 954 } 955 OSM_LOG_EXIT(sm->p_log); 956 return p_highest_sm; 957 } 958 959 /********************************************************************** 960 * Send SubnSet(SMInfo) SMP with HANDOVER attribute to the 961 * remote_sm indicated. 962 **********************************************************************/ 963 static void state_mgr_send_handover(IN osm_sm_t * sm, IN osm_remote_sm_t * p_sm) 964 { 965 uint8_t payload[IB_SMP_DATA_SIZE]; 966 ib_sm_info_t *p_smi = (ib_sm_info_t *) payload; 967 osm_madw_context_t context; 968 const osm_port_t *p_port; 969 ib_api_status_t status; 970 971 OSM_LOG_ENTER(sm->p_log); 972 973 /* 974 * Send a query of SubnSet(SMInfo) HANDOVER to the remote sm given. 975 */ 976 977 memset(&context, 0, sizeof(context)); 978 p_port = osm_get_port_by_guid(sm->p_subn, p_sm->smi.guid); 979 if (p_port == NULL) { 980 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3316: " 981 "No port object on given remote_sm object\n"); 982 goto Exit; 983 } 984 985 /* update the master_guid in the sm_state_mgr object according to */ 986 /* the guid of the port where the new Master SM should reside. */ 987 OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, 988 "Handing over mastership. Updating sm_state_mgr master_guid: %016" 989 PRIx64 " (node %s)\n", cl_ntoh64(p_port->guid), 990 p_port->p_node ? p_port->p_node->print_desc : "UNKNOWN"); 991 sm->master_sm_guid = p_port->guid; 992 993 context.smi_context.port_guid = p_port->guid; 994 context.smi_context.set_method = TRUE; 995 996 memset(payload, 0, sizeof(payload)); 997 p_smi->guid = sm->p_subn->sm_port_guid; 998 p_smi->act_count = cl_hton32(sm->p_subn->p_osm->stats.qp0_mads_sent); 999 p_smi->pri_state = (uint8_t) (sm->p_subn->sm_state | 1000 sm->p_subn->opt.sm_priority << 4); 1001 p_smi->sm_key = sm->p_subn->opt.sm_key; 1002 1003 CL_PLOCK_ACQUIRE(sm->p_lock); 1004 status = osm_req_set(sm, osm_physp_get_dr_path_ptr(p_port->p_physp), 1005 payload, sizeof(payload), IB_MAD_ATTR_SM_INFO, 1006 IB_SMINFO_ATTR_MOD_HANDOVER, FALSE, 1007 ib_port_info_get_m_key(&p_port->p_physp->port_info), 1008 CL_DISP_MSGID_NONE, &context); 1009 CL_PLOCK_RELEASE(sm->p_lock); 1010 1011 if (status != IB_SUCCESS) 1012 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3317: " 1013 "Failure requesting SMInfo (%s), remote SM at 0x%" 1014 PRIx64 " (node %s)\n", 1015 ib_get_err_str(status), cl_ntoh64(p_port->guid), 1016 p_port->p_node ? p_port->p_node->print_desc : "UNKNOWN"); 1017 1018 Exit: 1019 OSM_LOG_EXIT(sm->p_log); 1020 } 1021 1022 /********************************************************************** 1023 * Send Trap 64 on all new ports. 1024 **********************************************************************/ 1025 static void state_mgr_report_new_ports(IN osm_sm_t * sm) 1026 { 1027 ib_gid_t port_gid; 1028 ib_mad_notice_attr_t notice; 1029 ib_api_status_t status; 1030 ib_net64_t port_guid; 1031 cl_map_item_t *p_next; 1032 osm_port_t *p_port; 1033 uint16_t min_lid_ho; 1034 uint16_t max_lid_ho; 1035 1036 OSM_LOG_ENTER(sm->p_log); 1037 1038 CL_PLOCK_ACQUIRE(sm->p_lock); 1039 p_next = cl_qmap_head(&sm->p_subn->port_guid_tbl); 1040 while (p_next != cl_qmap_end(&sm->p_subn->port_guid_tbl)) { 1041 p_port = (osm_port_t *) p_next; 1042 p_next = cl_qmap_next(p_next); 1043 1044 if (!p_port->is_new) 1045 continue; 1046 1047 port_guid = osm_port_get_guid(p_port); 1048 /* issue a notice - trap 64 (SM_GID_IN_SERVICE_TRAP) */ 1049 1050 /* details of the notice */ 1051 notice.generic_type = 0x80 | IB_NOTICE_TYPE_SUBN_MGMT; /* is generic subn mgt type */ 1052 ib_notice_set_prod_type_ho(¬ice, 4); /* A Class Manager generator */ 1053 /* endport becomes reachable */ 1054 notice.g_or_v.generic.trap_num = CL_HTON16(SM_GID_IN_SERVICE_TRAP); /* 64 */ 1055 /* The sm_base_lid is saved in network order already. */ 1056 notice.issuer_lid = sm->p_subn->sm_base_lid; 1057 /* following C14-72.1.1 and table 119 p739 */ 1058 /* we need to provide the GID */ 1059 port_gid.unicast.prefix = sm->p_subn->opt.subnet_prefix; 1060 port_gid.unicast.interface_id = port_guid; 1061 memcpy(&(notice.data_details.ntc_64_67.gid), &(port_gid), 1062 sizeof(ib_gid_t)); 1063 1064 /* According to page 653 - the issuer gid in this case of trap 1065 * is the SM gid, since the SM is the initiator of this trap. */ 1066 notice.issuer_gid.unicast.prefix = 1067 sm->p_subn->opt.subnet_prefix; 1068 notice.issuer_gid.unicast.interface_id = 1069 sm->p_subn->sm_port_guid; 1070 1071 status = osm_report_notice(sm->p_log, sm->p_subn, ¬ice); 1072 if (status != IB_SUCCESS) 1073 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3318: " 1074 "Error sending trap reports on GUID:0x%016" 1075 PRIx64 " (%s)\n", port_gid.unicast.interface_id, 1076 ib_get_err_str(status)); 1077 osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho); 1078 OSM_LOG(sm->p_log, OSM_LOG_INFO, 1079 "Discovered new port with GUID:0x%016" PRIx64 1080 " LID range [%u,%u] of node: %s\n", 1081 cl_ntoh64(port_gid.unicast.interface_id), 1082 min_lid_ho, max_lid_ho, 1083 p_port->p_node ? p_port->p_node-> 1084 print_desc : "UNKNOWN"); 1085 1086 p_port->is_new = 0; 1087 } 1088 CL_PLOCK_RELEASE(sm->p_lock); 1089 1090 OSM_LOG_EXIT(sm->p_log); 1091 } 1092 1093 /********************************************************************** 1094 * Make sure that the lid_port_tbl of the subnet has only the ports 1095 * that are recognized, and in the correct lid place. There could be 1096 * errors if we wanted to assign a certain port with lid X, but that 1097 * request didn't reach the port. In this case port_lid_tbl will have 1098 * the port under lid X, though the port isn't updated with this lid. 1099 * We will run a new heavy sweep (since there were errors in the 1100 * initialization), but here we'll clean the database from incorrect 1101 * information. 1102 **********************************************************************/ 1103 static void state_mgr_check_tbl_consistency(IN osm_sm_t * sm) 1104 { 1105 cl_qmap_t *p_port_guid_tbl; 1106 osm_port_t *p_port; 1107 osm_port_t *p_next_port; 1108 cl_ptr_vector_t *p_port_lid_tbl; 1109 size_t max_lid, ref_size, curr_size, lid; 1110 osm_port_t *p_port_ref, *p_port_stored; 1111 cl_ptr_vector_t ref_port_lid_tbl; 1112 uint16_t min_lid_ho; 1113 uint16_t max_lid_ho; 1114 uint16_t lid_ho; 1115 1116 OSM_LOG_ENTER(sm->p_log); 1117 1118 if (sm->lid_mgr.dirty == FALSE) 1119 goto Exit; 1120 1121 sm->lid_mgr.dirty = FALSE; 1122 1123 cl_ptr_vector_construct(&ref_port_lid_tbl); 1124 cl_ptr_vector_init(&ref_port_lid_tbl, 1125 cl_ptr_vector_get_size(&sm->p_subn->port_lid_tbl), 1126 OSM_SUBNET_VECTOR_GROW_SIZE); 1127 1128 p_port_guid_tbl = &sm->p_subn->port_guid_tbl; 1129 1130 /* Let's go over all the ports according to port_guid_tbl, 1131 * and add the port to a reference port_lid_tbl. */ 1132 p_next_port = (osm_port_t *) cl_qmap_head(p_port_guid_tbl); 1133 while (p_next_port != (osm_port_t *) cl_qmap_end(p_port_guid_tbl)) { 1134 p_port = p_next_port; 1135 p_next_port = 1136 (osm_port_t *) cl_qmap_next(&p_next_port->map_item); 1137 1138 osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho); 1139 for (lid_ho = min_lid_ho; lid_ho <= max_lid_ho; lid_ho++) 1140 cl_ptr_vector_set(&ref_port_lid_tbl, lid_ho, p_port); 1141 } 1142 1143 p_port_lid_tbl = &sm->p_subn->port_lid_tbl; 1144 1145 ref_size = cl_ptr_vector_get_size(&ref_port_lid_tbl); 1146 curr_size = cl_ptr_vector_get_size(p_port_lid_tbl); 1147 /* They should be the same, but compare it anyway */ 1148 max_lid = (ref_size > curr_size) ? ref_size : curr_size; 1149 1150 for (lid = 1; lid < max_lid; lid++) { 1151 p_port_ref = NULL; 1152 p_port_stored = NULL; 1153 cl_ptr_vector_at(p_port_lid_tbl, lid, (void *)&p_port_stored); 1154 cl_ptr_vector_at(&ref_port_lid_tbl, lid, (void *)&p_port_ref); 1155 1156 if (p_port_stored == p_port_ref) 1157 /* This is the "good" case - both entries are the 1158 * same for this lid. Nothing to do. */ 1159 continue; 1160 1161 if (p_port_ref == NULL) { 1162 /* There is an object in the subnet database for this 1163 * lid, but no such object exists in the reference 1164 * port_list_tbl. This can occur if we wanted to assign 1165 * a certain port with some lid (different than the one 1166 * pre-assigned to it), and the port didn't get the 1167 * PortInfo Set request. Due to this, the port is 1168 * updated with its original lid in our database 1169 * rather than the new lid we wanted to give it. */ 1170 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3322: " 1171 "lid %zu is wrongly assigned to port 0x%016" 1172 PRIx64 " (\'%s\' port %u) in port_lid_tbl\n", 1173 lid, 1174 cl_ntoh64(osm_port_get_guid(p_port_stored)), 1175 p_port_stored->p_node->print_desc, 1176 p_port_stored->p_physp->port_num); 1177 } else if (p_port_stored == NULL) 1178 /* There is an object in the new database, but no 1179 * object in our subnet database. This is the matching 1180 * case of the prior check - the port still has its 1181 * original lid. */ 1182 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3323: " 1183 "port 0x%016" PRIx64 " (\'%s\' port %u)" 1184 " exists in new port_lid_tbl under lid %zu," 1185 " but missing in subnet port_lid_tbl db\n", 1186 cl_ntoh64(osm_port_get_guid(p_port_ref)), 1187 p_port_ref->p_node->print_desc, 1188 p_port_ref->p_physp->port_num, lid); 1189 else { 1190 /* if we reached here then p_port_stored != p_port_ref. 1191 * We were trying to set a lid to p_port_stored, but 1192 * it didn't reach it, and p_port_ref also didn't get 1193 * the lid update. */ 1194 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3324: " 1195 "lid %zu has port 0x%016" PRIx64 1196 " (\'%s\' port %u) in new port_lid_tbl db, " 1197 "and port 0x%016" PRIx64 " (\'%s\' port %u)" 1198 " in subnet port_lid_tbl db\n", lid, 1199 cl_ntoh64(osm_port_get_guid(p_port_ref)), 1200 p_port_ref->p_node->print_desc, 1201 p_port_ref->p_physp->port_num, 1202 cl_ntoh64(osm_port_get_guid(p_port_stored)), 1203 p_port_stored->p_node->print_desc, 1204 p_port_stored->p_physp->port_num); 1205 } 1206 1207 /* 1208 * Clear the lid of the port in order to ignore it 1209 * in routing phase 1210 */ 1211 if (p_port_stored) { 1212 OSM_LOG(sm->p_log, OSM_LOG_INFO, "Clearing Lid for " 1213 "port 0x%016" PRIx64 "\n", 1214 cl_ntoh64(osm_port_get_guid(p_port_stored))); 1215 osm_port_clear_base_lid(p_port_stored); 1216 cl_ptr_vector_set(p_port_lid_tbl, lid, NULL); 1217 } 1218 1219 /* Make sure we'll do another heavy sweep. */ 1220 sm->p_subn->subnet_initialization_error = TRUE; 1221 } 1222 1223 cl_ptr_vector_destroy(&ref_port_lid_tbl); 1224 1225 Exit: 1226 OSM_LOG_EXIT(sm->p_log); 1227 } 1228 1229 static void check_switch_lft(cl_map_item_t * item, void *log) 1230 { 1231 osm_switch_t *sw = (osm_switch_t *) item; 1232 1233 if (!sw->new_lft) 1234 return; 1235 1236 if (memcmp(sw->lft, sw->new_lft, sw->max_lid_ho + 1)) 1237 osm_log_v2(log, OSM_LOG_ERROR, FILE_ID, "ERR 331D: " 1238 "LFT of switch 0x%016" PRIx64 " (%s) is not up to date\n", 1239 cl_ntoh64(sw->p_node->node_info.node_guid), 1240 sw->p_node->print_desc); 1241 } 1242 1243 int wait_for_pending_transactions(osm_stats_t * stats) 1244 { 1245 #ifdef HAVE_LIBPTHREAD 1246 pthread_mutex_lock(&stats->mutex); 1247 while (stats->qp0_mads_outstanding && !osm_exit_flag) 1248 pthread_cond_wait(&stats->cond, &stats->mutex); 1249 pthread_mutex_unlock(&stats->mutex); 1250 #else 1251 while (1) { 1252 unsigned count = stats->qp0_mads_outstanding; 1253 if (!count || osm_exit_flag) 1254 break; 1255 cl_event_wait_on(&stats->event, EVENT_NO_TIMEOUT, TRUE); 1256 } 1257 #endif 1258 return osm_exit_flag; 1259 } 1260 1261 static void do_sweep(osm_sm_t * sm) 1262 { 1263 ib_api_status_t status; 1264 osm_remote_sm_t *p_remote_sm; 1265 unsigned config_parsed = 0; 1266 1267 if (sm->p_subn->force_first_time_master_sweep) { 1268 sm->p_subn->force_heavy_sweep = TRUE; 1269 sm->p_subn->coming_out_of_standby = TRUE; 1270 sm->p_subn->first_time_master_sweep = TRUE; 1271 sm->p_subn->force_first_time_master_sweep = FALSE; 1272 } 1273 1274 /* after subnet initialization error, run heavy sweep */ 1275 if (sm->p_subn->subnet_initialization_error) 1276 sm->p_subn->force_heavy_sweep = TRUE; 1277 1278 if (sm->p_subn->force_heavy_sweep) { 1279 if (osm_subn_rescan_conf_files(sm->p_subn) < 0) 1280 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 331A: " 1281 "osm_subn_rescan_conf_file failed\n"); 1282 else 1283 config_parsed = 1; 1284 } 1285 1286 if (sm->p_subn->sm_state != IB_SMINFO_STATE_MASTER && 1287 sm->p_subn->sm_state != IB_SMINFO_STATE_DISCOVERING) 1288 return; 1289 1290 if (sm->p_subn->coming_out_of_standby) { 1291 /* 1292 * Need to force re-write of sm_base_lid to all ports 1293 * to do that we want all the ports to be considered 1294 * foreign 1295 */ 1296 state_mgr_clean_known_lids(sm); 1297 1298 /* 1299 * Need to clean SA cache when state changes to STANDBY 1300 * after handover. 1301 */ 1302 state_mgr_sa_clean(sm); 1303 1304 /* 1305 * Need to reconfigure LFTs, PKEYs, and QoS on all switches 1306 * when coming out of STANDBY 1307 */ 1308 sm->p_subn->need_update = 1; 1309 } 1310 1311 sm->master_sm_found = 0; 1312 1313 /* 1314 * If we already have switches, then try a light sweep. 1315 * Otherwise, this is probably our first discovery pass 1316 * or we are connected in loopback. In both cases do a 1317 * heavy sweep. 1318 * Note: If we are connected in loopback we want a heavy 1319 * sweep, since we will not be getting any traps if there is 1320 * a lost connection. 1321 */ 1322 /* if we are in DISCOVERING state - this means it is either in 1323 * initializing or wake up from STANDBY - run the heavy sweep */ 1324 if (cl_qmap_count(&sm->p_subn->sw_guid_tbl) 1325 && sm->p_subn->sm_state != IB_SMINFO_STATE_DISCOVERING 1326 && sm->p_subn->opt.force_heavy_sweep == FALSE 1327 && sm->p_subn->force_heavy_sweep == FALSE 1328 && sm->p_subn->force_reroute == FALSE 1329 && sm->p_subn->subnet_initialization_error == FALSE 1330 && (state_mgr_light_sweep_start(sm) == IB_SUCCESS)) { 1331 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1332 return; 1333 if (!sm->p_subn->force_heavy_sweep) { 1334 if (sm->p_subn->opt.sa_db_dump && 1335 !osm_sa_db_file_dump(sm->p_subn->p_osm)) 1336 osm_opensm_report_event(sm->p_subn->p_osm, 1337 OSM_EVENT_ID_SA_DB_DUMPED, 1338 NULL); 1339 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1340 "LIGHT SWEEP COMPLETE"); 1341 return; 1342 } 1343 } 1344 1345 /* 1346 * Unicast cache should be invalidated when subnet re-route is 1347 * requested, and when OpenSM comes out of standby state. 1348 */ 1349 if (sm->p_subn->opt.use_ucast_cache && 1350 (sm->p_subn->force_reroute || sm->p_subn->coming_out_of_standby)) 1351 osm_ucast_cache_invalidate(&sm->ucast_mgr); 1352 1353 /* 1354 * If we don't need to do a heavy sweep and we want to do a reroute, 1355 * just reroute only. 1356 */ 1357 if (cl_qmap_count(&sm->p_subn->sw_guid_tbl) 1358 && sm->p_subn->sm_state != IB_SMINFO_STATE_DISCOVERING 1359 && sm->p_subn->opt.force_heavy_sweep == FALSE 1360 && sm->p_subn->force_heavy_sweep == FALSE 1361 && sm->p_subn->force_reroute == TRUE 1362 && sm->p_subn->subnet_initialization_error == FALSE) { 1363 /* Reset flag */ 1364 sm->p_subn->force_reroute = FALSE; 1365 1366 /* Re-program the switches fully */ 1367 sm->p_subn->ignore_existing_lfts = TRUE; 1368 1369 if (osm_ucast_mgr_process(&sm->ucast_mgr)) { 1370 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1371 "REROUTE FAILED"); 1372 return; 1373 } 1374 osm_qos_setup(sm->p_subn->p_osm); 1375 1376 /* Reset flag */ 1377 sm->p_subn->ignore_existing_lfts = FALSE; 1378 1379 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1380 return; 1381 1382 osm_congestion_control_setup(sm->p_subn->p_osm); 1383 1384 if (osm_congestion_control_wait_pending_transactions(sm->p_subn->p_osm)) 1385 return; 1386 1387 if (!sm->p_subn->subnet_initialization_error) { 1388 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1389 "REROUTE COMPLETE"); 1390 osm_opensm_report_event(sm->p_subn->p_osm, 1391 OSM_EVENT_ID_UCAST_ROUTING_DONE, 1392 (void *) UCAST_ROUTING_REROUTE); 1393 return; 1394 } 1395 } 1396 1397 osm_opensm_report_event(sm->p_subn->p_osm, 1398 OSM_EVENT_ID_HEAVY_SWEEP_START, NULL); 1399 1400 /* go to heavy sweep */ 1401 repeat_discovery: 1402 1403 /* First of all - unset all flags */ 1404 sm->p_subn->force_heavy_sweep = FALSE; 1405 sm->p_subn->force_reroute = FALSE; 1406 sm->p_subn->subnet_initialization_error = FALSE; 1407 1408 /* Reset tracking values in case limiting component got removed 1409 * from fabric. */ 1410 sm->p_subn->min_ca_mtu = IB_MAX_MTU; 1411 sm->p_subn->min_ca_rate = IB_PATH_RECORD_RATE_300_GBS; 1412 sm->p_subn->min_data_vls = IB_MAX_NUM_VLS - 1; 1413 sm->p_subn->min_sw_data_vls = IB_MAX_NUM_VLS - 1; 1414 1415 /* rescan configuration updates */ 1416 if (!config_parsed && osm_subn_rescan_conf_files(sm->p_subn) < 0) 1417 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 331A: " 1418 "osm_subn_rescan_conf_file failed\n"); 1419 1420 if (sm->p_subn->sm_state != IB_SMINFO_STATE_MASTER) 1421 sm->p_subn->need_update = 1; 1422 1423 status = state_mgr_sweep_hop_0(sm); 1424 if (status != IB_SUCCESS || 1425 wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1426 return; 1427 1428 if (state_mgr_is_sm_port_down(sm) == TRUE) { 1429 if (sm->p_subn->last_sm_port_state) { 1430 sm->p_subn->last_sm_port_state = 0; 1431 state_mgr_sa_clean(sm); 1432 osm_log_v2(sm->p_log, OSM_LOG_SYS, FILE_ID, 1433 "SM port is down\n"); 1434 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1435 "SM PORT DOWN"); 1436 } 1437 1438 /* Run the drop manager - we want to clear all records */ 1439 osm_drop_mgr_process(sm); 1440 1441 /* Move to DISCOVERING state */ 1442 if (sm->p_subn->sm_state != IB_SMINFO_STATE_DISCOVERING) 1443 osm_sm_state_mgr_process(sm, OSM_SM_SIGNAL_DISCOVER); 1444 osm_opensm_report_event(sm->p_subn->p_osm, 1445 OSM_EVENT_ID_STATE_CHANGE, NULL); 1446 return; 1447 } else { 1448 if (!sm->p_subn->last_sm_port_state) { 1449 sm->p_subn->last_sm_port_state = 1; 1450 osm_log_v2(sm->p_log, OSM_LOG_SYS, FILE_ID, 1451 "SM port is up\n"); 1452 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1453 "SM PORT UP"); 1454 } 1455 } 1456 1457 status = state_mgr_sweep_hop_1(sm); 1458 if (status != IB_SUCCESS || 1459 wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1460 return; 1461 1462 /* discovery completed - check other sm presence */ 1463 if (sm->master_sm_found) { 1464 /* 1465 * Call the sm_state_mgr with signal 1466 * MASTER_OR_HIGHER_SM_DETECTED_DONE 1467 */ 1468 osm_sm_state_mgr_process(sm, 1469 OSM_SM_SIGNAL_MASTER_OR_HIGHER_SM_DETECTED); 1470 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1471 "ENTERING STANDBY STATE"); 1472 /* notify master SM about us */ 1473 osm_send_trap144(sm, 0); 1474 osm_opensm_report_event(sm->p_subn->p_osm, 1475 OSM_EVENT_ID_STATE_CHANGE, NULL); 1476 return; 1477 } 1478 1479 /* if new sweep requested - don't bother with the rest */ 1480 if (sm->p_subn->force_heavy_sweep) { 1481 config_parsed = 0; 1482 goto repeat_discovery; 1483 } 1484 1485 osm_opensm_report_event(sm->p_subn->p_osm, 1486 OSM_EVENT_ID_HEAVY_SWEEP_DONE, NULL); 1487 1488 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, "HEAVY SWEEP COMPLETE"); 1489 1490 osm_drop_mgr_process(sm); 1491 1492 /* If we are MASTER - get the highest remote_sm, and 1493 * see if it is higher than our local sm. 1494 */ 1495 if (sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER) { 1496 p_remote_sm = state_mgr_get_highest_sm(sm); 1497 if (p_remote_sm != NULL) { 1498 /* report new ports (trap 64) before leaving MASTER */ 1499 state_mgr_report_new_ports(sm); 1500 1501 /* need to handover the mastership 1502 * to the remote sm, and move to standby */ 1503 state_mgr_send_handover(sm, p_remote_sm); 1504 osm_sm_state_mgr_process(sm, 1505 OSM_SM_SIGNAL_HANDOVER_SENT); 1506 return; 1507 } else { 1508 /* We are the highest sm - check to see if there is 1509 * a remote SM that is in master state. */ 1510 p_remote_sm = state_mgr_exists_other_master_sm(sm); 1511 if (p_remote_sm != NULL) { 1512 /* There is a remote SM that is master. 1513 * need to wait for that SM to relinquish control 1514 * of its portion of the subnet. C14-60.2.1. 1515 * Also - need to start polling on that SM. */ 1516 CL_PLOCK_EXCL_ACQUIRE(sm->p_lock); 1517 sm->polling_sm_guid = p_remote_sm->smi.guid; 1518 CL_PLOCK_RELEASE(sm->p_lock); 1519 osm_sm_state_mgr_process(sm, 1520 OSM_SM_SIGNAL_WAIT_FOR_HANDOVER); 1521 return; 1522 } else if (sm->polling_sm_guid) { 1523 /* Stop polling SM if it's not found */ 1524 osm_sm_state_mgr_process(sm, 1525 OSM_SM_SIGNAL_POLLING_TIMEOUT); 1526 return; 1527 } 1528 } 1529 } 1530 1531 /* 1532 * If we are not MASTER already - this means that we are 1533 * in discovery state. call osm_sm_state_mgr with signal 1534 * DISCOVERY_COMPLETED 1535 */ 1536 if (sm->p_subn->sm_state == IB_SMINFO_STATE_DISCOVERING) 1537 osm_sm_state_mgr_process(sm, OSM_SM_SIGNAL_DISCOVERY_COMPLETED); 1538 1539 osm_reset_switch_state_change_bit(sm->p_subn->p_osm); 1540 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1541 return; 1542 1543 osm_pkey_mgr_process(sm->p_subn->p_osm); 1544 1545 /* try to restore SA DB (this should be before lid_mgr 1546 because we may want to disable clients reregistration 1547 when SA DB is restored) */ 1548 osm_sa_db_file_load(sm->p_subn->p_osm); 1549 1550 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1551 return; 1552 1553 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1554 "PKEY setup completed - STARTING SM LID CONFIG"); 1555 1556 osm_lid_mgr_process_sm(&sm->lid_mgr); 1557 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1558 return; 1559 1560 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1561 "SM LID ASSIGNMENT COMPLETE - STARTING SUBNET LID CONFIG"); 1562 state_mgr_notify_lid_change(sm); 1563 1564 osm_lid_mgr_process_subnet(&sm->lid_mgr); 1565 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1566 return; 1567 1568 /* At this point we need to check the consistency of 1569 * the port_lid_tbl under the subnet. There might be 1570 * errors in it if PortInfo Set requests didn't reach 1571 * their destination. */ 1572 state_mgr_check_tbl_consistency(sm); 1573 1574 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1575 "LID ASSIGNMENT COMPLETE - STARTING SWITCH TABLE CONFIG"); 1576 1577 /* 1578 * Proceed with unicast forwarding table configuration; if it fails 1579 * return early to wait for a trap or the next sweep interval. 1580 */ 1581 1582 if (!sm->ucast_mgr.cache_valid || 1583 osm_ucast_cache_process(&sm->ucast_mgr)) { 1584 if (osm_ucast_mgr_process(&sm->ucast_mgr)) { 1585 osm_ucast_cache_invalidate(&sm->ucast_mgr); 1586 return; 1587 } 1588 } 1589 1590 osm_qos_setup(sm->p_subn->p_osm); 1591 1592 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1593 return; 1594 1595 /* We are done setting all LFTs so clear the ignore existing. 1596 * From now on, as long as we are still master, we want to 1597 * take into account these lfts. */ 1598 sm->p_subn->ignore_existing_lfts = FALSE; 1599 1600 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1601 "SWITCHES CONFIGURED FOR UNICAST"); 1602 osm_opensm_report_event(sm->p_subn->p_osm, 1603 OSM_EVENT_ID_UCAST_ROUTING_DONE, 1604 (void *) UCAST_ROUTING_HEAVY_SWEEP); 1605 1606 if (!sm->p_subn->opt.disable_multicast) { 1607 osm_mcast_mgr_process(sm, TRUE); 1608 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1609 return; 1610 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1611 "SWITCHES CONFIGURED FOR MULTICAST"); 1612 } 1613 1614 osm_guid_mgr_process(sm); 1615 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1616 return; 1617 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, "ALIAS GUIDS CONFIGURED"); 1618 1619 /* 1620 * The LINK_PORTS state is required since we cannot count on 1621 * the port state change MADs to succeed. This is an artifact 1622 * of the spec defining state change from state X to state X 1623 * as an error. The hardware then is not required to process 1624 * other parameters provided by the Set(PortInfo) Packet. 1625 */ 1626 1627 osm_link_mgr_process(sm, IB_LINK_NO_CHANGE); 1628 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1629 return; 1630 1631 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1632 "LINKS PORTS CONFIGURED - SET LINKS TO ARMED STATE"); 1633 1634 osm_link_mgr_process(sm, IB_LINK_ARMED); 1635 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1636 return; 1637 1638 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, 1639 "LINKS ARMED - SET LINKS TO ACTIVE STATE"); 1640 1641 osm_link_mgr_process(sm, IB_LINK_ACTIVE); 1642 if (wait_for_pending_transactions(&sm->p_subn->p_osm->stats)) 1643 return; 1644 1645 /* 1646 * The sweep completed! 1647 */ 1648 1649 /* Now do GSI configuration */ 1650 1651 osm_congestion_control_setup(sm->p_subn->p_osm); 1652 1653 if (osm_congestion_control_wait_pending_transactions(sm->p_subn->p_osm)) 1654 return; 1655 1656 /* 1657 * Send trap 64 on newly discovered endports 1658 */ 1659 state_mgr_report_new_ports(sm); 1660 1661 /* check switch lft buffers assignments */ 1662 cl_qmap_apply_func(&sm->p_subn->sw_guid_tbl, check_switch_lft, 1663 sm->p_log); 1664 1665 /* in any case we zero this flag */ 1666 sm->p_subn->coming_out_of_standby = FALSE; 1667 sm->p_subn->first_time_master_sweep = FALSE; 1668 1669 /* If there were errors - then the subnet is not really up */ 1670 if (sm->p_subn->subnet_initialization_error == TRUE) { 1671 osm_log_v2(sm->p_log, OSM_LOG_SYS, FILE_ID, 1672 "Errors during initialization\n"); 1673 OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_ERROR, 1674 "ERRORS DURING INITIALIZATION"); 1675 } else { 1676 sm->p_subn->need_update = 0; 1677 osm_dump_all(sm->p_subn->p_osm); 1678 state_mgr_up_msg(sm); 1679 1680 if ((OSM_LOG_IS_ACTIVE_V2(sm->p_log, OSM_LOG_VERBOSE) || 1681 sm->p_subn->opt.sa_db_dump) && 1682 !osm_sa_db_file_dump(sm->p_subn->p_osm)) 1683 osm_opensm_report_event(sm->p_subn->p_osm, 1684 OSM_EVENT_ID_SA_DB_DUMPED, 1685 NULL); 1686 } 1687 1688 /* 1689 * Finally signal the subnet up event 1690 */ 1691 cl_event_signal(&sm->subnet_up_event); 1692 1693 /* if we got a signal to force heavy sweep or errors 1694 * in the middle of the sweep - try another sweep. */ 1695 if (sm->p_subn->force_heavy_sweep) 1696 osm_sm_signal(sm, OSM_SIGNAL_SWEEP); 1697 1698 /* Write a new copy of our persistent guid2mkey database */ 1699 osm_db_store(sm->p_subn->p_g2m, sm->p_subn->opt.fsync_high_avail_files); 1700 osm_db_store(sm->p_subn->p_neighbor, 1701 sm->p_subn->opt.fsync_high_avail_files); 1702 1703 osm_opensm_report_event(sm->p_subn->p_osm, OSM_EVENT_ID_SUBNET_UP, 1704 NULL); 1705 } 1706 1707 static void do_process_mgrp_queue(osm_sm_t * sm) 1708 { 1709 if (sm->p_subn->sm_state != IB_SMINFO_STATE_MASTER) 1710 return; 1711 if (!sm->p_subn->opt.disable_multicast) { 1712 osm_mcast_mgr_process(sm, FALSE); 1713 wait_for_pending_transactions(&sm->p_subn->p_osm->stats); 1714 } 1715 } 1716 1717 static void do_process_guid_queue(osm_sm_t *sm) 1718 { 1719 osm_guid_mgr_process(sm); 1720 wait_for_pending_transactions(&sm->p_subn->p_osm->stats); 1721 } 1722 1723 void osm_state_mgr_process(IN osm_sm_t * sm, IN osm_signal_t signal) 1724 { 1725 CL_ASSERT(sm); 1726 1727 OSM_LOG_ENTER(sm->p_log); 1728 1729 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Received signal %s in state %s\n", 1730 osm_get_sm_signal_str(signal), 1731 osm_get_sm_mgr_state_str(sm->p_subn->sm_state)); 1732 1733 switch (signal) { 1734 case OSM_SIGNAL_SWEEP: 1735 if (!sm->p_subn->sweeping_enabled) { 1736 OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "sweeping disabled - " 1737 "ignoring signal %s in state %s\n", 1738 osm_get_sm_signal_str(signal), 1739 osm_get_sm_mgr_state_str(sm->p_subn->sm_state)); 1740 } else 1741 do_sweep(sm); 1742 break; 1743 case OSM_SIGNAL_IDLE_TIME_PROCESS_REQUEST: 1744 do_process_mgrp_queue(sm); 1745 break; 1746 case OSM_SIGNAL_GUID_PROCESS_REQUEST: 1747 do_process_guid_queue(sm); 1748 break; 1749 default: 1750 CL_ASSERT(FALSE); 1751 OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3320: " 1752 "Invalid SM signal %u\n", signal); 1753 break; 1754 } 1755 1756 OSM_LOG_EXIT(sm->p_log); 1757 } 1758