1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * PM8001 device state recovery routines 28 */ 29 30 #include <sys/scsi/adapters/pmcs/pmcs.h> 31 32 /* 33 * SAS Topology Configuration 34 */ 35 static void pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt); 36 static void pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, 37 pmcs_xscsi_t *tgt, pmcs_hw_t *pwp, const char *func_name, 38 char *reason_string); 39 40 /* 41 * Get device state. Called with statlock and PHY lock held. 42 */ 43 static int 44 pmcs_get_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 45 uint8_t *ds) 46 { 47 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 48 int result; 49 struct pmcwork *pwrk; 50 51 pmcs_prt(pwp, PMCS_PRT_DEBUG3, phyp, xp, "%s: tgt(0x%p)", __func__, 52 (void *)xp); 53 54 if (xp != NULL) { 55 ASSERT(mutex_owned(&xp->statlock)); 56 } 57 58 if (phyp == NULL) { 59 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 60 "%s: PHY is NULL", __func__); 61 return (-1); 62 } 63 ASSERT(mutex_owned(&phyp->phy_lock)); 64 65 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 66 if (pwrk == NULL) { 67 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 68 return (-1); 69 } 70 pwrk->arg = msg; 71 pwrk->dtype = phyp->dtype; 72 73 if (phyp->valid_device_id == 0) { 74 pmcs_pwork(pwp, pwrk); 75 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 76 "%s: Invalid DeviceID", __func__); 77 return (-1); 78 } 79 htag = pwrk->htag; 80 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 81 PMCIN_GET_DEVICE_STATE)); 82 msg[1] = LE_32(pwrk->htag); 83 msg[2] = LE_32(phyp->device_id); 84 CLEAN_MESSAGE(msg, 3); 85 86 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 87 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 88 if (ptr == NULL) { 89 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 90 pmcs_pwork(pwp, pwrk); 91 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 92 return (-1); 93 } 94 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 95 pwrk->state = PMCS_WORK_STATE_ONCHIP; 96 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 97 98 if (xp != NULL) { 99 mutex_exit(&xp->statlock); 100 } 101 pmcs_unlock_phy(phyp); 102 WAIT_FOR(pwrk, 1000, result); 103 pmcs_lock_phy(phyp); 104 pmcs_pwork(pwp, pwrk); 105 106 if (xp != NULL) { 107 mutex_enter(&xp->statlock); 108 } 109 110 if (result) { 111 pmcs_timed_out(pwp, htag, __func__); 112 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 113 "%s: cmd timed out, returning", __func__); 114 return (-1); 115 } 116 if (LE_32(msg[2]) == 0) { 117 *ds = (uint8_t)(LE_32(msg[4])); 118 if (xp == NULL) { 119 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 120 "%s: retrieved_ds=0x%x", __func__, *ds); 121 } else if (*ds != xp->dev_state) { 122 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 123 "%s: retrieved_ds=0x%x, target_ds=0x%x", __func__, 124 *ds, xp->dev_state); 125 } 126 return (0); 127 } else { 128 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 129 "%s: cmd failed Status(0x%x), returning ", __func__, 130 LE_32(msg[2])); 131 return (-1); 132 } 133 } 134 135 /* 136 * Set device state. Called with target's statlock and PHY lock held. 137 */ 138 static int 139 pmcs_set_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 140 uint8_t ds) 141 { 142 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 143 int result; 144 uint8_t pds, nds; 145 struct pmcwork *pwrk; 146 147 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 148 "%s: ds: 0x%x tgt: 0x%p phy: 0x%p", __func__, ds, (void *)xp, 149 (void *)phyp); 150 151 if (phyp == NULL) { 152 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 153 "%s: PHY is NULL", __func__); 154 return (-1); 155 } 156 157 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 158 if (pwrk == NULL) { 159 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 160 return (-1); 161 } 162 if (phyp->valid_device_id == 0) { 163 pmcs_pwork(pwp, pwrk); 164 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 165 "%s: Invalid DeviceID", __func__); 166 return (-1); 167 } 168 pwrk->arg = msg; 169 pwrk->dtype = phyp->dtype; 170 htag = pwrk->htag; 171 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 172 PMCIN_SET_DEVICE_STATE)); 173 msg[1] = LE_32(pwrk->htag); 174 msg[2] = LE_32(phyp->device_id); 175 msg[3] = LE_32(ds); 176 CLEAN_MESSAGE(msg, 4); 177 178 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 179 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 180 if (ptr == NULL) { 181 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 182 pmcs_pwork(pwp, pwrk); 183 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 184 return (-1); 185 } 186 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 187 pwrk->state = PMCS_WORK_STATE_ONCHIP; 188 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 189 190 if (xp != NULL) { 191 mutex_exit(&xp->statlock); 192 } 193 pmcs_unlock_phy(phyp); 194 WAIT_FOR(pwrk, 1000, result); 195 pmcs_lock_phy(phyp); 196 pmcs_pwork(pwp, pwrk); 197 if (xp != NULL) { 198 mutex_enter(&xp->statlock); 199 } 200 201 if (result) { 202 pmcs_timed_out(pwp, htag, __func__); 203 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 204 "%s: cmd timed out, returning", __func__); 205 return (-1); 206 } 207 if (LE_32(msg[2]) == 0) { 208 pds = (uint8_t)(LE_32(msg[4]) >> 4); 209 nds = (uint8_t)(LE_32(msg[4]) & 0x0000000f); 210 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 211 "%s: previous_ds=0x%x, new_ds=0x%x", __func__, pds, nds); 212 if (xp != NULL) { 213 xp->dev_state = nds; 214 } 215 return (0); 216 } else { 217 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 218 "%s: cmd failed Status(0x%x), returning ", __func__, 219 LE_32(msg[2])); 220 return (-1); 221 } 222 } 223 224 static void 225 pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt) 226 { 227 pmcs_hw_t *pwp; 228 229 ASSERT(pptr); 230 pwp = pptr->pwp; 231 232 if (tgt != NULL) { 233 tgt->recover_wait = 0; 234 } 235 pptr->ds_recovery_retries = 0; 236 237 if ((pptr->ds_prev_good_recoveries == 0) || 238 (ddi_get_lbolt() - pptr->last_good_recovery > 239 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME))) { 240 pptr->last_good_recovery = ddi_get_lbolt(); 241 pptr->ds_prev_good_recoveries = 1; 242 } else if (ddi_get_lbolt() < pptr->last_good_recovery + 243 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)) { 244 pptr->ds_prev_good_recoveries++; 245 } else { 246 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, __func__, 247 "Max recovery attempts reached. Declaring PHY dead"); 248 } 249 250 /* Don't bother to run the work queues if the PHY is dead */ 251 if (!pptr->dead) { 252 SCHEDULE_WORK(pwp, PMCS_WORK_RUN_QUEUES); 253 (void) ddi_taskq_dispatch(pwp->tq, pmcs_worker, 254 pwp, DDI_NOSLEEP); 255 } 256 } 257 258 void 259 pmcs_dev_state_recovery(pmcs_hw_t *pwp, pmcs_phy_t *phyp) 260 { 261 boolean_t reschedule = B_FALSE; 262 uint8_t ds, tgt_dev_state; 263 int rc; 264 pmcs_xscsi_t *tgt; 265 pmcs_phy_t *pptr, *pnext, *pchild; 266 267 /* 268 * First time, check to see if we're already performing recovery 269 */ 270 if (phyp == NULL) { 271 mutex_enter(&pwp->lock); 272 if (pwp->ds_err_recovering) { 273 mutex_exit(&pwp->lock); 274 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 275 return; 276 } 277 278 pwp->ds_err_recovering = 1; 279 pptr = pwp->root_phys; 280 mutex_exit(&pwp->lock); 281 } else { 282 pptr = phyp; 283 } 284 285 while (pptr) { 286 /* 287 * Since ds_err_recovering is set, we can be assured these 288 * PHYs won't disappear on us while we do this. 289 */ 290 pmcs_lock_phy(pptr); 291 pchild = pptr->children; 292 pnext = pptr->sibling; 293 pmcs_unlock_phy(pptr); 294 295 if (pchild) { 296 pmcs_dev_state_recovery(pwp, pchild); 297 } 298 299 tgt = NULL; 300 pmcs_lock_phy(pptr); 301 302 if (pptr->dead || !pptr->valid_device_id) { 303 goto next_phy; 304 } 305 306 if (pptr->iport && (pptr->iport->ua_state != UA_ACTIVE)) { 307 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, pptr->target, 308 "%s: No DS recovery on PHY %s, iport not active", 309 __func__, pptr->path); 310 goto next_phy; 311 } 312 313 tgt = pptr->target; 314 315 if (tgt != NULL) { 316 mutex_enter(&tgt->statlock); 317 if (tgt->recover_wait == 0) { 318 goto next_phy; 319 } 320 tgt_dev_state = tgt->dev_state; 321 } else { 322 tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 323 } 324 325 if (pptr->prev_recovery) { 326 if (ddi_get_lbolt() - pptr->prev_recovery < 327 drv_usectohz(PMCS_DS_RECOVERY_INTERVAL)) { 328 pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt, 329 "%s: DS recovery on PHY %s " 330 "re-invoked too soon. Skipping...", 331 __func__, pptr->path); 332 if ((tgt) && (tgt->recover_wait)) { 333 reschedule = B_TRUE; 334 } 335 goto next_phy; 336 } 337 } 338 pptr->prev_recovery = ddi_get_lbolt(); 339 340 /* 341 * Step 1: Put the device into the IN_RECOVERY state 342 */ 343 rc = pmcs_get_dev_state(pwp, pptr, tgt, &ds); 344 if (rc != 0) { 345 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 346 "%s: pmcs_get_dev_state on PHY %s " 347 "failed (rc=%d)", 348 __func__, pptr->path, rc); 349 350 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 351 __func__, "pmcs_get_dev_state"); 352 353 goto next_phy; 354 } 355 356 /* If the chip says it's operational, we're done */ 357 if (ds == PMCS_DEVICE_STATE_OPERATIONAL) { 358 pmcs_ds_operational(pptr, tgt); 359 goto next_phy; 360 } 361 362 if ((tgt_dev_state == ds) && 363 (ds == PMCS_DEVICE_STATE_IN_RECOVERY)) { 364 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 365 "%s: Target 0x%p already IN_RECOVERY", __func__, 366 (void *)tgt); 367 } else { 368 if (tgt != NULL) { 369 tgt->dev_state = ds; 370 } 371 tgt_dev_state = ds; 372 ds = PMCS_DEVICE_STATE_IN_RECOVERY; 373 rc = pmcs_send_err_recovery_cmd(pwp, ds, pptr, tgt); 374 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 375 "%s: pmcs_send_err_recovery_cmd " 376 "result(%d) tgt(0x%p) ds(0x%x) tgt->ds(0x%x)", 377 __func__, rc, (void *)tgt, ds, tgt_dev_state); 378 379 if (rc) { 380 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 381 "%s: pmcs_send_err_recovery_cmd to PHY %s " 382 "failed (rc=%d)", 383 __func__, pptr->path, rc); 384 385 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 386 __func__, "pmcs_send_err_recovery_cmd"); 387 388 goto next_phy; 389 } 390 } 391 392 /* 393 * Step 2: Perform a hard reset on the PHY. 394 */ 395 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 396 "%s: Issue HARD_RESET to PHY %s", __func__, 397 pptr->path); 398 /* 399 * Must release statlock here because pmcs_reset_phy 400 * will drop and reacquire the PHY lock. 401 */ 402 if (tgt != NULL) { 403 mutex_exit(&tgt->statlock); 404 } 405 rc = pmcs_reset_phy(pwp, pptr, PMCS_PHYOP_HARD_RESET); 406 if (tgt != NULL) { 407 mutex_enter(&tgt->statlock); 408 } 409 if (rc) { 410 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 411 "%s: HARD_RESET to PHY %s failed (rc=%d)", 412 __func__, pptr->path, rc); 413 414 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 415 __func__, "HARD_RESET"); 416 417 goto next_phy; 418 } 419 420 /* 421 * Step 3: Abort all I/Os to the device 422 */ 423 if (pptr->abort_all_start) { 424 while (pptr->abort_all_start) { 425 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 426 "%s: Waiting for outstanding ABORT_ALL on " 427 "PHY 0x%p", __func__, (void *)pptr); 428 cv_wait(&pptr->abort_all_cv, &pptr->phy_lock); 429 } 430 } else { 431 if (tgt != NULL) { 432 mutex_exit(&tgt->statlock); 433 } 434 rc = pmcs_abort(pwp, pptr, pptr->device_id, 1, 1); 435 if (tgt != NULL) { 436 mutex_enter(&tgt->statlock); 437 } 438 if (rc != 0) { 439 pptr->abort_pending = 1; 440 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 441 "%s: pmcs_abort to PHY %s failed (rc=%d)", 442 __func__, pptr->path, rc); 443 444 pmcs_handle_ds_recovery_error(pptr, tgt, 445 pwp, __func__, "pmcs_abort"); 446 447 goto next_phy; 448 } 449 } 450 451 /* 452 * Step 4: Set the device back to OPERATIONAL state 453 */ 454 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 455 "%s: Set PHY/tgt 0x%p/0x%p to OPERATIONAL state", 456 __func__, (void *)pptr, (void *)tgt); 457 rc = pmcs_set_dev_state(pwp, pptr, tgt, 458 PMCS_DEVICE_STATE_OPERATIONAL); 459 if (rc == 0) { 460 pmcs_ds_operational(pptr, tgt); 461 } else { 462 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 463 "%s: Failed to SET tgt 0x%p to OPERATIONAL state", 464 __func__, (void *)tgt); 465 466 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 467 __func__, "SET tgt to OPERATIONAL state"); 468 469 goto next_phy; 470 } 471 472 next_phy: 473 if (tgt) { 474 mutex_exit(&tgt->statlock); 475 } 476 pmcs_unlock_phy(pptr); 477 pptr = pnext; 478 } 479 480 /* 481 * Only clear ds_err_recovering if we're exiting for good and not 482 * just unwinding from recursion 483 */ 484 if (phyp == NULL) { 485 mutex_enter(&pwp->lock); 486 pwp->ds_err_recovering = 0; 487 mutex_exit(&pwp->lock); 488 } 489 490 if (reschedule) { 491 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 492 } 493 } 494 495 /* 496 * Called with target's statlock held (if target is non-NULL) and PHY lock held. 497 */ 498 int 499 pmcs_send_err_recovery_cmd(pmcs_hw_t *pwp, uint8_t dev_state, pmcs_phy_t *phyp, 500 pmcs_xscsi_t *tgt) 501 { 502 int rc = -1; 503 uint8_t tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 504 505 if (tgt != NULL) { 506 ASSERT(mutex_owned(&tgt->statlock)); 507 if (tgt->recovering) { 508 return (0); 509 } 510 511 tgt->recovering = 1; 512 tgt_dev_state = tgt->dev_state; 513 } 514 515 if (phyp == NULL) { 516 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, tgt, 517 "%s: PHY is NULL", __func__); 518 return (-1); 519 } 520 521 ASSERT(mutex_owned(&phyp->phy_lock)); 522 523 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 524 "%s: ds: 0x%x, tgt ds(0x%x)", __func__, dev_state, tgt_dev_state); 525 526 switch (dev_state) { 527 case PMCS_DEVICE_STATE_IN_RECOVERY: 528 if (tgt_dev_state == PMCS_DEVICE_STATE_IN_RECOVERY) { 529 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 530 "%s: Target 0x%p already IN_RECOVERY", __func__, 531 (void *)tgt); 532 rc = 0; /* This is not an error */ 533 goto no_action; 534 } 535 536 rc = pmcs_set_dev_state(pwp, phyp, tgt, 537 PMCS_DEVICE_STATE_IN_RECOVERY); 538 if (rc != 0) { 539 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 540 "%s(1): Failed to set tgt(0x%p) to IN_RECOVERY", 541 __func__, (void *)tgt); 542 } 543 544 break; 545 546 case PMCS_DEVICE_STATE_OPERATIONAL: 547 if (tgt_dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) { 548 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 549 "%s: Target 0x%p not ready to go OPERATIONAL", 550 __func__, (void *)tgt); 551 goto no_action; 552 } 553 554 rc = pmcs_set_dev_state(pwp, phyp, tgt, 555 PMCS_DEVICE_STATE_OPERATIONAL); 556 if (tgt != NULL) { 557 tgt->reset_success = 1; 558 } 559 if (rc != 0) { 560 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 561 "%s(2): Failed to SET tgt(0x%p) to OPERATIONAL", 562 __func__, (void *)tgt); 563 if (tgt != NULL) { 564 tgt->reset_success = 0; 565 } 566 } 567 568 break; 569 570 case PMCS_DEVICE_STATE_NON_OPERATIONAL: 571 PHY_CHANGED(pwp, phyp); 572 RESTART_DISCOVERY(pwp); 573 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 574 "%s: Device at %s is non-operational", 575 __func__, phyp->path); 576 if (tgt != NULL) { 577 tgt->dev_state = PMCS_DEVICE_STATE_NON_OPERATIONAL; 578 } 579 rc = 0; 580 581 break; 582 583 default: 584 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 585 "%s: Invalid state requested (%d)", __func__, 586 dev_state); 587 break; 588 589 } 590 591 no_action: 592 if (tgt != NULL) { 593 tgt->recovering = 0; 594 } 595 return (rc); 596 } 597 598 /* 599 * Start ssp event recovery. We have to schedule recovery operation because 600 * it involves sending multiple commands to device and we should not do it 601 * in the interrupt context. 602 * If it is failure of a recovery command, let the recovery thread deal with it. 603 * Called with pmcwork lock held. 604 */ 605 void 606 pmcs_start_ssp_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk, uint32_t *iomb, 607 size_t amt) 608 { 609 pmcs_xscsi_t *tgt = pwrk->xp; 610 uint32_t event = LE_32(iomb[2]); 611 pmcs_phy_t *pptr = pwrk->phy; 612 uint32_t tag; 613 614 if (tgt != NULL) { 615 mutex_enter(&tgt->statlock); 616 if (!tgt->assigned) { 617 if (pptr) { 618 pmcs_dec_phy_ref_count(pptr); 619 } 620 pptr = NULL; 621 pwrk->phy = NULL; 622 } 623 mutex_exit(&tgt->statlock); 624 } 625 626 if (pptr == NULL) { 627 /* 628 * No target, need to run RE-DISCOVERY here. 629 */ 630 if (pwrk->state != PMCS_WORK_STATE_TIMED_OUT) { 631 pwrk->state = PMCS_WORK_STATE_INTR; 632 } 633 /* 634 * Although we cannot mark phy to force abort nor mark phy 635 * as changed, killing of a target would take care of aborting 636 * commands for the device. 637 */ 638 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 639 "%s: No valid target for event processing. Reconfigure.", 640 __func__); 641 pmcs_pwork(pwp, pwrk); 642 RESTART_DISCOVERY(pwp); 643 return; 644 } else { 645 pmcs_lock_phy(pptr); 646 if (tgt) { 647 mutex_enter(&tgt->statlock); 648 } 649 if (event == PMCOUT_STATUS_OPEN_CNX_ERROR_IT_NEXUS_LOSS) { 650 if (tgt && tgt->dev_state != 651 PMCS_DEVICE_STATE_NON_OPERATIONAL) { 652 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 653 "%s: Device at %s is non-operational", 654 __func__, pptr->path); 655 tgt->dev_state = 656 PMCS_DEVICE_STATE_NON_OPERATIONAL; 657 } 658 pptr->abort_pending = 1; 659 if (tgt) { 660 mutex_exit(&tgt->statlock); 661 } 662 pmcs_unlock_phy(pptr); 663 mutex_exit(&pwrk->lock); 664 SCHEDULE_WORK(pwp, PMCS_WORK_ABORT_HANDLE); 665 RESTART_DISCOVERY(pwp); 666 return; 667 } 668 669 /* 670 * If this command is run in WAIT mode, it is a failing recovery 671 * command. If so, just wake up recovery thread waiting for 672 * command completion. 673 */ 674 tag = PMCS_TAG_TYPE(pwrk->htag); 675 if (tag == PMCS_TAG_TYPE_WAIT) { 676 pwrk->htag |= PMCS_TAG_DONE; 677 if (pwrk->arg && amt) { 678 (void) memcpy(pwrk->arg, iomb, amt); 679 } 680 cv_signal(&pwrk->sleep_cv); 681 if (tgt) { 682 mutex_exit(&tgt->statlock); 683 } 684 pmcs_unlock_phy(pptr); 685 mutex_exit(&pwrk->lock); 686 return; 687 } 688 689 if (!tgt) { 690 pmcs_prt(pwp, PMCS_PRT_DEBUG1, pptr, NULL, 691 "%s: Not scheduling SSP event recovery for NULL tgt" 692 " pwrk(%p) tag(0x%x)", __func__, (void *)pwrk, 693 pwrk->htag); 694 return; 695 } 696 697 /* 698 * To recover from primary failures, 699 * we need to schedule handling events recovery. 700 */ 701 tgt->event_recovery = 1; 702 mutex_exit(&tgt->statlock); 703 pmcs_unlock_phy(pptr); 704 pwrk->ssp_event = event; 705 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 706 "%s: Scheduling SSP event recovery for tgt(0x%p) " 707 "pwrk(%p) tag(0x%x)", __func__, (void *)tgt, (void *)pwrk, 708 pwrk->htag); 709 mutex_exit(&pwrk->lock); 710 SCHEDULE_WORK(pwp, PMCS_WORK_SSP_EVT_RECOVERY); 711 } 712 713 /* Work cannot be completed until event recovery is completed. */ 714 } 715 716 /* 717 * SSP target event recovery 718 * Entered with a phy lock held 719 * Pwrk lock is not needed - pwrk is on the target aq and no other thread 720 * will do anything with it until this thread starts the chain of recovery. 721 * Statlock may be acquired and released. 722 */ 723 void 724 pmcs_tgt_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk) 725 { 726 pmcs_phy_t *pptr = pwrk->phy; 727 pmcs_cmd_t *sp = pwrk->arg; 728 pmcs_lun_t *lun = sp->cmd_lun; 729 pmcs_xscsi_t *tgt = pwrk->xp; 730 uint32_t event; 731 uint32_t htag; 732 uint32_t status; 733 uint8_t dstate; 734 int rv; 735 736 ASSERT(pwrk->arg != NULL); 737 ASSERT(pwrk->xp != NULL); 738 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 739 "%s: event recovery for target 0x%p", __func__, (void *)pwrk->xp); 740 htag = pwrk->htag; 741 event = pwrk->ssp_event; 742 pwrk->ssp_event = 0xffffffff; 743 if (event == PMCOUT_STATUS_XFER_ERR_BREAK || 744 event == PMCOUT_STATUS_XFER_ERR_PHY_NOT_READY || 745 event == PMCOUT_STATUS_XFER_ERROR_CMD_ISSUE_ACK_NAK_TIMEOUT) { 746 /* Command may be still pending on device */ 747 rv = pmcs_ssp_tmf(pwp, pptr, SAS_QUERY_TASK, htag, 748 lun->lun_num, &status); 749 if (rv != 0) { 750 goto out; 751 } 752 if (status == SAS_RSP_TMF_COMPLETE) { 753 /* Command NOT pending on a device */ 754 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 755 "%s: No pending command for tgt 0x%p", 756 __func__, (void *)tgt); 757 /* Nothing more to do, just abort it on chip */ 758 htag = 0; 759 } 760 } 761 /* 762 * All other events left the command pending in the host 763 * Send abort task and abort it on the chip 764 */ 765 if (htag != 0) { 766 if (pmcs_ssp_tmf(pwp, pptr, SAS_ABORT_TASK, htag, 767 lun->lun_num, &status)) 768 goto out; 769 } 770 (void) pmcs_abort(pwp, pptr, pwrk->htag, 0, 1); 771 /* 772 * Abort either took care of work completion, or put device in 773 * a recovery state 774 */ 775 return; 776 out: 777 /* Abort failed, do full device recovery */ 778 mutex_enter(&tgt->statlock); 779 if (!pmcs_get_dev_state(pwp, pptr, tgt, &dstate)) 780 tgt->dev_state = dstate; 781 782 if ((tgt->dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) && 783 (tgt->dev_state != PMCS_DEVICE_STATE_NON_OPERATIONAL)) { 784 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 785 "%s: Setting IN_RECOVERY for tgt 0x%p", 786 __func__, (void *)tgt); 787 (void) pmcs_send_err_recovery_cmd(pwp, 788 PMCS_DEVICE_STATE_IN_RECOVERY, pptr, tgt); 789 } 790 mutex_exit(&tgt->statlock); 791 } 792 793 /* 794 * SSP event recovery task. 795 */ 796 void 797 pmcs_ssp_event_recovery(pmcs_hw_t *pwp) 798 { 799 int idx; 800 pmcs_xscsi_t *tgt; 801 pmcs_cmd_t *cp; 802 pmcwork_t *pwrk; 803 pmcs_phy_t *pphy; 804 int er_flag; 805 uint32_t idxpwrk; 806 807 restart: 808 for (idx = 0; idx < pwp->max_dev; idx++) { 809 mutex_enter(&pwp->lock); 810 tgt = pwp->targets[idx]; 811 mutex_exit(&pwp->lock); 812 if (tgt == NULL) { 813 continue; 814 } 815 816 mutex_enter(&tgt->statlock); 817 if (!tgt->assigned) { 818 mutex_exit(&tgt->statlock); 819 continue; 820 } 821 pphy = tgt->phy; 822 er_flag = tgt->event_recovery; 823 mutex_exit(&tgt->statlock); 824 825 if ((pphy == NULL) || (er_flag == 0)) { 826 continue; 827 } 828 829 pmcs_lock_phy(pphy); 830 mutex_enter(&tgt->statlock); 831 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 832 "%s: found target(0x%p)", __func__, (void *) tgt); 833 834 /* Check what cmd expects recovery */ 835 mutex_enter(&tgt->aqlock); 836 STAILQ_FOREACH(cp, &tgt->aq, cmd_next) { 837 /* 838 * Since work structure is on this target aq, and only 839 * this thread is accessing it now, we do not need 840 * to lock it 841 */ 842 idxpwrk = PMCS_TAG_INDEX(cp->cmd_tag); 843 pwrk = &pwp->work[idxpwrk]; 844 if (pwrk->htag != cp->cmd_tag) { 845 /* 846 * aq may contain TMF commands, so we 847 * may not find work structure with htag 848 */ 849 break; 850 } 851 if ((pwrk->ssp_event != 0) && 852 (pwrk->ssp_event != PMCS_REC_EVENT)) { 853 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 854 "%s: pwrk(%p) htag(0x%x)", 855 __func__, (void *) pwrk, cp->cmd_tag); 856 mutex_exit(&tgt->aqlock); 857 mutex_exit(&tgt->statlock); 858 pmcs_tgt_event_recovery(pwp, pwrk); 859 /* 860 * We dropped statlock, so restart the scan 861 */ 862 pmcs_unlock_phy(pphy); 863 goto restart; 864 } 865 } 866 mutex_exit(&tgt->aqlock); 867 tgt->event_recovery = 0; 868 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 869 "%s: end of SSP event recovery for target(0x%p)", 870 __func__, (void *) tgt); 871 mutex_exit(&tgt->statlock); 872 pmcs_unlock_phy(pphy); 873 } 874 pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL, 875 "%s: end of SSP event recovery for pwp(0x%p)", __func__, 876 (void *) pwp); 877 } 878 879 void 880 pmcs_start_dev_state_recovery(pmcs_xscsi_t *xp, pmcs_phy_t *phyp) 881 { 882 ASSERT(mutex_owned(&xp->statlock)); 883 ASSERT(xp->pwp != NULL); 884 885 if (xp->recover_wait == 0) { 886 pmcs_prt(xp->pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 887 "%s: Start ds_recovery for tgt 0x%p/PHY 0x%p (%s)", 888 __func__, (void *)xp, (void *)phyp, phyp->path); 889 xp->recover_wait = 1; 890 891 /* 892 * Rather than waiting for the watchdog timer, we'll 893 * kick it right now. 894 */ 895 SCHEDULE_WORK(xp->pwp, PMCS_WORK_DS_ERR_RECOVERY); 896 (void) ddi_taskq_dispatch(xp->pwp->tq, pmcs_worker, xp->pwp, 897 DDI_NOSLEEP); 898 } 899 } 900 901 /* 902 * Increment the phy ds error retry count. 903 * If too many retries, mark phy dead and restart discovery; 904 * otherwise schedule ds recovery. 905 */ 906 static void 907 pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, pmcs_xscsi_t *tgt, 908 pmcs_hw_t *pwp, const char *func_name, char *reason_string) 909 { 910 ASSERT(mutex_owned(&phyp->phy_lock)); 911 ASSERT((tgt == NULL) || mutex_owned(&tgt->statlock)); 912 913 phyp->ds_recovery_retries++; 914 915 if (phyp->ds_recovery_retries > PMCS_MAX_DS_RECOVERY_RETRIES) { 916 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, 917 "%s: retry limit reached after %s to PHY %s failed", 918 func_name, reason_string, phyp->path); 919 if (tgt != NULL) { 920 tgt->recover_wait = 0; 921 } 922 /* 923 * Mark the PHY as dead and it and its parent as changed, 924 * then restart discovery 925 */ 926 phyp->dead = 1; 927 PHY_CHANGED(pwp, phyp); 928 if (phyp->parent) 929 PHY_CHANGED(pwp, phyp->parent); 930 RESTART_DISCOVERY(pwp); 931 } else if ((phyp->ds_prev_good_recoveries > 932 PMCS_MAX_DS_RECOVERY_RETRIES) && 933 (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME) 934 < ddi_get_lbolt())) { 935 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of " 936 "successful recoveries reached, declaring PHY %s dead", 937 __func__, phyp->path); 938 if (tgt != NULL) { 939 tgt->recover_wait = 0; 940 } 941 /* 942 * Mark the PHY as dead and its parent as changed, 943 * then restart discovery 944 */ 945 phyp->dead = 1; 946 PHY_CHANGED(pwp, phyp); 947 if (phyp->parent) 948 PHY_CHANGED(pwp, phyp->parent); 949 RESTART_DISCOVERY(pwp); 950 } else { 951 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 952 } 953 } 954