1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * PM8001 device state recovery routines 28 */ 29 30 #include <sys/scsi/adapters/pmcs/pmcs.h> 31 32 /* 33 * SAS Topology Configuration 34 */ 35 static void pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt); 36 static void pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, 37 pmcs_xscsi_t *tgt, pmcs_hw_t *pwp, const char *func_name, int line, 38 char *reason_string); 39 40 /* 41 * Get device state. Called with statlock and PHY lock held. 42 */ 43 static int 44 pmcs_get_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 45 uint8_t *ds) 46 { 47 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 48 int result; 49 struct pmcwork *pwrk; 50 51 pmcs_prt(pwp, PMCS_PRT_DEBUG3, phyp, xp, "%s: tgt(0x%p)", __func__, 52 (void *)xp); 53 54 if (xp != NULL) { 55 ASSERT(mutex_owned(&xp->statlock)); 56 } 57 58 if (phyp == NULL) { 59 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 60 "%s: PHY is NULL", __func__); 61 return (-1); 62 } 63 ASSERT(mutex_owned(&phyp->phy_lock)); 64 65 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 66 if (pwrk == NULL) { 67 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 68 return (-1); 69 } 70 pwrk->arg = msg; 71 pwrk->dtype = phyp->dtype; 72 73 if (phyp->valid_device_id == 0) { 74 pmcs_pwork(pwp, pwrk); 75 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 76 "%s: Invalid DeviceID", __func__); 77 return (-1); 78 } 79 htag = pwrk->htag; 80 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 81 PMCIN_GET_DEVICE_STATE)); 82 msg[1] = LE_32(pwrk->htag); 83 msg[2] = LE_32(phyp->device_id); 84 85 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 86 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 87 if (ptr == NULL) { 88 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 89 pmcs_pwork(pwp, pwrk); 90 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 91 return (-1); 92 } 93 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 94 pwrk->state = PMCS_WORK_STATE_ONCHIP; 95 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 96 97 if (xp != NULL) { 98 mutex_exit(&xp->statlock); 99 } 100 pmcs_unlock_phy(phyp); 101 WAIT_FOR(pwrk, 1000, result); 102 pmcs_lock_phy(phyp); 103 pmcs_pwork(pwp, pwrk); 104 105 if (xp != NULL) { 106 mutex_enter(&xp->statlock); 107 } 108 109 if (result) { 110 pmcs_timed_out(pwp, htag, __func__); 111 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 112 "%s: cmd timed out, returning", __func__); 113 return (-1); 114 } 115 if (LE_32(msg[2]) == 0) { 116 *ds = (uint8_t)(LE_32(msg[4])); 117 if (xp == NULL) { 118 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 119 "%s: retrieved_ds=0x%x", __func__, *ds); 120 } else if (*ds != xp->dev_state) { 121 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 122 "%s: retrieved_ds=0x%x, target_ds=0x%x", __func__, 123 *ds, xp->dev_state); 124 } 125 return (0); 126 } else { 127 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 128 "%s: cmd failed Status(0x%x), returning ", __func__, 129 LE_32(msg[2])); 130 return (-1); 131 } 132 } 133 134 /* 135 * Set device state. Called with target's statlock and PHY lock held. 136 */ 137 static int 138 pmcs_set_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 139 uint8_t ds) 140 { 141 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 142 int result; 143 uint8_t pds, nds; 144 struct pmcwork *pwrk; 145 146 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 147 "%s: ds: 0x%x tgt: 0x%p phy: 0x%p", __func__, ds, (void *)xp, 148 (void *)phyp); 149 150 if (phyp == NULL) { 151 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 152 "%s: PHY is NULL", __func__); 153 return (-1); 154 } 155 156 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 157 if (pwrk == NULL) { 158 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 159 return (-1); 160 } 161 if (phyp->valid_device_id == 0) { 162 pmcs_pwork(pwp, pwrk); 163 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 164 "%s: Invalid DeviceID", __func__); 165 return (-1); 166 } 167 pwrk->arg = msg; 168 pwrk->dtype = phyp->dtype; 169 htag = pwrk->htag; 170 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 171 PMCIN_SET_DEVICE_STATE)); 172 msg[1] = LE_32(pwrk->htag); 173 msg[2] = LE_32(phyp->device_id); 174 msg[3] = LE_32(ds); 175 176 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 177 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 178 if (ptr == NULL) { 179 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 180 pmcs_pwork(pwp, pwrk); 181 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 182 return (-1); 183 } 184 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 185 pwrk->state = PMCS_WORK_STATE_ONCHIP; 186 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 187 188 if (xp != NULL) { 189 mutex_exit(&xp->statlock); 190 } 191 pmcs_unlock_phy(phyp); 192 WAIT_FOR(pwrk, 1000, result); 193 pmcs_lock_phy(phyp); 194 pmcs_pwork(pwp, pwrk); 195 if (xp != NULL) { 196 mutex_enter(&xp->statlock); 197 } 198 199 if (result) { 200 pmcs_timed_out(pwp, htag, __func__); 201 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 202 "%s: cmd timed out, returning", __func__); 203 return (-1); 204 } 205 if (LE_32(msg[2]) == 0) { 206 pds = (uint8_t)(LE_32(msg[4]) >> 4); 207 nds = (uint8_t)(LE_32(msg[4]) & 0x0000000f); 208 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 209 "%s: previous_ds=0x%x, new_ds=0x%x", __func__, pds, nds); 210 if (xp != NULL) { 211 xp->dev_state = nds; 212 } 213 return (0); 214 } else { 215 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 216 "%s: cmd failed Status(0x%x), returning ", __func__, 217 LE_32(msg[2])); 218 return (-1); 219 } 220 } 221 222 static void 223 pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt) 224 { 225 pmcs_hw_t *pwp; 226 227 ASSERT(pptr); 228 pwp = pptr->pwp; 229 230 if (tgt != NULL) { 231 tgt->recover_wait = 0; 232 } 233 pptr->ds_recovery_retries = 0; 234 235 if ((pptr->ds_prev_good_recoveries == 0) || 236 (ddi_get_lbolt() - pptr->last_good_recovery > 237 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME))) { 238 pptr->last_good_recovery = ddi_get_lbolt(); 239 pptr->ds_prev_good_recoveries = 1; 240 } else if (ddi_get_lbolt() < pptr->last_good_recovery + 241 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)) { 242 pptr->ds_prev_good_recoveries++; 243 } else { 244 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 245 __func__, __LINE__, "Max recovery" 246 "attempts reached. Declaring PHY dead"); 247 } 248 249 /* Don't bother to run the work queues if the PHY is dead */ 250 if (!pptr->dead) { 251 SCHEDULE_WORK(pwp, PMCS_WORK_RUN_QUEUES); 252 (void) ddi_taskq_dispatch(pwp->tq, pmcs_worker, 253 pwp, DDI_NOSLEEP); 254 } 255 } 256 257 void 258 pmcs_dev_state_recovery(pmcs_hw_t *pwp, pmcs_phy_t *phyp) 259 { 260 boolean_t reschedule = B_FALSE; 261 uint8_t ds, tgt_dev_state; 262 int rc; 263 pmcs_xscsi_t *tgt; 264 pmcs_phy_t *pptr, *pnext, *pchild; 265 266 /* 267 * First time, check to see if we're already performing recovery 268 */ 269 if (phyp == NULL) { 270 mutex_enter(&pwp->lock); 271 if (pwp->ds_err_recovering) { 272 mutex_exit(&pwp->lock); 273 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 274 return; 275 } 276 277 pwp->ds_err_recovering = 1; 278 pptr = pwp->root_phys; 279 mutex_exit(&pwp->lock); 280 } else { 281 pptr = phyp; 282 } 283 284 while (pptr) { 285 /* 286 * Since ds_err_recovering is set, we can be assured these 287 * PHYs won't disappear on us while we do this. 288 */ 289 pmcs_lock_phy(pptr); 290 pchild = pptr->children; 291 pnext = pptr->sibling; 292 pmcs_unlock_phy(pptr); 293 294 if (pchild) { 295 pmcs_dev_state_recovery(pwp, pchild); 296 } 297 298 tgt = NULL; 299 pmcs_lock_phy(pptr); 300 301 if (pptr->dead) { 302 goto next_phy; 303 } 304 305 tgt = pptr->target; 306 307 if (tgt != NULL) { 308 mutex_enter(&tgt->statlock); 309 if (tgt->recover_wait == 0) { 310 goto next_phy; 311 } 312 tgt_dev_state = tgt->dev_state; 313 } else { 314 tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 315 } 316 317 if (pptr->prev_recovery) { 318 if (ddi_get_lbolt() - pptr->prev_recovery < 319 drv_usectohz(PMCS_DS_RECOVERY_INTERVAL)) { 320 pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt, 321 "%s: DS recovery on PHY %s " 322 "re-invoked too soon. Skipping...", 323 __func__, pptr->path); 324 if ((tgt) && (tgt->recover_wait)) { 325 reschedule = B_TRUE; 326 } 327 goto next_phy; 328 } 329 } 330 pptr->prev_recovery = ddi_get_lbolt(); 331 332 /* 333 * Step 1: Put the device into the IN_RECOVERY state 334 */ 335 rc = pmcs_get_dev_state(pwp, pptr, tgt, &ds); 336 if (rc != 0) { 337 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 338 "%s: pmcs_get_dev_state on PHY %s " 339 "failed (rc=%d)", 340 __func__, pptr->path, rc); 341 342 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 343 __func__, __LINE__, "pmcs_get_dev_state"); 344 345 goto next_phy; 346 } 347 348 /* If the chip says it's operational, we're done */ 349 if (ds == PMCS_DEVICE_STATE_OPERATIONAL) { 350 pmcs_ds_operational(pptr, tgt); 351 goto next_phy; 352 } 353 354 if ((tgt_dev_state == ds) && 355 (ds == PMCS_DEVICE_STATE_IN_RECOVERY)) { 356 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 357 "%s: Target 0x%p already IN_RECOVERY", __func__, 358 (void *)tgt); 359 } else { 360 if (tgt != NULL) { 361 tgt->dev_state = ds; 362 } 363 tgt_dev_state = ds; 364 ds = PMCS_DEVICE_STATE_IN_RECOVERY; 365 rc = pmcs_send_err_recovery_cmd(pwp, ds, pptr, tgt); 366 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 367 "%s: pmcs_send_err_recovery_cmd " 368 "result(%d) tgt(0x%p) ds(0x%x) tgt->ds(0x%x)", 369 __func__, rc, (void *)tgt, ds, tgt_dev_state); 370 371 if (rc) { 372 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 373 "%s: pmcs_send_err_recovery_cmd to PHY %s " 374 "failed (rc=%d)", 375 __func__, pptr->path, rc); 376 377 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 378 __func__, __LINE__, 379 "pmcs_send_err_recovery_cmd"); 380 381 goto next_phy; 382 } 383 } 384 385 /* 386 * Step 2: Perform a hard reset on the PHY. 387 * Note we do not reset HBA PHYs. 388 */ 389 if (pptr->level > 0) { 390 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 391 "%s: Issue HARD_RESET to PHY %s", __func__, 392 pptr->path); 393 /* 394 * Must release statlock here because pmcs_reset_phy 395 * will drop and reacquire the PHY lock. 396 */ 397 if (tgt != NULL) { 398 mutex_exit(&tgt->statlock); 399 } 400 rc = pmcs_reset_phy(pwp, pptr, PMCS_PHYOP_HARD_RESET); 401 if (tgt != NULL) { 402 mutex_enter(&tgt->statlock); 403 } 404 if (rc) { 405 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 406 "%s: HARD_RESET to PHY %s failed (rc=%d)", 407 __func__, pptr->path, rc); 408 409 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 410 __func__, __LINE__, "HARD_RESET"); 411 412 goto next_phy; 413 } 414 } else { 415 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 416 "%s: Not resetting HBA PHY...", __func__); 417 } 418 419 /* 420 * Step 3: Abort all I/Os to the device 421 */ 422 if (pptr->abort_all_start) { 423 while (pptr->abort_all_start) { 424 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 425 "%s: Waiting for outstanding ABORT_ALL on " 426 "PHY 0x%p", __func__, (void *)pptr); 427 cv_wait(&pptr->abort_all_cv, &pptr->phy_lock); 428 } 429 } else { 430 if (tgt != NULL) { 431 mutex_exit(&tgt->statlock); 432 } 433 rc = pmcs_abort(pwp, pptr, pptr->device_id, 1, 1); 434 if (tgt != NULL) { 435 mutex_enter(&tgt->statlock); 436 } 437 if (rc != 0) { 438 pptr->abort_pending = 1; 439 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 440 "%s: pmcs_abort to PHY %s failed (rc=%d)", 441 __func__, pptr->path, rc); 442 443 pmcs_handle_ds_recovery_error(pptr, tgt, 444 pwp, __func__, __LINE__, "pmcs_abort"); 445 446 goto next_phy; 447 } 448 } 449 450 /* 451 * Step 4: Set the device back to OPERATIONAL state 452 */ 453 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 454 "%s: Set PHY/tgt 0x%p/0x%p to OPERATIONAL state", 455 __func__, (void *)pptr, (void *)tgt); 456 rc = pmcs_set_dev_state(pwp, pptr, tgt, 457 PMCS_DEVICE_STATE_OPERATIONAL); 458 if (rc == 0) { 459 pmcs_ds_operational(pptr, tgt); 460 } else { 461 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 462 "%s: Failed to SET tgt 0x%p to OPERATIONAL state", 463 __func__, (void *)tgt); 464 465 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 466 __func__, __LINE__, "SET tgt to OPERATIONAL state"); 467 468 goto next_phy; 469 } 470 471 next_phy: 472 if (tgt) { 473 mutex_exit(&tgt->statlock); 474 } 475 pmcs_unlock_phy(pptr); 476 pptr = pnext; 477 } 478 479 /* 480 * Only clear ds_err_recovering if we're exiting for good and not 481 * just unwinding from recursion 482 */ 483 if (phyp == NULL) { 484 mutex_enter(&pwp->lock); 485 pwp->ds_err_recovering = 0; 486 mutex_exit(&pwp->lock); 487 } 488 489 if (reschedule) { 490 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 491 } 492 } 493 494 /* 495 * Called with target's statlock held (if target is non-NULL) and PHY lock held. 496 */ 497 int 498 pmcs_send_err_recovery_cmd(pmcs_hw_t *pwp, uint8_t dev_state, pmcs_phy_t *phyp, 499 pmcs_xscsi_t *tgt) 500 { 501 int rc = -1; 502 uint8_t tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 503 504 if (tgt != NULL) { 505 ASSERT(mutex_owned(&tgt->statlock)); 506 if (tgt->recovering) { 507 return (0); 508 } 509 510 tgt->recovering = 1; 511 tgt_dev_state = tgt->dev_state; 512 } 513 514 if (phyp == NULL) { 515 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, tgt, 516 "%s: PHY is NULL", __func__); 517 return (-1); 518 } 519 520 ASSERT(mutex_owned(&phyp->phy_lock)); 521 522 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 523 "%s: ds: 0x%x, tgt ds(0x%x)", __func__, dev_state, tgt_dev_state); 524 525 switch (dev_state) { 526 case PMCS_DEVICE_STATE_IN_RECOVERY: 527 if (tgt_dev_state == PMCS_DEVICE_STATE_IN_RECOVERY) { 528 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 529 "%s: Target 0x%p already IN_RECOVERY", __func__, 530 (void *)tgt); 531 rc = 0; /* This is not an error */ 532 goto no_action; 533 } 534 535 rc = pmcs_set_dev_state(pwp, phyp, tgt, 536 PMCS_DEVICE_STATE_IN_RECOVERY); 537 if (rc != 0) { 538 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 539 "%s(1): Failed to set tgt(0x%p) to IN_RECOVERY", 540 __func__, (void *)tgt); 541 } 542 543 break; 544 545 case PMCS_DEVICE_STATE_OPERATIONAL: 546 if (tgt_dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) { 547 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 548 "%s: Target 0x%p not ready to go OPERATIONAL", 549 __func__, (void *)tgt); 550 goto no_action; 551 } 552 553 rc = pmcs_set_dev_state(pwp, phyp, tgt, 554 PMCS_DEVICE_STATE_OPERATIONAL); 555 if (tgt != NULL) { 556 tgt->reset_success = 1; 557 } 558 if (rc != 0) { 559 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 560 "%s(2): Failed to SET tgt(0x%p) to OPERATIONAL", 561 __func__, (void *)tgt); 562 if (tgt != NULL) { 563 tgt->reset_success = 0; 564 } 565 } 566 567 break; 568 569 case PMCS_DEVICE_STATE_NON_OPERATIONAL: 570 PHY_CHANGED(pwp, phyp); 571 RESTART_DISCOVERY(pwp); 572 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 573 "%s: Device at %s is non-operational", 574 __func__, phyp->path); 575 if (tgt != NULL) { 576 tgt->dev_state = PMCS_DEVICE_STATE_NON_OPERATIONAL; 577 } 578 rc = 0; 579 580 break; 581 582 default: 583 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 584 "%s: Invalid state requested (%d)", __func__, 585 dev_state); 586 break; 587 588 } 589 590 no_action: 591 if (tgt != NULL) { 592 tgt->recovering = 0; 593 } 594 return (rc); 595 } 596 597 /* 598 * Start ssp event recovery. We have to schedule recovery operation because 599 * it involves sending multiple commands to device and we should not do it 600 * in the interrupt context. 601 * If it is failure of a recovery command, let the recovery thread deal with it. 602 * Called with pmcwork lock held. 603 */ 604 605 void 606 pmcs_start_ssp_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk, uint32_t *iomb, 607 size_t amt) 608 { 609 pmcs_xscsi_t *tgt = pwrk->xp; 610 uint32_t event = LE_32(iomb[2]); 611 pmcs_phy_t *pptr = pwrk->phy; 612 uint32_t tag; 613 614 if (tgt != NULL) { 615 mutex_enter(&tgt->statlock); 616 if (!tgt->assigned) { 617 if (pptr) { 618 pmcs_dec_phy_ref_count(pptr); 619 } 620 pptr = NULL; 621 pwrk->phy = NULL; 622 } 623 mutex_exit(&tgt->statlock); 624 } 625 if (pptr == NULL) { 626 /* 627 * No target, need to run RE-DISCOVERY here. 628 */ 629 if (pwrk->state != PMCS_WORK_STATE_TIMED_OUT) { 630 pwrk->state = PMCS_WORK_STATE_INTR; 631 } 632 /* 633 * Although we cannot mark phy to force abort nor mark phy 634 * as changed, killing of a target would take care of aborting 635 * commands for the device. 636 */ 637 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 638 "%s: No valid target for event processing. Reconfigure.", 639 __func__); 640 pmcs_pwork(pwp, pwrk); 641 RESTART_DISCOVERY(pwp); 642 return; 643 } else { 644 pmcs_lock_phy(pptr); 645 mutex_enter(&tgt->statlock); 646 if (event == PMCOUT_STATUS_OPEN_CNX_ERROR_IT_NEXUS_LOSS) { 647 if (tgt->dev_state != 648 PMCS_DEVICE_STATE_NON_OPERATIONAL) { 649 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 650 "%s: Device at %s is non-operational", 651 __func__, pptr->path); 652 tgt->dev_state = 653 PMCS_DEVICE_STATE_NON_OPERATIONAL; 654 } 655 pptr->abort_pending = 1; 656 mutex_exit(&tgt->statlock); 657 pmcs_unlock_phy(pptr); 658 mutex_exit(&pwrk->lock); 659 SCHEDULE_WORK(pwp, PMCS_WORK_ABORT_HANDLE); 660 RESTART_DISCOVERY(pwp); 661 return; 662 } 663 664 /* 665 * If this command is run in WAIT mode, it is a failing recovery 666 * command. If so, just wake up recovery thread waiting for 667 * command completion. 668 */ 669 tag = PMCS_TAG_TYPE(pwrk->htag); 670 if (tag == PMCS_TAG_TYPE_WAIT) { 671 pwrk->htag |= PMCS_TAG_DONE; 672 if (pwrk->arg && amt) { 673 (void) memcpy(pwrk->arg, iomb, amt); 674 } 675 cv_signal(&pwrk->sleep_cv); 676 mutex_exit(&tgt->statlock); 677 pmcs_unlock_phy(pptr); 678 mutex_exit(&pwrk->lock); 679 return; 680 } 681 682 /* 683 * To recover from primary failures, 684 * we need to schedule handling events recovery. 685 */ 686 tgt->event_recovery = 1; 687 mutex_exit(&tgt->statlock); 688 pmcs_unlock_phy(pptr); 689 pwrk->ssp_event = event; 690 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 691 "%s: Scheduling SSP event recovery for tgt(0x%p) " 692 "pwrk(%p) tag(0x%x)", __func__, (void *)tgt, (void *)pwrk, 693 pwrk->htag); 694 mutex_exit(&pwrk->lock); 695 SCHEDULE_WORK(pwp, PMCS_WORK_SSP_EVT_RECOVERY); 696 } 697 698 /* Work cannot be completed until event recovery is completed. */ 699 } 700 701 /* 702 * SSP target event recovery 703 * Entered with a phy lock held 704 * Pwrk lock is not needed - pwrk is on the target aq and no other thread 705 * will do anything with it until this thread starts the chain of recovery. 706 * Statlock may be acquired and released. 707 */ 708 709 void 710 pmcs_tgt_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk) 711 { 712 pmcs_phy_t *pptr = pwrk->phy; 713 pmcs_cmd_t *sp = pwrk->arg; 714 pmcs_lun_t *lun = sp->cmd_lun; 715 pmcs_xscsi_t *tgt = pwrk->xp; 716 uint32_t event; 717 uint32_t htag; 718 uint32_t status; 719 uint8_t dstate; 720 int rv; 721 722 ASSERT(pwrk->arg != NULL); 723 ASSERT(pwrk->xp != NULL); 724 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 725 "%s: event recovery for target 0x%p", __func__, (void *)pwrk->xp); 726 htag = pwrk->htag; 727 event = pwrk->ssp_event; 728 pwrk->ssp_event = 0xffffffff; 729 if (event == PMCOUT_STATUS_XFER_ERR_BREAK || 730 event == PMCOUT_STATUS_XFER_ERR_PHY_NOT_READY || 731 event == PMCOUT_STATUS_XFER_ERROR_CMD_ISSUE_ACK_NAK_TIMEOUT) { 732 /* Command may be still pending on device */ 733 rv = pmcs_ssp_tmf(pwp, pptr, SAS_QUERY_TASK, htag, 734 lun->lun_num, &status); 735 if (rv != 0) { 736 goto out; 737 } 738 if (status == SAS_RSP_TMF_COMPLETE) { 739 /* Command NOT pending on a device */ 740 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 741 "%s: No pending command for tgt 0x%p", 742 __func__, (void *)tgt); 743 /* Nothing more to do, just abort it on chip */ 744 htag = 0; 745 } 746 } 747 /* 748 * All other events left the command pending in the host 749 * Send abort task and abort it on the chip 750 */ 751 if (htag != 0) { 752 if (pmcs_ssp_tmf(pwp, pptr, SAS_ABORT_TASK, htag, 753 lun->lun_num, &status)) 754 goto out; 755 } 756 (void) pmcs_abort(pwp, pptr, pwrk->htag, 0, 1); 757 /* 758 * Abort either took care of work completion, or put device in 759 * a recovery state 760 */ 761 return; 762 out: 763 /* Abort failed, do full device recovery */ 764 mutex_enter(&tgt->statlock); 765 if (!pmcs_get_dev_state(pwp, pptr, tgt, &dstate)) 766 tgt->dev_state = dstate; 767 768 if ((tgt->dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) && 769 (tgt->dev_state != PMCS_DEVICE_STATE_NON_OPERATIONAL)) { 770 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 771 "%s: Setting IN_RECOVERY for tgt 0x%p", 772 __func__, (void *)tgt); 773 (void) pmcs_send_err_recovery_cmd(pwp, 774 PMCS_DEVICE_STATE_IN_RECOVERY, pptr, tgt); 775 } 776 mutex_exit(&tgt->statlock); 777 } 778 779 /* 780 * SSP event recovery task. 781 */ 782 void 783 pmcs_ssp_event_recovery(pmcs_hw_t *pwp) 784 { 785 int idx; 786 pmcs_xscsi_t *tgt; 787 pmcs_cmd_t *cp; 788 pmcwork_t *pwrk; 789 pmcs_phy_t *pphy; 790 int er_flag; 791 uint32_t idxpwrk; 792 793 restart: 794 for (idx = 0; idx < pwp->max_dev; idx++) { 795 mutex_enter(&pwp->lock); 796 tgt = pwp->targets[idx]; 797 mutex_exit(&pwp->lock); 798 if (tgt != NULL) { 799 mutex_enter(&tgt->statlock); 800 if (!tgt->assigned) { 801 mutex_exit(&tgt->statlock); 802 continue; 803 } 804 pphy = tgt->phy; 805 er_flag = tgt->event_recovery; 806 mutex_exit(&tgt->statlock); 807 if (pphy != NULL && er_flag != 0) { 808 pmcs_lock_phy(pphy); 809 mutex_enter(&tgt->statlock); 810 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 811 "%s: found target(0x%p)", __func__, 812 (void *) tgt); 813 814 /* Check what cmd expects recovery */ 815 mutex_enter(&tgt->aqlock); 816 STAILQ_FOREACH(cp, &tgt->aq, cmd_next) { 817 /* 818 * Since work structure is on this 819 * target aq, and only this thread 820 * is accessing it now, we do not need 821 * to lock it 822 */ 823 idxpwrk = PMCS_TAG_INDEX(cp->cmd_tag); 824 pwrk = &pwp->work[idxpwrk]; 825 if (pwrk->htag != cp->cmd_tag) { 826 /* 827 * aq may contain TMF commands, 828 * so we may not find work 829 * structure with htag 830 */ 831 break; 832 } 833 if (pwrk->ssp_event != 0 && 834 pwrk->ssp_event != 835 PMCS_REC_EVENT) { 836 pmcs_prt(pwp, 837 PMCS_PRT_DEBUG, pphy, tgt, 838 "%s: pwrk(%p) ctag(0x%x)", 839 __func__, (void *) pwrk, 840 cp->cmd_tag); 841 mutex_exit(&tgt->aqlock); 842 mutex_exit(&tgt->statlock); 843 pmcs_tgt_event_recovery( 844 pwp, pwrk); 845 /* 846 * We dropped statlock, so 847 * restart scanning from scratch 848 */ 849 pmcs_unlock_phy(pphy); 850 goto restart; 851 } 852 } 853 mutex_exit(&tgt->aqlock); 854 tgt->event_recovery = 0; 855 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 856 "%s: end of SSP event recovery for " 857 "target(0x%p)", __func__, (void *) tgt); 858 mutex_exit(&tgt->statlock); 859 pmcs_unlock_phy(pphy); 860 } 861 } 862 } 863 pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL, 864 "%s: end of SSP event recovery for pwp(0x%p)", __func__, 865 (void *) pwp); 866 } 867 868 void 869 pmcs_start_dev_state_recovery(pmcs_xscsi_t *xp, pmcs_phy_t *phyp) 870 { 871 ASSERT(mutex_owned(&xp->statlock)); 872 ASSERT(xp->pwp != NULL); 873 874 if (xp->recover_wait == 0) { 875 pmcs_prt(xp->pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 876 "%s: Start ds_recovery for tgt 0x%p/PHY 0x%p (%s)", 877 __func__, (void *)xp, (void *)phyp, phyp->path); 878 xp->recover_wait = 1; 879 880 /* 881 * Rather than waiting for the watchdog timer, we'll 882 * kick it right now. 883 */ 884 SCHEDULE_WORK(xp->pwp, PMCS_WORK_DS_ERR_RECOVERY); 885 (void) ddi_taskq_dispatch(xp->pwp->tq, pmcs_worker, xp->pwp, 886 DDI_NOSLEEP); 887 } 888 } 889 890 /* 891 * Increment the phy ds error retry count. 892 * If too many retries, mark phy dead and restart discovery; 893 * otherwise schedule ds recovery. 894 */ 895 static void 896 pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, pmcs_xscsi_t *tgt, 897 pmcs_hw_t *pwp, const char *func_name, int line, char *reason_string) 898 { 899 ASSERT(mutex_owned(&phyp->phy_lock)); 900 ASSERT((tgt == NULL) || mutex_owned(&tgt->statlock)); 901 902 phyp->ds_recovery_retries++; 903 904 if (phyp->ds_recovery_retries > PMCS_MAX_DS_RECOVERY_RETRIES) { 905 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, 906 "%s: retry limit reached after %s to PHY %s failed", 907 func_name, reason_string, phyp->path); 908 if (tgt != NULL) { 909 tgt->recover_wait = 0; 910 } 911 phyp->dead = 1; 912 PHY_CHANGED_AT_LOCATION(pwp, phyp, func_name, line); 913 RESTART_DISCOVERY(pwp); 914 } else if ((phyp->ds_prev_good_recoveries > 915 PMCS_MAX_DS_RECOVERY_RETRIES) && 916 (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME) 917 < ddi_get_lbolt())) { 918 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of " 919 "successful recoveries reached, declaring PHY %s dead", 920 __func__, phyp->path); 921 if (tgt != NULL) { 922 tgt->recover_wait = 0; 923 } 924 phyp->dead = 1; 925 PHY_CHANGED_AT_LOCATION(pwp, phyp, func_name, line); 926 RESTART_DISCOVERY(pwp); 927 } else { 928 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 929 } 930 } 931