1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * PM8001 device state recovery routines 28 */ 29 30 #include <sys/scsi/adapters/pmcs/pmcs.h> 31 32 /* 33 * SAS Topology Configuration 34 */ 35 static void pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, 36 pmcs_xscsi_t *tgt, pmcs_hw_t *pwp, const char *func_name, int line, 37 char *reason_string); 38 39 /* 40 * Get device state. Called with statlock and PHY lock held. 41 */ 42 static int 43 pmcs_get_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 44 uint8_t *ds) 45 { 46 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 47 int result; 48 struct pmcwork *pwrk; 49 50 pmcs_prt(pwp, PMCS_PRT_DEBUG3, phyp, xp, "%s: tgt(0x%p)", __func__, 51 (void *)xp); 52 53 if (xp != NULL) { 54 ASSERT(mutex_owned(&xp->statlock)); 55 } 56 ASSERT(mutex_owned(&phyp->phy_lock)); 57 58 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 59 if (pwrk == NULL) { 60 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 61 return (-1); 62 } 63 pwrk->arg = msg; 64 pwrk->dtype = phyp->dtype; 65 66 if (phyp->valid_device_id == 0) { 67 pmcs_pwork(pwp, pwrk); 68 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 69 "%s: Invalid DeviceID", __func__); 70 return (-1); 71 } 72 htag = pwrk->htag; 73 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 74 PMCIN_GET_DEVICE_STATE)); 75 msg[1] = LE_32(pwrk->htag); 76 msg[2] = LE_32(phyp->device_id); 77 78 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 79 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 80 if (ptr == NULL) { 81 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 82 pmcs_pwork(pwp, pwrk); 83 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 84 return (-1); 85 } 86 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 87 pwrk->state = PMCS_WORK_STATE_ONCHIP; 88 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 89 90 if (xp != NULL) { 91 mutex_exit(&xp->statlock); 92 } 93 pmcs_unlock_phy(phyp); 94 WAIT_FOR(pwrk, 1000, result); 95 pmcs_lock_phy(phyp); 96 pmcs_pwork(pwp, pwrk); 97 98 if (xp != NULL) { 99 mutex_enter(&xp->statlock); 100 } 101 102 if (result) { 103 pmcs_timed_out(pwp, htag, __func__); 104 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 105 "%s: cmd timed out, returning", __func__); 106 return (-1); 107 } 108 if (LE_32(msg[2]) == 0) { 109 *ds = (uint8_t)(LE_32(msg[4])); 110 if (xp == NULL) { 111 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 112 "%s: retrieved_ds=0x%x", __func__, *ds); 113 } else if (*ds != xp->dev_state) { 114 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 115 "%s: retrieved_ds=0x%x, target_ds=0x%x", __func__, 116 *ds, xp->dev_state); 117 } 118 return (0); 119 } else { 120 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 121 "%s: cmd failed Status(0x%x), returning ", __func__, 122 LE_32(msg[2])); 123 return (-1); 124 } 125 } 126 127 /* 128 * Set device state. Called with target's statlock and PHY lock held. 129 */ 130 static int 131 pmcs_set_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 132 uint8_t ds) 133 { 134 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 135 int result; 136 uint8_t pds, nds; 137 struct pmcwork *pwrk; 138 139 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 140 "%s: ds: 0x%x tgt: 0x%p phy: 0x%p", __func__, ds, (void *)xp, 141 (void *)phyp); 142 143 if (phyp == NULL) { 144 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 145 "%s: PHY is NULL", __func__); 146 return (-1); 147 } 148 149 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 150 if (pwrk == NULL) { 151 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 152 return (-1); 153 } 154 if (phyp->valid_device_id == 0) { 155 pmcs_pwork(pwp, pwrk); 156 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 157 "%s: Invalid DeviceID", __func__); 158 return (-1); 159 } 160 pwrk->arg = msg; 161 pwrk->dtype = phyp->dtype; 162 htag = pwrk->htag; 163 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 164 PMCIN_SET_DEVICE_STATE)); 165 msg[1] = LE_32(pwrk->htag); 166 msg[2] = LE_32(phyp->device_id); 167 msg[3] = LE_32(ds); 168 169 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 170 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 171 if (ptr == NULL) { 172 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 173 pmcs_pwork(pwp, pwrk); 174 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 175 return (-1); 176 } 177 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 178 pwrk->state = PMCS_WORK_STATE_ONCHIP; 179 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 180 181 if (xp != NULL) { 182 mutex_exit(&xp->statlock); 183 } 184 pmcs_unlock_phy(phyp); 185 WAIT_FOR(pwrk, 1000, result); 186 pmcs_lock_phy(phyp); 187 pmcs_pwork(pwp, pwrk); 188 if (xp != NULL) { 189 mutex_enter(&xp->statlock); 190 } 191 192 if (result) { 193 pmcs_timed_out(pwp, htag, __func__); 194 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 195 "%s: cmd timed out, returning", __func__); 196 return (-1); 197 } 198 if (LE_32(msg[2]) == 0) { 199 pds = (uint8_t)(LE_32(msg[4]) >> 4); 200 nds = (uint8_t)(LE_32(msg[4]) & 0x0000000f); 201 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 202 "%s: previous_ds=0x%x, new_ds=0x%x", __func__, pds, nds); 203 if (xp != NULL) { 204 xp->dev_state = nds; 205 } 206 return (0); 207 } else { 208 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 209 "%s: cmd failed Status(0x%x), returning ", __func__, 210 LE_32(msg[2])); 211 return (-1); 212 } 213 } 214 215 void 216 pmcs_dev_state_recovery(pmcs_hw_t *pwp, pmcs_phy_t *phyp) 217 { 218 uint8_t ds, tgt_dev_state; 219 int rc; 220 pmcs_xscsi_t *tgt; 221 pmcs_phy_t *pptr, *pnext, *pchild; 222 223 /* 224 * First time, check to see if we're already performing recovery 225 */ 226 if (phyp == NULL) { 227 mutex_enter(&pwp->lock); 228 if (pwp->ds_err_recovering) { 229 mutex_exit(&pwp->lock); 230 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 231 return; 232 } 233 234 pwp->ds_err_recovering = 1; 235 pptr = pwp->root_phys; 236 mutex_exit(&pwp->lock); 237 } else { 238 pptr = phyp; 239 } 240 241 while (pptr) { 242 /* 243 * Since ds_err_recovering is set, we can be assured these 244 * PHYs won't disappear on us while we do this. 245 */ 246 pmcs_lock_phy(pptr); 247 pchild = pptr->children; 248 pnext = pptr->sibling; 249 pmcs_unlock_phy(pptr); 250 251 if (pchild) { 252 pmcs_dev_state_recovery(pwp, pchild); 253 } 254 255 tgt = NULL; 256 pmcs_lock_phy(pptr); 257 258 if (pptr->dead) { 259 goto next_phy; 260 } 261 262 tgt = pptr->target; 263 264 if (tgt != NULL) { 265 mutex_enter(&tgt->statlock); 266 if (tgt->recover_wait == 0) { 267 goto next_phy; 268 } 269 tgt_dev_state = tgt->dev_state; 270 } else { 271 tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 272 } 273 274 if (pptr->prev_recovery) { 275 if (ddi_get_lbolt() - pptr->prev_recovery < 276 drv_usectohz(PMCS_DS_RECOVERY_INTERVAL)) { 277 pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt, 278 "%s: DS recovery on PHY %s " 279 "re-invoked too soon. Skipping...", 280 __func__, pptr->path); 281 goto next_phy; 282 } 283 } 284 pptr->prev_recovery = ddi_get_lbolt(); 285 286 /* 287 * Step 1: Put the device into the IN_RECOVERY state 288 */ 289 rc = pmcs_get_dev_state(pwp, pptr, tgt, &ds); 290 if (rc != 0) { 291 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 292 "%s: pmcs_get_dev_state on PHY %s " 293 "failed (rc=%d)", 294 __func__, pptr->path, rc); 295 296 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 297 __func__, __LINE__, "pmcs_get_dev_state"); 298 299 goto next_phy; 300 } 301 302 if ((tgt_dev_state == ds) && 303 (ds == PMCS_DEVICE_STATE_IN_RECOVERY)) { 304 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 305 "%s: Target 0x%p already IN_RECOVERY", __func__, 306 (void *)tgt); 307 } else { 308 if (tgt != NULL) { 309 tgt->dev_state = ds; 310 } 311 tgt_dev_state = ds; 312 ds = PMCS_DEVICE_STATE_IN_RECOVERY; 313 rc = pmcs_send_err_recovery_cmd(pwp, ds, pptr, tgt); 314 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 315 "%s: pmcs_send_err_recovery_cmd " 316 "result(%d) tgt(0x%p) ds(0x%x) tgt->ds(0x%x)", 317 __func__, rc, (void *)tgt, ds, tgt_dev_state); 318 319 if (rc) { 320 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 321 "%s: pmcs_send_err_recovery_cmd to PHY %s " 322 "failed (rc=%d)", 323 __func__, pptr->path, rc); 324 325 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 326 __func__, __LINE__, 327 "pmcs_send_err_recovery_cmd"); 328 329 goto next_phy; 330 } 331 } 332 333 /* 334 * Step 2: Perform a hard reset on the PHY 335 */ 336 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 337 "%s: Issue HARD_RESET to PHY %s", __func__, pptr->path); 338 /* 339 * Must release statlock here because pmcs_reset_phy will 340 * drop and reacquire the PHY lock. 341 */ 342 if (tgt != NULL) { 343 mutex_exit(&tgt->statlock); 344 } 345 rc = pmcs_reset_phy(pwp, pptr, PMCS_PHYOP_HARD_RESET); 346 if (tgt != NULL) { 347 mutex_enter(&tgt->statlock); 348 } 349 if (rc) { 350 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 351 "%s: HARD_RESET to PHY %s failed (rc=%d)", 352 __func__, pptr->path, rc); 353 354 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 355 __func__, __LINE__, "HARD_RESET"); 356 357 goto next_phy; 358 } 359 360 /* 361 * Step 3: Abort all I/Os to the device 362 */ 363 if (pptr->abort_all_start) { 364 while (pptr->abort_all_start) { 365 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 366 "%s: Waiting for outstanding ABORT_ALL on " 367 "PHY 0x%p", __func__, (void *)pptr); 368 cv_wait(&pptr->abort_all_cv, &pptr->phy_lock); 369 } 370 } else { 371 if (tgt != NULL) { 372 mutex_exit(&tgt->statlock); 373 } 374 rc = pmcs_abort(pwp, pptr, pptr->device_id, 1, 1); 375 if (tgt != NULL) { 376 mutex_enter(&tgt->statlock); 377 } 378 if (rc != 0) { 379 pptr->abort_pending = 1; 380 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 381 "%s: pmcs_abort to PHY %s failed (rc=%d)", 382 __func__, pptr->path, rc); 383 384 pmcs_handle_ds_recovery_error(pptr, tgt, 385 pwp, __func__, __LINE__, "pmcs_abort"); 386 387 goto next_phy; 388 } 389 } 390 391 /* 392 * Step 4: Set the device back to OPERATIONAL state 393 */ 394 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 395 "%s: Set PHY/tgt 0x%p/0x%p to OPERATIONAL state", 396 __func__, (void *)pptr, (void *)tgt); 397 rc = pmcs_set_dev_state(pwp, pptr, tgt, 398 PMCS_DEVICE_STATE_OPERATIONAL); 399 if (rc == 0) { 400 if (tgt != NULL) { 401 tgt->recover_wait = 0; 402 } 403 pptr->ds_recovery_retries = 0; 404 405 if ((pptr->ds_prev_good_recoveries == 0) || 406 (ddi_get_lbolt() - pptr->last_good_recovery > 407 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME))) { 408 pptr->last_good_recovery = ddi_get_lbolt(); 409 pptr->ds_prev_good_recoveries = 1; 410 } else if (ddi_get_lbolt() < pptr->last_good_recovery + 411 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)) { 412 pptr->ds_prev_good_recoveries++; 413 } else { 414 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 415 __func__, __LINE__, "Max recovery" 416 "attempts reached. Declaring PHY dead"); 417 } 418 419 /* 420 * Don't bother to run the work queues if the PHY 421 * is dead. 422 */ 423 if (tgt && tgt->phy && !tgt->phy->dead) { 424 SCHEDULE_WORK(pwp, PMCS_WORK_RUN_QUEUES); 425 (void) ddi_taskq_dispatch(pwp->tq, pmcs_worker, 426 pwp, DDI_NOSLEEP); 427 } 428 } else { 429 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 430 "%s: Failed to SET tgt 0x%p to OPERATIONAL state", 431 __func__, (void *)tgt); 432 433 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 434 __func__, __LINE__, "SET tgt to OPERATIONAL state"); 435 436 goto next_phy; 437 } 438 439 next_phy: 440 if (tgt) { 441 mutex_exit(&tgt->statlock); 442 } 443 pmcs_unlock_phy(pptr); 444 pptr = pnext; 445 } 446 447 /* 448 * Only clear ds_err_recovering if we're exiting for good and not 449 * just unwinding from recursion 450 */ 451 if (phyp == NULL) { 452 mutex_enter(&pwp->lock); 453 pwp->ds_err_recovering = 0; 454 mutex_exit(&pwp->lock); 455 } 456 } 457 458 /* 459 * Called with target's statlock held (if target is non-NULL) and PHY lock held. 460 */ 461 int 462 pmcs_send_err_recovery_cmd(pmcs_hw_t *pwp, uint8_t dev_state, pmcs_phy_t *phyp, 463 pmcs_xscsi_t *tgt) 464 { 465 int rc = -1; 466 uint8_t tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 467 468 if (tgt != NULL) { 469 ASSERT(mutex_owned(&tgt->statlock)); 470 if (tgt->recovering) { 471 return (0); 472 } 473 474 tgt->recovering = 1; 475 tgt_dev_state = tgt->dev_state; 476 } 477 478 if (phyp == NULL) { 479 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, tgt, 480 "%s: PHY is NULL", __func__); 481 return (-1); 482 } 483 484 ASSERT(mutex_owned(&phyp->phy_lock)); 485 486 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 487 "%s: ds: 0x%x, tgt ds(0x%x)", __func__, dev_state, tgt_dev_state); 488 489 switch (dev_state) { 490 case PMCS_DEVICE_STATE_IN_RECOVERY: 491 if (tgt_dev_state == PMCS_DEVICE_STATE_IN_RECOVERY) { 492 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 493 "%s: Target 0x%p already IN_RECOVERY", __func__, 494 (void *)tgt); 495 rc = 0; /* This is not an error */ 496 goto no_action; 497 } 498 499 rc = pmcs_set_dev_state(pwp, phyp, tgt, 500 PMCS_DEVICE_STATE_IN_RECOVERY); 501 if (rc != 0) { 502 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 503 "%s(1): Failed to set tgt(0x%p) to IN_RECOVERY", 504 __func__, (void *)tgt); 505 } 506 507 break; 508 509 case PMCS_DEVICE_STATE_OPERATIONAL: 510 if (tgt_dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) { 511 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 512 "%s: Target 0x%p not ready to go OPERATIONAL", 513 __func__, (void *)tgt); 514 goto no_action; 515 } 516 517 rc = pmcs_set_dev_state(pwp, phyp, tgt, 518 PMCS_DEVICE_STATE_OPERATIONAL); 519 if (tgt != NULL) { 520 tgt->reset_success = 1; 521 } 522 if (rc != 0) { 523 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 524 "%s(2): Failed to SET tgt(0x%p) to OPERATIONAL", 525 __func__, (void *)tgt); 526 if (tgt != NULL) { 527 tgt->reset_success = 0; 528 } 529 } 530 531 break; 532 533 case PMCS_DEVICE_STATE_NON_OPERATIONAL: 534 PHY_CHANGED(pwp, phyp); 535 RESTART_DISCOVERY(pwp); 536 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 537 "%s: Device at %s is non-operational", 538 __func__, phyp->path); 539 if (tgt != NULL) { 540 tgt->dev_state = PMCS_DEVICE_STATE_NON_OPERATIONAL; 541 } 542 rc = 0; 543 544 break; 545 546 default: 547 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 548 "%s: Invalid state requested (%d)", __func__, 549 dev_state); 550 break; 551 552 } 553 554 no_action: 555 if (tgt != NULL) { 556 tgt->recovering = 0; 557 } 558 return (rc); 559 } 560 561 /* 562 * Start ssp event recovery. We have to schedule recovery operation because 563 * it involves sending multiple commands to device and we should not do it 564 * in the interrupt context. 565 * If it is failure of a recovery command, let the recovery thread deal with it. 566 * Called with pmcwork lock held. 567 */ 568 569 void 570 pmcs_start_ssp_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk, uint32_t *iomb, 571 size_t amt) 572 { 573 pmcs_xscsi_t *tgt = pwrk->xp; 574 uint32_t event = LE_32(iomb[2]); 575 pmcs_phy_t *pptr = pwrk->phy; 576 uint32_t tag; 577 578 if (tgt != NULL) { 579 mutex_enter(&tgt->statlock); 580 if (!tgt->assigned) { 581 if (pptr) { 582 pmcs_dec_phy_ref_count(pptr); 583 } 584 pptr = NULL; 585 pwrk->phy = NULL; 586 } 587 mutex_exit(&tgt->statlock); 588 } 589 if (pptr == NULL) { 590 /* 591 * No target, need to run RE-DISCOVERY here. 592 */ 593 if (pwrk->state != PMCS_WORK_STATE_TIMED_OUT) { 594 pwrk->state = PMCS_WORK_STATE_INTR; 595 } 596 /* 597 * Although we cannot mark phy to force abort nor mark phy 598 * as changed, killing of a target would take care of aborting 599 * commands for the device. 600 */ 601 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 602 "%s: No valid target for event processing. Reconfigure.", 603 __func__); 604 pmcs_pwork(pwp, pwrk); 605 RESTART_DISCOVERY(pwp); 606 return; 607 } else { 608 pmcs_lock_phy(pptr); 609 mutex_enter(&tgt->statlock); 610 if (event == PMCOUT_STATUS_OPEN_CNX_ERROR_IT_NEXUS_LOSS) { 611 if (tgt->dev_state != 612 PMCS_DEVICE_STATE_NON_OPERATIONAL) { 613 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 614 "%s: Device at %s is non-operational", 615 __func__, pptr->path); 616 tgt->dev_state = 617 PMCS_DEVICE_STATE_NON_OPERATIONAL; 618 } 619 pptr->abort_pending = 1; 620 mutex_exit(&tgt->statlock); 621 pmcs_unlock_phy(pptr); 622 mutex_exit(&pwrk->lock); 623 SCHEDULE_WORK(pwp, PMCS_WORK_ABORT_HANDLE); 624 RESTART_DISCOVERY(pwp); 625 return; 626 } 627 628 /* 629 * If this command is run in WAIT mode, it is a failing recovery 630 * command. If so, just wake up recovery thread waiting for 631 * command completion. 632 */ 633 tag = PMCS_TAG_TYPE(pwrk->htag); 634 if (tag == PMCS_TAG_TYPE_WAIT) { 635 pwrk->htag |= PMCS_TAG_DONE; 636 if (pwrk->arg && amt) { 637 (void) memcpy(pwrk->arg, iomb, amt); 638 } 639 cv_signal(&pwrk->sleep_cv); 640 mutex_exit(&tgt->statlock); 641 pmcs_unlock_phy(pptr); 642 mutex_exit(&pwrk->lock); 643 return; 644 } 645 646 /* 647 * To recover from primary failures, 648 * we need to schedule handling events recovery. 649 */ 650 tgt->event_recovery = 1; 651 mutex_exit(&tgt->statlock); 652 pmcs_unlock_phy(pptr); 653 pwrk->ssp_event = event; 654 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 655 "%s: Scheduling SSP event recovery for tgt(0x%p) " 656 "pwrk(%p) tag(0x%x)", __func__, (void *)tgt, (void *)pwrk, 657 pwrk->htag); 658 mutex_exit(&pwrk->lock); 659 SCHEDULE_WORK(pwp, PMCS_WORK_SSP_EVT_RECOVERY); 660 } 661 662 /* Work cannot be completed until event recovery is completed. */ 663 } 664 665 /* 666 * SSP target event recovery 667 * Entered with a phy lock held 668 * Pwrk lock is not needed - pwrk is on the target aq and no other thread 669 * will do anything with it until this thread starts the chain of recovery. 670 * Statlock may be acquired and released. 671 */ 672 673 void 674 pmcs_tgt_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk) 675 { 676 pmcs_phy_t *pptr = pwrk->phy; 677 pmcs_cmd_t *sp = pwrk->arg; 678 pmcs_lun_t *lun = sp->cmd_lun; 679 pmcs_xscsi_t *tgt = pwrk->xp; 680 uint32_t event; 681 uint32_t htag; 682 uint32_t status; 683 uint8_t dstate; 684 int rv; 685 686 ASSERT(pwrk->arg != NULL); 687 ASSERT(pwrk->xp != NULL); 688 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 689 "%s: event recovery for target 0x%p", __func__, (void *)pwrk->xp); 690 htag = pwrk->htag; 691 event = pwrk->ssp_event; 692 pwrk->ssp_event = 0xffffffff; 693 if (event == PMCOUT_STATUS_XFER_ERR_BREAK || 694 event == PMCOUT_STATUS_XFER_ERR_PHY_NOT_READY || 695 event == PMCOUT_STATUS_XFER_ERROR_CMD_ISSUE_ACK_NAK_TIMEOUT) { 696 /* Command may be still pending on device */ 697 rv = pmcs_ssp_tmf(pwp, pptr, SAS_QUERY_TASK, htag, 698 lun->lun_num, &status); 699 if (rv != 0) { 700 goto out; 701 } 702 if (status == SAS_RSP_TMF_COMPLETE) { 703 /* Command NOT pending on a device */ 704 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 705 "%s: No pending command for tgt 0x%p", 706 __func__, (void *)tgt); 707 /* Nothing more to do, just abort it on chip */ 708 htag = 0; 709 } 710 } 711 /* 712 * All other events left the command pending in the host 713 * Send abort task and abort it on the chip 714 */ 715 if (htag != 0) { 716 if (pmcs_ssp_tmf(pwp, pptr, SAS_ABORT_TASK, htag, 717 lun->lun_num, &status)) 718 goto out; 719 } 720 (void) pmcs_abort(pwp, pptr, pwrk->htag, 0, 1); 721 /* 722 * Abort either took care of work completion, or put device in 723 * a recovery state 724 */ 725 return; 726 out: 727 /* Abort failed, do full device recovery */ 728 mutex_enter(&tgt->statlock); 729 if (!pmcs_get_dev_state(pwp, pptr, tgt, &dstate)) 730 tgt->dev_state = dstate; 731 732 if ((tgt->dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) && 733 (tgt->dev_state != PMCS_DEVICE_STATE_NON_OPERATIONAL)) { 734 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 735 "%s: Setting IN_RECOVERY for tgt 0x%p", 736 __func__, (void *)tgt); 737 (void) pmcs_send_err_recovery_cmd(pwp, 738 PMCS_DEVICE_STATE_IN_RECOVERY, pptr, tgt); 739 } 740 mutex_exit(&tgt->statlock); 741 } 742 743 /* 744 * SSP event recovery task. 745 */ 746 void 747 pmcs_ssp_event_recovery(pmcs_hw_t *pwp) 748 { 749 int idx; 750 pmcs_xscsi_t *tgt; 751 pmcs_cmd_t *cp; 752 pmcwork_t *pwrk; 753 pmcs_phy_t *pphy; 754 int er_flag; 755 uint32_t idxpwrk; 756 757 restart: 758 for (idx = 0; idx < pwp->max_dev; idx++) { 759 mutex_enter(&pwp->lock); 760 tgt = pwp->targets[idx]; 761 mutex_exit(&pwp->lock); 762 if (tgt != NULL) { 763 mutex_enter(&tgt->statlock); 764 if (!tgt->assigned) { 765 mutex_exit(&tgt->statlock); 766 continue; 767 } 768 pphy = tgt->phy; 769 er_flag = tgt->event_recovery; 770 mutex_exit(&tgt->statlock); 771 if (pphy != NULL && er_flag != 0) { 772 pmcs_lock_phy(pphy); 773 mutex_enter(&tgt->statlock); 774 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 775 "%s: found target(0x%p)", __func__, 776 (void *) tgt); 777 778 /* Check what cmd expects recovery */ 779 mutex_enter(&tgt->aqlock); 780 STAILQ_FOREACH(cp, &tgt->aq, cmd_next) { 781 /* 782 * Since work structure is on this 783 * target aq, and only this thread 784 * is accessing it now, we do not need 785 * to lock it 786 */ 787 idxpwrk = PMCS_TAG_INDEX(cp->cmd_tag); 788 pwrk = &pwp->work[idxpwrk]; 789 if (pwrk->htag != cp->cmd_tag) { 790 /* 791 * aq may contain TMF commands, 792 * so we may not find work 793 * structure with htag 794 */ 795 break; 796 } 797 if (pwrk->ssp_event != 0 && 798 pwrk->ssp_event != 799 PMCS_REC_EVENT) { 800 pmcs_prt(pwp, 801 PMCS_PRT_DEBUG, pphy, tgt, 802 "%s: pwrk(%p) ctag(0x%x)", 803 __func__, (void *) pwrk, 804 cp->cmd_tag); 805 mutex_exit(&tgt->aqlock); 806 mutex_exit(&tgt->statlock); 807 pmcs_tgt_event_recovery( 808 pwp, pwrk); 809 /* 810 * We dropped statlock, so 811 * restart scanning from scratch 812 */ 813 pmcs_unlock_phy(pphy); 814 goto restart; 815 } 816 } 817 mutex_exit(&tgt->aqlock); 818 tgt->event_recovery = 0; 819 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 820 "%s: end of SSP event recovery for " 821 "target(0x%p)", __func__, (void *) tgt); 822 mutex_exit(&tgt->statlock); 823 pmcs_unlock_phy(pphy); 824 } 825 } 826 } 827 pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL, 828 "%s: end of SSP event recovery for pwp(0x%p)", __func__, 829 (void *) pwp); 830 } 831 832 void 833 pmcs_start_dev_state_recovery(pmcs_xscsi_t *xp, pmcs_phy_t *phyp) 834 { 835 ASSERT(mutex_owned(&xp->statlock)); 836 ASSERT(xp->pwp != NULL); 837 838 if (xp->recover_wait == 0) { 839 pmcs_prt(xp->pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 840 "%s: Start ds_recovery for tgt 0x%p/PHY 0x%p (%s)", 841 __func__, (void *)xp, (void *)phyp, phyp->path); 842 xp->recover_wait = 1; 843 844 /* 845 * Rather than waiting for the watchdog timer, we'll 846 * kick it right now. 847 */ 848 SCHEDULE_WORK(xp->pwp, PMCS_WORK_DS_ERR_RECOVERY); 849 (void) ddi_taskq_dispatch(xp->pwp->tq, pmcs_worker, xp->pwp, 850 DDI_NOSLEEP); 851 } 852 } 853 854 /* 855 * Increment the phy ds error retry count. 856 * If too many retries, mark phy dead and restart discovery; 857 * otherwise schedule ds recovery. 858 */ 859 static void 860 pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, pmcs_xscsi_t *tgt, 861 pmcs_hw_t *pwp, const char *func_name, int line, char *reason_string) 862 { 863 ASSERT(mutex_owned(&phyp->phy_lock)); 864 ASSERT((tgt == NULL) || mutex_owned(&tgt->statlock)); 865 866 phyp->ds_recovery_retries++; 867 868 if (phyp->ds_recovery_retries > PMCS_MAX_DS_RECOVERY_RETRIES) { 869 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, 870 "%s: retry limit reached after %s to PHY %s failed", 871 func_name, reason_string, phyp->path); 872 if (tgt != NULL) { 873 tgt->recover_wait = 0; 874 } 875 phyp->dead = 1; 876 PHY_CHANGED_AT_LOCATION(pwp, phyp, func_name, line); 877 RESTART_DISCOVERY(pwp); 878 } else if ((phyp->ds_prev_good_recoveries > 879 PMCS_MAX_DS_RECOVERY_RETRIES) && 880 (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME) 881 < ddi_get_lbolt())) { 882 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of " 883 "successful recoveries reached, declaring PHY %s dead", 884 __func__, phyp->path); 885 if (tgt != NULL) { 886 tgt->recover_wait = 0; 887 } 888 phyp->dead = 1; 889 PHY_CHANGED_AT_LOCATION(pwp, phyp, func_name, line); 890 RESTART_DISCOVERY(pwp); 891 } else { 892 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 893 } 894 } 895