1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * PM8001 device state recovery routines 27 */ 28 29 #include <sys/scsi/adapters/pmcs/pmcs.h> 30 31 /* 32 * SAS Topology Configuration 33 */ 34 static void pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt); 35 static void pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, 36 pmcs_xscsi_t *tgt, pmcs_hw_t *pwp, const char *func_name, 37 char *reason_string); 38 39 /* 40 * Get device state. Called with statlock and PHY lock held. 41 */ 42 static int 43 pmcs_get_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 44 uint8_t *ds) 45 { 46 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 47 int result; 48 struct pmcwork *pwrk; 49 50 pmcs_prt(pwp, PMCS_PRT_DEBUG3, phyp, xp, "%s: tgt(0x%p)", __func__, 51 (void *)xp); 52 53 if (xp != NULL) { 54 ASSERT(mutex_owned(&xp->statlock)); 55 } 56 57 if (phyp == NULL) { 58 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 59 "%s: PHY is NULL", __func__); 60 return (-1); 61 } 62 ASSERT(mutex_owned(&phyp->phy_lock)); 63 64 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 65 if (pwrk == NULL) { 66 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 67 return (-1); 68 } 69 pwrk->arg = msg; 70 pwrk->dtype = phyp->dtype; 71 72 if (phyp->valid_device_id == 0) { 73 pmcs_pwork(pwp, pwrk); 74 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 75 "%s: Invalid DeviceID", __func__); 76 return (-1); 77 } 78 htag = pwrk->htag; 79 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 80 PMCIN_GET_DEVICE_STATE)); 81 msg[1] = LE_32(pwrk->htag); 82 msg[2] = LE_32(phyp->device_id); 83 CLEAN_MESSAGE(msg, 3); 84 85 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 86 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 87 if (ptr == NULL) { 88 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 89 pmcs_pwork(pwp, pwrk); 90 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 91 return (-1); 92 } 93 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 94 pwrk->state = PMCS_WORK_STATE_ONCHIP; 95 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 96 97 if (xp != NULL) { 98 mutex_exit(&xp->statlock); 99 } 100 pmcs_unlock_phy(phyp); 101 WAIT_FOR(pwrk, 1000, result); 102 pmcs_pwork(pwp, pwrk); 103 pmcs_lock_phy(phyp); 104 105 if (xp != NULL) { 106 mutex_enter(&xp->statlock); 107 } 108 109 if (result) { 110 pmcs_timed_out(pwp, htag, __func__); 111 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 112 "%s: cmd timed out, returning", __func__); 113 return (-1); 114 } 115 if (LE_32(msg[2]) == 0) { 116 *ds = (uint8_t)(LE_32(msg[4])); 117 if (xp == NULL) { 118 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 119 "%s: retrieved_ds=0x%x", __func__, *ds); 120 } else if (*ds != xp->dev_state) { 121 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 122 "%s: retrieved_ds=0x%x, target_ds=0x%x", __func__, 123 *ds, xp->dev_state); 124 } 125 return (0); 126 } else { 127 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 128 "%s: cmd failed Status(0x%x), returning ", __func__, 129 LE_32(msg[2])); 130 return (-1); 131 } 132 } 133 134 /* 135 * Set device state. Called with target's statlock and PHY lock held. 136 */ 137 static int 138 pmcs_set_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 139 uint8_t ds) 140 { 141 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 142 int result; 143 uint8_t pds, nds; 144 struct pmcwork *pwrk; 145 146 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 147 "%s: ds: 0x%x tgt: 0x%p phy: 0x%p", __func__, ds, (void *)xp, 148 (void *)phyp); 149 150 if (phyp == NULL) { 151 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 152 "%s: PHY is NULL", __func__); 153 return (-1); 154 } 155 156 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 157 if (pwrk == NULL) { 158 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 159 return (-1); 160 } 161 if (phyp->valid_device_id == 0) { 162 pmcs_pwork(pwp, pwrk); 163 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 164 "%s: Invalid DeviceID", __func__); 165 return (-1); 166 } 167 pwrk->arg = msg; 168 pwrk->dtype = phyp->dtype; 169 htag = pwrk->htag; 170 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 171 PMCIN_SET_DEVICE_STATE)); 172 msg[1] = LE_32(pwrk->htag); 173 msg[2] = LE_32(phyp->device_id); 174 msg[3] = LE_32(ds); 175 CLEAN_MESSAGE(msg, 4); 176 177 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 178 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 179 if (ptr == NULL) { 180 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 181 pmcs_pwork(pwp, pwrk); 182 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 183 return (-1); 184 } 185 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 186 pwrk->state = PMCS_WORK_STATE_ONCHIP; 187 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 188 189 if (xp != NULL) { 190 mutex_exit(&xp->statlock); 191 } 192 pmcs_unlock_phy(phyp); 193 WAIT_FOR(pwrk, 1000, result); 194 pmcs_pwork(pwp, pwrk); 195 pmcs_lock_phy(phyp); 196 if (xp != NULL) { 197 mutex_enter(&xp->statlock); 198 } 199 200 if (result) { 201 pmcs_timed_out(pwp, htag, __func__); 202 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 203 "%s: cmd timed out, returning", __func__); 204 return (-1); 205 } 206 if (LE_32(msg[2]) == 0) { 207 pds = (uint8_t)(LE_32(msg[4]) >> 4); 208 nds = (uint8_t)(LE_32(msg[4]) & 0x0000000f); 209 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 210 "%s: previous_ds=0x%x, new_ds=0x%x", __func__, pds, nds); 211 if (xp != NULL) { 212 xp->dev_state = nds; 213 } 214 return (0); 215 } else { 216 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 217 "%s: cmd failed Status(0x%x), returning ", __func__, 218 LE_32(msg[2])); 219 return (-1); 220 } 221 } 222 223 static void 224 pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt) 225 { 226 pmcs_hw_t *pwp; 227 228 ASSERT(pptr); 229 pwp = pptr->pwp; 230 231 if (tgt != NULL) { 232 tgt->recover_wait = 0; 233 } 234 pptr->ds_recovery_retries = 0; 235 236 if ((pptr->ds_prev_good_recoveries == 0) || 237 (ddi_get_lbolt() - pptr->last_good_recovery > 238 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME))) { 239 pptr->last_good_recovery = ddi_get_lbolt(); 240 pptr->ds_prev_good_recoveries = 1; 241 } else if (ddi_get_lbolt() < pptr->last_good_recovery + 242 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)) { 243 pptr->ds_prev_good_recoveries++; 244 } else { 245 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, __func__, 246 "Max recovery attempts reached. Declaring PHY dead"); 247 } 248 249 /* Don't bother to run the work queues if the PHY is dead */ 250 if (!pptr->dead) { 251 SCHEDULE_WORK(pwp, PMCS_WORK_RUN_QUEUES); 252 (void) ddi_taskq_dispatch(pwp->tq, pmcs_worker, 253 pwp, DDI_NOSLEEP); 254 } 255 } 256 257 void 258 pmcs_dev_state_recovery(pmcs_hw_t *pwp, pmcs_phy_t *phyp) 259 { 260 boolean_t reschedule = B_FALSE; 261 uint8_t ds, tgt_dev_state; 262 int rc; 263 pmcs_xscsi_t *tgt; 264 pmcs_phy_t *pptr, *pnext, *pchild; 265 266 /* 267 * First time, check to see if we're already performing recovery 268 */ 269 if (phyp == NULL) { 270 mutex_enter(&pwp->lock); 271 if (pwp->ds_err_recovering) { 272 mutex_exit(&pwp->lock); 273 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 274 return; 275 } 276 277 pwp->ds_err_recovering = 1; 278 pptr = pwp->root_phys; 279 mutex_exit(&pwp->lock); 280 } else { 281 pptr = phyp; 282 } 283 284 while (pptr) { 285 /* 286 * Since ds_err_recovering is set, we can be assured these 287 * PHYs won't disappear on us while we do this. 288 */ 289 pmcs_lock_phy(pptr); 290 pchild = pptr->children; 291 pnext = pptr->sibling; 292 pmcs_unlock_phy(pptr); 293 294 if (pchild) { 295 pmcs_dev_state_recovery(pwp, pchild); 296 } 297 298 tgt = NULL; 299 pmcs_lock_phy(pptr); 300 301 if (pptr->dead || !pptr->valid_device_id) { 302 goto next_phy; 303 } 304 305 if (pptr->iport && (pptr->iport->ua_state != UA_ACTIVE)) { 306 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, pptr->target, 307 "%s: No DS recovery on PHY %s, iport not active", 308 __func__, pptr->path); 309 goto next_phy; 310 } 311 312 tgt = pptr->target; 313 314 if (tgt != NULL) { 315 mutex_enter(&tgt->statlock); 316 if (tgt->recover_wait == 0) { 317 goto next_phy; 318 } 319 tgt_dev_state = tgt->dev_state; 320 } else { 321 tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 322 } 323 324 if (pptr->prev_recovery) { 325 if (ddi_get_lbolt() - pptr->prev_recovery < 326 drv_usectohz(PMCS_DS_RECOVERY_INTERVAL)) { 327 pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt, 328 "%s: DS recovery on PHY %s " 329 "re-invoked too soon. Skipping...", 330 __func__, pptr->path); 331 if ((tgt) && (tgt->recover_wait)) { 332 reschedule = B_TRUE; 333 } 334 goto next_phy; 335 } 336 } 337 pptr->prev_recovery = ddi_get_lbolt(); 338 339 /* 340 * Step 1: Put the device into the IN_RECOVERY state 341 */ 342 rc = pmcs_get_dev_state(pwp, pptr, tgt, &ds); 343 if (rc != 0) { 344 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 345 "%s: pmcs_get_dev_state on PHY %s " 346 "failed (rc=%d)", 347 __func__, pptr->path, rc); 348 349 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 350 __func__, "pmcs_get_dev_state"); 351 352 goto next_phy; 353 } 354 355 /* If the chip says it's operational, we're done */ 356 if (ds == PMCS_DEVICE_STATE_OPERATIONAL) { 357 pmcs_ds_operational(pptr, tgt); 358 goto next_phy; 359 } 360 361 if ((tgt_dev_state == ds) && 362 (ds == PMCS_DEVICE_STATE_IN_RECOVERY)) { 363 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 364 "%s: Target 0x%p already IN_RECOVERY", __func__, 365 (void *)tgt); 366 } else { 367 if (tgt != NULL) { 368 tgt->dev_state = ds; 369 } 370 tgt_dev_state = ds; 371 ds = PMCS_DEVICE_STATE_IN_RECOVERY; 372 rc = pmcs_send_err_recovery_cmd(pwp, ds, pptr, tgt); 373 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 374 "%s: pmcs_send_err_recovery_cmd " 375 "result(%d) tgt(0x%p) ds(0x%x) tgt->ds(0x%x)", 376 __func__, rc, (void *)tgt, ds, tgt_dev_state); 377 378 if (rc) { 379 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 380 "%s: pmcs_send_err_recovery_cmd to PHY %s " 381 "failed (rc=%d)", 382 __func__, pptr->path, rc); 383 384 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 385 __func__, "pmcs_send_err_recovery_cmd"); 386 387 goto next_phy; 388 } 389 } 390 391 /* 392 * Step 2: Perform a hard reset on the PHY. 393 */ 394 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 395 "%s: Issue HARD_RESET to PHY %s", __func__, 396 pptr->path); 397 /* 398 * Must release statlock here because pmcs_reset_phy 399 * will drop and reacquire the PHY lock. 400 */ 401 if (tgt != NULL) { 402 mutex_exit(&tgt->statlock); 403 } 404 rc = pmcs_reset_phy(pwp, pptr, PMCS_PHYOP_HARD_RESET); 405 if (tgt != NULL) { 406 mutex_enter(&tgt->statlock); 407 } 408 if (rc) { 409 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 410 "%s: HARD_RESET to PHY %s failed (rc=%d)", 411 __func__, pptr->path, rc); 412 413 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 414 __func__, "HARD_RESET"); 415 416 goto next_phy; 417 } 418 419 /* 420 * Step 3: Abort all I/Os to the device 421 */ 422 if (pptr->abort_all_start) { 423 while (pptr->abort_all_start) { 424 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 425 "%s: Waiting for outstanding ABORT_ALL on " 426 "PHY 0x%p", __func__, (void *)pptr); 427 cv_wait(&pptr->abort_all_cv, &pptr->phy_lock); 428 } 429 } else { 430 if (tgt != NULL) { 431 mutex_exit(&tgt->statlock); 432 } 433 rc = pmcs_abort(pwp, pptr, pptr->device_id, 1, 1); 434 if (tgt != NULL) { 435 mutex_enter(&tgt->statlock); 436 } 437 if (rc != 0) { 438 pptr->abort_pending = 1; 439 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 440 "%s: pmcs_abort to PHY %s failed (rc=%d)", 441 __func__, pptr->path, rc); 442 443 pmcs_handle_ds_recovery_error(pptr, tgt, 444 pwp, __func__, "pmcs_abort"); 445 446 goto next_phy; 447 } 448 } 449 450 /* 451 * Step 4: Set the device back to OPERATIONAL state 452 */ 453 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 454 "%s: Set PHY/tgt 0x%p/0x%p to OPERATIONAL state", 455 __func__, (void *)pptr, (void *)tgt); 456 rc = pmcs_set_dev_state(pwp, pptr, tgt, 457 PMCS_DEVICE_STATE_OPERATIONAL); 458 if (rc == 0) { 459 pmcs_ds_operational(pptr, tgt); 460 } else { 461 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 462 "%s: Failed to SET tgt 0x%p to OPERATIONAL state", 463 __func__, (void *)tgt); 464 465 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 466 __func__, "SET tgt to OPERATIONAL state"); 467 468 goto next_phy; 469 } 470 471 next_phy: 472 if (tgt) { 473 mutex_exit(&tgt->statlock); 474 } 475 pmcs_unlock_phy(pptr); 476 pptr = pnext; 477 } 478 479 /* 480 * Only clear ds_err_recovering if we're exiting for good and not 481 * just unwinding from recursion 482 */ 483 if (phyp == NULL) { 484 mutex_enter(&pwp->lock); 485 pwp->ds_err_recovering = 0; 486 mutex_exit(&pwp->lock); 487 } 488 489 if (reschedule) { 490 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 491 } 492 } 493 494 /* 495 * Called with target's statlock held (if target is non-NULL) and PHY lock held. 496 */ 497 int 498 pmcs_send_err_recovery_cmd(pmcs_hw_t *pwp, uint8_t dev_state, pmcs_phy_t *phyp, 499 pmcs_xscsi_t *tgt) 500 { 501 int rc = -1; 502 uint8_t tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 503 504 if (tgt != NULL) { 505 ASSERT(mutex_owned(&tgt->statlock)); 506 if (tgt->recovering) { 507 return (0); 508 } 509 510 tgt->recovering = 1; 511 tgt_dev_state = tgt->dev_state; 512 } 513 514 if (phyp == NULL) { 515 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, tgt, 516 "%s: PHY is NULL", __func__); 517 return (-1); 518 } 519 520 ASSERT(mutex_owned(&phyp->phy_lock)); 521 522 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 523 "%s: ds: 0x%x, tgt ds(0x%x)", __func__, dev_state, tgt_dev_state); 524 525 switch (dev_state) { 526 case PMCS_DEVICE_STATE_IN_RECOVERY: 527 if (tgt_dev_state == PMCS_DEVICE_STATE_IN_RECOVERY) { 528 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 529 "%s: Target 0x%p already IN_RECOVERY", __func__, 530 (void *)tgt); 531 rc = 0; /* This is not an error */ 532 goto no_action; 533 } 534 535 rc = pmcs_set_dev_state(pwp, phyp, tgt, 536 PMCS_DEVICE_STATE_IN_RECOVERY); 537 if (rc != 0) { 538 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 539 "%s(1): Failed to set tgt(0x%p) to IN_RECOVERY", 540 __func__, (void *)tgt); 541 } 542 543 break; 544 545 case PMCS_DEVICE_STATE_OPERATIONAL: 546 if (tgt_dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) { 547 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 548 "%s: Target 0x%p not ready to go OPERATIONAL", 549 __func__, (void *)tgt); 550 goto no_action; 551 } 552 553 rc = pmcs_set_dev_state(pwp, phyp, tgt, 554 PMCS_DEVICE_STATE_OPERATIONAL); 555 if (tgt != NULL) { 556 tgt->reset_success = 1; 557 } 558 if (rc != 0) { 559 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 560 "%s(2): Failed to SET tgt(0x%p) to OPERATIONAL", 561 __func__, (void *)tgt); 562 if (tgt != NULL) { 563 tgt->reset_success = 0; 564 } 565 } 566 567 break; 568 569 case PMCS_DEVICE_STATE_NON_OPERATIONAL: 570 PHY_CHANGED(pwp, phyp); 571 RESTART_DISCOVERY(pwp); 572 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 573 "%s: Device at %s is non-operational", 574 __func__, phyp->path); 575 if (tgt != NULL) { 576 tgt->dev_state = PMCS_DEVICE_STATE_NON_OPERATIONAL; 577 } 578 rc = 0; 579 580 break; 581 582 default: 583 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 584 "%s: Invalid state requested (%d)", __func__, 585 dev_state); 586 break; 587 588 } 589 590 no_action: 591 if (tgt != NULL) { 592 tgt->recovering = 0; 593 } 594 return (rc); 595 } 596 597 /* 598 * Start ssp event recovery. We have to schedule recovery operation because 599 * it involves sending multiple commands to device and we should not do it 600 * in the interrupt context. 601 * If it is failure of a recovery command, let the recovery thread deal with it. 602 * Called with pmcwork lock held. 603 */ 604 void 605 pmcs_start_ssp_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk, uint32_t *iomb, 606 size_t amt) 607 { 608 pmcs_xscsi_t *tgt = pwrk->xp; 609 uint32_t event = LE_32(iomb[2]); 610 pmcs_phy_t *pptr = pwrk->phy; 611 pmcs_cb_t callback; 612 uint32_t tag; 613 614 if (tgt != NULL) { 615 mutex_enter(&tgt->statlock); 616 if (!tgt->assigned) { 617 if (pptr) { 618 pmcs_dec_phy_ref_count(pptr); 619 } 620 pptr = NULL; 621 pwrk->phy = NULL; 622 } 623 mutex_exit(&tgt->statlock); 624 } 625 626 if (pptr == NULL) { 627 /* 628 * No target, need to run RE-DISCOVERY here. 629 */ 630 if (pwrk->state != PMCS_WORK_STATE_TIMED_OUT) { 631 pwrk->state = PMCS_WORK_STATE_INTR; 632 } 633 /* 634 * Although we cannot mark phy to force abort nor mark phy 635 * as changed, killing of a target would take care of aborting 636 * commands for the device. 637 */ 638 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 639 "%s: No valid target for event processing. Reconfigure.", 640 __func__); 641 pmcs_pwork(pwp, pwrk); 642 RESTART_DISCOVERY(pwp); 643 return; 644 } else { 645 pmcs_lock_phy(pptr); 646 if (tgt != NULL) { 647 mutex_enter(&tgt->statlock); 648 } 649 if (event == PMCOUT_STATUS_OPEN_CNX_ERROR_IT_NEXUS_LOSS) { 650 if ((tgt != NULL) && (tgt->dev_state != 651 PMCS_DEVICE_STATE_NON_OPERATIONAL)) { 652 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 653 "%s: Device at %s is non-operational", 654 __func__, pptr->path); 655 tgt->dev_state = 656 PMCS_DEVICE_STATE_NON_OPERATIONAL; 657 } 658 pptr->abort_pending = 1; 659 if (tgt != NULL) { 660 mutex_exit(&tgt->statlock); 661 } 662 pmcs_unlock_phy(pptr); 663 mutex_exit(&pwrk->lock); 664 SCHEDULE_WORK(pwp, PMCS_WORK_ABORT_HANDLE); 665 RESTART_DISCOVERY(pwp); 666 return; 667 } 668 669 /* 670 * If this command is run in WAIT mode, it is a failing recovery 671 * command. If so, just wake up recovery thread waiting for 672 * command completion. 673 */ 674 tag = PMCS_TAG_TYPE(pwrk->htag); 675 if (tag == PMCS_TAG_TYPE_WAIT) { 676 pwrk->htag |= PMCS_TAG_DONE; 677 if (pwrk->arg && amt) { 678 (void) memcpy(pwrk->arg, iomb, amt); 679 } 680 cv_signal(&pwrk->sleep_cv); 681 if (tgt != NULL) { 682 mutex_exit(&tgt->statlock); 683 } 684 pmcs_unlock_phy(pptr); 685 mutex_exit(&pwrk->lock); /* XXX: Is this right??? */ 686 return; 687 } 688 689 if (tgt == NULL) { 690 pmcs_prt(pwp, PMCS_PRT_DEBUG1, pptr, NULL, 691 "%s: Not scheduling SSP event recovery for NULL tgt" 692 " pwrk(%p) tag(0x%x)", __func__, (void *)pwrk, 693 pwrk->htag); 694 return; 695 } 696 697 /* 698 * If the SSP event was an OPEN_RETRY_TIMEOUT, we don't want 699 * to go through the recovery (abort/LU reset) process. 700 * Simply complete the command and return it as STATUS_BUSY. 701 * This will cause the target driver to simply retry. 702 */ 703 if (event == PMCOUT_STATUS_IO_XFER_OPEN_RETRY_TIMEOUT) { 704 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 705 "%s: Got OPEN_RETRY_TIMEOUT event (htag 0x%08x)", 706 __func__, pwrk->htag); 707 708 mutex_exit(&tgt->statlock); 709 pmcs_unlock_phy(pptr); 710 pwrk->ssp_event = event; 711 callback = (pmcs_cb_t)pwrk->ptr; 712 (*callback)(pwp, pwrk, iomb); 713 return; 714 } 715 716 /* 717 * To recover from primary failures, 718 * we need to schedule handling events recovery. 719 */ 720 tgt->event_recovery = 1; 721 mutex_exit(&tgt->statlock); 722 pmcs_unlock_phy(pptr); 723 pwrk->ssp_event = event; 724 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 725 "%s: Scheduling SSP event recovery for tgt(0x%p) " 726 "pwrk(%p) tag(0x%x)", __func__, (void *)tgt, (void *)pwrk, 727 pwrk->htag); 728 mutex_exit(&pwrk->lock); 729 SCHEDULE_WORK(pwp, PMCS_WORK_SSP_EVT_RECOVERY); 730 } 731 732 /* Work cannot be completed until event recovery is completed. */ 733 } 734 735 /* 736 * SSP target event recovery 737 * Entered with a phy lock held 738 * Pwrk lock is not needed - pwrk is on the target aq and no other thread 739 * will do anything with it until this thread starts the chain of recovery. 740 * Statlock may be acquired and released. 741 */ 742 void 743 pmcs_tgt_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk) 744 { 745 pmcs_phy_t *pptr = pwrk->phy; 746 pmcs_cmd_t *sp = pwrk->arg; 747 pmcs_lun_t *lun = sp->cmd_lun; 748 pmcs_xscsi_t *tgt = pwrk->xp; 749 uint32_t event; 750 uint32_t htag; 751 uint32_t status; 752 uint8_t dstate; 753 int rv; 754 755 ASSERT(pwrk->arg != NULL); 756 ASSERT(pwrk->xp != NULL); 757 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 758 "%s: event recovery for target 0x%p", __func__, (void *)pwrk->xp); 759 htag = pwrk->htag; 760 event = pwrk->ssp_event; 761 pwrk->ssp_event = 0xffffffff; 762 763 if (event == PMCOUT_STATUS_XFER_ERR_BREAK || 764 event == PMCOUT_STATUS_XFER_ERR_PHY_NOT_READY || 765 event == PMCOUT_STATUS_XFER_ERROR_CMD_ISSUE_ACK_NAK_TIMEOUT) { 766 /* Command may be still pending on device */ 767 rv = pmcs_ssp_tmf(pwp, pptr, SAS_QUERY_TASK, htag, 768 lun->lun_num, &status); 769 if (rv != 0) { 770 goto out; 771 } 772 if (status == SAS_RSP_TMF_COMPLETE) { 773 /* Command NOT pending on a device */ 774 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 775 "%s: No pending command for tgt 0x%p", 776 __func__, (void *)tgt); 777 /* Nothing more to do, just abort it on chip */ 778 htag = 0; 779 } 780 } 781 /* 782 * All other events left the command pending in the host 783 * Send abort task and abort it on the chip 784 */ 785 if (htag != 0) { 786 if (pmcs_ssp_tmf(pwp, pptr, SAS_ABORT_TASK, htag, 787 lun->lun_num, &status)) 788 goto out; 789 } 790 (void) pmcs_abort(pwp, pptr, pwrk->htag, 0, 1); 791 /* 792 * Abort either took care of work completion, or put device in 793 * a recovery state 794 */ 795 return; 796 out: 797 /* Abort failed, do full device recovery */ 798 ASSERT(tgt != NULL); 799 mutex_enter(&tgt->statlock); 800 if (!pmcs_get_dev_state(pwp, pptr, tgt, &dstate)) 801 tgt->dev_state = dstate; 802 803 if ((tgt->dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) && 804 (tgt->dev_state != PMCS_DEVICE_STATE_NON_OPERATIONAL)) { 805 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 806 "%s: Setting IN_RECOVERY for tgt 0x%p", 807 __func__, (void *)tgt); 808 (void) pmcs_send_err_recovery_cmd(pwp, 809 PMCS_DEVICE_STATE_IN_RECOVERY, pptr, tgt); 810 } 811 mutex_exit(&tgt->statlock); 812 } 813 814 /* 815 * SSP event recovery task. 816 */ 817 void 818 pmcs_ssp_event_recovery(pmcs_hw_t *pwp) 819 { 820 int idx; 821 pmcs_xscsi_t *tgt; 822 pmcs_cmd_t *cp; 823 pmcwork_t *pwrk; 824 pmcs_phy_t *pphy; 825 int er_flag; 826 uint32_t idxpwrk; 827 828 restart: 829 for (idx = 0; idx < pwp->max_dev; idx++) { 830 mutex_enter(&pwp->lock); 831 tgt = pwp->targets[idx]; 832 mutex_exit(&pwp->lock); 833 if (tgt == NULL) { 834 continue; 835 } 836 837 mutex_enter(&tgt->statlock); 838 if (!tgt->assigned) { 839 mutex_exit(&tgt->statlock); 840 continue; 841 } 842 pphy = tgt->phy; 843 er_flag = tgt->event_recovery; 844 mutex_exit(&tgt->statlock); 845 846 if ((pphy == NULL) || (er_flag == 0)) { 847 continue; 848 } 849 850 pmcs_lock_phy(pphy); 851 mutex_enter(&tgt->statlock); 852 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 853 "%s: found target(0x%p)", __func__, (void *) tgt); 854 855 /* Check what cmd expects recovery */ 856 mutex_enter(&tgt->aqlock); 857 STAILQ_FOREACH(cp, &tgt->aq, cmd_next) { 858 /* 859 * Since work structure is on this target aq, and only 860 * this thread is accessing it now, we do not need 861 * to lock it 862 */ 863 idxpwrk = PMCS_TAG_INDEX(cp->cmd_tag); 864 pwrk = &pwp->work[idxpwrk]; 865 if (pwrk->htag != cp->cmd_tag) { 866 /* 867 * aq may contain TMF commands, so we 868 * may not find work structure with htag 869 */ 870 break; 871 } 872 if ((pwrk->ssp_event != 0) && 873 (pwrk->ssp_event != PMCS_REC_EVENT)) { 874 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 875 "%s: pwrk(%p) htag(0x%x)", 876 __func__, (void *) pwrk, cp->cmd_tag); 877 mutex_exit(&tgt->aqlock); 878 mutex_exit(&tgt->statlock); 879 pmcs_tgt_event_recovery(pwp, pwrk); 880 /* 881 * We dropped statlock, so restart the scan 882 */ 883 pmcs_unlock_phy(pphy); 884 goto restart; 885 } 886 } 887 mutex_exit(&tgt->aqlock); 888 tgt->event_recovery = 0; 889 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 890 "%s: end of SSP event recovery for target(0x%p)", 891 __func__, (void *) tgt); 892 mutex_exit(&tgt->statlock); 893 pmcs_unlock_phy(pphy); 894 } 895 pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL, 896 "%s: end of SSP event recovery for pwp(0x%p)", __func__, 897 (void *) pwp); 898 } 899 900 void 901 pmcs_start_dev_state_recovery(pmcs_xscsi_t *xp, pmcs_phy_t *phyp) 902 { 903 ASSERT(mutex_owned(&xp->statlock)); 904 ASSERT(xp->pwp != NULL); 905 906 if (xp->recover_wait == 0) { 907 pmcs_prt(xp->pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 908 "%s: Start ds_recovery for tgt 0x%p/PHY 0x%p (%s)", 909 __func__, (void *)xp, (void *)phyp, phyp->path); 910 xp->recover_wait = 1; 911 912 /* 913 * Rather than waiting for the watchdog timer, we'll 914 * kick it right now. 915 */ 916 SCHEDULE_WORK(xp->pwp, PMCS_WORK_DS_ERR_RECOVERY); 917 (void) ddi_taskq_dispatch(xp->pwp->tq, pmcs_worker, xp->pwp, 918 DDI_NOSLEEP); 919 } 920 } 921 922 /* 923 * Increment the phy ds error retry count. 924 * If too many retries, mark phy dead and restart discovery; 925 * otherwise schedule ds recovery. 926 */ 927 static void 928 pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, pmcs_xscsi_t *tgt, 929 pmcs_hw_t *pwp, const char *func_name, char *reason_string) 930 { 931 ASSERT(mutex_owned(&phyp->phy_lock)); 932 ASSERT((tgt == NULL) || mutex_owned(&tgt->statlock)); 933 934 phyp->ds_recovery_retries++; 935 936 if (phyp->ds_recovery_retries > PMCS_MAX_DS_RECOVERY_RETRIES) { 937 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, 938 "%s: retry limit reached after %s to PHY %s failed", 939 func_name, reason_string, phyp->path); 940 if (tgt != NULL) { 941 tgt->recover_wait = 0; 942 } 943 /* 944 * Mark the PHY as dead and it and its parent as changed, 945 * then restart discovery 946 */ 947 phyp->dead = 1; 948 PHY_CHANGED(pwp, phyp); 949 if (phyp->parent) 950 PHY_CHANGED(pwp, phyp->parent); 951 RESTART_DISCOVERY(pwp); 952 } else if ((phyp->ds_prev_good_recoveries > 953 PMCS_MAX_DS_RECOVERY_RETRIES) && 954 (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME) 955 < ddi_get_lbolt())) { 956 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of " 957 "successful recoveries reached, declaring PHY %s dead", 958 __func__, phyp->path); 959 if (tgt != NULL) { 960 tgt->recover_wait = 0; 961 } 962 /* 963 * Mark the PHY as dead and its parent as changed, 964 * then restart discovery 965 */ 966 phyp->dead = 1; 967 PHY_CHANGED(pwp, phyp); 968 if (phyp->parent) 969 PHY_CHANGED(pwp, phyp->parent); 970 RESTART_DISCOVERY(pwp); 971 } else { 972 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 973 } 974 } 975