1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  *
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * PM8001 device state recovery routines
28  */
29 
30 #include <sys/scsi/adapters/pmcs/pmcs.h>
31 
32 /*
33  * SAS Topology Configuration
34  */
35 static void pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp,
36     pmcs_xscsi_t *tgt, pmcs_hw_t *pwp, const char *func_name, int line,
37     char *reason_string);
38 
39 /*
40  * Get device state.  Called with statlock and PHY lock held.
41  */
42 static int
43 pmcs_get_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp,
44     uint8_t *ds)
45 {
46 	uint32_t htag, *ptr, msg[PMCS_MSG_SIZE];
47 	int result;
48 	struct pmcwork *pwrk;
49 
50 	pmcs_prt(pwp, PMCS_PRT_DEBUG3, phyp, xp, "%s: tgt(0x%p)", __func__,
51 	    (void *)xp);
52 
53 	if (xp != NULL) {
54 		ASSERT(mutex_owned(&xp->statlock));
55 	}
56 	ASSERT(mutex_owned(&phyp->phy_lock));
57 
58 	pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp);
59 	if (pwrk == NULL) {
60 		pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__);
61 		return (-1);
62 	}
63 	pwrk->arg = msg;
64 	pwrk->dtype = phyp->dtype;
65 
66 	if (phyp->valid_device_id == 0) {
67 		pmcs_pwork(pwp, pwrk);
68 		pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp,
69 		    "%s: Invalid DeviceID", __func__);
70 		return (-1);
71 	}
72 	htag = pwrk->htag;
73 	msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL,
74 	    PMCIN_GET_DEVICE_STATE));
75 	msg[1] = LE_32(pwrk->htag);
76 	msg[2] = LE_32(phyp->device_id);
77 
78 	mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]);
79 	ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER);
80 	if (ptr == NULL) {
81 		mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]);
82 		pmcs_pwork(pwp, pwrk);
83 		pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__);
84 		return (-1);
85 	}
86 	COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE);
87 	pwrk->state = PMCS_WORK_STATE_ONCHIP;
88 	INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER);
89 
90 	if (xp != NULL) {
91 		mutex_exit(&xp->statlock);
92 	}
93 	pmcs_unlock_phy(phyp);
94 	WAIT_FOR(pwrk, 1000, result);
95 	pmcs_lock_phy(phyp);
96 	pmcs_pwork(pwp, pwrk);
97 
98 	if (xp != NULL) {
99 		mutex_enter(&xp->statlock);
100 	}
101 
102 	if (result) {
103 		pmcs_timed_out(pwp, htag, __func__);
104 		pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp,
105 		    "%s: cmd timed out, returning", __func__);
106 		return (-1);
107 	}
108 	if (LE_32(msg[2]) == 0) {
109 		*ds = (uint8_t)(LE_32(msg[4]));
110 		if (xp == NULL) {
111 			pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
112 			    "%s: retrieved_ds=0x%x", __func__, *ds);
113 		} else if (*ds !=  xp->dev_state) {
114 			pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
115 			    "%s: retrieved_ds=0x%x, target_ds=0x%x", __func__,
116 			    *ds, xp->dev_state);
117 		}
118 		return (0);
119 	} else {
120 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
121 		    "%s: cmd failed Status(0x%x), returning ", __func__,
122 		    LE_32(msg[2]));
123 		return (-1);
124 	}
125 }
126 
127 /*
128  * Set device state.  Called with target's statlock and PHY lock held.
129  */
130 static int
131 pmcs_set_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp,
132     uint8_t ds)
133 {
134 	uint32_t htag, *ptr, msg[PMCS_MSG_SIZE];
135 	int result;
136 	uint8_t pds, nds;
137 	struct pmcwork *pwrk;
138 
139 	pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
140 	    "%s: ds: 0x%x tgt: 0x%p phy: 0x%p", __func__, ds, (void *)xp,
141 	    (void *)phyp);
142 
143 	if (phyp == NULL) {
144 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp,
145 		    "%s: PHY is NULL", __func__);
146 		return (-1);
147 	}
148 
149 	pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp);
150 	if (pwrk == NULL) {
151 		pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__);
152 		return (-1);
153 	}
154 	if (phyp->valid_device_id == 0) {
155 		pmcs_pwork(pwp, pwrk);
156 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
157 		    "%s: Invalid DeviceID", __func__);
158 		return (-1);
159 	}
160 	pwrk->arg = msg;
161 	pwrk->dtype = phyp->dtype;
162 	htag = pwrk->htag;
163 	msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL,
164 	    PMCIN_SET_DEVICE_STATE));
165 	msg[1] = LE_32(pwrk->htag);
166 	msg[2] = LE_32(phyp->device_id);
167 	msg[3] = LE_32(ds);
168 
169 	mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]);
170 	ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER);
171 	if (ptr == NULL) {
172 		mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]);
173 		pmcs_pwork(pwp, pwrk);
174 		pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__);
175 		return (-1);
176 	}
177 	COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE);
178 	pwrk->state = PMCS_WORK_STATE_ONCHIP;
179 	INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER);
180 
181 	if (xp != NULL) {
182 		mutex_exit(&xp->statlock);
183 	}
184 	pmcs_unlock_phy(phyp);
185 	WAIT_FOR(pwrk, 1000, result);
186 	pmcs_lock_phy(phyp);
187 	pmcs_pwork(pwp, pwrk);
188 	if (xp != NULL) {
189 		mutex_enter(&xp->statlock);
190 	}
191 
192 	if (result) {
193 		pmcs_timed_out(pwp, htag, __func__);
194 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
195 		    "%s: cmd timed out, returning", __func__);
196 		return (-1);
197 	}
198 	if (LE_32(msg[2]) == 0) {
199 		pds = (uint8_t)(LE_32(msg[4]) >> 4);
200 		nds = (uint8_t)(LE_32(msg[4]) & 0x0000000f);
201 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
202 		    "%s: previous_ds=0x%x, new_ds=0x%x", __func__, pds, nds);
203 		if (xp != NULL) {
204 			xp->dev_state = nds;
205 		}
206 		return (0);
207 	} else {
208 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
209 		    "%s: cmd failed Status(0x%x), returning ", __func__,
210 		    LE_32(msg[2]));
211 		return (-1);
212 	}
213 }
214 
215 void
216 pmcs_dev_state_recovery(pmcs_hw_t *pwp, pmcs_phy_t *phyp)
217 {
218 	uint8_t	ds, tgt_dev_state;
219 	int rc;
220 	pmcs_xscsi_t *tgt;
221 	pmcs_phy_t *pptr, *pnext, *pchild;
222 
223 	/*
224 	 * First time, check to see if we're already performing recovery
225 	 */
226 	if (phyp == NULL) {
227 		mutex_enter(&pwp->lock);
228 		if (pwp->ds_err_recovering) {
229 			mutex_exit(&pwp->lock);
230 			SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY);
231 			return;
232 		}
233 
234 		pwp->ds_err_recovering = 1;
235 		pptr = pwp->root_phys;
236 		mutex_exit(&pwp->lock);
237 	} else {
238 		pptr = phyp;
239 	}
240 
241 	while (pptr) {
242 		/*
243 		 * Since ds_err_recovering is set, we can be assured these
244 		 * PHYs won't disappear on us while we do this.
245 		 */
246 		pmcs_lock_phy(pptr);
247 		pchild = pptr->children;
248 		pnext = pptr->sibling;
249 		pmcs_unlock_phy(pptr);
250 
251 		if (pchild) {
252 			pmcs_dev_state_recovery(pwp, pchild);
253 		}
254 
255 		tgt = NULL;
256 		pmcs_lock_phy(pptr);
257 
258 		if (pptr->dead) {
259 			goto next_phy;
260 		}
261 
262 		tgt = pptr->target;
263 
264 		if (tgt != NULL) {
265 			mutex_enter(&tgt->statlock);
266 			if (tgt->recover_wait == 0) {
267 				goto next_phy;
268 			}
269 			tgt_dev_state = tgt->dev_state;
270 		} else {
271 			tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE;
272 		}
273 
274 		if (pptr->prev_recovery) {
275 			if (ddi_get_lbolt() - pptr->prev_recovery <
276 			    drv_usectohz(PMCS_DS_RECOVERY_INTERVAL)) {
277 				pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt,
278 				    "%s: DS recovery on PHY %s "
279 				    "re-invoked too soon. Skipping...",
280 				    __func__, pptr->path);
281 				goto next_phy;
282 			}
283 		}
284 		pptr->prev_recovery = ddi_get_lbolt();
285 
286 		/*
287 		 * Step 1: Put the device into the IN_RECOVERY state
288 		 */
289 		rc = pmcs_get_dev_state(pwp, pptr, tgt, &ds);
290 		if (rc != 0) {
291 			pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
292 			    "%s: pmcs_get_dev_state on PHY %s "
293 			    "failed (rc=%d)",
294 			    __func__, pptr->path, rc);
295 
296 			pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
297 			    __func__, __LINE__, "pmcs_get_dev_state");
298 
299 			goto next_phy;
300 		}
301 
302 		if ((tgt_dev_state == ds) &&
303 		    (ds == PMCS_DEVICE_STATE_IN_RECOVERY)) {
304 			pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
305 			    "%s: Target 0x%p already IN_RECOVERY", __func__,
306 			    (void *)tgt);
307 		} else {
308 			if (tgt != NULL) {
309 				tgt->dev_state = ds;
310 			}
311 			tgt_dev_state = ds;
312 			ds = PMCS_DEVICE_STATE_IN_RECOVERY;
313 			rc = pmcs_send_err_recovery_cmd(pwp, ds, pptr, tgt);
314 			pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
315 			    "%s: pmcs_send_err_recovery_cmd "
316 			    "result(%d) tgt(0x%p) ds(0x%x) tgt->ds(0x%x)",
317 			    __func__, rc, (void *)tgt, ds, tgt_dev_state);
318 
319 			if (rc) {
320 				pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
321 				    "%s: pmcs_send_err_recovery_cmd to PHY %s "
322 				    "failed (rc=%d)",
323 				    __func__, pptr->path, rc);
324 
325 				pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
326 				    __func__, __LINE__,
327 				    "pmcs_send_err_recovery_cmd");
328 
329 				goto next_phy;
330 			}
331 		}
332 
333 		/*
334 		 * Step 2: Perform a hard reset on the PHY
335 		 */
336 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
337 		    "%s: Issue HARD_RESET to PHY %s", __func__, pptr->path);
338 		/*
339 		 * Must release statlock here because pmcs_reset_phy will
340 		 * drop and reacquire the PHY lock.
341 		 */
342 		if (tgt != NULL) {
343 			mutex_exit(&tgt->statlock);
344 		}
345 		rc = pmcs_reset_phy(pwp, pptr, PMCS_PHYOP_HARD_RESET);
346 		if (tgt != NULL) {
347 			mutex_enter(&tgt->statlock);
348 		}
349 		if (rc) {
350 			pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
351 			    "%s: HARD_RESET to PHY %s failed (rc=%d)",
352 			    __func__, pptr->path, rc);
353 
354 			pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
355 			    __func__, __LINE__, "HARD_RESET");
356 
357 			goto next_phy;
358 		}
359 
360 		/*
361 		 * Step 3: Abort all I/Os to the device
362 		 */
363 		if (pptr->abort_all_start) {
364 			while (pptr->abort_all_start) {
365 				pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
366 				    "%s: Waiting for outstanding ABORT_ALL on "
367 				    "PHY 0x%p", __func__, (void *)pptr);
368 				cv_wait(&pptr->abort_all_cv, &pptr->phy_lock);
369 			}
370 		} else {
371 			if (tgt != NULL) {
372 				mutex_exit(&tgt->statlock);
373 			}
374 			rc = pmcs_abort(pwp, pptr, pptr->device_id, 1, 1);
375 			if (tgt != NULL) {
376 				mutex_enter(&tgt->statlock);
377 			}
378 			if (rc != 0) {
379 				pptr->abort_pending = 1;
380 				pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
381 				    "%s: pmcs_abort to PHY %s failed (rc=%d)",
382 				    __func__, pptr->path, rc);
383 
384 				pmcs_handle_ds_recovery_error(pptr, tgt,
385 				    pwp, __func__, __LINE__, "pmcs_abort");
386 
387 				goto next_phy;
388 			}
389 		}
390 
391 		/*
392 		 * Step 4: Set the device back to OPERATIONAL state
393 		 */
394 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
395 		    "%s: Set PHY/tgt 0x%p/0x%p to OPERATIONAL state",
396 		    __func__, (void *)pptr, (void *)tgt);
397 		rc = pmcs_set_dev_state(pwp, pptr, tgt,
398 		    PMCS_DEVICE_STATE_OPERATIONAL);
399 		if (rc == 0) {
400 			if (tgt != NULL) {
401 				tgt->recover_wait = 0;
402 			}
403 			pptr->ds_recovery_retries = 0;
404 
405 			if ((pptr->ds_prev_good_recoveries == 0) ||
406 			    (ddi_get_lbolt() - pptr->last_good_recovery >
407 			    drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME))) {
408 				pptr->last_good_recovery = ddi_get_lbolt();
409 				pptr->ds_prev_good_recoveries = 1;
410 			} else if (ddi_get_lbolt() < pptr->last_good_recovery +
411 			    drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)) {
412 				pptr->ds_prev_good_recoveries++;
413 			} else {
414 				pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
415 				    __func__, __LINE__, "Max recovery"
416 				    "attempts reached. Declaring PHY dead");
417 			}
418 
419 			/*
420 			 * Don't bother to run the work queues if the PHY
421 			 * is dead.
422 			 */
423 			if (tgt && tgt->phy && !tgt->phy->dead) {
424 				SCHEDULE_WORK(pwp, PMCS_WORK_RUN_QUEUES);
425 				(void) ddi_taskq_dispatch(pwp->tq, pmcs_worker,
426 				    pwp, DDI_NOSLEEP);
427 			}
428 		} else {
429 			pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
430 			    "%s: Failed to SET tgt 0x%p to OPERATIONAL state",
431 			    __func__, (void *)tgt);
432 
433 			pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
434 			    __func__, __LINE__, "SET tgt to OPERATIONAL state");
435 
436 			goto next_phy;
437 		}
438 
439 next_phy:
440 		if (tgt) {
441 			mutex_exit(&tgt->statlock);
442 		}
443 		pmcs_unlock_phy(pptr);
444 		pptr = pnext;
445 	}
446 
447 	/*
448 	 * Only clear ds_err_recovering if we're exiting for good and not
449 	 * just unwinding from recursion
450 	 */
451 	if (phyp == NULL) {
452 		mutex_enter(&pwp->lock);
453 		pwp->ds_err_recovering = 0;
454 		mutex_exit(&pwp->lock);
455 	}
456 }
457 
458 /*
459  * Called with target's statlock held (if target is non-NULL) and PHY lock held.
460  */
461 int
462 pmcs_send_err_recovery_cmd(pmcs_hw_t *pwp, uint8_t dev_state, pmcs_phy_t *phyp,
463     pmcs_xscsi_t *tgt)
464 {
465 	int rc = -1;
466 	uint8_t tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE;
467 
468 	if (tgt != NULL) {
469 		ASSERT(mutex_owned(&tgt->statlock));
470 		if (tgt->recovering) {
471 			return (0);
472 		}
473 
474 		tgt->recovering = 1;
475 		tgt_dev_state = tgt->dev_state;
476 	}
477 
478 	if (phyp == NULL) {
479 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, tgt,
480 		    "%s: PHY is NULL", __func__);
481 		return (-1);
482 	}
483 
484 	ASSERT(mutex_owned(&phyp->phy_lock));
485 
486 	pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
487 	    "%s: ds: 0x%x, tgt ds(0x%x)", __func__, dev_state, tgt_dev_state);
488 
489 	switch (dev_state) {
490 	case PMCS_DEVICE_STATE_IN_RECOVERY:
491 		if (tgt_dev_state == PMCS_DEVICE_STATE_IN_RECOVERY) {
492 			pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
493 			    "%s: Target 0x%p already IN_RECOVERY", __func__,
494 			    (void *)tgt);
495 			rc = 0;	/* This is not an error */
496 			goto no_action;
497 		}
498 
499 		rc = pmcs_set_dev_state(pwp, phyp, tgt,
500 		    PMCS_DEVICE_STATE_IN_RECOVERY);
501 		if (rc != 0) {
502 			pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
503 			    "%s(1): Failed to set tgt(0x%p) to IN_RECOVERY",
504 			    __func__, (void *)tgt);
505 		}
506 
507 		break;
508 
509 	case PMCS_DEVICE_STATE_OPERATIONAL:
510 		if (tgt_dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) {
511 			pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
512 			    "%s: Target 0x%p not ready to go OPERATIONAL",
513 			    __func__, (void *)tgt);
514 			goto no_action;
515 		}
516 
517 		rc = pmcs_set_dev_state(pwp, phyp, tgt,
518 		    PMCS_DEVICE_STATE_OPERATIONAL);
519 		if (tgt != NULL) {
520 			tgt->reset_success = 1;
521 		}
522 		if (rc != 0) {
523 			pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
524 			    "%s(2): Failed to SET tgt(0x%p) to OPERATIONAL",
525 			    __func__, (void *)tgt);
526 			if (tgt != NULL) {
527 				tgt->reset_success = 0;
528 			}
529 		}
530 
531 		break;
532 
533 	case PMCS_DEVICE_STATE_NON_OPERATIONAL:
534 		PHY_CHANGED(pwp, phyp);
535 		RESTART_DISCOVERY(pwp);
536 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
537 		    "%s: Device at %s is non-operational",
538 		    __func__, phyp->path);
539 		if (tgt != NULL) {
540 			tgt->dev_state = PMCS_DEVICE_STATE_NON_OPERATIONAL;
541 		}
542 		rc = 0;
543 
544 		break;
545 
546 	default:
547 		pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
548 		    "%s: Invalid state requested (%d)", __func__,
549 		    dev_state);
550 		break;
551 
552 	}
553 
554 no_action:
555 	if (tgt != NULL) {
556 		tgt->recovering = 0;
557 	}
558 	return (rc);
559 }
560 
561 /*
562  * Start ssp event recovery. We have to schedule recovery operation because
563  * it involves sending multiple commands to device and we should not do it
564  * in the interrupt context.
565  * If it is failure of a recovery command, let the recovery thread deal with it.
566  * Called with pmcwork lock held.
567  */
568 
569 void
570 pmcs_start_ssp_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk, uint32_t *iomb,
571     size_t amt)
572 {
573 	pmcs_xscsi_t *tgt = pwrk->xp;
574 	uint32_t event = LE_32(iomb[2]);
575 	pmcs_phy_t *pptr = pwrk->phy;
576 	uint32_t tag;
577 
578 	if (tgt != NULL) {
579 		mutex_enter(&tgt->statlock);
580 		if (!tgt->assigned) {
581 			if (pptr) {
582 				pmcs_dec_phy_ref_count(pptr);
583 			}
584 			pptr = NULL;
585 			pwrk->phy = NULL;
586 		}
587 		mutex_exit(&tgt->statlock);
588 	}
589 	if (pptr == NULL) {
590 		/*
591 		 * No target, need to run RE-DISCOVERY here.
592 		 */
593 		if (pwrk->state != PMCS_WORK_STATE_TIMED_OUT) {
594 			pwrk->state = PMCS_WORK_STATE_INTR;
595 		}
596 		/*
597 		 * Although we cannot mark phy to force abort nor mark phy
598 		 * as changed, killing of a target would take care of aborting
599 		 * commands for the device.
600 		 */
601 		pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
602 		    "%s: No valid target for event processing. Reconfigure.",
603 		    __func__);
604 		pmcs_pwork(pwp, pwrk);
605 		RESTART_DISCOVERY(pwp);
606 		return;
607 	} else {
608 		pmcs_lock_phy(pptr);
609 		mutex_enter(&tgt->statlock);
610 		if (event == PMCOUT_STATUS_OPEN_CNX_ERROR_IT_NEXUS_LOSS) {
611 			if (tgt->dev_state !=
612 			    PMCS_DEVICE_STATE_NON_OPERATIONAL) {
613 				pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
614 				    "%s: Device at %s is non-operational",
615 				    __func__, pptr->path);
616 				tgt->dev_state =
617 				    PMCS_DEVICE_STATE_NON_OPERATIONAL;
618 			}
619 			pptr->abort_pending = 1;
620 			mutex_exit(&tgt->statlock);
621 			pmcs_unlock_phy(pptr);
622 			mutex_exit(&pwrk->lock);
623 			SCHEDULE_WORK(pwp, PMCS_WORK_ABORT_HANDLE);
624 			RESTART_DISCOVERY(pwp);
625 			return;
626 		}
627 
628 		/*
629 		 * If this command is run in WAIT mode, it is a failing recovery
630 		 * command. If so, just wake up recovery thread waiting for
631 		 * command completion.
632 		 */
633 		tag = PMCS_TAG_TYPE(pwrk->htag);
634 		if (tag == PMCS_TAG_TYPE_WAIT) {
635 			pwrk->htag |= PMCS_TAG_DONE;
636 			if (pwrk->arg && amt) {
637 				(void) memcpy(pwrk->arg, iomb, amt);
638 			}
639 			cv_signal(&pwrk->sleep_cv);
640 			mutex_exit(&tgt->statlock);
641 			pmcs_unlock_phy(pptr);
642 			mutex_exit(&pwrk->lock);
643 			return;
644 		}
645 
646 		/*
647 		 * To recover from primary failures,
648 		 * we need to schedule handling events recovery.
649 		 */
650 		tgt->event_recovery = 1;
651 		mutex_exit(&tgt->statlock);
652 		pmcs_unlock_phy(pptr);
653 		pwrk->ssp_event = event;
654 		pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
655 		    "%s: Scheduling SSP event recovery for tgt(0x%p) "
656 		    "pwrk(%p) tag(0x%x)", __func__, (void *)tgt, (void *)pwrk,
657 		    pwrk->htag);
658 		mutex_exit(&pwrk->lock);
659 		SCHEDULE_WORK(pwp, PMCS_WORK_SSP_EVT_RECOVERY);
660 	}
661 
662 	/* Work cannot be completed until event recovery is completed. */
663 }
664 
665 /*
666  * SSP target event recovery
667  * Entered with a phy lock held
668  * Pwrk lock is not needed - pwrk is on the target aq and no other thread
669  * will do anything with it until this thread starts the chain of recovery.
670  * Statlock may be acquired and released.
671  */
672 
673 void
674 pmcs_tgt_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk)
675 {
676 	pmcs_phy_t *pptr = pwrk->phy;
677 	pmcs_cmd_t *sp = pwrk->arg;
678 	pmcs_lun_t *lun = sp->cmd_lun;
679 	pmcs_xscsi_t *tgt = pwrk->xp;
680 	uint32_t event;
681 	uint32_t htag;
682 	uint32_t status;
683 	uint8_t dstate;
684 	int rv;
685 
686 	ASSERT(pwrk->arg != NULL);
687 	ASSERT(pwrk->xp != NULL);
688 	pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
689 	    "%s: event recovery for target 0x%p", __func__, (void *)pwrk->xp);
690 	htag = pwrk->htag;
691 	event = pwrk->ssp_event;
692 	pwrk->ssp_event = 0xffffffff;
693 	if (event == PMCOUT_STATUS_XFER_ERR_BREAK ||
694 	    event == PMCOUT_STATUS_XFER_ERR_PHY_NOT_READY ||
695 	    event == PMCOUT_STATUS_XFER_ERROR_CMD_ISSUE_ACK_NAK_TIMEOUT) {
696 		/* Command may be still pending on device */
697 		rv = pmcs_ssp_tmf(pwp, pptr, SAS_QUERY_TASK, htag,
698 		    lun->lun_num, &status);
699 		if (rv != 0) {
700 			goto out;
701 		}
702 		if (status == SAS_RSP_TMF_COMPLETE) {
703 			/* Command NOT pending on a device */
704 			pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
705 			    "%s: No pending command for tgt 0x%p",
706 			    __func__, (void *)tgt);
707 			/* Nothing more to do, just abort it on chip */
708 			htag = 0;
709 		}
710 	}
711 	/*
712 	 * All other events left the command pending in the host
713 	 * Send abort task and abort it on the chip
714 	 */
715 	if (htag != 0) {
716 		if (pmcs_ssp_tmf(pwp, pptr, SAS_ABORT_TASK, htag,
717 		    lun->lun_num, &status))
718 			goto out;
719 	}
720 	(void) pmcs_abort(pwp, pptr, pwrk->htag, 0, 1);
721 	/*
722 	 * Abort either took care of work completion, or put device in
723 	 * a recovery state
724 	 */
725 	return;
726 out:
727 	/* Abort failed, do full device recovery */
728 	mutex_enter(&tgt->statlock);
729 	if (!pmcs_get_dev_state(pwp, pptr, tgt, &dstate))
730 		tgt->dev_state = dstate;
731 
732 	if ((tgt->dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) &&
733 	    (tgt->dev_state != PMCS_DEVICE_STATE_NON_OPERATIONAL)) {
734 		pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
735 		    "%s: Setting IN_RECOVERY for tgt 0x%p",
736 		    __func__, (void *)tgt);
737 		(void) pmcs_send_err_recovery_cmd(pwp,
738 		    PMCS_DEVICE_STATE_IN_RECOVERY, pptr, tgt);
739 	}
740 	mutex_exit(&tgt->statlock);
741 }
742 
743 /*
744  * SSP event recovery task.
745  */
746 void
747 pmcs_ssp_event_recovery(pmcs_hw_t *pwp)
748 {
749 	int idx;
750 	pmcs_xscsi_t *tgt;
751 	pmcs_cmd_t *cp;
752 	pmcwork_t *pwrk;
753 	pmcs_phy_t *pphy;
754 	int er_flag;
755 	uint32_t idxpwrk;
756 
757 restart:
758 	for (idx = 0; idx < pwp->max_dev; idx++) {
759 		mutex_enter(&pwp->lock);
760 		tgt = pwp->targets[idx];
761 		mutex_exit(&pwp->lock);
762 		if (tgt != NULL) {
763 			mutex_enter(&tgt->statlock);
764 			if (!tgt->assigned) {
765 				mutex_exit(&tgt->statlock);
766 				continue;
767 			}
768 			pphy = tgt->phy;
769 			er_flag = tgt->event_recovery;
770 			mutex_exit(&tgt->statlock);
771 			if (pphy != NULL && er_flag != 0) {
772 				pmcs_lock_phy(pphy);
773 				mutex_enter(&tgt->statlock);
774 				pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt,
775 				    "%s: found target(0x%p)", __func__,
776 				    (void *) tgt);
777 
778 				/* Check what cmd expects recovery */
779 				mutex_enter(&tgt->aqlock);
780 				STAILQ_FOREACH(cp, &tgt->aq, cmd_next) {
781 					/*
782 					 * Since work structure is on this
783 					 * target aq, and only this thread
784 					 * is accessing it now, we do not need
785 					 * to lock it
786 					 */
787 					idxpwrk = PMCS_TAG_INDEX(cp->cmd_tag);
788 					pwrk = &pwp->work[idxpwrk];
789 					if (pwrk->htag != cp->cmd_tag) {
790 						/*
791 						 * aq may contain TMF commands,
792 						 * so we may not find work
793 						 * structure with htag
794 						 */
795 						break;
796 					}
797 					if (pwrk->ssp_event != 0 &&
798 					    pwrk->ssp_event !=
799 					    PMCS_REC_EVENT) {
800 						pmcs_prt(pwp,
801 						    PMCS_PRT_DEBUG, pphy, tgt,
802 						    "%s: pwrk(%p) ctag(0x%x)",
803 						    __func__, (void *) pwrk,
804 						    cp->cmd_tag);
805 						mutex_exit(&tgt->aqlock);
806 						mutex_exit(&tgt->statlock);
807 						pmcs_tgt_event_recovery(
808 						    pwp, pwrk);
809 						/*
810 						 * We dropped statlock, so
811 						 * restart scanning from scratch
812 						 */
813 						pmcs_unlock_phy(pphy);
814 						goto restart;
815 					}
816 				}
817 				mutex_exit(&tgt->aqlock);
818 				tgt->event_recovery = 0;
819 				pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt,
820 				    "%s: end of SSP event recovery for "
821 				    "target(0x%p)", __func__, (void *) tgt);
822 				mutex_exit(&tgt->statlock);
823 				pmcs_unlock_phy(pphy);
824 			}
825 		}
826 	}
827 	pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL,
828 	    "%s: end of SSP event recovery for pwp(0x%p)", __func__,
829 	    (void *) pwp);
830 }
831 
832 void
833 pmcs_start_dev_state_recovery(pmcs_xscsi_t *xp, pmcs_phy_t *phyp)
834 {
835 	ASSERT(mutex_owned(&xp->statlock));
836 	ASSERT(xp->pwp != NULL);
837 
838 	if (xp->recover_wait == 0) {
839 		pmcs_prt(xp->pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
840 		    "%s: Start ds_recovery for tgt 0x%p/PHY 0x%p (%s)",
841 		    __func__, (void *)xp, (void *)phyp, phyp->path);
842 		xp->recover_wait = 1;
843 
844 		/*
845 		 * Rather than waiting for the watchdog timer, we'll
846 		 * kick it right now.
847 		 */
848 		SCHEDULE_WORK(xp->pwp, PMCS_WORK_DS_ERR_RECOVERY);
849 		(void) ddi_taskq_dispatch(xp->pwp->tq, pmcs_worker, xp->pwp,
850 		    DDI_NOSLEEP);
851 	}
852 }
853 
854 /*
855  * Increment the phy ds error retry count.
856  * If too many retries, mark phy dead and restart discovery;
857  * otherwise schedule ds recovery.
858  */
859 static void
860 pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, pmcs_xscsi_t *tgt,
861     pmcs_hw_t *pwp, const char *func_name, int line, char *reason_string)
862 {
863 	ASSERT(mutex_owned(&phyp->phy_lock));
864 	ASSERT((tgt == NULL) || mutex_owned(&tgt->statlock));
865 
866 	phyp->ds_recovery_retries++;
867 
868 	if (phyp->ds_recovery_retries > PMCS_MAX_DS_RECOVERY_RETRIES) {
869 		pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt,
870 		    "%s: retry limit reached after %s to PHY %s failed",
871 		    func_name, reason_string, phyp->path);
872 		if (tgt != NULL) {
873 			tgt->recover_wait = 0;
874 		}
875 		phyp->dead = 1;
876 		PHY_CHANGED_AT_LOCATION(pwp, phyp, func_name, line);
877 		RESTART_DISCOVERY(pwp);
878 	} else if ((phyp->ds_prev_good_recoveries >
879 	    PMCS_MAX_DS_RECOVERY_RETRIES) &&
880 	    (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)
881 	    < ddi_get_lbolt())) {
882 		pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of "
883 		    "successful recoveries reached, declaring PHY %s dead",
884 		    __func__, phyp->path);
885 		if (tgt != NULL) {
886 			tgt->recover_wait = 0;
887 		}
888 		phyp->dead = 1;
889 		PHY_CHANGED_AT_LOCATION(pwp, phyp, func_name, line);
890 		RESTART_DISCOVERY(pwp);
891 	} else {
892 		SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY);
893 	}
894 }
895