1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/scsi/scsi.h>
30 #include <sys/ddi.h>
31 #include <sys/sunddi.h>
32 #include <sys/thread.h>
33 #include <sys/var.h>
34 
35 #include "sd_xbuf.h"
36 
37 /*
38  * xbuf.c: buf(9s) extension facility.
39  *
40  * The buf(9S) extension facility is intended to allow block drivers to
41  * allocate additional memory that is associated with a particular buf(9S)
42  * struct.  It is further intended to help in addressing the usual set of
43  * problems associated with such allocations, in particular those involving
44  * recovery from allocation failures, especially in code paths that the
45  * system relies on to free memory.
46  *
47  * CAVEAT: Currently this code is completely private to the sd driver and in
48  * NO WAY constitutes a public or supported interface of any kind. It is
49  * envisioned that this may one day migrate into the Solaris DDI, but until
50  * that time this ought to be considered completely unstable and is subject
51  * to change without notice. This code may NOT in any way be utilized by
52  * ANY code outside the sd driver.
53  */
54 
55 
56 static int xbuf_iostart(ddi_xbuf_attr_t xap);
57 static void xbuf_dispatch(ddi_xbuf_attr_t xap);
58 static void xbuf_restart_callback(void *arg);
59 
60 
61 /*
62  * Note: Should this be exposed to the caller.... do we want to give the
63  * caller the fexibility of specifying the parameters for the thread pool?
64  * Note: these values are just estimates at this time, based upon what
65  * seems reasonable for the sd driver. It may be preferable to make these
66  * parameters self-scaling in a real (future) implementation.
67  */
68 #define	XBUF_TQ_MINALLOC	64
69 #define	XBUF_TQ_MAXALLOC	512
70 #define	XBUF_DISPATCH_DELAY	(drv_usectohz(50000))	/* 50 msec */
71 
72 static taskq_t *xbuf_tq = NULL;
73 static int xbuf_attr_tq_minalloc = XBUF_TQ_MINALLOC;
74 static int xbuf_attr_tq_maxalloc = XBUF_TQ_MAXALLOC;
75 
76 static kmutex_t	xbuf_mutex = { 0 };
77 static uint32_t	xbuf_refcount = 0;
78 
79 
80 /* ARGSUSED */
81 DDII ddi_xbuf_attr_t
82 ddi_xbuf_attr_create(size_t xsize,
83 	void (*xa_strategy)(struct buf *bp, ddi_xbuf_t xp, void *attr_arg),
84 	void *attr_arg, uint32_t active_limit, uint32_t reserve_limit,
85 	major_t major, int flags)
86 {
87 	ddi_xbuf_attr_t	xap;
88 
89 	xap = kmem_zalloc(sizeof (struct __ddi_xbuf_attr), KM_SLEEP);
90 
91 	mutex_init(&xap->xa_mutex, NULL, MUTEX_DRIVER, NULL);
92 	mutex_init(&xap->xa_reserve_mutex, NULL, MUTEX_DRIVER, NULL);
93 
94 	/* Future: Allow the caller to specify alignment requirements? */
95 	xap->xa_allocsize	= max(xsize, sizeof (void *));
96 	xap->xa_active_limit	= active_limit;
97 	xap->xa_active_lowater	= xap->xa_active_limit / 2;
98 	xap->xa_reserve_limit	= reserve_limit;
99 	xap->xa_strategy	= xa_strategy;
100 	xap->xa_attr_arg	= attr_arg;
101 
102 	mutex_enter(&xbuf_mutex);
103 	if (xbuf_refcount == 0) {
104 		ASSERT(xbuf_tq == NULL);
105 		/*
106 		 * Note: Would be nice if: (1) #threads in the taskq pool (set
107 		 * to the value of 'ncpus' at the time the taskq is created)
108 		 * could adjust automatically with DR; (2) the taskq
109 		 * minalloc/maxalloc counts could be grown/shrunk on the fly.
110 		 */
111 		xbuf_tq = taskq_create("xbuf_taskq", ncpus,
112 		    (v.v_maxsyspri - 2), xbuf_attr_tq_minalloc,
113 		    xbuf_attr_tq_maxalloc, TASKQ_PREPOPULATE);
114 	}
115 	xbuf_refcount++;
116 	mutex_exit(&xbuf_mutex);
117 
118 	/* In this prototype we just always use the global system pool. */
119 	xap->xa_tq = xbuf_tq;
120 
121 	return (xap);
122 }
123 
124 
125 DDII void
126 ddi_xbuf_attr_destroy(ddi_xbuf_attr_t xap)
127 {
128 	ddi_xbuf_t	xp;
129 
130 	mutex_destroy(&xap->xa_mutex);
131 	mutex_destroy(&xap->xa_reserve_mutex);
132 
133 	/* Free any xbufs on the reserve list */
134 	while (xap->xa_reserve_count != 0) {
135 		xp = xap->xa_reserve_headp;
136 		xap->xa_reserve_headp = *((void **)xp);
137 		xap->xa_reserve_count--;
138 		kmem_free(xp, xap->xa_allocsize);
139 	}
140 	ASSERT(xap->xa_reserve_headp == NULL);
141 
142 	mutex_enter(&xbuf_mutex);
143 	ASSERT((xbuf_refcount != 0) && (xbuf_tq != NULL));
144 	xbuf_refcount--;
145 	if (xbuf_refcount == 0) {
146 		taskq_destroy(xbuf_tq);
147 		xbuf_tq = NULL;
148 	}
149 	mutex_exit(&xbuf_mutex);
150 
151 	kmem_free(xap, sizeof (struct __ddi_xbuf_attr));
152 }
153 
154 
155 /* ARGSUSED */
156 DDII void
157 ddi_xbuf_attr_register_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip)
158 {
159 	/* Currently a no-op in this prototype */
160 }
161 
162 
163 /* ARGSUSED */
164 DDII void
165 ddi_xbuf_attr_unregister_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip)
166 {
167 	/* Currently a no-op in this prototype */
168 }
169 
170 
171 
172 /*
173  * Enqueue the given buf and attempt to initiate IO.
174  * Called from the driver strategy(9E) routine.
175  */
176 
177 DDII int
178 ddi_xbuf_qstrategy(struct buf *bp, ddi_xbuf_attr_t xap)
179 {
180 	ASSERT(xap != NULL);
181 	ASSERT(!mutex_owned(&xap->xa_mutex));
182 	ASSERT(!mutex_owned(&xap->xa_reserve_mutex));
183 
184 	mutex_enter(&xap->xa_mutex);
185 
186 	if (xap->xa_headp == NULL) {
187 		xap->xa_headp = xap->xa_tailp = bp;
188 	} else {
189 		xap->xa_tailp->av_forw = bp;
190 		xap->xa_tailp = bp;
191 	}
192 	bp->av_forw = NULL;
193 
194 	xap->xa_pending++;
195 	mutex_exit(&xap->xa_mutex);
196 	return (xbuf_iostart(xap));
197 }
198 
199 
200 /*
201  * Drivers call this immediately before calling biodone(9F), to notify the
202  * framework that the indicated xbuf is no longer being used by the driver.
203  * May be called under interrupt context.
204  */
205 
206 DDII void
207 ddi_xbuf_done(struct buf *bp, ddi_xbuf_attr_t xap)
208 {
209 	ddi_xbuf_t xp;
210 
211 	ASSERT(bp != NULL);
212 	ASSERT(xap != NULL);
213 	ASSERT(!mutex_owned(&xap->xa_mutex));
214 	ASSERT(!mutex_owned(&xap->xa_reserve_mutex));
215 
216 	xp = ddi_xbuf_get(bp, xap);
217 
218 	mutex_enter(&xap->xa_mutex);
219 
220 #ifdef	SDDEBUG
221 	if (xap->xa_active_limit != 0) {
222 		ASSERT(xap->xa_active_count > 0);
223 	}
224 #endif
225 	xap->xa_active_count--;
226 
227 	if (xap->xa_reserve_limit != 0) {
228 		mutex_enter(&xap->xa_reserve_mutex);
229 		if (xap->xa_reserve_count < xap->xa_reserve_limit) {
230 			/* Put this xbuf onto the reserve list & exit */
231 			*((void **)xp) = xap->xa_reserve_headp;
232 			xap->xa_reserve_headp = xp;
233 			xap->xa_reserve_count++;
234 			mutex_exit(&xap->xa_reserve_mutex);
235 			goto done;
236 		}
237 		mutex_exit(&xap->xa_reserve_mutex);
238 	}
239 
240 	kmem_free(xp, xap->xa_allocsize);	/* return it to the system */
241 
242 done:
243 	if ((xap->xa_active_limit == 0) ||
244 	    (xap->xa_active_count <= xap->xa_active_lowater)) {
245 		xbuf_dispatch(xap);
246 	}
247 
248 	mutex_exit(&xap->xa_mutex);
249 }
250 
251 DDII void
252 ddi_xbuf_dispatch(ddi_xbuf_attr_t xap)
253 {
254 	mutex_enter(&xap->xa_mutex);
255 	if ((xap->xa_active_limit == 0) ||
256 	    (xap->xa_active_count <= xap->xa_active_lowater)) {
257 		xbuf_dispatch(xap);
258 	}
259 	mutex_exit(&xap->xa_mutex);
260 }
261 
262 
263 /*
264  * ISSUE: in this prototype we cannot really implement ddi_xbuf_get()
265  * unless we explicitly hide the xbuf pointer somewhere in the buf
266  * during allocation, and then rely on the driver never changing it.
267  * We can probably get away with using b_private for this for now,
268  * tho it really is kinda gnarly.....
269  */
270 
271 /* ARGSUSED */
272 DDII ddi_xbuf_t
273 ddi_xbuf_get(struct buf *bp, ddi_xbuf_attr_t xap)
274 {
275 	return (bp->b_private);
276 }
277 
278 
279 /*
280  * Initiate IOs for bufs on the queue.  Called from kernel thread or taskq
281  * thread context. May execute concurrently for the same ddi_xbuf_attr_t.
282  */
283 
284 static int
285 xbuf_iostart(ddi_xbuf_attr_t xap)
286 {
287 	struct buf *bp;
288 	ddi_xbuf_t xp;
289 
290 	ASSERT(xap != NULL);
291 	ASSERT(!mutex_owned(&xap->xa_mutex));
292 	ASSERT(!mutex_owned(&xap->xa_reserve_mutex));
293 
294 	/*
295 	 * For each request on the queue, attempt to allocate the specified
296 	 * xbuf extension area, and call the driver's iostart() routine.
297 	 * We process as many requests on the queue as we can, until either
298 	 * (1) we run out of requests; or
299 	 * (2) we run out of resources; or
300 	 * (3) we reach the maximum limit for the given ddi_xbuf_attr_t.
301 	 */
302 	for (;;) {
303 		mutex_enter(&xap->xa_mutex);
304 
305 		if ((bp = xap->xa_headp) == NULL) {
306 			break;	/* queue empty */
307 		}
308 
309 		if ((xap->xa_active_limit != 0) &&
310 		    (xap->xa_active_count >= xap->xa_active_limit)) {
311 			break;	/* allocation limit reached */
312 		}
313 
314 		/*
315 		 * If the reserve_limit is non-zero then work with the
316 		 * reserve else always allocate a new struct.
317 		 */
318 		if (xap->xa_reserve_limit != 0) {
319 			/*
320 			 * Don't penalize EVERY I/O by always allocating a new
321 			 * struct. for the sake of maintaining and not touching
322 			 * a reserve for a pathalogical condition that may never
323 			 * happen. Use the reserve entries first, this uses it
324 			 * like a local pool rather than a reserve that goes
325 			 * untouched. Make sure it's re-populated whenever it
326 			 * gets fully depleted just in case it really is needed.
327 			 * This is safe because under the pathalogical
328 			 * condition, when the system runs out of memory such
329 			 * that the below allocs fail, the reserve will still
330 			 * be available whether the entries are saved away on
331 			 * the queue unused or in-transport somewhere. Thus
332 			 * progress can still continue, however slowly.
333 			 */
334 			mutex_enter(&xap->xa_reserve_mutex);
335 			if (xap->xa_reserve_count != 0) {
336 				ASSERT(xap->xa_reserve_headp != NULL);
337 				/* Grab an xbuf from the reserve */
338 				xp = xap->xa_reserve_headp;
339 				xap->xa_reserve_headp = *((void **)xp);
340 				ASSERT(xap->xa_reserve_count > 0);
341 				xap->xa_reserve_count--;
342 			} else {
343 				/*
344 				 * Either this is the first time through,
345 				 * or the reserve has been totally depleted.
346 				 * Re-populate the reserve (pool). Excess
347 				 * structs. get released in the done path.
348 				 */
349 				while (xap->xa_reserve_count <
350 				    xap->xa_reserve_limit) {
351 					xp = kmem_alloc(xap->xa_allocsize,
352 					    KM_NOSLEEP);
353 					if (xp == NULL) {
354 						break;
355 					}
356 					*((void **)xp) = xap->xa_reserve_headp;
357 					xap->xa_reserve_headp = xp;
358 					xap->xa_reserve_count++;
359 				}
360 				/* And one more to use right now. */
361 				xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP);
362 			}
363 			mutex_exit(&xap->xa_reserve_mutex);
364 		} else {
365 			/*
366 			 * Try to alloc a new xbuf struct. If this fails just
367 			 * exit for now. We'll get back here again either upon
368 			 * cmd completion or via the timer handler.
369 			 * Question: what if the allocation attempt for the very
370 			 * first cmd. fails? There are no outstanding cmds so
371 			 * how do we get back here?
372 			 * Should look at un_ncmds_in_transport, if it's zero
373 			 * then schedule xbuf_restart_callback via the timer.
374 			 * Athough that breaks the architecture by bringing
375 			 * softstate data into this code.
376 			 */
377 			xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP);
378 		}
379 		if (xp == NULL) {
380 			break; /* Can't process a cmd. right now. */
381 		}
382 
383 		/*
384 		 * Always run the counter. It's used/needed when xa_active_limit
385 		 * is non-zero which is the typical (and right now only) case.
386 		 */
387 		xap->xa_active_count++;
388 
389 		/* unlink the buf from the list */
390 		xap->xa_headp = bp->av_forw;
391 		bp->av_forw = NULL;
392 
393 		/*
394 		 * Hack needed in the prototype so ddi_xbuf_get() will work.
395 		 * Here we can rely on the sd code not changing the value in
396 		 * b_private (in fact it wants it there). See ddi_get_xbuf()
397 		 */
398 		bp->b_private = xp;
399 
400 		/* call the driver's iostart routine */
401 		mutex_exit(&xap->xa_mutex);
402 		(*(xap->xa_strategy))(bp, xp, xap->xa_attr_arg);
403 	}
404 
405 	ASSERT(xap->xa_pending > 0);
406 	xap->xa_pending--;
407 	mutex_exit(&xap->xa_mutex);
408 	return (0);
409 }
410 
411 
412 /*
413  * Re-start IO processing if there is anything on the queue, AND if the
414  * restart function is not already running/pending for this ddi_xbuf_attr_t
415  */
416 static void
417 xbuf_dispatch(ddi_xbuf_attr_t xap)
418 {
419 	ASSERT(xap != NULL);
420 	ASSERT(xap->xa_tq != NULL);
421 	ASSERT(mutex_owned(&xap->xa_mutex));
422 
423 	if ((xap->xa_headp != NULL) && (xap->xa_timeid == NULL) &&
424 	    (xap->xa_pending == 0)) {
425 		/*
426 		 * First try to see if we can dispatch the restart function
427 		 * immediately, in a taskq thread.  If this fails, then
428 		 * schedule a timeout(9F) callback to try again later.
429 		 */
430 		if (taskq_dispatch(xap->xa_tq,
431 		    (void (*)(void *)) xbuf_iostart, xap, KM_NOSLEEP) == 0) {
432 			/*
433 			 * Unable to enqueue the request for the taskq thread,
434 			 * try again later.  Note that this will keep re-trying
435 			 * until taskq_dispatch() succeeds.
436 			 */
437 			xap->xa_timeid = timeout(xbuf_restart_callback, xap,
438 			    XBUF_DISPATCH_DELAY);
439 		} else {
440 			/*
441 			 * This indicates that xbuf_iostart() will soon be
442 			 * run for this ddi_xbuf_attr_t, and we do not need to
443 			 * schedule another invocation via timeout/taskq
444 			 */
445 			xap->xa_pending++;
446 		}
447 	}
448 }
449 
450 /* timeout(9F) callback routine for xbuf restart mechanism. */
451 static void
452 xbuf_restart_callback(void *arg)
453 {
454 	ddi_xbuf_attr_t	xap = arg;
455 
456 	ASSERT(xap != NULL);
457 	ASSERT(xap->xa_tq != NULL);
458 	ASSERT(!mutex_owned(&xap->xa_mutex));
459 
460 	mutex_enter(&xap->xa_mutex);
461 	xap->xa_timeid = NULL;
462 	xbuf_dispatch(xap);
463 	mutex_exit(&xap->xa_mutex);
464 }
465 
466 
467 DDII void
468 ddi_xbuf_flushq(ddi_xbuf_attr_t xap, int (*funcp)(struct buf *))
469 {
470 	struct buf *bp;
471 	struct buf *next_bp;
472 	struct buf *prev_bp = NULL;
473 
474 	ASSERT(xap != NULL);
475 	ASSERT(xap->xa_tq != NULL);
476 	ASSERT(!mutex_owned(&xap->xa_mutex));
477 
478 	mutex_enter(&xap->xa_mutex);
479 
480 	for (bp = xap->xa_headp; bp != NULL; bp = next_bp) {
481 
482 		next_bp = bp->av_forw;	/* Save for next iteration */
483 
484 		/*
485 		 * If the user-supplied function is non-NULL and returns
486 		 * FALSE, then just leave the current bp on the queue.
487 		 */
488 		if ((funcp != NULL) && (!(*funcp)(bp))) {
489 			prev_bp = bp;
490 			continue;
491 		}
492 
493 		/* de-queue the bp */
494 		if (bp == xap->xa_headp) {
495 			xap->xa_headp = next_bp;
496 			if (xap->xa_headp == NULL) {
497 				xap->xa_tailp = NULL;
498 			}
499 		} else {
500 			ASSERT(xap->xa_headp != NULL);
501 			ASSERT(prev_bp != NULL);
502 			if (bp == xap->xa_tailp) {
503 				ASSERT(next_bp == NULL);
504 				xap->xa_tailp = prev_bp;
505 			}
506 			prev_bp->av_forw = next_bp;
507 		}
508 		bp->av_forw = NULL;
509 
510 		/* Add the bp to the flush queue */
511 		if (xap->xa_flush_headp == NULL) {
512 			ASSERT(xap->xa_flush_tailp == NULL);
513 			xap->xa_flush_headp = xap->xa_flush_tailp = bp;
514 		} else {
515 			ASSERT(xap->xa_flush_tailp != NULL);
516 			xap->xa_flush_tailp->av_forw = bp;
517 			xap->xa_flush_tailp = bp;
518 		}
519 	}
520 
521 	while ((bp = xap->xa_flush_headp) != NULL) {
522 		xap->xa_flush_headp = bp->av_forw;
523 		if (xap->xa_flush_headp == NULL) {
524 			xap->xa_flush_tailp = NULL;
525 		}
526 		mutex_exit(&xap->xa_mutex);
527 		bioerror(bp, EIO);
528 		bp->b_resid = bp->b_bcount;
529 		biodone(bp);
530 		mutex_enter(&xap->xa_mutex);
531 	}
532 
533 	mutex_exit(&xap->xa_mutex);
534 }
535