1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/systm.h>
30 #include <sys/types.h>
31 #include <sys/vnode.h>
32 #include <sys/errno.h>
33 #include <sys/sysmacros.h>
34 #include <sys/debug.h>
35 #include <sys/kmem.h>
36 #include <sys/conf.h>
37 #include <sys/proc.h>
38 #include <sys/cmn_err.h>
39 #include <sys/fssnap_if.h>
40 #include <sys/fs/ufs_inode.h>
41 #include <sys/fs/ufs_filio.h>
42 #include <sys/fs/ufs_log.h>
43 #include <sys/fs/ufs_bio.h>
44 #include <sys/inttypes.h>
45 #include <sys/callb.h>
46 #include <sys/tnf_probe.h>
47 
48 /*
49  * Kernel threads for logging
50  * Currently only one for rolling the log (one per log).
51  */
52 
53 #define	LUFS_DEFAULT_NUM_ROLL_BUFS 16
54 #define	LUFS_DEFAULT_MIN_ROLL_BUFS 4
55 #define	LUFS_DEFAULT_MAX_ROLL_BUFS 64
56 
57 /*
58  * Macros
59  */
60 #define	logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
61 #define	ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
62 
63 /*
64  * Tunables
65  */
66 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
67 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
68 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
69 long logmap_maxnme = 1536;
70 int trans_roll_tics = 0;
71 uint64_t trans_roll_new_delta = 0;
72 uint64_t lrr_wait = 0;
73 /*
74  * Key for thread specific data for the roll thread to
75  * bypass snapshot throttling
76  */
77 uint_t bypass_snapshot_throttle_key;
78 
79 /*
80  * externs
81  */
82 extern kmutex_t		ml_scan;
83 extern kcondvar_t	ml_scan_cv;
84 extern int		maxphys;
85 
86 static void
87 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
88 {
89 	mutex_enter(&logmap->mtm_mutex);
90 	logmap->mtm_ref = 0;
91 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
92 		cv_broadcast(&logmap->mtm_from_roll_cv);
93 	}
94 	logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
95 	CALLB_CPR_SAFE_BEGIN(cprinfop);
96 	(void) cv_timedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
97 			lbolt + trans_roll_tics);
98 	CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
99 	logmap->mtm_flags |= MTM_ROLLING;
100 	mutex_exit(&logmap->mtm_mutex);
101 }
102 
103 /*
104  * returns the number of 8K buffers to use for rolling the log
105  */
106 static uint32_t
107 log_roll_buffers()
108 {
109 	/*
110 	 * sanity validate the tunable lufs_num_roll_bufs
111 	 */
112 	if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
113 		return (lufs_min_roll_bufs);
114 	}
115 	if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
116 		return (lufs_max_roll_bufs);
117 	}
118 	return (lufs_num_roll_bufs);
119 }
120 
121 /*
122  * Find something to roll, then if we don't have cached roll buffers
123  * covering all the deltas in that MAPBLOCK then read the master
124  * and overlay the deltas.
125  * returns;
126  * 	0 if sucessful
127  *	1 on finding nothing to roll
128  *	2 on error
129  */
130 int
131 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
132     int *retnbuf)
133 {
134 	offset_t	mof;
135 	buf_t		*bp;
136 	rollbuf_t	*rbp;
137 	mt_map_t	*logmap = ul->un_logmap;
138 	daddr_t		mblkno;
139 	int		i;
140 	int		error;
141 	int		nbuf;
142 
143 	/*
144 	 * Make sure there is really something to roll
145 	 */
146 	mof = 0;
147 	if (!logmap_next_roll(logmap, &mof)) {
148 		return (1);
149 	}
150 
151 	/*
152 	 * build some master blocks + deltas to roll forward
153 	 */
154 	rw_enter(&logmap->mtm_rwlock, RW_READER);
155 	nbuf = 0;
156 	do {
157 		mof = mof & (offset_t)MAPBLOCKMASK;
158 		mblkno = lbtodb(mof);
159 
160 		/*
161 		 * Check for the case of a new delta to a set up buffer
162 		 */
163 		for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
164 			if (P2ALIGN(rbp->rb_bh.b_blkno,
165 			    MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
166 				TNF_PROBE_0(trans_roll_new_delta, "lufs",
167 				    /* CSTYLED */);
168 				trans_roll_new_delta++;
169 				/* Flush out the current set of buffers */
170 				goto flush_bufs;
171 			}
172 		}
173 
174 		/*
175 		 * Work out what to roll next. If it isn't cached then read
176 		 * it asynchronously from the master.
177 		 */
178 		bp = &rbp->rb_bh;
179 		bp->b_blkno = mblkno;
180 		bp->b_flags = B_READ;
181 		bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
182 		bp->b_bufsize = MAPBLOCKSIZE;
183 		if (top_read_roll(rbp, ul)) {
184 			/* logmap deltas were in use */
185 			if (nbuf == 0) {
186 				/*
187 				 * On first buffer wait for the logmap user
188 				 * to finish by grabbing the logmap lock
189 				 * exclusively rather than spinning
190 				 */
191 				rw_exit(&logmap->mtm_rwlock);
192 				lrr_wait++;
193 				rw_enter(&logmap->mtm_rwlock, RW_WRITER);
194 				rw_exit(&logmap->mtm_rwlock);
195 				return (1);
196 			}
197 			/* we have at least one buffer - flush it */
198 			goto flush_bufs;
199 		}
200 		if ((bp->b_flags & B_INVAL) == 0) {
201 			nbuf++;
202 		}
203 		mof += MAPBLOCKSIZE;
204 	} while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
205 
206 	/*
207 	 * If there was nothing to roll cycle back
208 	 */
209 	if (nbuf == 0) {
210 		rw_exit(&logmap->mtm_rwlock);
211 		return (1);
212 	}
213 
214 flush_bufs:
215 	/*
216 	 * For each buffer, if it isn't cached then wait for the read to
217 	 * finish and overlay the deltas.
218 	 */
219 	for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
220 		if (!rbp->rb_crb) {
221 			bp = &rbp->rb_bh;
222 			if (trans_not_wait(bp)) {
223 				ldl_seterror(ul,
224 				    "Error reading master during ufs log roll");
225 				error = 1;
226 			}
227 			/*
228 			 * sync read the data from the log
229 			 */
230 			if (ldl_read(ul, bp->b_un.b_addr,
231 			    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
232 			    MAPBLOCKSIZE, rbp->rb_age)) {
233 				error = 1;
234 			}
235 		}
236 
237 		/*
238 		 * reset the age bit in the age list
239 		 */
240 		logmap_list_put_roll(logmap, rbp->rb_age);
241 
242 		if (ul->un_flags & LDL_ERROR) {
243 			error = 1;
244 		}
245 	}
246 	rw_exit(&logmap->mtm_rwlock);
247 	if (error)
248 		return (2);
249 	*retnbuf = nbuf;
250 	return (0);
251 }
252 
253 /*
254  * Write out a cached roll buffer
255  */
256 void
257 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
258 {
259 	crb_t *crb = rbp->rb_crb;
260 	buf_t *bp = &rbp->rb_bh;
261 
262 	bp->b_blkno = lbtodb(crb->c_mof);
263 	bp->b_un.b_addr = crb->c_buf;
264 	bp->b_bcount = crb->c_nb;
265 	bp->b_bufsize = crb->c_nb;
266 	ASSERT((crb->c_nb & DEV_BMASK) == 0);
267 	bp->b_flags = B_WRITE;
268 	logstats.ls_rwrites.value.ui64++;
269 
270 	/* if snapshots are enabled, call it */
271 	if (ufsvfsp->vfs_snapshot) {
272 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
273 	} else {
274 		(void) bdev_strategy(bp);
275 	}
276 }
277 
278 /*
279  * Write out a set of non cached roll buffers
280  */
281 void
282 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
283 {
284 	buf_t		*bp = &rbp->rb_bh;
285 	buf_t		*bp2;
286 	rbsecmap_t	secmap = rbp->rb_secmap;
287 	int		j, k;
288 
289 	ASSERT(secmap);
290 	ASSERT((bp->b_flags & B_INVAL) == 0);
291 
292 	do { /* for each contiguous block of sectors */
293 		/* find start of next sector to write */
294 		for (j = 0; j < 16; ++j) {
295 			if (secmap & UINT16_C(1))
296 				break;
297 			secmap >>= 1;
298 		}
299 		bp->b_un.b_addr += (j << DEV_BSHIFT);
300 		bp->b_blkno += j;
301 
302 		/* calculate number of sectors */
303 		secmap >>= 1;
304 		j++;
305 		for (k = 1; j < 16; ++j) {
306 			if ((secmap & UINT16_C(1)) == 0)
307 				break;
308 			secmap >>= 1;
309 			k++;
310 		}
311 		bp->b_bcount = k << DEV_BSHIFT;
312 		bp->b_flags = B_WRITE;
313 		logstats.ls_rwrites.value.ui64++;
314 
315 		/* if snapshots are enabled, call it */
316 		if (ufsvfsp->vfs_snapshot)
317 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
318 		else
319 			(void) bdev_strategy(bp);
320 		if (secmap) {
321 			/*
322 			 * Allocate another buf_t to handle
323 			 * the next write in this MAPBLOCK
324 			 * Chain them via b_list.
325 			 */
326 			bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
327 			bp->b_list = bp2;
328 			bioinit(bp2);
329 			bp2->b_iodone = trans_not_done;
330 			bp2->b_bufsize = MAPBLOCKSIZE;
331 			bp2->b_edev = bp->b_edev;
332 			bp2->b_un.b_addr =
333 			    bp->b_un.b_addr + bp->b_bcount;
334 			bp2->b_blkno = bp->b_blkno + k;
335 			bp = bp2;
336 		}
337 	} while (secmap);
338 }
339 
340 /*
341  * Asynchronously roll the deltas, using the sector map
342  * in each rollbuf_t.
343  */
344 int
345 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
346 {
347 
348 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
349 	rollbuf_t	*rbp;
350 	buf_t		*bp, *bp2;
351 	rollbuf_t	*head, *prev, *rbp2;
352 
353 	/*
354 	 * Order the buffers by blkno
355 	 */
356 	ASSERT(nbuf > 0);
357 #ifdef lint
358 	prev = rbs;
359 #endif
360 	for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
361 		for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
362 			if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
363 				if (rbp2 == head) {
364 					rbp->rb_next = head;
365 					head = rbp;
366 				} else {
367 					prev->rb_next = rbp;
368 					rbp->rb_next = rbp2;
369 				}
370 				break;
371 			}
372 		}
373 		if (rbp2 == NULL) {
374 			prev->rb_next = rbp;
375 			rbp->rb_next = NULL;
376 		}
377 	}
378 
379 	/*
380 	 * issue the in-order writes
381 	 */
382 	for (rbp = head; rbp; rbp = rbp2) {
383 		if (rbp->rb_crb) {
384 			log_roll_write_crb(ufsvfsp, rbp);
385 		} else {
386 			log_roll_write_bufs(ufsvfsp, rbp);
387 		}
388 		/* null out the rb_next link for next set of rolling */
389 		rbp2 = rbp->rb_next;
390 		rbp->rb_next = NULL;
391 	}
392 
393 	/*
394 	 * wait for all the writes to finish
395 	 */
396 	for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
397 		bp = &rbp->rb_bh;
398 		if (trans_not_wait(bp)) {
399 			ldl_seterror(ul,
400 			    "Error writing master during ufs log roll");
401 		}
402 
403 		/*
404 		 * Now wait for all the "cloned" buffer writes (if any)
405 		 * and free those headers
406 		 */
407 		bp2 = bp->b_list;
408 		bp->b_list = NULL;
409 		while (bp2) {
410 			if (trans_not_wait(bp2)) {
411 				ldl_seterror(ul,
412 				    "Error writing master during ufs log roll");
413 			}
414 			bp = bp2;
415 			bp2 = bp2->b_list;
416 			kmem_free(bp, sizeof (buf_t));
417 		}
418 	}
419 
420 	if (ul->un_flags & LDL_ERROR)
421 		return (1);
422 	return (0);
423 }
424 
425 void
426 trans_roll(ml_unit_t *ul)
427 {
428 	callb_cpr_t	cprinfo;
429 	mt_map_t	*logmap = ul->un_logmap;
430 	rollbuf_t	*rbs;
431 	rollbuf_t	*rbp;
432 	buf_t		*bp;
433 	caddr_t		roll_bufs;
434 	uint32_t	nmblk;
435 	int		i;
436 	int		doingforceroll;
437 	int		nbuf;
438 
439 	CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
440 	    "trans_roll");
441 
442 	/*
443 	 * We do not want the roll thread's writes to be
444 	 * throttled by the snapshot.
445 	 * If they are throttled then we can have a deadlock
446 	 * between the roll thread and the snapshot taskq thread:
447 	 * roll thread wants the throttling semaphore and
448 	 * the snapshot taskq thread cannot release the semaphore
449 	 * because it is writing to the log and the log is full.
450 	 */
451 
452 	(void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
453 
454 	/*
455 	 * setup some roll parameters
456 	 */
457 	if (trans_roll_tics == 0)
458 		trans_roll_tics = 5 * hz;
459 	nmblk = log_roll_buffers();
460 
461 	/*
462 	 * allocate the buffers and buffer headers
463 	 */
464 	roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
465 	rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
466 
467 	/*
468 	 * initialize the buffer headers
469 	 */
470 	for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
471 		rbp->rb_next = NULL;
472 		bp = &rbp->rb_bh;
473 		bioinit(bp);
474 		bp->b_edev = ul->un_dev;
475 		bp->b_iodone = trans_not_done;
476 		bp->b_bufsize = MAPBLOCKSIZE;
477 	}
478 
479 	doingforceroll = 0;
480 
481 again:
482 	/*
483 	 * LOOP FOREVER
484 	 */
485 
486 	/*
487 	 * exit on demand
488 	 */
489 	mutex_enter(&logmap->mtm_mutex);
490 	if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
491 		kmem_free(rbs, nmblk * sizeof (rollbuf_t));
492 		kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
493 		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
494 		    MTM_ROLL_EXIT | MTM_ROLLING);
495 		cv_broadcast(&logmap->mtm_from_roll_cv);
496 		CALLB_CPR_EXIT(&cprinfo);
497 		thread_exit();
498 		/* NOTREACHED */
499 	}
500 
501 	/*
502 	 * MT_SCAN debug mode
503 	 *	don't roll except in FORCEROLL situations
504 	 */
505 	if (logmap->mtm_debug & MT_SCAN)
506 		if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
507 			mutex_exit(&logmap->mtm_mutex);
508 			trans_roll_wait(logmap, &cprinfo);
509 			goto again;
510 		}
511 	ASSERT(logmap->mtm_trimlof == 0);
512 
513 	/*
514 	 * If we've finished a force roll cycle then wakeup any
515 	 * waiters.
516 	 */
517 	if (doingforceroll) {
518 		doingforceroll = 0;
519 		logmap->mtm_flags &= ~MTM_FORCE_ROLL;
520 		mutex_exit(&logmap->mtm_mutex);
521 		cv_broadcast(&logmap->mtm_from_roll_cv);
522 	} else {
523 		mutex_exit(&logmap->mtm_mutex);
524 	}
525 
526 	/*
527 	 * If someone wants us to roll something; then do it
528 	 */
529 	if (logmap->mtm_flags & MTM_FORCE_ROLL) {
530 		doingforceroll = 1;
531 		goto rollsomething;
532 	}
533 
534 	/*
535 	 * Log is busy, check if logmap is getting full.
536 	 */
537 	if (logmap_need_roll(logmap)) {
538 		goto rollsomething;
539 	}
540 
541 	/*
542 	 * Check if the log is idle and is not empty
543 	 */
544 	if (!logmap->mtm_ref && !ldl_empty(ul)) {
545 		goto rollsomething;
546 	}
547 
548 	/*
549 	 * Log is busy, check if its getting full
550 	 */
551 	if (ldl_need_roll(ul)) {
552 		goto rollsomething;
553 	}
554 
555 	/*
556 	 * nothing to do; wait a bit and then start over
557 	 */
558 	trans_roll_wait(logmap, &cprinfo);
559 	goto again;
560 
561 	/*
562 	 * ROLL SOMETHING
563 	 */
564 
565 rollsomething:
566 	/*
567 	 * Use the cached roll buffers, or read the master
568 	 * and overlay the deltas
569 	 */
570 	switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
571 	case 1: trans_roll_wait(logmap, &cprinfo);
572 		/* FALLTHROUGH */
573 	case 2: goto again;
574 	/* default case is success */
575 	}
576 
577 	/*
578 	 * Asynchronously write out the deltas
579 	 */
580 	if (log_roll_write(ul, rbs, nbuf))
581 		goto again;
582 
583 	/*
584 	 * free up the deltas in the logmap
585 	 */
586 	for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
587 		bp = &rbp->rb_bh;
588 		logmap_remove_roll(logmap,
589 		    ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
590 	}
591 
592 	/*
593 	 * free up log space; if possible
594 	 */
595 	logmap_sethead(logmap, ul);
596 
597 	/*
598 	 * LOOP
599 	 */
600 	goto again;
601 }
602