1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /* copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * Copyright (c) 2017 by Delphix. All rights reserved.
31 */
32
33 /*
34 * Portions of this source code were derived from Berkeley 4.3 BSD
35 * under license from the Regents of the University of California.
36 */
37
38 #include <sys/types.h>
39 #include <sys/systm.h>
40 #include <sys/errno.h>
41 #include <sys/kmem.h>
42 #include <sys/buf.h>
43 #include <sys/vnode.h>
44 #include <sys/vfs.h>
45 #include <sys/user.h>
46 #include <sys/callb.h>
47 #include <sys/cpuvar.h>
48 #include <sys/fs/ufs_inode.h>
49 #include <sys/fs/ufs_log.h>
50 #include <sys/fs/ufs_trans.h>
51 #include <sys/fs/ufs_acl.h>
52 #include <sys/fs/ufs_bio.h>
53 #include <sys/fs/ufs_fsdir.h>
54 #include <sys/debug.h>
55 #include <sys/cmn_err.h>
56 #include <sys/sysmacros.h>
57 #include <vm/pvn.h>
58
59 extern pri_t minclsyspri;
60 extern int hash2ints();
61 extern struct kmem_cache *inode_cache; /* cache of free inodes */
62 extern int ufs_idle_waiters;
63 extern struct instats ins;
64
65 static void ufs_attr_purge(struct inode *);
66
67 /*
68 * initialize a thread's queue struct
69 */
70 void
ufs_thread_init(struct ufs_q * uq,int lowat)71 ufs_thread_init(struct ufs_q *uq, int lowat)
72 {
73 bzero((caddr_t)uq, sizeof (*uq));
74 cv_init(&uq->uq_cv, NULL, CV_DEFAULT, NULL);
75 mutex_init(&uq->uq_mutex, NULL, MUTEX_DEFAULT, NULL);
76 uq->uq_lowat = lowat;
77 uq->uq_hiwat = 2 * lowat;
78 uq->uq_threadp = NULL;
79 }
80
81 /*
82 * start a thread for a queue (assumes success)
83 */
84 void
ufs_thread_start(struct ufs_q * uq,void (* func)(),struct vfs * vfsp)85 ufs_thread_start(struct ufs_q *uq, void (*func)(), struct vfs *vfsp)
86 {
87 mutex_enter(&uq->uq_mutex);
88 if (uq->uq_threadp == NULL) {
89 uq->uq_threadp = thread_create(NULL, 0, func, vfsp, 0, &p0,
90 TS_RUN, minclsyspri);
91 uq->uq_flags = 0;
92 }
93 mutex_exit(&uq->uq_mutex);
94 }
95
96 /*
97 * wait for the thread to exit
98 */
99 void
ufs_thread_exit(struct ufs_q * uq)100 ufs_thread_exit(struct ufs_q *uq)
101 {
102 kt_did_t ufs_thread_did = 0;
103
104 mutex_enter(&uq->uq_mutex);
105 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
106 if (uq->uq_threadp != NULL) {
107 ufs_thread_did = uq->uq_threadp->t_did;
108 uq->uq_flags |= (UQ_EXIT|UQ_WAIT);
109 cv_broadcast(&uq->uq_cv);
110 }
111 mutex_exit(&uq->uq_mutex);
112
113 /*
114 * It's safe to call thread_join() with an already-gone
115 * t_did, but we have to obtain it before the kernel
116 * thread structure is freed. We do so above under the
117 * protection of the uq_mutex when we're sure the thread
118 * still exists and it's save to de-reference it.
119 * We also have to check if ufs_thread_did is != 0
120 * before calling thread_join() since thread 0 in the system
121 * gets a t_did of 0.
122 */
123 if (ufs_thread_did)
124 thread_join(ufs_thread_did);
125 }
126
127 /*
128 * wait for a thread to suspend itself on the caller's behalf
129 * the caller is responsible for continuing the thread
130 */
131 void
ufs_thread_suspend(struct ufs_q * uq)132 ufs_thread_suspend(struct ufs_q *uq)
133 {
134 mutex_enter(&uq->uq_mutex);
135 if (uq->uq_threadp != NULL) {
136 /*
137 * wait while another thread is suspending this thread.
138 * no need to do a cv_broadcast(), as whoever suspended
139 * the thread must continue it at some point.
140 */
141 while ((uq->uq_flags & UQ_SUSPEND) &&
142 (uq->uq_threadp != NULL)) {
143 /*
144 * We can't use cv_signal() because if our
145 * signal doesn't happen to hit the desired
146 * thread but instead some other waiter like
147 * ourselves, we'll wait forever for a
148 * response. Well, at least an indeterminate
149 * amount of time until we just happen to get
150 * lucky from whomever did get signalled doing
151 * a cv_signal() of their own. This is an
152 * unfortunate performance lossage.
153 */
154 uq->uq_flags |= UQ_WAIT;
155 cv_wait(&uq->uq_cv, &uq->uq_mutex);
156 }
157
158 uq->uq_flags |= (UQ_SUSPEND | UQ_WAIT);
159
160 /*
161 * wait for the thread to suspend itself
162 */
163 if ((uq->uq_flags & UQ_SUSPENDED) == 0 &&
164 (uq->uq_threadp != NULL)) {
165 cv_broadcast(&uq->uq_cv);
166 }
167
168 while (((uq->uq_flags & UQ_SUSPENDED) == 0) &&
169 (uq->uq_threadp != NULL)) {
170 cv_wait(&uq->uq_cv, &uq->uq_mutex);
171 }
172 }
173 mutex_exit(&uq->uq_mutex);
174 }
175
176 /*
177 * allow a thread to continue from a ufs_thread_suspend()
178 * This thread must be the same as the thread that called
179 * ufs_thread_suspend.
180 */
181 void
ufs_thread_continue(struct ufs_q * uq)182 ufs_thread_continue(struct ufs_q *uq)
183 {
184 mutex_enter(&uq->uq_mutex);
185 uq->uq_flags &= ~(UQ_SUSPEND | UQ_SUSPENDED);
186 cv_broadcast(&uq->uq_cv);
187 mutex_exit(&uq->uq_mutex);
188 }
189
190 /*
191 * some common code for managing a threads execution
192 * uq is locked at entry and return
193 * may sleep
194 * may exit
195 */
196 /*
197 * Kind of a hack passing in the callb_cpr_t * here.
198 * It should really be part of the ufs_q structure.
199 * I did not put it in there because we are already in beta
200 * and I was concerned that changing ufs_inode.h to include
201 * callb.h might break something.
202 */
203 int
ufs_thread_run(struct ufs_q * uq,callb_cpr_t * cprinfop)204 ufs_thread_run(struct ufs_q *uq, callb_cpr_t *cprinfop)
205 {
206 again:
207 ASSERT(uq->uq_ne >= 0);
208
209 if (uq->uq_flags & UQ_SUSPEND) {
210 uq->uq_flags |= UQ_SUSPENDED;
211 } else if (uq->uq_flags & UQ_EXIT) {
212 /*
213 * exiting; empty the queue (may infinite loop)
214 */
215 if (uq->uq_ne)
216 return (uq->uq_ne);
217 uq->uq_threadp = NULL;
218 if (uq->uq_flags & UQ_WAIT) {
219 cv_broadcast(&uq->uq_cv);
220 }
221 uq->uq_flags &= ~(UQ_EXIT | UQ_WAIT);
222 CALLB_CPR_EXIT(cprinfop);
223 thread_exit();
224 } else if (uq->uq_ne >= uq->uq_lowat) {
225 /*
226 * process a block of entries until below high water mark
227 */
228 return (uq->uq_ne - (uq->uq_lowat >> 1));
229 }
230 if (uq->uq_flags & UQ_WAIT) {
231 uq->uq_flags &= ~UQ_WAIT;
232 cv_broadcast(&uq->uq_cv);
233 }
234 CALLB_CPR_SAFE_BEGIN(cprinfop);
235 cv_wait(&uq->uq_cv, &uq->uq_mutex);
236 CALLB_CPR_SAFE_END(cprinfop, &uq->uq_mutex);
237 goto again;
238 }
239
240 /*
241 * DELETE INODE
242 * The following routines implement the protocol for freeing the resources
243 * held by an idle and deleted inode.
244 */
245 void
ufs_delete(struct ufsvfs * ufsvfsp,struct inode * ip,int dolockfs)246 ufs_delete(struct ufsvfs *ufsvfsp, struct inode *ip, int dolockfs)
247 {
248 ushort_t mode;
249 struct vnode *vp = ITOV(ip);
250 struct ulockfs *ulp;
251 int trans_size;
252 int dorwlock = ((ip->i_mode & IFMT) == IFREG);
253 int issync;
254 int err;
255 struct inode *dp;
256 struct ufs_q *delq = &ufsvfsp->vfs_delete;
257 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
258
259 /*
260 * Ignore if deletes are not allowed (wlock/hlock)
261 */
262 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
263 mutex_enter(&delq->uq_mutex);
264 delq_info->delq_unreclaimed_blocks -= ip->i_blocks;
265 delq_info->delq_unreclaimed_files--;
266 mutex_exit(&delq->uq_mutex);
267 VN_RELE(vp);
268 return;
269 }
270
271 if ((vp->v_count > 1) || (ip->i_mode == 0)) {
272 mutex_enter(&delq->uq_mutex);
273 delq_info->delq_unreclaimed_blocks -= ip->i_blocks;
274 delq_info->delq_unreclaimed_files--;
275 mutex_exit(&delq->uq_mutex);
276 VN_RELE(vp);
277 return;
278 }
279 /*
280 * If we are called as part of setting a fs lock, then only
281 * do part of the lockfs protocol. In other words, don't hang.
282 */
283 if (dolockfs) {
284 if (ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_DELETE_MASK))
285 return;
286 } else {
287 /*
288 * check for recursive VOP call
289 */
290 if (curthread->t_flag & T_DONTBLOCK) {
291 ulp = NULL;
292 } else {
293 ulp = &ufsvfsp->vfs_ulockfs;
294 curthread->t_flag |= T_DONTBLOCK;
295 }
296 }
297
298 /*
299 * Hold rwlock to synchronize with (nfs) writes
300 */
301 if (dorwlock)
302 rw_enter(&ip->i_rwlock, RW_WRITER);
303
304 /*
305 * Delete the attribute directory.
306 */
307 if (ip->i_oeftflag != 0) {
308 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
309 trans_size = (int)TOP_REMOVE_SIZE(ip));
310 rw_enter(&ip->i_contents, RW_WRITER);
311 err = ufs_iget(ip->i_vfs, ip->i_oeftflag,
312 &dp, CRED());
313 if (err == 0) {
314 rw_enter(&dp->i_rwlock, RW_WRITER);
315 rw_enter(&dp->i_contents, RW_WRITER);
316 dp->i_flag |= IUPD|ICHG;
317 dp->i_seq++;
318 TRANS_INODE(dp->i_ufsvfs, dp);
319 dp->i_nlink -= 2;
320 ufs_setreclaim(dp);
321 /*
322 * Should get rid of any negative cache entries that
323 * might be lingering, as well as ``.'' and
324 * ``..''. If we don't, the VN_RELE() below
325 * won't actually put dp on the delete queue
326 * and it'll hang out until someone forces it
327 * (lockfs -f, umount, ...). The only reliable
328 * way of doing this at the moment is to call
329 * dnlc_purge_vp(ITOV(dp)), which is unacceptably
330 * slow, so we'll just note the problem in this
331 * comment for now.
332 */
333 dnlc_remove(ITOV(dp), ".");
334 dnlc_remove(ITOV(dp), "..");
335 ITIMES_NOLOCK(dp);
336 if (!TRANS_ISTRANS(ufsvfsp)) {
337 ufs_iupdat(dp, I_SYNC);
338 }
339 rw_exit(&dp->i_contents);
340 rw_exit(&dp->i_rwlock);
341 VN_RELE(ITOV(dp));
342 }
343 /*
344 * Clear out attribute pointer
345 */
346 ip->i_oeftflag = 0;
347 rw_exit(&ip->i_contents);
348 TRANS_END_CSYNC(ufsvfsp, err, issync,
349 TOP_REMOVE, trans_size);
350 dnlc_remove(ITOV(ip), XATTR_DIR_NAME);
351 }
352
353 if ((ip->i_mode & IFMT) == IFATTRDIR) {
354 ufs_attr_purge(ip);
355 }
356
357 (void) TRANS_ITRUNC(ip, (u_offset_t)0, I_FREE | I_ACCT, CRED());
358
359 /*
360 * the inode's space has been freed; now free the inode
361 */
362 if (ulp) {
363 trans_size = TOP_IFREE_SIZE(ip);
364 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
365 }
366 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
367 rw_enter(&ip->i_contents, RW_WRITER);
368 TRANS_INODE(ufsvfsp, ip);
369 mode = ip->i_mode;
370 ip->i_mode = 0;
371 ip->i_rdev = 0;
372 ip->i_ordev = 0;
373 ip->i_flag |= IMOD;
374 if (ip->i_ufs_acl) {
375 (void) ufs_si_free(ip->i_ufs_acl, vp->v_vfsp, CRED());
376 ip->i_ufs_acl = NULL;
377 ip->i_shadow = 0;
378 }
379
380 /*
381 * This inode is torn down but still retains it's identity
382 * (inode number). It could get recycled soon so it's best
383 * to clean up the vnode just in case.
384 */
385 mutex_enter(&vp->v_lock);
386 vn_recycle(vp);
387 mutex_exit(&vp->v_lock);
388
389 /*
390 * free the inode
391 */
392 ufs_ifree(ip, ip->i_number, mode);
393 /*
394 * release quota resources; can't fail
395 */
396 (void) chkiq((struct ufsvfs *)vp->v_vfsp->vfs_data,
397 /* change */ -1, ip, (uid_t)ip->i_uid, 0, CRED(),
398 (char **)NULL, (size_t *)NULL);
399 dqrele(ip->i_dquot);
400 ip->i_dquot = NULL;
401 ip->i_flag &= ~(IDEL | IDIRECTIO);
402 ip->i_cflags = 0;
403 if (!TRANS_ISTRANS(ufsvfsp)) {
404 ufs_iupdat(ip, I_SYNC);
405 } else {
406 mutex_enter(&delq->uq_mutex);
407 delq_info->delq_unreclaimed_files--;
408 mutex_exit(&delq->uq_mutex);
409 }
410 rw_exit(&ip->i_contents);
411 rw_exit(&ufsvfsp->vfs_dqrwlock);
412 if (dorwlock)
413 rw_exit(&ip->i_rwlock);
414 VN_RELE(vp);
415
416 /*
417 * End of transaction
418 */
419 if (ulp) {
420 TRANS_END_ASYNC(ufsvfsp, TOP_IFREE, trans_size);
421 if (dolockfs)
422 ufs_lockfs_end(ulp);
423 else
424 curthread->t_flag &= ~T_DONTBLOCK;
425 }
426 }
427
428 /*
429 * Create the delete thread and init the delq_info for this fs
430 */
431 void
ufs_delete_init(struct ufsvfs * ufsvfsp,int lowat)432 ufs_delete_init(struct ufsvfs *ufsvfsp, int lowat)
433 {
434 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
435
436 ufs_thread_init(&ufsvfsp->vfs_delete, lowat);
437 (void) memset((void *)delq_info, 0, sizeof (*delq_info));
438 }
439
440 /*
441 * thread that frees up deleted inodes
442 */
443 void
ufs_thread_delete(struct vfs * vfsp)444 ufs_thread_delete(struct vfs *vfsp)
445 {
446 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
447 struct ufs_q *uq = &ufsvfsp->vfs_delete;
448 struct inode *ip;
449 long ne;
450 callb_cpr_t cprinfo;
451
452 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
453 "ufsdelete");
454
455 mutex_enter(&uq->uq_mutex);
456 again:
457 /*
458 * Sleep until there is work to do. Only do one entry at
459 * a time, to reduce the wait time for checking for a suspend
460 * request. The ?: is for pedantic portability.
461 */
462 ne = ufs_thread_run(uq, &cprinfo) ? 1 : 0;
463
464 /*
465 * process an entry, if there are any
466 */
467 if (ne && (ip = uq->uq_ihead)) {
468 /*
469 * process first entry on queue. Assumed conditions are:
470 * ip is held (v_count >= 1)
471 * ip is referenced (i_flag & IREF)
472 * ip is free (i_nlink <= 0)
473 */
474 if ((uq->uq_ihead = ip->i_freef) == ip)
475 uq->uq_ihead = NULL;
476 ip->i_freef->i_freeb = ip->i_freeb;
477 ip->i_freeb->i_freef = ip->i_freef;
478 ip->i_freef = ip;
479 ip->i_freeb = ip;
480 uq->uq_ne--;
481 mutex_exit(&uq->uq_mutex);
482 ufs_delete(ufsvfsp, ip, 1);
483 mutex_enter(&uq->uq_mutex);
484 }
485 goto again;
486 }
487
488 /*
489 * drain ne entries off the delete queue. As new queue entries may
490 * be added while we're working, ne is interpreted as follows:
491 *
492 * ne > 0 => remove up to ne entries
493 * ne == 0 => remove all entries currently on the queue
494 * ne == -1 => remove entries until the queue is empty
495 */
496 void
ufs_delete_drain(struct vfs * vfsp,int ne,int dolockfs)497 ufs_delete_drain(struct vfs *vfsp, int ne, int dolockfs)
498 {
499 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
500 struct ufs_q *uq;
501 struct inode *ip;
502 int drain_cnt = 0;
503 int done;
504
505 /*
506 * if forcibly unmounted; ignore
507 */
508 if (ufsvfsp == NULL)
509 return;
510
511 uq = &ufsvfsp->vfs_delete;
512 mutex_enter(&uq->uq_mutex);
513 if (ne == 0)
514 drain_cnt = uq->uq_ne;
515 else if (ne > 0)
516 drain_cnt = ne;
517
518 /*
519 * process up to ne entries
520 */
521
522 done = 0;
523 while (!done && (ip = uq->uq_ihead)) {
524 if (ne != -1)
525 drain_cnt--;
526 if (ne != -1 && drain_cnt == 0)
527 done = 1;
528 if ((uq->uq_ihead = ip->i_freef) == ip)
529 uq->uq_ihead = NULL;
530 ip->i_freef->i_freeb = ip->i_freeb;
531 ip->i_freeb->i_freef = ip->i_freef;
532 ip->i_freef = ip;
533 ip->i_freeb = ip;
534 uq->uq_ne--;
535 mutex_exit(&uq->uq_mutex);
536 ufs_delete(ufsvfsp, ip, dolockfs);
537 mutex_enter(&uq->uq_mutex);
538 }
539 mutex_exit(&uq->uq_mutex);
540 }
541
542 void
ufs_sync_with_thread(struct ufs_q * uq)543 ufs_sync_with_thread(struct ufs_q *uq)
544 {
545 mutex_enter(&uq->uq_mutex);
546
547 /*
548 * Wake up delete thread to free up space.
549 */
550 if ((uq->uq_flags & UQ_WAIT) == 0) {
551 uq->uq_flags |= UQ_WAIT;
552 cv_broadcast(&uq->uq_cv);
553 }
554
555 while ((uq->uq_threadp != NULL) && (uq->uq_flags & UQ_WAIT)) {
556 cv_wait(&uq->uq_cv, &uq->uq_mutex);
557 }
558
559 mutex_exit(&uq->uq_mutex);
560 }
561
562 /*
563 * Get rid of everything that's currently in the delete queue,
564 * plus whatever the delete thread is working on at the moment.
565 *
566 * This ability is required for providing true POSIX semantics
567 * regarding close(2), unlink(2), etc, even when logging is enabled.
568 * The standard requires that the released space be immediately
569 * observable (statvfs(2)) and allocatable (e.g., write(2)).
570 */
571 void
ufs_delete_drain_wait(struct ufsvfs * ufsvfsp,int dolockfs)572 ufs_delete_drain_wait(struct ufsvfs *ufsvfsp, int dolockfs)
573 {
574 struct ufs_q *uq = &ufsvfsp->vfs_delete;
575 int error;
576 struct ufs_q *delq = &ufsvfsp->vfs_delete;
577 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
578
579 /*
580 * If there is something on delq or delete thread
581 * working on delq.
582 */
583 mutex_enter(&delq->uq_mutex);
584 if (delq_info->delq_unreclaimed_files > 0) {
585 mutex_exit(&delq->uq_mutex);
586 (void) ufs_delete_drain(ufsvfsp->vfs_vfs, 0, dolockfs);
587 ufs_sync_with_thread(uq);
588 } else {
589 ASSERT(delq_info->delq_unreclaimed_files == 0);
590 mutex_exit(&delq->uq_mutex);
591 return;
592 }
593
594 /*
595 * Commit any outstanding transactions to make sure
596 * any canceled freed blocks are available for allocation.
597 */
598 curthread->t_flag |= T_DONTBLOCK;
599 TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, error);
600 if (!error) {
601 TRANS_END_SYNC(ufsvfsp, error, TOP_COMMIT_UPDATE,
602 TOP_COMMIT_SIZE);
603 }
604 curthread->t_flag &= ~T_DONTBLOCK;
605 }
606
607 /*
608 * Adjust the resource usage in a struct statvfs based on
609 * what's in the delete queue.
610 *
611 * We do not consider the impact of ACLs or extended attributes
612 * that may be deleted as a side-effect of deleting a file.
613 * Those are metadata, and their sizes aren't reflected in the
614 * sizes returned by stat(), so this is not a problem.
615 */
616 void
ufs_delete_adjust_stats(struct ufsvfs * ufsvfsp,struct statvfs64 * sp)617 ufs_delete_adjust_stats(struct ufsvfs *ufsvfsp, struct statvfs64 *sp)
618 {
619 struct ufs_q *uq = &ufsvfsp->vfs_delete;
620 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
621
622 mutex_enter(&uq->uq_mutex);
623 /*
624 * The blocks accounted for in the delete queue info are
625 * counted in DEV_BSIZE chunks, but ufs_statvfs counts in
626 * filesystem fragments, so a conversion is required here.
627 */
628 sp->f_bfree += dbtofsb(ufsvfsp->vfs_fs,
629 delq_info->delq_unreclaimed_blocks);
630 sp->f_ffree += delq_info->delq_unreclaimed_files;
631 mutex_exit(&uq->uq_mutex);
632 }
633
634 /*
635 * IDLE INODE
636 * The following routines implement the protocol for maintaining an
637 * LRU list of idle inodes and for moving the idle inodes to the
638 * reuse list when the number of allocated inodes exceeds the user
639 * tunable high-water mark (ufs_ninode).
640 */
641
642 /*
643 * clean an idle inode and move it to the reuse list
644 */
645 static void
ufs_idle_free(struct inode * ip)646 ufs_idle_free(struct inode *ip)
647 {
648 int pages;
649 int hno;
650 kmutex_t *ihm;
651 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
652 struct vnode *vp = ITOV(ip);
653 int vn_has_data, vn_modified;
654
655 /*
656 * inode is held
657 */
658
659 /*
660 * remember `pages' for stats below
661 */
662 pages = (ip->i_mode && vn_has_cached_data(vp) && vp->v_type != VCHR);
663
664 /*
665 * start the dirty pages to disk and then invalidate them
666 * unless the inode is invalid (ISTALE)
667 */
668 if ((ip->i_flag & ISTALE) == 0) {
669 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_FREE);
670 (void) TRANS_SYNCIP(ip,
671 (TRANS_ISERROR(ufsvfsp)) ? B_INVAL | B_FORCE : B_INVAL,
672 I_ASYNC, TOP_SYNCIP_FREE);
673 }
674
675 /*
676 * wait for any current ufs_iget to finish and block future ufs_igets
677 */
678 ASSERT(ip->i_number != 0);
679 hno = INOHASH(ip->i_number);
680 ihm = &ih_lock[hno];
681 mutex_enter(ihm);
682
683 /*
684 * It must be guaranteed that v_count >= 2, otherwise
685 * something must be wrong with this vnode already.
686 * That is why we use VN_RELE_LOCKED() instead of VN_RELE().
687 * Acquire the vnode lock in case another thread is in
688 * VN_RELE().
689 */
690 mutex_enter(&vp->v_lock);
691
692 VERIFY3U(vp->v_count, >=, 2);
693
694 VN_RELE_LOCKED(vp);
695
696 vn_has_data = (vp->v_type != VCHR && vn_has_cached_data(vp));
697 vn_modified = (ip->i_flag & (IMOD|IMODACC|IACC|ICHG|IUPD|IATTCHG));
698
699 if (vp->v_count != 1 ||
700 ((vn_has_data || vn_modified) &&
701 ((ip->i_flag & ISTALE) == 0))) {
702 /*
703 * Another thread has referenced this inode while
704 * we are trying to free it. Call VN_RELE() to
705 * release our reference, if v_count > 1 data is
706 * present or one of the modified etc. flags was
707 * set, whereby ISTALE wasn't set.
708 * If we'd proceed with ISTALE set here, we might
709 * get ourselves into a deadlock situation.
710 */
711 mutex_exit(&vp->v_lock);
712 mutex_exit(ihm);
713 VN_RELE(vp);
714 } else {
715 /*
716 * The inode is currently unreferenced and can not
717 * acquire further references because it has no pages
718 * and the hash is locked. Inodes acquire references
719 * via the hash list or via their pages.
720 */
721
722 mutex_exit(&vp->v_lock);
723
724 /*
725 * remove it from the cache
726 */
727 remque(ip);
728 mutex_exit(ihm);
729 /*
730 * Stale inodes have no valid ufsvfs
731 */
732 if ((ip->i_flag & ISTALE) == 0 && ip->i_dquot) {
733 TRANS_DQRELE(ufsvfsp, ip->i_dquot);
734 ip->i_dquot = NULL;
735 }
736 if ((ip->i_flag & ISTALE) &&
737 vn_has_data) {
738 /*
739 * ISTALE inodes may have data
740 * and this data needs to be
741 * cleaned up.
742 */
743 (void) pvn_vplist_dirty(vp, (u_offset_t)0,
744 ufs_putapage, B_INVAL | B_TRUNC,
745 (struct cred *)NULL);
746 }
747 ufs_si_del(ip);
748 if (pages) {
749 CPU_STATS_ADDQ(CPU, sys, ufsipage, 1);
750 } else {
751 CPU_STATS_ADDQ(CPU, sys, ufsinopage, 1);
752 }
753 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
754
755 /*
756 * We had better not have a vnode reference count > 1
757 * at this point, if we do then something is broken as
758 * this inode/vnode acquired a reference underneath of us.
759 */
760 ASSERT(vp->v_count == 1);
761
762 ufs_free_inode(ip);
763 }
764 }
765
766 /*
767 * this thread processes the global idle queue
768 */
769 iqhead_t *ufs_junk_iq;
770 iqhead_t *ufs_useful_iq;
771 int ufs_njunk_iq = 0;
772 int ufs_nuseful_iq = 0;
773 int ufs_niqhash;
774 int ufs_iqhashmask;
775 struct ufs_q ufs_idle_q;
776
777 void
ufs_thread_idle(void)778 ufs_thread_idle(void)
779 {
780 callb_cpr_t cprinfo;
781 int i;
782 int ne;
783
784 ufs_niqhash = (ufs_idle_q.uq_lowat >> 1) / IQHASHQLEN;
785 ufs_niqhash = 1 << highbit(ufs_niqhash); /* round up to power of 2 */
786 ufs_iqhashmask = ufs_niqhash - 1;
787 ufs_junk_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_junk_iq),
788 KM_SLEEP);
789 ufs_useful_iq = kmem_alloc(ufs_niqhash * sizeof (*ufs_useful_iq),
790 KM_SLEEP);
791
792 /* Initialize hash queue headers */
793 for (i = 0; i < ufs_niqhash; i++) {
794 ufs_junk_iq[i].i_freef = (inode_t *)&ufs_junk_iq[i];
795 ufs_junk_iq[i].i_freeb = (inode_t *)&ufs_junk_iq[i];
796 ufs_useful_iq[i].i_freef = (inode_t *)&ufs_useful_iq[i];
797 ufs_useful_iq[i].i_freeb = (inode_t *)&ufs_useful_iq[i];
798 }
799
800 CALLB_CPR_INIT(&cprinfo, &ufs_idle_q.uq_mutex, callb_generic_cpr,
801 "ufsidle");
802 again:
803 /*
804 * Whenever the idle thread is awakened, it repeatedly gives
805 * back half of the idle queue until the idle queue falls
806 * below lowat.
807 */
808 mutex_enter(&ufs_idle_q.uq_mutex);
809 if (ufs_idle_q.uq_ne < ufs_idle_q.uq_lowat) {
810 CALLB_CPR_SAFE_BEGIN(&cprinfo);
811 cv_wait(&ufs_idle_q.uq_cv, &ufs_idle_q.uq_mutex);
812 CALLB_CPR_SAFE_END(&cprinfo, &ufs_idle_q.uq_mutex);
813 }
814 mutex_exit(&ufs_idle_q.uq_mutex);
815
816 /*
817 * Give back 1/2 of the idle queue
818 */
819 ne = ufs_idle_q.uq_ne >> 1;
820 ins.in_tidles.value.ul += ne;
821 ufs_idle_some(ne);
822 goto again;
823 }
824
825 /*
826 * Reclaim callback for ufs inode cache.
827 * Invoked by the kernel memory allocator when memory gets tight.
828 */
829 /*ARGSUSED*/
830 void
ufs_inode_cache_reclaim(void * cdrarg)831 ufs_inode_cache_reclaim(void *cdrarg)
832 {
833 /*
834 * If we are low on memory and the idle queue is over its
835 * halfway mark, then free 50% of the idle q
836 *
837 * We don't free all of the idle inodes because the inodes
838 * for popular NFS files may have been kicked from the dnlc.
839 * The inodes for these files will end up on the idle queue
840 * after every NFS access.
841 *
842 * If we repeatedly push them from the idle queue then
843 * NFS users may be unhappy as an extra buf cache operation
844 * is incurred for every NFS operation to these files.
845 *
846 * It's not common, but I have seen it happen.
847 *
848 */
849 if (ufs_idle_q.uq_ne < (ufs_idle_q.uq_lowat >> 1))
850 return;
851 mutex_enter(&ufs_idle_q.uq_mutex);
852 cv_broadcast(&ufs_idle_q.uq_cv);
853 mutex_exit(&ufs_idle_q.uq_mutex);
854 }
855
856 /*
857 * Free up some idle inodes
858 */
859 void
ufs_idle_some(int ne)860 ufs_idle_some(int ne)
861 {
862 int i;
863 struct inode *ip;
864 struct vnode *vp;
865 static int junk_rotor = 0;
866 static int useful_rotor = 0;
867
868 for (i = 0; i < ne; ++i) {
869 mutex_enter(&ufs_idle_q.uq_mutex);
870
871 if (ufs_njunk_iq) {
872 while (ufs_junk_iq[junk_rotor].i_freef ==
873 (inode_t *)&ufs_junk_iq[junk_rotor]) {
874 junk_rotor = IQNEXT(junk_rotor);
875 }
876 ip = ufs_junk_iq[junk_rotor].i_freef;
877 ASSERT(ip->i_flag & IJUNKIQ);
878 } else if (ufs_nuseful_iq) {
879 while (ufs_useful_iq[useful_rotor].i_freef ==
880 (inode_t *)&ufs_useful_iq[useful_rotor]) {
881 useful_rotor = IQNEXT(useful_rotor);
882 }
883 ip = ufs_useful_iq[useful_rotor].i_freef;
884 ASSERT(!(ip->i_flag & IJUNKIQ));
885 } else {
886 mutex_exit(&ufs_idle_q.uq_mutex);
887 return;
888 }
889
890 /*
891 * emulate ufs_iget
892 */
893 vp = ITOV(ip);
894 VN_HOLD(vp);
895 mutex_exit(&ufs_idle_q.uq_mutex);
896 rw_enter(&ip->i_contents, RW_WRITER);
897 /*
898 * VN_RELE should not be called if
899 * ufs_rmidle returns true, as it will
900 * effectively be done in ufs_idle_free.
901 */
902 if (ufs_rmidle(ip)) {
903 rw_exit(&ip->i_contents);
904 ufs_idle_free(ip);
905 } else {
906 rw_exit(&ip->i_contents);
907 VN_RELE(vp);
908 }
909 }
910 }
911
912 /*
913 * drain entries for vfsp from the idle queue
914 * vfsp == NULL means drain the entire thing
915 */
916 void
ufs_idle_drain(struct vfs * vfsp)917 ufs_idle_drain(struct vfs *vfsp)
918 {
919 struct inode *ip, *nip;
920 struct inode *ianchor = NULL;
921 int i;
922
923 mutex_enter(&ufs_idle_q.uq_mutex);
924 if (ufs_njunk_iq) {
925 /* for each hash q */
926 for (i = 0; i < ufs_niqhash; i++) {
927 /* search down the hash q */
928 for (ip = ufs_junk_iq[i].i_freef;
929 ip != (inode_t *)&ufs_junk_iq[i];
930 ip = ip->i_freef) {
931 if (ip->i_vfs == vfsp || vfsp == NULL) {
932 /* found a matching entry */
933 VN_HOLD(ITOV(ip));
934 mutex_exit(&ufs_idle_q.uq_mutex);
935 rw_enter(&ip->i_contents, RW_WRITER);
936 /*
937 * See comments in ufs_idle_some()
938 * as we will call ufs_idle_free()
939 * after scanning both queues.
940 */
941 if (ufs_rmidle(ip)) {
942 rw_exit(&ip->i_contents);
943 ip->i_freef = ianchor;
944 ianchor = ip;
945 } else {
946 rw_exit(&ip->i_contents);
947 VN_RELE(ITOV(ip));
948 }
949 /* restart this hash q */
950 ip = (inode_t *)&ufs_junk_iq[i];
951 mutex_enter(&ufs_idle_q.uq_mutex);
952 }
953 }
954 }
955 }
956 if (ufs_nuseful_iq) {
957 /* for each hash q */
958 for (i = 0; i < ufs_niqhash; i++) {
959 /* search down the hash q */
960 for (ip = ufs_useful_iq[i].i_freef;
961 ip != (inode_t *)&ufs_useful_iq[i];
962 ip = ip->i_freef) {
963 if (ip->i_vfs == vfsp || vfsp == NULL) {
964 /* found a matching entry */
965 VN_HOLD(ITOV(ip));
966 mutex_exit(&ufs_idle_q.uq_mutex);
967 rw_enter(&ip->i_contents, RW_WRITER);
968 /*
969 * See comments in ufs_idle_some()
970 * as we will call ufs_idle_free()
971 * after scanning both queues.
972 */
973 if (ufs_rmidle(ip)) {
974 rw_exit(&ip->i_contents);
975 ip->i_freef = ianchor;
976 ianchor = ip;
977 } else {
978 rw_exit(&ip->i_contents);
979 VN_RELE(ITOV(ip));
980 }
981 /* restart this hash q */
982 ip = (inode_t *)&ufs_useful_iq[i];
983 mutex_enter(&ufs_idle_q.uq_mutex);
984 }
985 }
986 }
987 }
988
989 mutex_exit(&ufs_idle_q.uq_mutex);
990 /* no more matching entries, release those we have found (if any) */
991 for (ip = ianchor; ip; ip = nip) {
992 nip = ip->i_freef;
993 ip->i_freef = ip;
994 ufs_idle_free(ip);
995 }
996 }
997
998 /*
999 * RECLAIM DELETED INODES
1000 * The following thread scans the file system once looking for deleted files
1001 */
1002 void
ufs_thread_reclaim(struct vfs * vfsp)1003 ufs_thread_reclaim(struct vfs *vfsp)
1004 {
1005 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
1006 struct ufs_q *uq = &ufsvfsp->vfs_reclaim;
1007 struct fs *fs = ufsvfsp->vfs_fs;
1008 struct buf *bp = 0;
1009 int err = 0;
1010 daddr_t bno;
1011 ino_t ino;
1012 struct dinode *dp;
1013 struct inode *ip;
1014 callb_cpr_t cprinfo;
1015
1016 CALLB_CPR_INIT(&cprinfo, &uq->uq_mutex, callb_generic_cpr,
1017 "ufsreclaim");
1018
1019 /*
1020 * mount decided that we don't need a reclaim thread
1021 */
1022 if ((fs->fs_reclaim & FS_RECLAIMING) == 0)
1023 err++;
1024
1025 /*
1026 * don't reclaim if readonly
1027 */
1028 if (fs->fs_ronly)
1029 err++;
1030
1031 for (ino = 0; ino < (fs->fs_ncg * fs->fs_ipg) && !err; ++ino) {
1032
1033 /*
1034 * Check whether we are the target of another
1035 * thread having called ufs_thread_exit() or
1036 * ufs_thread_suspend().
1037 */
1038 mutex_enter(&uq->uq_mutex);
1039 again:
1040 if (uq->uq_flags & UQ_EXIT) {
1041 err++;
1042 mutex_exit(&uq->uq_mutex);
1043 break;
1044 } else if (uq->uq_flags & UQ_SUSPEND) {
1045 uq->uq_flags |= UQ_SUSPENDED;
1046 /*
1047 * Release the buf before we cv_wait()
1048 * otherwise we may deadlock with the
1049 * thread that called ufs_thread_suspend().
1050 */
1051 if (bp) {
1052 brelse(bp);
1053 bp = 0;
1054 }
1055 if (uq->uq_flags & UQ_WAIT) {
1056 uq->uq_flags &= ~UQ_WAIT;
1057 cv_broadcast(&uq->uq_cv);
1058 }
1059 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1060 cv_wait(&uq->uq_cv, &uq->uq_mutex);
1061 CALLB_CPR_SAFE_END(&cprinfo, &uq->uq_mutex);
1062 goto again;
1063 }
1064 mutex_exit(&uq->uq_mutex);
1065
1066 /*
1067 * if we don't already have the buf; get it
1068 */
1069 bno = fsbtodb(fs, itod(fs, ino));
1070 if ((bp == 0) || (bp->b_blkno != bno)) {
1071 if (bp)
1072 brelse(bp);
1073 bp = UFS_BREAD(ufsvfsp,
1074 ufsvfsp->vfs_dev, bno, fs->fs_bsize);
1075 bp->b_flags |= B_AGE;
1076 }
1077 if (bp->b_flags & B_ERROR) {
1078 err++;
1079 continue;
1080 }
1081 /*
1082 * nlink <= 0 and mode != 0 means deleted
1083 */
1084 dp = (struct dinode *)bp->b_un.b_addr + itoo(fs, ino);
1085 if ((dp->di_nlink <= 0) && (dp->di_mode != 0)) {
1086 /*
1087 * can't hold the buf (deadlock)
1088 */
1089 brelse(bp);
1090 bp = 0;
1091 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1092 /*
1093 * iget/iput sequence will put inode on ifree
1094 * thread queue if it is idle. This is a nop
1095 * for busy (open, deleted) inodes
1096 */
1097 if (ufs_iget(vfsp, ino, &ip, CRED()))
1098 err++;
1099 else
1100 VN_RELE(ITOV(ip));
1101 rw_exit(&ufsvfsp->vfs_dqrwlock);
1102 }
1103 }
1104
1105 if (bp)
1106 brelse(bp);
1107 if (!err) {
1108 /*
1109 * reset the reclaiming-bit
1110 */
1111 mutex_enter(&ufsvfsp->vfs_lock);
1112 fs->fs_reclaim &= ~FS_RECLAIMING;
1113 mutex_exit(&ufsvfsp->vfs_lock);
1114 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_RECLAIM);
1115 }
1116
1117 /*
1118 * exit the reclaim thread
1119 */
1120 mutex_enter(&uq->uq_mutex);
1121 uq->uq_threadp = NULL;
1122 uq->uq_flags &= ~UQ_WAIT;
1123 cv_broadcast(&uq->uq_cv);
1124 CALLB_CPR_EXIT(&cprinfo);
1125 thread_exit();
1126 }
1127 /*
1128 * HLOCK FILE SYSTEM
1129 * hlock the file system's whose logs have device errors
1130 */
1131 struct ufs_q ufs_hlock;
1132 /*ARGSUSED*/
1133 void
ufs_thread_hlock(void * ignore)1134 ufs_thread_hlock(void *ignore)
1135 {
1136 int retry;
1137 callb_cpr_t cprinfo;
1138
1139 CALLB_CPR_INIT(&cprinfo, &ufs_hlock.uq_mutex, callb_generic_cpr,
1140 "ufshlock");
1141
1142 for (;;) {
1143 /*
1144 * sleep until there is work to do
1145 */
1146 mutex_enter(&ufs_hlock.uq_mutex);
1147 (void) ufs_thread_run(&ufs_hlock, &cprinfo);
1148 ufs_hlock.uq_ne = 0;
1149 mutex_exit(&ufs_hlock.uq_mutex);
1150 /*
1151 * hlock the error'ed fs's
1152 * retry after a bit if another app is doing lockfs stuff
1153 */
1154 do {
1155 retry = ufs_trans_hlock();
1156 if (retry) {
1157 mutex_enter(&ufs_hlock.uq_mutex);
1158 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1159 (void) cv_reltimedwait(&ufs_hlock.uq_cv,
1160 &ufs_hlock.uq_mutex, hz, TR_CLOCK_TICK);
1161 CALLB_CPR_SAFE_END(&cprinfo,
1162 &ufs_hlock.uq_mutex);
1163 mutex_exit(&ufs_hlock.uq_mutex);
1164 }
1165 } while (retry);
1166 }
1167 }
1168
1169 static void
ufs_attr_purge(struct inode * dp)1170 ufs_attr_purge(struct inode *dp)
1171 {
1172 int err;
1173 int error;
1174 off_t dirsize; /* size of the directory */
1175 off_t offset; /* offset in the directory */
1176 int entryoffsetinblk; /* offset of ep in fbp's buffer */
1177 struct inode *tp;
1178 struct fbuf *fbp; /* pointer to directory block */
1179 struct direct *ep; /* directory entry */
1180 int trans_size;
1181 int issync;
1182 struct ufsvfs *ufsvfsp = dp->i_ufsvfs;
1183
1184 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1185
1186 fbp = NULL;
1187 dirsize = roundup(dp->i_size, DIRBLKSIZ);
1188 offset = 0;
1189 entryoffsetinblk = 0;
1190
1191 /*
1192 * Purge directory cache
1193 */
1194
1195 dnlc_dir_purge(&dp->i_danchor);
1196
1197 while (offset < dirsize) {
1198 /*
1199 * If offset is on a block boundary,
1200 * read the next directory block.
1201 * Release previous if it exists.
1202 */
1203 if (blkoff(dp->i_fs, offset) == 0) {
1204 if (fbp != NULL) {
1205 fbrelse(fbp, S_OTHER);
1206 }
1207
1208 err = blkatoff(dp, offset, (char **)0, &fbp);
1209 if (err) {
1210 goto out;
1211 }
1212 entryoffsetinblk = 0;
1213 }
1214 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1215 if (ep->d_ino == 0 || (ep->d_name[0] == '.' &&
1216 ep->d_name[1] == '\0') ||
1217 (ep->d_name[0] == '.' && ep->d_name[1] == '.' &&
1218 ep->d_name[2] == '\0')) {
1219
1220 entryoffsetinblk += ep->d_reclen;
1221
1222 } else {
1223
1224 if ((err = ufs_iget(dp->i_vfs, ep->d_ino,
1225 &tp, CRED())) != 0) {
1226 goto out;
1227 }
1228
1229 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
1230 trans_size = (int)TOP_REMOVE_SIZE(tp));
1231
1232 /*
1233 * Delete inode.
1234 */
1235
1236 dnlc_remove(ITOV(dp), ep->d_name);
1237
1238 rw_enter(&tp->i_contents, RW_WRITER);
1239 tp->i_flag |= ICHG;
1240 tp->i_seq++;
1241 TRANS_INODE(tp->i_ufsvfs, tp);
1242 tp->i_nlink--;
1243 ufs_setreclaim(tp);
1244 ITIMES_NOLOCK(tp);
1245 rw_exit(&tp->i_contents);
1246
1247 VN_RELE(ITOV(tp));
1248 entryoffsetinblk += ep->d_reclen;
1249 TRANS_END_CSYNC(ufsvfsp, error,
1250 issync, TOP_REMOVE, trans_size);
1251
1252 }
1253 offset += ep->d_reclen;
1254 }
1255
1256 if (fbp) {
1257 fbrelse(fbp, S_OTHER);
1258 }
1259
1260 out:
1261 rw_exit(&ufsvfsp->vfs_dqrwlock);
1262 }
1263