1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode.h"
16 #include "xfs_icache.h"
17 #include "xfs_alloc.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_ialloc.h"
20 #include "xfs_ialloc_btree.h"
21 #include "xfs_refcount_btree.h"
22 #include "xfs_rmap.h"
23 #include "xfs_rmap_btree.h"
24 #include "xfs_log.h"
25 #include "xfs_trans_priv.h"
26 #include "xfs_da_format.h"
27 #include "xfs_da_btree.h"
28 #include "xfs_dir2_priv.h"
29 #include "xfs_dir2.h"
30 #include "xfs_attr.h"
31 #include "xfs_reflink.h"
32 #include "xfs_ag.h"
33 #include "xfs_error.h"
34 #include "xfs_quota.h"
35 #include "xfs_exchmaps.h"
36 #include "xfs_rtbitmap.h"
37 #include "scrub/scrub.h"
38 #include "scrub/common.h"
39 #include "scrub/trace.h"
40 #include "scrub/repair.h"
41 #include "scrub/health.h"
42
43 /* Common code for the metadata scrubbers. */
44
45 /*
46 * Handling operational errors.
47 *
48 * The *_process_error() family of functions are used to process error return
49 * codes from functions called as part of a scrub operation.
50 *
51 * If there's no error, we return true to tell the caller that it's ok
52 * to move on to the next check in its list.
53 *
54 * For non-verifier errors (e.g. ENOMEM) we return false to tell the
55 * caller that something bad happened, and we preserve *error so that
56 * the caller can return the *error up the stack to userspace.
57 *
58 * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
59 * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words,
60 * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
61 * not via return codes. We return false to tell the caller that
62 * something bad happened. Since the error has been cleared, the caller
63 * will (presumably) return that zero and scrubbing will move on to
64 * whatever's next.
65 *
66 * ftrace can be used to record the precise metadata location and the
67 * approximate code location of the failed operation.
68 */
69
70 /* Check for operational errors. */
71 static bool
__xchk_process_error(struct xfs_scrub * sc,xfs_agnumber_t agno,xfs_agblock_t bno,int * error,__u32 errflag,void * ret_ip)72 __xchk_process_error(
73 struct xfs_scrub *sc,
74 xfs_agnumber_t agno,
75 xfs_agblock_t bno,
76 int *error,
77 __u32 errflag,
78 void *ret_ip)
79 {
80 switch (*error) {
81 case 0:
82 return true;
83 case -EDEADLOCK:
84 case -ECHRNG:
85 /* Used to restart an op with deadlock avoidance. */
86 trace_xchk_deadlock_retry(
87 sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
88 sc->sm, *error);
89 break;
90 case -ECANCELED:
91 /*
92 * ECANCELED here means that the caller set one of the scrub
93 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
94 * quickly. Set error to zero and do not continue.
95 */
96 trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
97 *error = 0;
98 break;
99 case -EFSBADCRC:
100 case -EFSCORRUPTED:
101 /* Note the badness but don't abort. */
102 sc->sm->sm_flags |= errflag;
103 *error = 0;
104 fallthrough;
105 default:
106 trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
107 break;
108 }
109 return false;
110 }
111
112 bool
xchk_process_error(struct xfs_scrub * sc,xfs_agnumber_t agno,xfs_agblock_t bno,int * error)113 xchk_process_error(
114 struct xfs_scrub *sc,
115 xfs_agnumber_t agno,
116 xfs_agblock_t bno,
117 int *error)
118 {
119 return __xchk_process_error(sc, agno, bno, error,
120 XFS_SCRUB_OFLAG_CORRUPT, __return_address);
121 }
122
123 bool
xchk_xref_process_error(struct xfs_scrub * sc,xfs_agnumber_t agno,xfs_agblock_t bno,int * error)124 xchk_xref_process_error(
125 struct xfs_scrub *sc,
126 xfs_agnumber_t agno,
127 xfs_agblock_t bno,
128 int *error)
129 {
130 return __xchk_process_error(sc, agno, bno, error,
131 XFS_SCRUB_OFLAG_XFAIL, __return_address);
132 }
133
134 /* Check for operational errors for a file offset. */
135 static bool
__xchk_fblock_process_error(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset,int * error,__u32 errflag,void * ret_ip)136 __xchk_fblock_process_error(
137 struct xfs_scrub *sc,
138 int whichfork,
139 xfs_fileoff_t offset,
140 int *error,
141 __u32 errflag,
142 void *ret_ip)
143 {
144 switch (*error) {
145 case 0:
146 return true;
147 case -EDEADLOCK:
148 case -ECHRNG:
149 /* Used to restart an op with deadlock avoidance. */
150 trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
151 break;
152 case -ECANCELED:
153 /*
154 * ECANCELED here means that the caller set one of the scrub
155 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
156 * quickly. Set error to zero and do not continue.
157 */
158 trace_xchk_file_op_error(sc, whichfork, offset, *error,
159 ret_ip);
160 *error = 0;
161 break;
162 case -EFSBADCRC:
163 case -EFSCORRUPTED:
164 /* Note the badness but don't abort. */
165 sc->sm->sm_flags |= errflag;
166 *error = 0;
167 fallthrough;
168 default:
169 trace_xchk_file_op_error(sc, whichfork, offset, *error,
170 ret_ip);
171 break;
172 }
173 return false;
174 }
175
176 bool
xchk_fblock_process_error(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset,int * error)177 xchk_fblock_process_error(
178 struct xfs_scrub *sc,
179 int whichfork,
180 xfs_fileoff_t offset,
181 int *error)
182 {
183 return __xchk_fblock_process_error(sc, whichfork, offset, error,
184 XFS_SCRUB_OFLAG_CORRUPT, __return_address);
185 }
186
187 bool
xchk_fblock_xref_process_error(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset,int * error)188 xchk_fblock_xref_process_error(
189 struct xfs_scrub *sc,
190 int whichfork,
191 xfs_fileoff_t offset,
192 int *error)
193 {
194 return __xchk_fblock_process_error(sc, whichfork, offset, error,
195 XFS_SCRUB_OFLAG_XFAIL, __return_address);
196 }
197
198 /*
199 * Handling scrub corruption/optimization/warning checks.
200 *
201 * The *_set_{corrupt,preen,warning}() family of functions are used to
202 * record the presence of metadata that is incorrect (corrupt), could be
203 * optimized somehow (preen), or should be flagged for administrative
204 * review but is not incorrect (warn).
205 *
206 * ftrace can be used to record the precise metadata location and
207 * approximate code location of the failed check.
208 */
209
210 /* Record a block which could be optimized. */
211 void
xchk_block_set_preen(struct xfs_scrub * sc,struct xfs_buf * bp)212 xchk_block_set_preen(
213 struct xfs_scrub *sc,
214 struct xfs_buf *bp)
215 {
216 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
217 trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
218 }
219
220 /*
221 * Record an inode which could be optimized. The trace data will
222 * include the block given by bp if bp is given; otherwise it will use
223 * the block location of the inode record itself.
224 */
225 void
xchk_ino_set_preen(struct xfs_scrub * sc,xfs_ino_t ino)226 xchk_ino_set_preen(
227 struct xfs_scrub *sc,
228 xfs_ino_t ino)
229 {
230 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
231 trace_xchk_ino_preen(sc, ino, __return_address);
232 }
233
234 /* Record something being wrong with the filesystem primary superblock. */
235 void
xchk_set_corrupt(struct xfs_scrub * sc)236 xchk_set_corrupt(
237 struct xfs_scrub *sc)
238 {
239 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
240 trace_xchk_fs_error(sc, 0, __return_address);
241 }
242
243 /* Record a corrupt block. */
244 void
xchk_block_set_corrupt(struct xfs_scrub * sc,struct xfs_buf * bp)245 xchk_block_set_corrupt(
246 struct xfs_scrub *sc,
247 struct xfs_buf *bp)
248 {
249 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
250 trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
251 }
252
253 #ifdef CONFIG_XFS_QUOTA
254 /* Record a corrupt quota counter. */
255 void
xchk_qcheck_set_corrupt(struct xfs_scrub * sc,unsigned int dqtype,xfs_dqid_t id)256 xchk_qcheck_set_corrupt(
257 struct xfs_scrub *sc,
258 unsigned int dqtype,
259 xfs_dqid_t id)
260 {
261 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
262 trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
263 }
264 #endif
265
266 /* Record a corruption while cross-referencing. */
267 void
xchk_block_xref_set_corrupt(struct xfs_scrub * sc,struct xfs_buf * bp)268 xchk_block_xref_set_corrupt(
269 struct xfs_scrub *sc,
270 struct xfs_buf *bp)
271 {
272 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
273 trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
274 }
275
276 /*
277 * Record a corrupt inode. The trace data will include the block given
278 * by bp if bp is given; otherwise it will use the block location of the
279 * inode record itself.
280 */
281 void
xchk_ino_set_corrupt(struct xfs_scrub * sc,xfs_ino_t ino)282 xchk_ino_set_corrupt(
283 struct xfs_scrub *sc,
284 xfs_ino_t ino)
285 {
286 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
287 trace_xchk_ino_error(sc, ino, __return_address);
288 }
289
290 /* Record a corruption while cross-referencing with an inode. */
291 void
xchk_ino_xref_set_corrupt(struct xfs_scrub * sc,xfs_ino_t ino)292 xchk_ino_xref_set_corrupt(
293 struct xfs_scrub *sc,
294 xfs_ino_t ino)
295 {
296 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
297 trace_xchk_ino_error(sc, ino, __return_address);
298 }
299
300 /* Record corruption in a block indexed by a file fork. */
301 void
xchk_fblock_set_corrupt(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset)302 xchk_fblock_set_corrupt(
303 struct xfs_scrub *sc,
304 int whichfork,
305 xfs_fileoff_t offset)
306 {
307 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
308 trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
309 }
310
311 /* Record a corruption while cross-referencing a fork block. */
312 void
xchk_fblock_xref_set_corrupt(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset)313 xchk_fblock_xref_set_corrupt(
314 struct xfs_scrub *sc,
315 int whichfork,
316 xfs_fileoff_t offset)
317 {
318 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
319 trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
320 }
321
322 /*
323 * Warn about inodes that need administrative review but is not
324 * incorrect.
325 */
326 void
xchk_ino_set_warning(struct xfs_scrub * sc,xfs_ino_t ino)327 xchk_ino_set_warning(
328 struct xfs_scrub *sc,
329 xfs_ino_t ino)
330 {
331 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
332 trace_xchk_ino_warning(sc, ino, __return_address);
333 }
334
335 /* Warn about a block indexed by a file fork that needs review. */
336 void
xchk_fblock_set_warning(struct xfs_scrub * sc,int whichfork,xfs_fileoff_t offset)337 xchk_fblock_set_warning(
338 struct xfs_scrub *sc,
339 int whichfork,
340 xfs_fileoff_t offset)
341 {
342 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
343 trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
344 }
345
346 /* Signal an incomplete scrub. */
347 void
xchk_set_incomplete(struct xfs_scrub * sc)348 xchk_set_incomplete(
349 struct xfs_scrub *sc)
350 {
351 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
352 trace_xchk_incomplete(sc, __return_address);
353 }
354
355 /*
356 * rmap scrubbing -- compute the number of blocks with a given owner,
357 * at least according to the reverse mapping data.
358 */
359
360 struct xchk_rmap_ownedby_info {
361 const struct xfs_owner_info *oinfo;
362 xfs_filblks_t *blocks;
363 };
364
365 STATIC int
xchk_count_rmap_ownedby_irec(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)366 xchk_count_rmap_ownedby_irec(
367 struct xfs_btree_cur *cur,
368 const struct xfs_rmap_irec *rec,
369 void *priv)
370 {
371 struct xchk_rmap_ownedby_info *sroi = priv;
372 bool irec_attr;
373 bool oinfo_attr;
374
375 irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
376 oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
377
378 if (rec->rm_owner != sroi->oinfo->oi_owner)
379 return 0;
380
381 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
382 (*sroi->blocks) += rec->rm_blockcount;
383
384 return 0;
385 }
386
387 /*
388 * Calculate the number of blocks the rmap thinks are owned by something.
389 * The caller should pass us an rmapbt cursor.
390 */
391 int
xchk_count_rmap_ownedby_ag(struct xfs_scrub * sc,struct xfs_btree_cur * cur,const struct xfs_owner_info * oinfo,xfs_filblks_t * blocks)392 xchk_count_rmap_ownedby_ag(
393 struct xfs_scrub *sc,
394 struct xfs_btree_cur *cur,
395 const struct xfs_owner_info *oinfo,
396 xfs_filblks_t *blocks)
397 {
398 struct xchk_rmap_ownedby_info sroi = {
399 .oinfo = oinfo,
400 .blocks = blocks,
401 };
402
403 *blocks = 0;
404 return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
405 &sroi);
406 }
407
408 /*
409 * AG scrubbing
410 *
411 * These helpers facilitate locking an allocation group's header
412 * buffers, setting up cursors for all btrees that are present, and
413 * cleaning everything up once we're through.
414 */
415
416 /* Decide if we want to return an AG header read failure. */
417 static inline bool
want_ag_read_header_failure(struct xfs_scrub * sc,unsigned int type)418 want_ag_read_header_failure(
419 struct xfs_scrub *sc,
420 unsigned int type)
421 {
422 /* Return all AG header read failures when scanning btrees. */
423 if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
424 sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
425 sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
426 return true;
427 /*
428 * If we're scanning a given type of AG header, we only want to
429 * see read failures from that specific header. We'd like the
430 * other headers to cross-check them, but this isn't required.
431 */
432 if (sc->sm->sm_type == type)
433 return true;
434 return false;
435 }
436
437 /*
438 * Grab the AG header buffers for the attached perag structure.
439 *
440 * The headers should be released by xchk_ag_free, but as a fail safe we attach
441 * all the buffers we grab to the scrub transaction so they'll all be freed
442 * when we cancel it.
443 */
444 static inline int
xchk_perag_read_headers(struct xfs_scrub * sc,struct xchk_ag * sa)445 xchk_perag_read_headers(
446 struct xfs_scrub *sc,
447 struct xchk_ag *sa)
448 {
449 int error;
450
451 error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp);
452 if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
453 return error;
454
455 error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
456 if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
457 return error;
458
459 return 0;
460 }
461
462 /*
463 * Grab the AG headers for the attached perag structure and wait for pending
464 * intents to drain.
465 */
466 int
xchk_perag_drain_and_lock(struct xfs_scrub * sc)467 xchk_perag_drain_and_lock(
468 struct xfs_scrub *sc)
469 {
470 struct xchk_ag *sa = &sc->sa;
471 int error = 0;
472
473 ASSERT(sa->pag != NULL);
474 ASSERT(sa->agi_bp == NULL);
475 ASSERT(sa->agf_bp == NULL);
476
477 do {
478 if (xchk_should_terminate(sc, &error))
479 return error;
480
481 error = xchk_perag_read_headers(sc, sa);
482 if (error)
483 return error;
484
485 /*
486 * If we've grabbed an inode for scrubbing then we assume that
487 * holding its ILOCK will suffice to coordinate with any intent
488 * chains involving this inode.
489 */
490 if (sc->ip)
491 return 0;
492
493 /*
494 * Decide if this AG is quiet enough for all metadata to be
495 * consistent with each other. XFS allows the AG header buffer
496 * locks to cycle across transaction rolls while processing
497 * chains of deferred ops, which means that there could be
498 * other threads in the middle of processing a chain of
499 * deferred ops. For regular operations we are careful about
500 * ordering operations to prevent collisions between threads
501 * (which is why we don't need a per-AG lock), but scrub and
502 * repair have to serialize against chained operations.
503 *
504 * We just locked all the AG headers buffers; now take a look
505 * to see if there are any intents in progress. If there are,
506 * drop the AG headers and wait for the intents to drain.
507 * Since we hold all the AG header locks for the duration of
508 * the scrub, this is the only time we have to sample the
509 * intents counter; any threads increasing it after this point
510 * can't possibly be in the middle of a chain of AG metadata
511 * updates.
512 *
513 * Obviously, this should be slanted against scrub and in favor
514 * of runtime threads.
515 */
516 if (!xfs_perag_intent_busy(sa->pag))
517 return 0;
518
519 if (sa->agf_bp) {
520 xfs_trans_brelse(sc->tp, sa->agf_bp);
521 sa->agf_bp = NULL;
522 }
523
524 if (sa->agi_bp) {
525 xfs_trans_brelse(sc->tp, sa->agi_bp);
526 sa->agi_bp = NULL;
527 }
528
529 if (!(sc->flags & XCHK_FSGATES_DRAIN))
530 return -ECHRNG;
531 error = xfs_perag_intent_drain(sa->pag);
532 if (error == -ERESTARTSYS)
533 error = -EINTR;
534 } while (!error);
535
536 return error;
537 }
538
539 /*
540 * Grab the per-AG structure, grab all AG header buffers, and wait until there
541 * aren't any pending intents. Returns -ENOENT if we can't grab the perag
542 * structure.
543 */
544 int
xchk_ag_read_headers(struct xfs_scrub * sc,xfs_agnumber_t agno,struct xchk_ag * sa)545 xchk_ag_read_headers(
546 struct xfs_scrub *sc,
547 xfs_agnumber_t agno,
548 struct xchk_ag *sa)
549 {
550 struct xfs_mount *mp = sc->mp;
551
552 ASSERT(!sa->pag);
553 sa->pag = xfs_perag_get(mp, agno);
554 if (!sa->pag)
555 return -ENOENT;
556
557 return xchk_perag_drain_and_lock(sc);
558 }
559
560 /* Release all the AG btree cursors. */
561 void
xchk_ag_btcur_free(struct xchk_ag * sa)562 xchk_ag_btcur_free(
563 struct xchk_ag *sa)
564 {
565 if (sa->refc_cur)
566 xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
567 if (sa->rmap_cur)
568 xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
569 if (sa->fino_cur)
570 xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
571 if (sa->ino_cur)
572 xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
573 if (sa->cnt_cur)
574 xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
575 if (sa->bno_cur)
576 xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
577
578 sa->refc_cur = NULL;
579 sa->rmap_cur = NULL;
580 sa->fino_cur = NULL;
581 sa->ino_cur = NULL;
582 sa->bno_cur = NULL;
583 sa->cnt_cur = NULL;
584 }
585
586 /* Initialize all the btree cursors for an AG. */
587 void
xchk_ag_btcur_init(struct xfs_scrub * sc,struct xchk_ag * sa)588 xchk_ag_btcur_init(
589 struct xfs_scrub *sc,
590 struct xchk_ag *sa)
591 {
592 struct xfs_mount *mp = sc->mp;
593
594 if (sa->agf_bp) {
595 /* Set up a bnobt cursor for cross-referencing. */
596 sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
597 sa->pag);
598 xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
599 XFS_SCRUB_TYPE_BNOBT);
600
601 /* Set up a cntbt cursor for cross-referencing. */
602 sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
603 sa->pag);
604 xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
605 XFS_SCRUB_TYPE_CNTBT);
606
607 /* Set up a rmapbt cursor for cross-referencing. */
608 if (xfs_has_rmapbt(mp)) {
609 sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
610 sa->agf_bp, sa->pag);
611 xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
612 XFS_SCRUB_TYPE_RMAPBT);
613 }
614
615 /* Set up a refcountbt cursor for cross-referencing. */
616 if (xfs_has_reflink(mp)) {
617 sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
618 sa->agf_bp, sa->pag);
619 xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
620 XFS_SCRUB_TYPE_REFCNTBT);
621 }
622 }
623
624 if (sa->agi_bp) {
625 /* Set up a inobt cursor for cross-referencing. */
626 sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
627 sa->agi_bp);
628 xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
629 XFS_SCRUB_TYPE_INOBT);
630
631 /* Set up a finobt cursor for cross-referencing. */
632 if (xfs_has_finobt(mp)) {
633 sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
634 sa->agi_bp);
635 xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
636 XFS_SCRUB_TYPE_FINOBT);
637 }
638 }
639 }
640
641 /* Release the AG header context and btree cursors. */
642 void
xchk_ag_free(struct xfs_scrub * sc,struct xchk_ag * sa)643 xchk_ag_free(
644 struct xfs_scrub *sc,
645 struct xchk_ag *sa)
646 {
647 xchk_ag_btcur_free(sa);
648 xrep_reset_perag_resv(sc);
649 if (sa->agf_bp) {
650 xfs_trans_brelse(sc->tp, sa->agf_bp);
651 sa->agf_bp = NULL;
652 }
653 if (sa->agi_bp) {
654 xfs_trans_brelse(sc->tp, sa->agi_bp);
655 sa->agi_bp = NULL;
656 }
657 if (sa->pag) {
658 xfs_perag_put(sa->pag);
659 sa->pag = NULL;
660 }
661 }
662
663 /*
664 * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
665 * order. Locking order requires us to get the AGI before the AGF. We use the
666 * transaction to avoid deadlocking on crosslinked metadata buffers; either the
667 * caller passes one in (bmap scrub) or we have to create a transaction
668 * ourselves. Returns ENOENT if the perag struct cannot be grabbed.
669 */
670 int
xchk_ag_init(struct xfs_scrub * sc,xfs_agnumber_t agno,struct xchk_ag * sa)671 xchk_ag_init(
672 struct xfs_scrub *sc,
673 xfs_agnumber_t agno,
674 struct xchk_ag *sa)
675 {
676 int error;
677
678 error = xchk_ag_read_headers(sc, agno, sa);
679 if (error)
680 return error;
681
682 xchk_ag_btcur_init(sc, sa);
683 return 0;
684 }
685
686 /* Per-scrubber setup functions */
687
688 void
xchk_trans_cancel(struct xfs_scrub * sc)689 xchk_trans_cancel(
690 struct xfs_scrub *sc)
691 {
692 xfs_trans_cancel(sc->tp);
693 sc->tp = NULL;
694 }
695
696 int
xchk_trans_alloc_empty(struct xfs_scrub * sc)697 xchk_trans_alloc_empty(
698 struct xfs_scrub *sc)
699 {
700 return xfs_trans_alloc_empty(sc->mp, &sc->tp);
701 }
702
703 /*
704 * Grab an empty transaction so that we can re-grab locked buffers if
705 * one of our btrees turns out to be cyclic.
706 *
707 * If we're going to repair something, we need to ask for the largest possible
708 * log reservation so that we can handle the worst case scenario for metadata
709 * updates while rebuilding a metadata item. We also need to reserve as many
710 * blocks in the head transaction as we think we're going to need to rebuild
711 * the metadata object.
712 */
713 int
xchk_trans_alloc(struct xfs_scrub * sc,uint resblks)714 xchk_trans_alloc(
715 struct xfs_scrub *sc,
716 uint resblks)
717 {
718 if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
719 return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
720 resblks, 0, 0, &sc->tp);
721
722 return xchk_trans_alloc_empty(sc);
723 }
724
725 /* Set us up with a transaction and an empty context. */
726 int
xchk_setup_fs(struct xfs_scrub * sc)727 xchk_setup_fs(
728 struct xfs_scrub *sc)
729 {
730 uint resblks;
731
732 resblks = xrep_calc_ag_resblks(sc);
733 return xchk_trans_alloc(sc, resblks);
734 }
735
736 /* Set us up with AG headers and btree cursors. */
737 int
xchk_setup_ag_btree(struct xfs_scrub * sc,bool force_log)738 xchk_setup_ag_btree(
739 struct xfs_scrub *sc,
740 bool force_log)
741 {
742 struct xfs_mount *mp = sc->mp;
743 int error;
744
745 /*
746 * If the caller asks us to checkpont the log, do so. This
747 * expensive operation should be performed infrequently and only
748 * as a last resort. Any caller that sets force_log should
749 * document why they need to do so.
750 */
751 if (force_log) {
752 error = xchk_checkpoint_log(mp);
753 if (error)
754 return error;
755 }
756
757 error = xchk_setup_fs(sc);
758 if (error)
759 return error;
760
761 return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
762 }
763
764 /* Push everything out of the log onto disk. */
765 int
xchk_checkpoint_log(struct xfs_mount * mp)766 xchk_checkpoint_log(
767 struct xfs_mount *mp)
768 {
769 int error;
770
771 error = xfs_log_force(mp, XFS_LOG_SYNC);
772 if (error)
773 return error;
774 xfs_ail_push_all_sync(mp->m_ail);
775 return 0;
776 }
777
778 /* Verify that an inode is allocated ondisk, then return its cached inode. */
779 int
xchk_iget(struct xfs_scrub * sc,xfs_ino_t inum,struct xfs_inode ** ipp)780 xchk_iget(
781 struct xfs_scrub *sc,
782 xfs_ino_t inum,
783 struct xfs_inode **ipp)
784 {
785 ASSERT(sc->tp != NULL);
786
787 return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
788 }
789
790 /*
791 * Try to grab an inode in a manner that avoids races with physical inode
792 * allocation. If we can't, return the locked AGI buffer so that the caller
793 * can single-step the loading process to see where things went wrong.
794 * Callers must have a valid scrub transaction.
795 *
796 * If the iget succeeds, return 0, a NULL AGI, and the inode.
797 *
798 * If the iget fails, return the error, the locked AGI, and a NULL inode. This
799 * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
800 * no longer allocated; or any other corruption or runtime error.
801 *
802 * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
803 *
804 * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
805 */
806 int
xchk_iget_agi(struct xfs_scrub * sc,xfs_ino_t inum,struct xfs_buf ** agi_bpp,struct xfs_inode ** ipp)807 xchk_iget_agi(
808 struct xfs_scrub *sc,
809 xfs_ino_t inum,
810 struct xfs_buf **agi_bpp,
811 struct xfs_inode **ipp)
812 {
813 struct xfs_mount *mp = sc->mp;
814 struct xfs_trans *tp = sc->tp;
815 struct xfs_perag *pag;
816 int error;
817
818 ASSERT(sc->tp != NULL);
819
820 again:
821 *agi_bpp = NULL;
822 *ipp = NULL;
823 error = 0;
824
825 if (xchk_should_terminate(sc, &error))
826 return error;
827
828 /*
829 * Attach the AGI buffer to the scrub transaction to avoid deadlocks
830 * in the iget cache miss path.
831 */
832 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
833 error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
834 xfs_perag_put(pag);
835 if (error)
836 return error;
837
838 error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
839 ipp);
840 if (error == -EAGAIN) {
841 /*
842 * The inode may be in core but temporarily unavailable and may
843 * require the AGI buffer before it can be returned. Drop the
844 * AGI buffer and retry the lookup.
845 *
846 * Incore lookup will fail with EAGAIN on a cache hit if the
847 * inode is queued to the inactivation list. The inactivation
848 * worker may remove the inode from the unlinked list and hence
849 * needs the AGI.
850 *
851 * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
852 * to allow inodegc to make progress and move the inode to
853 * IRECLAIMABLE state where xfs_iget will be able to return it
854 * again if it can lock the inode.
855 */
856 xfs_trans_brelse(tp, *agi_bpp);
857 delay(1);
858 goto again;
859 }
860 if (error)
861 return error;
862
863 /* We got the inode, so we can release the AGI. */
864 ASSERT(*ipp != NULL);
865 xfs_trans_brelse(tp, *agi_bpp);
866 *agi_bpp = NULL;
867 return 0;
868 }
869
870 #ifdef CONFIG_XFS_QUOTA
871 /*
872 * Try to attach dquots to this inode if we think we might want to repair it.
873 * Callers must not hold any ILOCKs. If the dquots are broken and cannot be
874 * attached, a quotacheck will be scheduled.
875 */
876 int
xchk_ino_dqattach(struct xfs_scrub * sc)877 xchk_ino_dqattach(
878 struct xfs_scrub *sc)
879 {
880 ASSERT(sc->tp != NULL);
881 ASSERT(sc->ip != NULL);
882
883 if (!xchk_could_repair(sc))
884 return 0;
885
886 return xrep_ino_dqattach(sc);
887 }
888 #endif
889
890 /* Install an inode that we opened by handle for scrubbing. */
891 int
xchk_install_handle_inode(struct xfs_scrub * sc,struct xfs_inode * ip)892 xchk_install_handle_inode(
893 struct xfs_scrub *sc,
894 struct xfs_inode *ip)
895 {
896 if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
897 xchk_irele(sc, ip);
898 return -ENOENT;
899 }
900
901 sc->ip = ip;
902 return 0;
903 }
904
905 /*
906 * Install an already-referenced inode for scrubbing. Get our own reference to
907 * the inode to make disposal simpler. The inode must not be in I_FREEING or
908 * I_WILL_FREE state!
909 */
910 int
xchk_install_live_inode(struct xfs_scrub * sc,struct xfs_inode * ip)911 xchk_install_live_inode(
912 struct xfs_scrub *sc,
913 struct xfs_inode *ip)
914 {
915 if (!igrab(VFS_I(ip))) {
916 xchk_ino_set_corrupt(sc, ip->i_ino);
917 return -EFSCORRUPTED;
918 }
919
920 sc->ip = ip;
921 return 0;
922 }
923
924 /*
925 * In preparation to scrub metadata structures that hang off of an inode,
926 * grab either the inode referenced in the scrub control structure or the
927 * inode passed in. If the inumber does not reference an allocated inode
928 * record, the function returns ENOENT to end the scrub early. The inode
929 * is not locked.
930 */
931 int
xchk_iget_for_scrubbing(struct xfs_scrub * sc)932 xchk_iget_for_scrubbing(
933 struct xfs_scrub *sc)
934 {
935 struct xfs_imap imap;
936 struct xfs_mount *mp = sc->mp;
937 struct xfs_perag *pag;
938 struct xfs_buf *agi_bp;
939 struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
940 struct xfs_inode *ip = NULL;
941 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
942 int error;
943
944 ASSERT(sc->tp == NULL);
945
946 /* We want to scan the inode we already had opened. */
947 if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
948 return xchk_install_live_inode(sc, ip_in);
949
950 /* Reject internal metadata files and obviously bad inode numbers. */
951 if (xfs_internal_inum(mp, sc->sm->sm_ino))
952 return -ENOENT;
953 if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
954 return -ENOENT;
955
956 /* Try a safe untrusted iget. */
957 error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
958 if (!error)
959 return xchk_install_handle_inode(sc, ip);
960 if (error == -ENOENT)
961 return error;
962 if (error != -EINVAL)
963 goto out_error;
964
965 /*
966 * EINVAL with IGET_UNTRUSTED probably means one of several things:
967 * userspace gave us an inode number that doesn't correspond to fs
968 * space; the inode btree lacks a record for this inode; or there is a
969 * record, and it says this inode is free.
970 *
971 * We want to look up this inode in the inobt to distinguish two
972 * scenarios: (1) the inobt says the inode is free, in which case
973 * there's nothing to do; and (2) the inobt says the inode is
974 * allocated, but loading it failed due to corruption.
975 *
976 * Allocate a transaction and grab the AGI to prevent inobt activity
977 * in this AG. Retry the iget in case someone allocated a new inode
978 * after the first iget failed.
979 */
980 error = xchk_trans_alloc(sc, 0);
981 if (error)
982 goto out_error;
983
984 error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
985 if (error == 0) {
986 /* Actually got the inode, so install it. */
987 xchk_trans_cancel(sc);
988 return xchk_install_handle_inode(sc, ip);
989 }
990 if (error == -ENOENT)
991 goto out_gone;
992 if (error != -EINVAL)
993 goto out_cancel;
994
995 /* Ensure that we have protected against inode allocation/freeing. */
996 if (agi_bp == NULL) {
997 ASSERT(agi_bp != NULL);
998 error = -ECANCELED;
999 goto out_cancel;
1000 }
1001
1002 /*
1003 * Untrusted iget failed a second time. Let's try an inobt lookup.
1004 * If the inobt thinks this the inode neither can exist inside the
1005 * filesystem nor is allocated, return ENOENT to signal that the check
1006 * can be skipped.
1007 *
1008 * If the lookup returns corruption, we'll mark this inode corrupt and
1009 * exit to userspace. There's little chance of fixing anything until
1010 * the inobt is straightened out, but there's nothing we can do here.
1011 *
1012 * If the lookup encounters any other error, exit to userspace.
1013 *
1014 * If the lookup succeeds, something else must be very wrong in the fs
1015 * such that setting up the incore inode failed in some strange way.
1016 * Treat those as corruptions.
1017 */
1018 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
1019 if (!pag) {
1020 error = -EFSCORRUPTED;
1021 goto out_cancel;
1022 }
1023
1024 error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
1025 XFS_IGET_UNTRUSTED);
1026 xfs_perag_put(pag);
1027 if (error == -EINVAL || error == -ENOENT)
1028 goto out_gone;
1029 if (!error)
1030 error = -EFSCORRUPTED;
1031
1032 out_cancel:
1033 xchk_trans_cancel(sc);
1034 out_error:
1035 trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
1036 error, __return_address);
1037 return error;
1038 out_gone:
1039 /* The file is gone, so there's nothing to check. */
1040 xchk_trans_cancel(sc);
1041 return -ENOENT;
1042 }
1043
1044 /* Release an inode, possibly dropping it in the process. */
1045 void
xchk_irele(struct xfs_scrub * sc,struct xfs_inode * ip)1046 xchk_irele(
1047 struct xfs_scrub *sc,
1048 struct xfs_inode *ip)
1049 {
1050 if (sc->tp) {
1051 /*
1052 * If we are in a transaction, we /cannot/ drop the inode
1053 * ourselves, because the VFS will trigger writeback, which
1054 * can require a transaction. Clear DONTCACHE to force the
1055 * inode to the LRU, where someone else can take care of
1056 * dropping it.
1057 *
1058 * Note that when we grabbed our reference to the inode, it
1059 * could have had an active ref and DONTCACHE set if a sysadmin
1060 * is trying to coerce a change in file access mode. icache
1061 * hits do not clear DONTCACHE, so we must do it here.
1062 */
1063 spin_lock(&VFS_I(ip)->i_lock);
1064 VFS_I(ip)->i_state &= ~I_DONTCACHE;
1065 spin_unlock(&VFS_I(ip)->i_lock);
1066 }
1067
1068 xfs_irele(ip);
1069 }
1070
1071 /*
1072 * Set us up to scrub metadata mapped by a file's fork. Callers must not use
1073 * this to operate on user-accessible regular file data because the MMAPLOCK is
1074 * not taken.
1075 */
1076 int
xchk_setup_inode_contents(struct xfs_scrub * sc,unsigned int resblks)1077 xchk_setup_inode_contents(
1078 struct xfs_scrub *sc,
1079 unsigned int resblks)
1080 {
1081 int error;
1082
1083 error = xchk_iget_for_scrubbing(sc);
1084 if (error)
1085 return error;
1086
1087 /* Lock the inode so the VFS cannot touch this file. */
1088 xchk_ilock(sc, XFS_IOLOCK_EXCL);
1089
1090 error = xchk_trans_alloc(sc, resblks);
1091 if (error)
1092 goto out;
1093
1094 error = xchk_ino_dqattach(sc);
1095 if (error)
1096 goto out;
1097
1098 xchk_ilock(sc, XFS_ILOCK_EXCL);
1099 out:
1100 /* scrub teardown will unlock and release the inode for us */
1101 return error;
1102 }
1103
1104 void
xchk_ilock(struct xfs_scrub * sc,unsigned int ilock_flags)1105 xchk_ilock(
1106 struct xfs_scrub *sc,
1107 unsigned int ilock_flags)
1108 {
1109 xfs_ilock(sc->ip, ilock_flags);
1110 sc->ilock_flags |= ilock_flags;
1111 }
1112
1113 bool
xchk_ilock_nowait(struct xfs_scrub * sc,unsigned int ilock_flags)1114 xchk_ilock_nowait(
1115 struct xfs_scrub *sc,
1116 unsigned int ilock_flags)
1117 {
1118 if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1119 sc->ilock_flags |= ilock_flags;
1120 return true;
1121 }
1122
1123 return false;
1124 }
1125
1126 void
xchk_iunlock(struct xfs_scrub * sc,unsigned int ilock_flags)1127 xchk_iunlock(
1128 struct xfs_scrub *sc,
1129 unsigned int ilock_flags)
1130 {
1131 sc->ilock_flags &= ~ilock_flags;
1132 xfs_iunlock(sc->ip, ilock_flags);
1133 }
1134
1135 /*
1136 * Predicate that decides if we need to evaluate the cross-reference check.
1137 * If there was an error accessing the cross-reference btree, just delete
1138 * the cursor and skip the check.
1139 */
1140 bool
xchk_should_check_xref(struct xfs_scrub * sc,int * error,struct xfs_btree_cur ** curpp)1141 xchk_should_check_xref(
1142 struct xfs_scrub *sc,
1143 int *error,
1144 struct xfs_btree_cur **curpp)
1145 {
1146 /* No point in xref if we already know we're corrupt. */
1147 if (xchk_skip_xref(sc->sm))
1148 return false;
1149
1150 if (*error == 0)
1151 return true;
1152
1153 if (curpp) {
1154 /* If we've already given up on xref, just bail out. */
1155 if (!*curpp)
1156 return false;
1157
1158 /* xref error, delete cursor and bail out. */
1159 xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1160 *curpp = NULL;
1161 }
1162
1163 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1164 trace_xchk_xref_error(sc, *error, __return_address);
1165
1166 /*
1167 * Errors encountered during cross-referencing with another
1168 * data structure should not cause this scrubber to abort.
1169 */
1170 *error = 0;
1171 return false;
1172 }
1173
1174 /* Run the structure verifiers on in-memory buffers to detect bad memory. */
1175 void
xchk_buffer_recheck(struct xfs_scrub * sc,struct xfs_buf * bp)1176 xchk_buffer_recheck(
1177 struct xfs_scrub *sc,
1178 struct xfs_buf *bp)
1179 {
1180 xfs_failaddr_t fa;
1181
1182 if (bp->b_ops == NULL) {
1183 xchk_block_set_corrupt(sc, bp);
1184 return;
1185 }
1186 if (bp->b_ops->verify_struct == NULL) {
1187 xchk_set_incomplete(sc);
1188 return;
1189 }
1190 fa = bp->b_ops->verify_struct(bp);
1191 if (!fa)
1192 return;
1193 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1194 trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1195 }
1196
1197 static inline int
xchk_metadata_inode_subtype(struct xfs_scrub * sc,unsigned int scrub_type)1198 xchk_metadata_inode_subtype(
1199 struct xfs_scrub *sc,
1200 unsigned int scrub_type)
1201 {
1202 struct xfs_scrub_subord *sub;
1203 int error;
1204
1205 sub = xchk_scrub_create_subord(sc, scrub_type);
1206 error = sub->sc.ops->scrub(&sub->sc);
1207 xchk_scrub_free_subord(sub);
1208 return error;
1209 }
1210
1211 /*
1212 * Scrub the attr/data forks of a metadata inode. The metadata inode must be
1213 * pointed to by sc->ip and the ILOCK must be held.
1214 */
1215 int
xchk_metadata_inode_forks(struct xfs_scrub * sc)1216 xchk_metadata_inode_forks(
1217 struct xfs_scrub *sc)
1218 {
1219 bool shared;
1220 int error;
1221
1222 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1223 return 0;
1224
1225 /* Check the inode record. */
1226 error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1227 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1228 return error;
1229
1230 /* Metadata inodes don't live on the rt device. */
1231 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1232 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1233 return 0;
1234 }
1235
1236 /* They should never participate in reflink. */
1237 if (xfs_is_reflink_inode(sc->ip)) {
1238 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1239 return 0;
1240 }
1241
1242 /* They also should never have extended attributes. */
1243 if (xfs_inode_hasattr(sc->ip)) {
1244 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1245 return 0;
1246 }
1247
1248 /* Invoke the data fork scrubber. */
1249 error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1250 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1251 return error;
1252
1253 /* Look for incorrect shared blocks. */
1254 if (xfs_has_reflink(sc->mp)) {
1255 error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1256 &shared);
1257 if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1258 &error))
1259 return error;
1260 if (shared)
1261 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1262 }
1263
1264 return 0;
1265 }
1266
1267 /*
1268 * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1269 * operation. Callers must not hold any locks that intersect with the CPU
1270 * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1271 * to change kernel code.
1272 */
1273 void
xchk_fsgates_enable(struct xfs_scrub * sc,unsigned int scrub_fsgates)1274 xchk_fsgates_enable(
1275 struct xfs_scrub *sc,
1276 unsigned int scrub_fsgates)
1277 {
1278 ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1279 ASSERT(!(sc->flags & scrub_fsgates));
1280
1281 trace_xchk_fsgates_enable(sc, scrub_fsgates);
1282
1283 if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1284 xfs_drain_wait_enable();
1285
1286 if (scrub_fsgates & XCHK_FSGATES_QUOTA)
1287 xfs_dqtrx_hook_enable();
1288
1289 if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
1290 xfs_dir_hook_enable();
1291
1292 if (scrub_fsgates & XCHK_FSGATES_RMAP)
1293 xfs_rmap_hook_enable();
1294
1295 sc->flags |= scrub_fsgates;
1296 }
1297
1298 /*
1299 * Decide if this is this a cached inode that's also allocated. The caller
1300 * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1301 * from being allocated or freed.
1302 *
1303 * Look up an inode by number in the given file system. If the inode number
1304 * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA.
1305 * If the inode is being reclaimed, return -ENODATA because we know the inode
1306 * cache cannot be updating the ondisk metadata.
1307 *
1308 * Otherwise, the incore inode is the one we want, and it is either live,
1309 * somewhere in the inactivation machinery, or reclaimable. The inode is
1310 * allocated if i_mode is nonzero. In all three cases, the cached inode will
1311 * be more up to date than the ondisk inode buffer, so we must use the incore
1312 * i_mode.
1313 */
1314 int
xchk_inode_is_allocated(struct xfs_scrub * sc,xfs_agino_t agino,bool * inuse)1315 xchk_inode_is_allocated(
1316 struct xfs_scrub *sc,
1317 xfs_agino_t agino,
1318 bool *inuse)
1319 {
1320 struct xfs_mount *mp = sc->mp;
1321 struct xfs_perag *pag = sc->sa.pag;
1322 xfs_ino_t ino;
1323 struct xfs_inode *ip;
1324 int error;
1325
1326 /* caller must hold perag reference */
1327 if (pag == NULL) {
1328 ASSERT(pag != NULL);
1329 return -EINVAL;
1330 }
1331
1332 /* caller must have AGI buffer */
1333 if (sc->sa.agi_bp == NULL) {
1334 ASSERT(sc->sa.agi_bp != NULL);
1335 return -EINVAL;
1336 }
1337
1338 /* reject inode numbers outside existing AGs */
1339 ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1340 if (!xfs_verify_ino(mp, ino))
1341 return -EINVAL;
1342
1343 error = -ENODATA;
1344 rcu_read_lock();
1345 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1346 if (!ip) {
1347 /* cache miss */
1348 goto out_rcu;
1349 }
1350
1351 /*
1352 * If the inode number doesn't match, the incore inode got reused
1353 * during an RCU grace period and the radix tree hasn't been updated.
1354 * This isn't the inode we want.
1355 */
1356 spin_lock(&ip->i_flags_lock);
1357 if (ip->i_ino != ino)
1358 goto out_skip;
1359
1360 trace_xchk_inode_is_allocated(ip);
1361
1362 /*
1363 * We have an incore inode that matches the inode we want, and the
1364 * caller holds the perag structure and the AGI buffer. Let's check
1365 * our assumptions below:
1366 */
1367
1368 #ifdef DEBUG
1369 /*
1370 * (1) If the incore inode is live (i.e. referenced from the dcache),
1371 * it will not be INEW, nor will it be in the inactivation or reclaim
1372 * machinery. The ondisk inode had better be allocated. This is the
1373 * most trivial case.
1374 */
1375 if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1376 XFS_INACTIVATING))) {
1377 /* live inode */
1378 ASSERT(VFS_I(ip)->i_mode != 0);
1379 }
1380
1381 /*
1382 * If the incore inode is INEW, there are several possibilities:
1383 *
1384 * (2) For a file that is being created, note that we allocate the
1385 * ondisk inode before allocating, initializing, and adding the incore
1386 * inode to the radix tree.
1387 *
1388 * (3) If the incore inode is being recycled, the inode has to be
1389 * allocated because we don't allow freed inodes to be recycled.
1390 * Recycling doesn't touch i_mode.
1391 */
1392 if (ip->i_flags & XFS_INEW) {
1393 /* created on disk already or recycling */
1394 ASSERT(VFS_I(ip)->i_mode != 0);
1395 }
1396
1397 /*
1398 * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1399 * inactivation has not started (!INACTIVATING), it is still allocated.
1400 */
1401 if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1402 !(ip->i_flags & XFS_INACTIVATING)) {
1403 /* definitely before difree */
1404 ASSERT(VFS_I(ip)->i_mode != 0);
1405 }
1406 #endif
1407
1408 /*
1409 * If the incore inode is undergoing inactivation (INACTIVATING), there
1410 * are two possibilities:
1411 *
1412 * (5) It is before the point where it would get freed ondisk, in which
1413 * case i_mode is still nonzero.
1414 *
1415 * (6) It has already been freed, in which case i_mode is zero.
1416 *
1417 * We don't take the ILOCK here, but difree and dialloc update the AGI,
1418 * and we've taken the AGI buffer lock, which prevents that from
1419 * happening.
1420 */
1421
1422 /*
1423 * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1424 * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still
1425 * reflects the ondisk state.
1426 */
1427
1428 /*
1429 * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1430 * the flush code uses i_mode to format the ondisk inode.
1431 */
1432
1433 /*
1434 * (9) If the inode is in IRECLAIM and was reachable via the radix
1435 * tree, it still has the same i_mode as it did before it entered
1436 * reclaim. The inode object is still alive because we hold the RCU
1437 * read lock.
1438 */
1439
1440 *inuse = VFS_I(ip)->i_mode != 0;
1441 error = 0;
1442
1443 out_skip:
1444 spin_unlock(&ip->i_flags_lock);
1445 out_rcu:
1446 rcu_read_unlock();
1447 return error;
1448 }
1449