xref: /linux/fs/xfs/libxfs/xfs_alloc.c (revision b33874fb)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_defer.h"
15 #include "xfs_btree.h"
16 #include "xfs_rmap.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_extent_busy.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_trans.h"
24 #include "xfs_buf_item.h"
25 #include "xfs_log.h"
26 #include "xfs_ag.h"
27 #include "xfs_ag_resv.h"
28 #include "xfs_bmap.h"
29 #include "xfs_health.h"
30 
31 struct kmem_cache	*xfs_extfree_item_cache;
32 
33 struct workqueue_struct *xfs_alloc_wq;
34 
35 #define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
36 
37 #define	XFSA_FIXUP_BNO_OK	1
38 #define	XFSA_FIXUP_CNT_OK	2
39 
40 /*
41  * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
42  * the beginning of the block for a proper header with the location information
43  * and CRC.
44  */
45 unsigned int
xfs_agfl_size(struct xfs_mount * mp)46 xfs_agfl_size(
47 	struct xfs_mount	*mp)
48 {
49 	unsigned int		size = mp->m_sb.sb_sectsize;
50 
51 	if (xfs_has_crc(mp))
52 		size -= sizeof(struct xfs_agfl);
53 
54 	return size / sizeof(xfs_agblock_t);
55 }
56 
57 unsigned int
xfs_refc_block(struct xfs_mount * mp)58 xfs_refc_block(
59 	struct xfs_mount	*mp)
60 {
61 	if (xfs_has_rmapbt(mp))
62 		return XFS_RMAP_BLOCK(mp) + 1;
63 	if (xfs_has_finobt(mp))
64 		return XFS_FIBT_BLOCK(mp) + 1;
65 	return XFS_IBT_BLOCK(mp) + 1;
66 }
67 
68 xfs_extlen_t
xfs_prealloc_blocks(struct xfs_mount * mp)69 xfs_prealloc_blocks(
70 	struct xfs_mount	*mp)
71 {
72 	if (xfs_has_reflink(mp))
73 		return xfs_refc_block(mp) + 1;
74 	if (xfs_has_rmapbt(mp))
75 		return XFS_RMAP_BLOCK(mp) + 1;
76 	if (xfs_has_finobt(mp))
77 		return XFS_FIBT_BLOCK(mp) + 1;
78 	return XFS_IBT_BLOCK(mp) + 1;
79 }
80 
81 /*
82  * The number of blocks per AG that we withhold from xfs_dec_fdblocks to
83  * guarantee that we can refill the AGFL prior to allocating space in a nearly
84  * full AG.  Although the space described by the free space btrees, the
85  * blocks used by the freesp btrees themselves, and the blocks owned by the
86  * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
87  * free space in the AG drop so low that the free space btrees cannot refill an
88  * empty AGFL up to the minimum level.  Rather than grind through empty AGs
89  * until the fs goes down, we subtract this many AG blocks from the incore
90  * fdblocks to ensure user allocation does not overcommit the space the
91  * filesystem needs for the AGFLs.  The rmap btree uses a per-AG reservation to
92  * withhold space from xfs_dec_fdblocks, so we do not account for that here.
93  */
94 #define XFS_ALLOCBT_AGFL_RESERVE	4
95 
96 /*
97  * Compute the number of blocks that we set aside to guarantee the ability to
98  * refill the AGFL and handle a full bmap btree split.
99  *
100  * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
101  * AGF buffer (PV 947395), we place constraints on the relationship among
102  * actual allocations for data blocks, freelist blocks, and potential file data
103  * bmap btree blocks. However, these restrictions may result in no actual space
104  * allocated for a delayed extent, for example, a data block in a certain AG is
105  * allocated but there is no additional block for the additional bmap btree
106  * block due to a split of the bmap btree of the file. The result of this may
107  * lead to an infinite loop when the file gets flushed to disk and all delayed
108  * extents need to be actually allocated. To get around this, we explicitly set
109  * aside a few blocks which will not be reserved in delayed allocation.
110  *
111  * For each AG, we need to reserve enough blocks to replenish a totally empty
112  * AGFL and 4 more to handle a potential split of the file's bmap btree.
113  */
114 unsigned int
xfs_alloc_set_aside(struct xfs_mount * mp)115 xfs_alloc_set_aside(
116 	struct xfs_mount	*mp)
117 {
118 	return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
119 }
120 
121 /*
122  * When deciding how much space to allocate out of an AG, we limit the
123  * allocation maximum size to the size the AG. However, we cannot use all the
124  * blocks in the AG - some are permanently used by metadata. These
125  * blocks are generally:
126  *	- the AG superblock, AGF, AGI and AGFL
127  *	- the AGF (bno and cnt) and AGI btree root blocks, and optionally
128  *	  the AGI free inode and rmap btree root blocks.
129  *	- blocks on the AGFL according to xfs_alloc_set_aside() limits
130  *	- the rmapbt root block
131  *
132  * The AG headers are sector sized, so the amount of space they take up is
133  * dependent on filesystem geometry. The others are all single blocks.
134  */
135 unsigned int
xfs_alloc_ag_max_usable(struct xfs_mount * mp)136 xfs_alloc_ag_max_usable(
137 	struct xfs_mount	*mp)
138 {
139 	unsigned int		blocks;
140 
141 	blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
142 	blocks += XFS_ALLOCBT_AGFL_RESERVE;
143 	blocks += 3;			/* AGF, AGI btree root blocks */
144 	if (xfs_has_finobt(mp))
145 		blocks++;		/* finobt root block */
146 	if (xfs_has_rmapbt(mp))
147 		blocks++;		/* rmap root block */
148 	if (xfs_has_reflink(mp))
149 		blocks++;		/* refcount root block */
150 
151 	return mp->m_sb.sb_agblocks - blocks;
152 }
153 
154 
155 static int
xfs_alloc_lookup(struct xfs_btree_cur * cur,xfs_lookup_t dir,xfs_agblock_t bno,xfs_extlen_t len,int * stat)156 xfs_alloc_lookup(
157 	struct xfs_btree_cur	*cur,
158 	xfs_lookup_t		dir,
159 	xfs_agblock_t		bno,
160 	xfs_extlen_t		len,
161 	int			*stat)
162 {
163 	int			error;
164 
165 	cur->bc_rec.a.ar_startblock = bno;
166 	cur->bc_rec.a.ar_blockcount = len;
167 	error = xfs_btree_lookup(cur, dir, stat);
168 	if (*stat == 1)
169 		cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
170 	else
171 		cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
172 	return error;
173 }
174 
175 /*
176  * Lookup the record equal to [bno, len] in the btree given by cur.
177  */
178 static inline int				/* error */
xfs_alloc_lookup_eq(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len,int * stat)179 xfs_alloc_lookup_eq(
180 	struct xfs_btree_cur	*cur,	/* btree cursor */
181 	xfs_agblock_t		bno,	/* starting block of extent */
182 	xfs_extlen_t		len,	/* length of extent */
183 	int			*stat)	/* success/failure */
184 {
185 	return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat);
186 }
187 
188 /*
189  * Lookup the first record greater than or equal to [bno, len]
190  * in the btree given by cur.
191  */
192 int				/* error */
xfs_alloc_lookup_ge(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len,int * stat)193 xfs_alloc_lookup_ge(
194 	struct xfs_btree_cur	*cur,	/* btree cursor */
195 	xfs_agblock_t		bno,	/* starting block of extent */
196 	xfs_extlen_t		len,	/* length of extent */
197 	int			*stat)	/* success/failure */
198 {
199 	return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat);
200 }
201 
202 /*
203  * Lookup the first record less than or equal to [bno, len]
204  * in the btree given by cur.
205  */
206 int					/* error */
xfs_alloc_lookup_le(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len,int * stat)207 xfs_alloc_lookup_le(
208 	struct xfs_btree_cur	*cur,	/* btree cursor */
209 	xfs_agblock_t		bno,	/* starting block of extent */
210 	xfs_extlen_t		len,	/* length of extent */
211 	int			*stat)	/* success/failure */
212 {
213 	return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat);
214 }
215 
216 static inline bool
xfs_alloc_cur_active(struct xfs_btree_cur * cur)217 xfs_alloc_cur_active(
218 	struct xfs_btree_cur	*cur)
219 {
220 	return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE);
221 }
222 
223 /*
224  * Update the record referred to by cur to the value given
225  * by [bno, len].
226  * This either works (return 0) or gets an EFSCORRUPTED error.
227  */
228 STATIC int				/* error */
xfs_alloc_update(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len)229 xfs_alloc_update(
230 	struct xfs_btree_cur	*cur,	/* btree cursor */
231 	xfs_agblock_t		bno,	/* starting block of extent */
232 	xfs_extlen_t		len)	/* length of extent */
233 {
234 	union xfs_btree_rec	rec;
235 
236 	rec.alloc.ar_startblock = cpu_to_be32(bno);
237 	rec.alloc.ar_blockcount = cpu_to_be32(len);
238 	return xfs_btree_update(cur, &rec);
239 }
240 
241 /* Convert the ondisk btree record to its incore representation. */
242 void
xfs_alloc_btrec_to_irec(const union xfs_btree_rec * rec,struct xfs_alloc_rec_incore * irec)243 xfs_alloc_btrec_to_irec(
244 	const union xfs_btree_rec	*rec,
245 	struct xfs_alloc_rec_incore	*irec)
246 {
247 	irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
248 	irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
249 }
250 
251 /* Simple checks for free space records. */
252 xfs_failaddr_t
xfs_alloc_check_irec(struct xfs_perag * pag,const struct xfs_alloc_rec_incore * irec)253 xfs_alloc_check_irec(
254 	struct xfs_perag			*pag,
255 	const struct xfs_alloc_rec_incore	*irec)
256 {
257 	if (irec->ar_blockcount == 0)
258 		return __this_address;
259 
260 	/* check for valid extent range, including overflow */
261 	if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
262 		return __this_address;
263 
264 	return NULL;
265 }
266 
267 static inline int
xfs_alloc_complain_bad_rec(struct xfs_btree_cur * cur,xfs_failaddr_t fa,const struct xfs_alloc_rec_incore * irec)268 xfs_alloc_complain_bad_rec(
269 	struct xfs_btree_cur		*cur,
270 	xfs_failaddr_t			fa,
271 	const struct xfs_alloc_rec_incore *irec)
272 {
273 	struct xfs_mount		*mp = cur->bc_mp;
274 
275 	xfs_warn(mp,
276 		"%sbt record corruption in AG %d detected at %pS!",
277 		cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
278 	xfs_warn(mp,
279 		"start block 0x%x block count 0x%x", irec->ar_startblock,
280 		irec->ar_blockcount);
281 	xfs_btree_mark_sick(cur);
282 	return -EFSCORRUPTED;
283 }
284 
285 /*
286  * Get the data from the pointed-to record.
287  */
288 int					/* error */
xfs_alloc_get_rec(struct xfs_btree_cur * cur,xfs_agblock_t * bno,xfs_extlen_t * len,int * stat)289 xfs_alloc_get_rec(
290 	struct xfs_btree_cur	*cur,	/* btree cursor */
291 	xfs_agblock_t		*bno,	/* output: starting block of extent */
292 	xfs_extlen_t		*len,	/* output: length of extent */
293 	int			*stat)	/* output: success/failure */
294 {
295 	struct xfs_alloc_rec_incore irec;
296 	union xfs_btree_rec	*rec;
297 	xfs_failaddr_t		fa;
298 	int			error;
299 
300 	error = xfs_btree_get_rec(cur, &rec, stat);
301 	if (error || !(*stat))
302 		return error;
303 
304 	xfs_alloc_btrec_to_irec(rec, &irec);
305 	fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
306 	if (fa)
307 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
308 
309 	*bno = irec.ar_startblock;
310 	*len = irec.ar_blockcount;
311 	return 0;
312 }
313 
314 /*
315  * Compute aligned version of the found extent.
316  * Takes alignment and min length into account.
317  */
318 STATIC bool
xfs_alloc_compute_aligned(xfs_alloc_arg_t * args,xfs_agblock_t foundbno,xfs_extlen_t foundlen,xfs_agblock_t * resbno,xfs_extlen_t * reslen,unsigned * busy_gen)319 xfs_alloc_compute_aligned(
320 	xfs_alloc_arg_t	*args,		/* allocation argument structure */
321 	xfs_agblock_t	foundbno,	/* starting block in found extent */
322 	xfs_extlen_t	foundlen,	/* length in found extent */
323 	xfs_agblock_t	*resbno,	/* result block number */
324 	xfs_extlen_t	*reslen,	/* result length */
325 	unsigned	*busy_gen)
326 {
327 	xfs_agblock_t	bno = foundbno;
328 	xfs_extlen_t	len = foundlen;
329 	xfs_extlen_t	diff;
330 	bool		busy;
331 
332 	/* Trim busy sections out of found extent */
333 	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
334 
335 	/*
336 	 * If we have a largish extent that happens to start before min_agbno,
337 	 * see if we can shift it into range...
338 	 */
339 	if (bno < args->min_agbno && bno + len > args->min_agbno) {
340 		diff = args->min_agbno - bno;
341 		if (len > diff) {
342 			bno += diff;
343 			len -= diff;
344 		}
345 	}
346 
347 	if (args->alignment > 1 && len >= args->minlen) {
348 		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
349 
350 		diff = aligned_bno - bno;
351 
352 		*resbno = aligned_bno;
353 		*reslen = diff >= len ? 0 : len - diff;
354 	} else {
355 		*resbno = bno;
356 		*reslen = len;
357 	}
358 
359 	return busy;
360 }
361 
362 /*
363  * Compute best start block and diff for "near" allocations.
364  * freelen >= wantlen already checked by caller.
365  */
366 STATIC xfs_extlen_t			/* difference value (absolute) */
xfs_alloc_compute_diff(xfs_agblock_t wantbno,xfs_extlen_t wantlen,xfs_extlen_t alignment,int datatype,xfs_agblock_t freebno,xfs_extlen_t freelen,xfs_agblock_t * newbnop)367 xfs_alloc_compute_diff(
368 	xfs_agblock_t	wantbno,	/* target starting block */
369 	xfs_extlen_t	wantlen,	/* target length */
370 	xfs_extlen_t	alignment,	/* target alignment */
371 	int		datatype,	/* are we allocating data? */
372 	xfs_agblock_t	freebno,	/* freespace's starting block */
373 	xfs_extlen_t	freelen,	/* freespace's length */
374 	xfs_agblock_t	*newbnop)	/* result: best start block from free */
375 {
376 	xfs_agblock_t	freeend;	/* end of freespace extent */
377 	xfs_agblock_t	newbno1;	/* return block number */
378 	xfs_agblock_t	newbno2;	/* other new block number */
379 	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
380 	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
381 	xfs_agblock_t	wantend;	/* end of target extent */
382 	bool		userdata = datatype & XFS_ALLOC_USERDATA;
383 
384 	ASSERT(freelen >= wantlen);
385 	freeend = freebno + freelen;
386 	wantend = wantbno + wantlen;
387 	/*
388 	 * We want to allocate from the start of a free extent if it is past
389 	 * the desired block or if we are allocating user data and the free
390 	 * extent is before desired block. The second case is there to allow
391 	 * for contiguous allocation from the remaining free space if the file
392 	 * grows in the short term.
393 	 */
394 	if (freebno >= wantbno || (userdata && freeend < wantend)) {
395 		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
396 			newbno1 = NULLAGBLOCK;
397 	} else if (freeend >= wantend && alignment > 1) {
398 		newbno1 = roundup(wantbno, alignment);
399 		newbno2 = newbno1 - alignment;
400 		if (newbno1 >= freeend)
401 			newbno1 = NULLAGBLOCK;
402 		else
403 			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
404 		if (newbno2 < freebno)
405 			newbno2 = NULLAGBLOCK;
406 		else
407 			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
408 		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
409 			if (newlen1 < newlen2 ||
410 			    (newlen1 == newlen2 &&
411 			     XFS_ABSDIFF(newbno1, wantbno) >
412 			     XFS_ABSDIFF(newbno2, wantbno)))
413 				newbno1 = newbno2;
414 		} else if (newbno2 != NULLAGBLOCK)
415 			newbno1 = newbno2;
416 	} else if (freeend >= wantend) {
417 		newbno1 = wantbno;
418 	} else if (alignment > 1) {
419 		newbno1 = roundup(freeend - wantlen, alignment);
420 		if (newbno1 > freeend - wantlen &&
421 		    newbno1 - alignment >= freebno)
422 			newbno1 -= alignment;
423 		else if (newbno1 >= freeend)
424 			newbno1 = NULLAGBLOCK;
425 	} else
426 		newbno1 = freeend - wantlen;
427 	*newbnop = newbno1;
428 	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
429 }
430 
431 /*
432  * Fix up the length, based on mod and prod.
433  * len should be k * prod + mod for some k.
434  * If len is too small it is returned unchanged.
435  * If len hits maxlen it is left alone.
436  */
437 STATIC void
xfs_alloc_fix_len(xfs_alloc_arg_t * args)438 xfs_alloc_fix_len(
439 	xfs_alloc_arg_t	*args)		/* allocation argument structure */
440 {
441 	xfs_extlen_t	k;
442 	xfs_extlen_t	rlen;
443 
444 	ASSERT(args->mod < args->prod);
445 	rlen = args->len;
446 	ASSERT(rlen >= args->minlen);
447 	ASSERT(rlen <= args->maxlen);
448 	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
449 	    (args->mod == 0 && rlen < args->prod))
450 		return;
451 	k = rlen % args->prod;
452 	if (k == args->mod)
453 		return;
454 	if (k > args->mod)
455 		rlen = rlen - (k - args->mod);
456 	else
457 		rlen = rlen - args->prod + (args->mod - k);
458 	/* casts to (int) catch length underflows */
459 	if ((int)rlen < (int)args->minlen)
460 		return;
461 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
462 	ASSERT(rlen % args->prod == args->mod);
463 	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
464 		rlen + args->minleft);
465 	args->len = rlen;
466 }
467 
468 /*
469  * Update the two btrees, logically removing from freespace the extent
470  * starting at rbno, rlen blocks.  The extent is contained within the
471  * actual (current) free extent fbno for flen blocks.
472  * Flags are passed in indicating whether the cursors are set to the
473  * relevant records.
474  */
475 STATIC int				/* error code */
xfs_alloc_fixup_trees(struct xfs_btree_cur * cnt_cur,struct xfs_btree_cur * bno_cur,xfs_agblock_t fbno,xfs_extlen_t flen,xfs_agblock_t rbno,xfs_extlen_t rlen,int flags)476 xfs_alloc_fixup_trees(
477 	struct xfs_btree_cur *cnt_cur,	/* cursor for by-size btree */
478 	struct xfs_btree_cur *bno_cur,	/* cursor for by-block btree */
479 	xfs_agblock_t	fbno,		/* starting block of free extent */
480 	xfs_extlen_t	flen,		/* length of free extent */
481 	xfs_agblock_t	rbno,		/* starting block of returned extent */
482 	xfs_extlen_t	rlen,		/* length of returned extent */
483 	int		flags)		/* flags, XFSA_FIXUP_... */
484 {
485 	int		error;		/* error code */
486 	int		i;		/* operation results */
487 	xfs_agblock_t	nfbno1;		/* first new free startblock */
488 	xfs_agblock_t	nfbno2;		/* second new free startblock */
489 	xfs_extlen_t	nflen1=0;	/* first new free length */
490 	xfs_extlen_t	nflen2=0;	/* second new free length */
491 	struct xfs_mount *mp;
492 
493 	mp = cnt_cur->bc_mp;
494 
495 	/*
496 	 * Look up the record in the by-size tree if necessary.
497 	 */
498 	if (flags & XFSA_FIXUP_CNT_OK) {
499 #ifdef DEBUG
500 		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
501 			return error;
502 		if (XFS_IS_CORRUPT(mp,
503 				   i != 1 ||
504 				   nfbno1 != fbno ||
505 				   nflen1 != flen)) {
506 			xfs_btree_mark_sick(cnt_cur);
507 			return -EFSCORRUPTED;
508 		}
509 #endif
510 	} else {
511 		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
512 			return error;
513 		if (XFS_IS_CORRUPT(mp, i != 1)) {
514 			xfs_btree_mark_sick(cnt_cur);
515 			return -EFSCORRUPTED;
516 		}
517 	}
518 	/*
519 	 * Look up the record in the by-block tree if necessary.
520 	 */
521 	if (flags & XFSA_FIXUP_BNO_OK) {
522 #ifdef DEBUG
523 		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
524 			return error;
525 		if (XFS_IS_CORRUPT(mp,
526 				   i != 1 ||
527 				   nfbno1 != fbno ||
528 				   nflen1 != flen)) {
529 			xfs_btree_mark_sick(bno_cur);
530 			return -EFSCORRUPTED;
531 		}
532 #endif
533 	} else {
534 		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
535 			return error;
536 		if (XFS_IS_CORRUPT(mp, i != 1)) {
537 			xfs_btree_mark_sick(bno_cur);
538 			return -EFSCORRUPTED;
539 		}
540 	}
541 
542 #ifdef DEBUG
543 	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
544 		struct xfs_btree_block	*bnoblock;
545 		struct xfs_btree_block	*cntblock;
546 
547 		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
548 		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
549 
550 		if (XFS_IS_CORRUPT(mp,
551 				   bnoblock->bb_numrecs !=
552 				   cntblock->bb_numrecs)) {
553 			xfs_btree_mark_sick(bno_cur);
554 			return -EFSCORRUPTED;
555 		}
556 	}
557 #endif
558 
559 	/*
560 	 * Deal with all four cases: the allocated record is contained
561 	 * within the freespace record, so we can have new freespace
562 	 * at either (or both) end, or no freespace remaining.
563 	 */
564 	if (rbno == fbno && rlen == flen)
565 		nfbno1 = nfbno2 = NULLAGBLOCK;
566 	else if (rbno == fbno) {
567 		nfbno1 = rbno + rlen;
568 		nflen1 = flen - rlen;
569 		nfbno2 = NULLAGBLOCK;
570 	} else if (rbno + rlen == fbno + flen) {
571 		nfbno1 = fbno;
572 		nflen1 = flen - rlen;
573 		nfbno2 = NULLAGBLOCK;
574 	} else {
575 		nfbno1 = fbno;
576 		nflen1 = rbno - fbno;
577 		nfbno2 = rbno + rlen;
578 		nflen2 = (fbno + flen) - nfbno2;
579 	}
580 	/*
581 	 * Delete the entry from the by-size btree.
582 	 */
583 	if ((error = xfs_btree_delete(cnt_cur, &i)))
584 		return error;
585 	if (XFS_IS_CORRUPT(mp, i != 1)) {
586 		xfs_btree_mark_sick(cnt_cur);
587 		return -EFSCORRUPTED;
588 	}
589 	/*
590 	 * Add new by-size btree entry(s).
591 	 */
592 	if (nfbno1 != NULLAGBLOCK) {
593 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
594 			return error;
595 		if (XFS_IS_CORRUPT(mp, i != 0)) {
596 			xfs_btree_mark_sick(cnt_cur);
597 			return -EFSCORRUPTED;
598 		}
599 		if ((error = xfs_btree_insert(cnt_cur, &i)))
600 			return error;
601 		if (XFS_IS_CORRUPT(mp, i != 1)) {
602 			xfs_btree_mark_sick(cnt_cur);
603 			return -EFSCORRUPTED;
604 		}
605 	}
606 	if (nfbno2 != NULLAGBLOCK) {
607 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
608 			return error;
609 		if (XFS_IS_CORRUPT(mp, i != 0)) {
610 			xfs_btree_mark_sick(cnt_cur);
611 			return -EFSCORRUPTED;
612 		}
613 		if ((error = xfs_btree_insert(cnt_cur, &i)))
614 			return error;
615 		if (XFS_IS_CORRUPT(mp, i != 1)) {
616 			xfs_btree_mark_sick(cnt_cur);
617 			return -EFSCORRUPTED;
618 		}
619 	}
620 	/*
621 	 * Fix up the by-block btree entry(s).
622 	 */
623 	if (nfbno1 == NULLAGBLOCK) {
624 		/*
625 		 * No remaining freespace, just delete the by-block tree entry.
626 		 */
627 		if ((error = xfs_btree_delete(bno_cur, &i)))
628 			return error;
629 		if (XFS_IS_CORRUPT(mp, i != 1)) {
630 			xfs_btree_mark_sick(bno_cur);
631 			return -EFSCORRUPTED;
632 		}
633 	} else {
634 		/*
635 		 * Update the by-block entry to start later|be shorter.
636 		 */
637 		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
638 			return error;
639 	}
640 	if (nfbno2 != NULLAGBLOCK) {
641 		/*
642 		 * 2 resulting free entries, need to add one.
643 		 */
644 		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
645 			return error;
646 		if (XFS_IS_CORRUPT(mp, i != 0)) {
647 			xfs_btree_mark_sick(bno_cur);
648 			return -EFSCORRUPTED;
649 		}
650 		if ((error = xfs_btree_insert(bno_cur, &i)))
651 			return error;
652 		if (XFS_IS_CORRUPT(mp, i != 1)) {
653 			xfs_btree_mark_sick(bno_cur);
654 			return -EFSCORRUPTED;
655 		}
656 	}
657 	return 0;
658 }
659 
660 /*
661  * We do not verify the AGFL contents against AGF-based index counters here,
662  * even though we may have access to the perag that contains shadow copies. We
663  * don't know if the AGF based counters have been checked, and if they have they
664  * still may be inconsistent because they haven't yet been reset on the first
665  * allocation after the AGF has been read in.
666  *
667  * This means we can only check that all agfl entries contain valid or null
668  * values because we can't reliably determine the active range to exclude
669  * NULLAGBNO as a valid value.
670  *
671  * However, we can't even do that for v4 format filesystems because there are
672  * old versions of mkfs out there that does not initialise the AGFL to known,
673  * verifiable values. HEnce we can't tell the difference between a AGFL block
674  * allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
675  *
676  * As a result, we can only fully validate AGFL block numbers when we pull them
677  * from the freelist in xfs_alloc_get_freelist().
678  */
679 static xfs_failaddr_t
xfs_agfl_verify(struct xfs_buf * bp)680 xfs_agfl_verify(
681 	struct xfs_buf	*bp)
682 {
683 	struct xfs_mount *mp = bp->b_mount;
684 	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
685 	__be32		*agfl_bno = xfs_buf_to_agfl_bno(bp);
686 	int		i;
687 
688 	if (!xfs_has_crc(mp))
689 		return NULL;
690 
691 	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
692 		return __this_address;
693 	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
694 		return __this_address;
695 	/*
696 	 * during growfs operations, the perag is not fully initialised,
697 	 * so we can't use it for any useful checking. growfs ensures we can't
698 	 * use it by using uncached buffers that don't have the perag attached
699 	 * so we can detect and avoid this problem.
700 	 */
701 	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
702 		return __this_address;
703 
704 	for (i = 0; i < xfs_agfl_size(mp); i++) {
705 		if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
706 		    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
707 			return __this_address;
708 	}
709 
710 	if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
711 		return __this_address;
712 	return NULL;
713 }
714 
715 static void
xfs_agfl_read_verify(struct xfs_buf * bp)716 xfs_agfl_read_verify(
717 	struct xfs_buf	*bp)
718 {
719 	struct xfs_mount *mp = bp->b_mount;
720 	xfs_failaddr_t	fa;
721 
722 	/*
723 	 * There is no verification of non-crc AGFLs because mkfs does not
724 	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
725 	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
726 	 * can't verify just those entries are valid.
727 	 */
728 	if (!xfs_has_crc(mp))
729 		return;
730 
731 	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
732 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
733 	else {
734 		fa = xfs_agfl_verify(bp);
735 		if (fa)
736 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
737 	}
738 }
739 
740 static void
xfs_agfl_write_verify(struct xfs_buf * bp)741 xfs_agfl_write_verify(
742 	struct xfs_buf	*bp)
743 {
744 	struct xfs_mount	*mp = bp->b_mount;
745 	struct xfs_buf_log_item	*bip = bp->b_log_item;
746 	xfs_failaddr_t		fa;
747 
748 	/* no verification of non-crc AGFLs */
749 	if (!xfs_has_crc(mp))
750 		return;
751 
752 	fa = xfs_agfl_verify(bp);
753 	if (fa) {
754 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
755 		return;
756 	}
757 
758 	if (bip)
759 		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
760 
761 	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
762 }
763 
764 const struct xfs_buf_ops xfs_agfl_buf_ops = {
765 	.name = "xfs_agfl",
766 	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
767 	.verify_read = xfs_agfl_read_verify,
768 	.verify_write = xfs_agfl_write_verify,
769 	.verify_struct = xfs_agfl_verify,
770 };
771 
772 /*
773  * Read in the allocation group free block array.
774  */
775 int
xfs_alloc_read_agfl(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf ** bpp)776 xfs_alloc_read_agfl(
777 	struct xfs_perag	*pag,
778 	struct xfs_trans	*tp,
779 	struct xfs_buf		**bpp)
780 {
781 	struct xfs_mount	*mp = pag->pag_mount;
782 	struct xfs_buf		*bp;
783 	int			error;
784 
785 	error = xfs_trans_read_buf(
786 			mp, tp, mp->m_ddev_targp,
787 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
788 			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
789 	if (xfs_metadata_is_sick(error))
790 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
791 	if (error)
792 		return error;
793 	xfs_buf_set_ref(bp, XFS_AGFL_REF);
794 	*bpp = bp;
795 	return 0;
796 }
797 
798 STATIC int
xfs_alloc_update_counters(struct xfs_trans * tp,struct xfs_buf * agbp,long len)799 xfs_alloc_update_counters(
800 	struct xfs_trans	*tp,
801 	struct xfs_buf		*agbp,
802 	long			len)
803 {
804 	struct xfs_agf		*agf = agbp->b_addr;
805 
806 	agbp->b_pag->pagf_freeblks += len;
807 	be32_add_cpu(&agf->agf_freeblks, len);
808 
809 	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
810 		     be32_to_cpu(agf->agf_length))) {
811 		xfs_buf_mark_corrupt(agbp);
812 		xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF);
813 		return -EFSCORRUPTED;
814 	}
815 
816 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
817 	return 0;
818 }
819 
820 /*
821  * Block allocation algorithm and data structures.
822  */
823 struct xfs_alloc_cur {
824 	struct xfs_btree_cur		*cnt;	/* btree cursors */
825 	struct xfs_btree_cur		*bnolt;
826 	struct xfs_btree_cur		*bnogt;
827 	xfs_extlen_t			cur_len;/* current search length */
828 	xfs_agblock_t			rec_bno;/* extent startblock */
829 	xfs_extlen_t			rec_len;/* extent length */
830 	xfs_agblock_t			bno;	/* alloc bno */
831 	xfs_extlen_t			len;	/* alloc len */
832 	xfs_extlen_t			diff;	/* diff from search bno */
833 	unsigned int			busy_gen;/* busy state */
834 	bool				busy;
835 };
836 
837 /*
838  * Set up cursors, etc. in the extent allocation cursor. This function can be
839  * called multiple times to reset an initialized structure without having to
840  * reallocate cursors.
841  */
842 static int
xfs_alloc_cur_setup(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur)843 xfs_alloc_cur_setup(
844 	struct xfs_alloc_arg	*args,
845 	struct xfs_alloc_cur	*acur)
846 {
847 	int			error;
848 	int			i;
849 
850 	acur->cur_len = args->maxlen;
851 	acur->rec_bno = 0;
852 	acur->rec_len = 0;
853 	acur->bno = 0;
854 	acur->len = 0;
855 	acur->diff = -1;
856 	acur->busy = false;
857 	acur->busy_gen = 0;
858 
859 	/*
860 	 * Perform an initial cntbt lookup to check for availability of maxlen
861 	 * extents. If this fails, we'll return -ENOSPC to signal the caller to
862 	 * attempt a small allocation.
863 	 */
864 	if (!acur->cnt)
865 		acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp,
866 					args->agbp, args->pag);
867 	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
868 	if (error)
869 		return error;
870 
871 	/*
872 	 * Allocate the bnobt left and right search cursors.
873 	 */
874 	if (!acur->bnolt)
875 		acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp,
876 					args->agbp, args->pag);
877 	if (!acur->bnogt)
878 		acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp,
879 					args->agbp, args->pag);
880 	return i == 1 ? 0 : -ENOSPC;
881 }
882 
883 static void
xfs_alloc_cur_close(struct xfs_alloc_cur * acur,bool error)884 xfs_alloc_cur_close(
885 	struct xfs_alloc_cur	*acur,
886 	bool			error)
887 {
888 	int			cur_error = XFS_BTREE_NOERROR;
889 
890 	if (error)
891 		cur_error = XFS_BTREE_ERROR;
892 
893 	if (acur->cnt)
894 		xfs_btree_del_cursor(acur->cnt, cur_error);
895 	if (acur->bnolt)
896 		xfs_btree_del_cursor(acur->bnolt, cur_error);
897 	if (acur->bnogt)
898 		xfs_btree_del_cursor(acur->bnogt, cur_error);
899 	acur->cnt = acur->bnolt = acur->bnogt = NULL;
900 }
901 
902 /*
903  * Check an extent for allocation and track the best available candidate in the
904  * allocation structure. The cursor is deactivated if it has entered an out of
905  * range state based on allocation arguments. Optionally return the extent
906  * extent geometry and allocation status if requested by the caller.
907  */
908 static int
xfs_alloc_cur_check(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur,struct xfs_btree_cur * cur,int * new)909 xfs_alloc_cur_check(
910 	struct xfs_alloc_arg	*args,
911 	struct xfs_alloc_cur	*acur,
912 	struct xfs_btree_cur	*cur,
913 	int			*new)
914 {
915 	int			error, i;
916 	xfs_agblock_t		bno, bnoa, bnew;
917 	xfs_extlen_t		len, lena, diff = -1;
918 	bool			busy;
919 	unsigned		busy_gen = 0;
920 	bool			deactivate = false;
921 	bool			isbnobt = xfs_btree_is_bno(cur->bc_ops);
922 
923 	*new = 0;
924 
925 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
926 	if (error)
927 		return error;
928 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
929 		xfs_btree_mark_sick(cur);
930 		return -EFSCORRUPTED;
931 	}
932 
933 	/*
934 	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
935 	 * range (i.e., walking backwards looking for a minlen extent).
936 	 */
937 	if (len < args->minlen) {
938 		deactivate = !isbnobt;
939 		goto out;
940 	}
941 
942 	busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
943 					 &busy_gen);
944 	acur->busy |= busy;
945 	if (busy)
946 		acur->busy_gen = busy_gen;
947 	/* deactivate a bnobt cursor outside of locality range */
948 	if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
949 		deactivate = isbnobt;
950 		goto out;
951 	}
952 	if (lena < args->minlen)
953 		goto out;
954 
955 	args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
956 	xfs_alloc_fix_len(args);
957 	ASSERT(args->len >= args->minlen);
958 	if (args->len < acur->len)
959 		goto out;
960 
961 	/*
962 	 * We have an aligned record that satisfies minlen and beats or matches
963 	 * the candidate extent size. Compare locality for near allocation mode.
964 	 */
965 	diff = xfs_alloc_compute_diff(args->agbno, args->len,
966 				      args->alignment, args->datatype,
967 				      bnoa, lena, &bnew);
968 	if (bnew == NULLAGBLOCK)
969 		goto out;
970 
971 	/*
972 	 * Deactivate a bnobt cursor with worse locality than the current best.
973 	 */
974 	if (diff > acur->diff) {
975 		deactivate = isbnobt;
976 		goto out;
977 	}
978 
979 	ASSERT(args->len > acur->len ||
980 	       (args->len == acur->len && diff <= acur->diff));
981 	acur->rec_bno = bno;
982 	acur->rec_len = len;
983 	acur->bno = bnew;
984 	acur->len = args->len;
985 	acur->diff = diff;
986 	*new = 1;
987 
988 	/*
989 	 * We're done if we found a perfect allocation. This only deactivates
990 	 * the current cursor, but this is just an optimization to terminate a
991 	 * cntbt search that otherwise runs to the edge of the tree.
992 	 */
993 	if (acur->diff == 0 && acur->len == args->maxlen)
994 		deactivate = true;
995 out:
996 	if (deactivate)
997 		cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
998 	trace_xfs_alloc_cur_check(cur, bno, len, diff, *new);
999 	return 0;
1000 }
1001 
1002 /*
1003  * Complete an allocation of a candidate extent. Remove the extent from both
1004  * trees and update the args structure.
1005  */
1006 STATIC int
xfs_alloc_cur_finish(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur)1007 xfs_alloc_cur_finish(
1008 	struct xfs_alloc_arg	*args,
1009 	struct xfs_alloc_cur	*acur)
1010 {
1011 	int			error;
1012 
1013 	ASSERT(acur->cnt && acur->bnolt);
1014 	ASSERT(acur->bno >= acur->rec_bno);
1015 	ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
1016 	ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len));
1017 
1018 	error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
1019 				      acur->rec_len, acur->bno, acur->len, 0);
1020 	if (error)
1021 		return error;
1022 
1023 	args->agbno = acur->bno;
1024 	args->len = acur->len;
1025 	args->wasfromfl = 0;
1026 
1027 	trace_xfs_alloc_cur(args);
1028 	return 0;
1029 }
1030 
1031 /*
1032  * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
1033  * bno optimized lookup to search for extents with ideal size and locality.
1034  */
1035 STATIC int
xfs_alloc_cntbt_iter(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur)1036 xfs_alloc_cntbt_iter(
1037 	struct xfs_alloc_arg		*args,
1038 	struct xfs_alloc_cur		*acur)
1039 {
1040 	struct xfs_btree_cur	*cur = acur->cnt;
1041 	xfs_agblock_t		bno;
1042 	xfs_extlen_t		len, cur_len;
1043 	int			error;
1044 	int			i;
1045 
1046 	if (!xfs_alloc_cur_active(cur))
1047 		return 0;
1048 
1049 	/* locality optimized lookup */
1050 	cur_len = acur->cur_len;
1051 	error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
1052 	if (error)
1053 		return error;
1054 	if (i == 0)
1055 		return 0;
1056 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1057 	if (error)
1058 		return error;
1059 
1060 	/* check the current record and update search length from it */
1061 	error = xfs_alloc_cur_check(args, acur, cur, &i);
1062 	if (error)
1063 		return error;
1064 	ASSERT(len >= acur->cur_len);
1065 	acur->cur_len = len;
1066 
1067 	/*
1068 	 * We looked up the first record >= [agbno, len] above. The agbno is a
1069 	 * secondary key and so the current record may lie just before or after
1070 	 * agbno. If it is past agbno, check the previous record too so long as
1071 	 * the length matches as it may be closer. Don't check a smaller record
1072 	 * because that could deactivate our cursor.
1073 	 */
1074 	if (bno > args->agbno) {
1075 		error = xfs_btree_decrement(cur, 0, &i);
1076 		if (!error && i) {
1077 			error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1078 			if (!error && i && len == acur->cur_len)
1079 				error = xfs_alloc_cur_check(args, acur, cur,
1080 							    &i);
1081 		}
1082 		if (error)
1083 			return error;
1084 	}
1085 
1086 	/*
1087 	 * Increment the search key until we find at least one allocation
1088 	 * candidate or if the extent we found was larger. Otherwise, double the
1089 	 * search key to optimize the search. Efficiency is more important here
1090 	 * than absolute best locality.
1091 	 */
1092 	cur_len <<= 1;
1093 	if (!acur->len || acur->cur_len >= cur_len)
1094 		acur->cur_len++;
1095 	else
1096 		acur->cur_len = cur_len;
1097 
1098 	return error;
1099 }
1100 
1101 /*
1102  * Deal with the case where only small freespaces remain. Either return the
1103  * contents of the last freespace record, or allocate space from the freelist if
1104  * there is nothing in the tree.
1105  */
1106 STATIC int			/* error */
xfs_alloc_ag_vextent_small(struct xfs_alloc_arg * args,struct xfs_btree_cur * ccur,xfs_agblock_t * fbnop,xfs_extlen_t * flenp,int * stat)1107 xfs_alloc_ag_vextent_small(
1108 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
1109 	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
1110 	xfs_agblock_t		*fbnop,	/* result block number */
1111 	xfs_extlen_t		*flenp,	/* result length */
1112 	int			*stat)	/* status: 0-freelist, 1-normal/none */
1113 {
1114 	struct xfs_agf		*agf = args->agbp->b_addr;
1115 	int			error = 0;
1116 	xfs_agblock_t		fbno = NULLAGBLOCK;
1117 	xfs_extlen_t		flen = 0;
1118 	int			i = 0;
1119 
1120 	/*
1121 	 * If a cntbt cursor is provided, try to allocate the largest record in
1122 	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1123 	 * allocation. Make sure to respect minleft even when pulling from the
1124 	 * freelist.
1125 	 */
1126 	if (ccur)
1127 		error = xfs_btree_decrement(ccur, 0, &i);
1128 	if (error)
1129 		goto error;
1130 	if (i) {
1131 		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1132 		if (error)
1133 			goto error;
1134 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1135 			xfs_btree_mark_sick(ccur);
1136 			error = -EFSCORRUPTED;
1137 			goto error;
1138 		}
1139 		goto out;
1140 	}
1141 
1142 	if (args->minlen != 1 || args->alignment != 1 ||
1143 	    args->resv == XFS_AG_RESV_AGFL ||
1144 	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
1145 		goto out;
1146 
1147 	error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1148 			&fbno, 0);
1149 	if (error)
1150 		goto error;
1151 	if (fbno == NULLAGBLOCK)
1152 		goto out;
1153 
1154 	xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
1155 			      (args->datatype & XFS_ALLOC_NOBUSY));
1156 
1157 	if (args->datatype & XFS_ALLOC_USERDATA) {
1158 		struct xfs_buf	*bp;
1159 
1160 		error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1161 				XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1162 				args->mp->m_bsize, 0, &bp);
1163 		if (error)
1164 			goto error;
1165 		xfs_trans_binval(args->tp, bp);
1166 	}
1167 	*fbnop = args->agbno = fbno;
1168 	*flenp = args->len = 1;
1169 	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1170 		xfs_btree_mark_sick(ccur);
1171 		error = -EFSCORRUPTED;
1172 		goto error;
1173 	}
1174 	args->wasfromfl = 1;
1175 	trace_xfs_alloc_small_freelist(args);
1176 
1177 	/*
1178 	 * If we're feeding an AGFL block to something that doesn't live in the
1179 	 * free space, we need to clear out the OWN_AG rmap.
1180 	 */
1181 	error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1182 			      &XFS_RMAP_OINFO_AG);
1183 	if (error)
1184 		goto error;
1185 
1186 	*stat = 0;
1187 	return 0;
1188 
1189 out:
1190 	/*
1191 	 * Can't do the allocation, give up.
1192 	 */
1193 	if (flen < args->minlen) {
1194 		args->agbno = NULLAGBLOCK;
1195 		trace_xfs_alloc_small_notenough(args);
1196 		flen = 0;
1197 	}
1198 	*fbnop = fbno;
1199 	*flenp = flen;
1200 	*stat = 1;
1201 	trace_xfs_alloc_small_done(args);
1202 	return 0;
1203 
1204 error:
1205 	trace_xfs_alloc_small_error(args);
1206 	return error;
1207 }
1208 
1209 /*
1210  * Allocate a variable extent at exactly agno/bno.
1211  * Extent's length (returned in *len) will be between minlen and maxlen,
1212  * and of the form k * prod + mod unless there's nothing that large.
1213  * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1214  */
1215 STATIC int			/* error */
xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t * args)1216 xfs_alloc_ag_vextent_exact(
1217 	xfs_alloc_arg_t	*args)	/* allocation argument structure */
1218 {
1219 	struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1220 	struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
1221 	int		error;
1222 	xfs_agblock_t	fbno;	/* start block of found extent */
1223 	xfs_extlen_t	flen;	/* length of found extent */
1224 	xfs_agblock_t	tbno;	/* start block of busy extent */
1225 	xfs_extlen_t	tlen;	/* length of busy extent */
1226 	xfs_agblock_t	tend;	/* end block of busy extent */
1227 	int		i;	/* success/failure of operation */
1228 	unsigned	busy_gen;
1229 
1230 	ASSERT(args->alignment == 1);
1231 
1232 	/*
1233 	 * Allocate/initialize a cursor for the by-number freespace btree.
1234 	 */
1235 	bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
1236 					  args->pag);
1237 
1238 	/*
1239 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1240 	 * Look for the closest free block <= bno, it must contain bno
1241 	 * if any free block does.
1242 	 */
1243 	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1244 	if (error)
1245 		goto error0;
1246 	if (!i)
1247 		goto not_found;
1248 
1249 	/*
1250 	 * Grab the freespace record.
1251 	 */
1252 	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1253 	if (error)
1254 		goto error0;
1255 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1256 		xfs_btree_mark_sick(bno_cur);
1257 		error = -EFSCORRUPTED;
1258 		goto error0;
1259 	}
1260 	ASSERT(fbno <= args->agbno);
1261 
1262 	/*
1263 	 * Check for overlapping busy extents.
1264 	 */
1265 	tbno = fbno;
1266 	tlen = flen;
1267 	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1268 
1269 	/*
1270 	 * Give up if the start of the extent is busy, or the freespace isn't
1271 	 * long enough for the minimum request.
1272 	 */
1273 	if (tbno > args->agbno)
1274 		goto not_found;
1275 	if (tlen < args->minlen)
1276 		goto not_found;
1277 	tend = tbno + tlen;
1278 	if (tend < args->agbno + args->minlen)
1279 		goto not_found;
1280 
1281 	/*
1282 	 * End of extent will be smaller of the freespace end and the
1283 	 * maximal requested end.
1284 	 *
1285 	 * Fix the length according to mod and prod if given.
1286 	 */
1287 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1288 						- args->agbno;
1289 	xfs_alloc_fix_len(args);
1290 	ASSERT(args->agbno + args->len <= tend);
1291 
1292 	/*
1293 	 * We are allocating agbno for args->len
1294 	 * Allocate/initialize a cursor for the by-size btree.
1295 	 */
1296 	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
1297 					args->pag);
1298 	ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len));
1299 	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1300 				      args->len, XFSA_FIXUP_BNO_OK);
1301 	if (error) {
1302 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1303 		goto error0;
1304 	}
1305 
1306 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1307 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1308 
1309 	args->wasfromfl = 0;
1310 	trace_xfs_alloc_exact_done(args);
1311 	return 0;
1312 
1313 not_found:
1314 	/* Didn't find it, return null. */
1315 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1316 	args->agbno = NULLAGBLOCK;
1317 	trace_xfs_alloc_exact_notfound(args);
1318 	return 0;
1319 
1320 error0:
1321 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1322 	trace_xfs_alloc_exact_error(args);
1323 	return error;
1324 }
1325 
1326 /*
1327  * Search a given number of btree records in a given direction. Check each
1328  * record against the good extent we've already found.
1329  */
1330 STATIC int
xfs_alloc_walk_iter(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur,struct xfs_btree_cur * cur,bool increment,bool find_one,int count,int * stat)1331 xfs_alloc_walk_iter(
1332 	struct xfs_alloc_arg	*args,
1333 	struct xfs_alloc_cur	*acur,
1334 	struct xfs_btree_cur	*cur,
1335 	bool			increment,
1336 	bool			find_one, /* quit on first candidate */
1337 	int			count,    /* rec count (-1 for infinite) */
1338 	int			*stat)
1339 {
1340 	int			error;
1341 	int			i;
1342 
1343 	*stat = 0;
1344 
1345 	/*
1346 	 * Search so long as the cursor is active or we find a better extent.
1347 	 * The cursor is deactivated if it extends beyond the range of the
1348 	 * current allocation candidate.
1349 	 */
1350 	while (xfs_alloc_cur_active(cur) && count) {
1351 		error = xfs_alloc_cur_check(args, acur, cur, &i);
1352 		if (error)
1353 			return error;
1354 		if (i == 1) {
1355 			*stat = 1;
1356 			if (find_one)
1357 				break;
1358 		}
1359 		if (!xfs_alloc_cur_active(cur))
1360 			break;
1361 
1362 		if (increment)
1363 			error = xfs_btree_increment(cur, 0, &i);
1364 		else
1365 			error = xfs_btree_decrement(cur, 0, &i);
1366 		if (error)
1367 			return error;
1368 		if (i == 0)
1369 			cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
1370 
1371 		if (count > 0)
1372 			count--;
1373 	}
1374 
1375 	return 0;
1376 }
1377 
1378 /*
1379  * Search the by-bno and by-size btrees in parallel in search of an extent with
1380  * ideal locality based on the NEAR mode ->agbno locality hint.
1381  */
1382 STATIC int
xfs_alloc_ag_vextent_locality(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur,int * stat)1383 xfs_alloc_ag_vextent_locality(
1384 	struct xfs_alloc_arg	*args,
1385 	struct xfs_alloc_cur	*acur,
1386 	int			*stat)
1387 {
1388 	struct xfs_btree_cur	*fbcur = NULL;
1389 	int			error;
1390 	int			i;
1391 	bool			fbinc;
1392 
1393 	ASSERT(acur->len == 0);
1394 
1395 	*stat = 0;
1396 
1397 	error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1398 	if (error)
1399 		return error;
1400 	error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1401 	if (error)
1402 		return error;
1403 	error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1404 	if (error)
1405 		return error;
1406 
1407 	/*
1408 	 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1409 	 * right and lookup the closest extent to the locality hint for each
1410 	 * extent size key in the cntbt. The entire search terminates
1411 	 * immediately on a bnobt hit because that means we've found best case
1412 	 * locality. Otherwise the search continues until the cntbt cursor runs
1413 	 * off the end of the tree. If no allocation candidate is found at this
1414 	 * point, give up on locality, walk backwards from the end of the cntbt
1415 	 * and take the first available extent.
1416 	 *
1417 	 * The parallel tree searches balance each other out to provide fairly
1418 	 * consistent performance for various situations. The bnobt search can
1419 	 * have pathological behavior in the worst case scenario of larger
1420 	 * allocation requests and fragmented free space. On the other hand, the
1421 	 * bnobt is able to satisfy most smaller allocation requests much more
1422 	 * quickly than the cntbt. The cntbt search can sift through fragmented
1423 	 * free space and sets of free extents for larger allocation requests
1424 	 * more quickly than the bnobt. Since the locality hint is just a hint
1425 	 * and we don't want to scan the entire bnobt for perfect locality, the
1426 	 * cntbt search essentially bounds the bnobt search such that we can
1427 	 * find good enough locality at reasonable performance in most cases.
1428 	 */
1429 	while (xfs_alloc_cur_active(acur->bnolt) ||
1430 	       xfs_alloc_cur_active(acur->bnogt) ||
1431 	       xfs_alloc_cur_active(acur->cnt)) {
1432 
1433 		trace_xfs_alloc_cur_lookup(args);
1434 
1435 		/*
1436 		 * Search the bnobt left and right. In the case of a hit, finish
1437 		 * the search in the opposite direction and we're done.
1438 		 */
1439 		error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1440 					    true, 1, &i);
1441 		if (error)
1442 			return error;
1443 		if (i == 1) {
1444 			trace_xfs_alloc_cur_left(args);
1445 			fbcur = acur->bnogt;
1446 			fbinc = true;
1447 			break;
1448 		}
1449 		error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1450 					    1, &i);
1451 		if (error)
1452 			return error;
1453 		if (i == 1) {
1454 			trace_xfs_alloc_cur_right(args);
1455 			fbcur = acur->bnolt;
1456 			fbinc = false;
1457 			break;
1458 		}
1459 
1460 		/*
1461 		 * Check the extent with best locality based on the current
1462 		 * extent size search key and keep track of the best candidate.
1463 		 */
1464 		error = xfs_alloc_cntbt_iter(args, acur);
1465 		if (error)
1466 			return error;
1467 		if (!xfs_alloc_cur_active(acur->cnt)) {
1468 			trace_xfs_alloc_cur_lookup_done(args);
1469 			break;
1470 		}
1471 	}
1472 
1473 	/*
1474 	 * If we failed to find anything due to busy extents, return empty
1475 	 * handed so the caller can flush and retry. If no busy extents were
1476 	 * found, walk backwards from the end of the cntbt as a last resort.
1477 	 */
1478 	if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1479 		error = xfs_btree_decrement(acur->cnt, 0, &i);
1480 		if (error)
1481 			return error;
1482 		if (i) {
1483 			acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
1484 			fbcur = acur->cnt;
1485 			fbinc = false;
1486 		}
1487 	}
1488 
1489 	/*
1490 	 * Search in the opposite direction for a better entry in the case of
1491 	 * a bnobt hit or walk backwards from the end of the cntbt.
1492 	 */
1493 	if (fbcur) {
1494 		error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1495 					    &i);
1496 		if (error)
1497 			return error;
1498 	}
1499 
1500 	if (acur->len)
1501 		*stat = 1;
1502 
1503 	return 0;
1504 }
1505 
1506 /* Check the last block of the cnt btree for allocations. */
1507 static int
xfs_alloc_ag_vextent_lastblock(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur,xfs_agblock_t * bno,xfs_extlen_t * len,bool * allocated)1508 xfs_alloc_ag_vextent_lastblock(
1509 	struct xfs_alloc_arg	*args,
1510 	struct xfs_alloc_cur	*acur,
1511 	xfs_agblock_t		*bno,
1512 	xfs_extlen_t		*len,
1513 	bool			*allocated)
1514 {
1515 	int			error;
1516 	int			i;
1517 
1518 #ifdef DEBUG
1519 	/* Randomly don't execute the first algorithm. */
1520 	if (get_random_u32_below(2))
1521 		return 0;
1522 #endif
1523 
1524 	/*
1525 	 * Start from the entry that lookup found, sequence through all larger
1526 	 * free blocks.  If we're actually pointing at a record smaller than
1527 	 * maxlen, go to the start of this block, and skip all those smaller
1528 	 * than minlen.
1529 	 */
1530 	if (*len || args->alignment > 1) {
1531 		acur->cnt->bc_levels[0].ptr = 1;
1532 		do {
1533 			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1534 			if (error)
1535 				return error;
1536 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1537 				xfs_btree_mark_sick(acur->cnt);
1538 				return -EFSCORRUPTED;
1539 			}
1540 			if (*len >= args->minlen)
1541 				break;
1542 			error = xfs_btree_increment(acur->cnt, 0, &i);
1543 			if (error)
1544 				return error;
1545 		} while (i);
1546 		ASSERT(*len >= args->minlen);
1547 		if (!i)
1548 			return 0;
1549 	}
1550 
1551 	error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1552 	if (error)
1553 		return error;
1554 
1555 	/*
1556 	 * It didn't work.  We COULD be in a case where there's a good record
1557 	 * somewhere, so try again.
1558 	 */
1559 	if (acur->len == 0)
1560 		return 0;
1561 
1562 	trace_xfs_alloc_near_first(args);
1563 	*allocated = true;
1564 	return 0;
1565 }
1566 
1567 /*
1568  * Allocate a variable extent near bno in the allocation group agno.
1569  * Extent's length (returned in len) will be between minlen and maxlen,
1570  * and of the form k * prod + mod unless there's nothing that large.
1571  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1572  */
1573 STATIC int
xfs_alloc_ag_vextent_near(struct xfs_alloc_arg * args,uint32_t alloc_flags)1574 xfs_alloc_ag_vextent_near(
1575 	struct xfs_alloc_arg	*args,
1576 	uint32_t		alloc_flags)
1577 {
1578 	struct xfs_alloc_cur	acur = {};
1579 	int			error;		/* error code */
1580 	int			i;		/* result code, temporary */
1581 	xfs_agblock_t		bno;
1582 	xfs_extlen_t		len;
1583 
1584 	/* handle uninitialized agbno range so caller doesn't have to */
1585 	if (!args->min_agbno && !args->max_agbno)
1586 		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1587 	ASSERT(args->min_agbno <= args->max_agbno);
1588 
1589 	/* clamp agbno to the range if it's outside */
1590 	if (args->agbno < args->min_agbno)
1591 		args->agbno = args->min_agbno;
1592 	if (args->agbno > args->max_agbno)
1593 		args->agbno = args->max_agbno;
1594 
1595 	/* Retry once quickly if we find busy extents before blocking. */
1596 	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1597 restart:
1598 	len = 0;
1599 
1600 	/*
1601 	 * Set up cursors and see if there are any free extents as big as
1602 	 * maxlen. If not, pick the last entry in the tree unless the tree is
1603 	 * empty.
1604 	 */
1605 	error = xfs_alloc_cur_setup(args, &acur);
1606 	if (error == -ENOSPC) {
1607 		error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1608 				&len, &i);
1609 		if (error)
1610 			goto out;
1611 		if (i == 0 || len == 0) {
1612 			trace_xfs_alloc_near_noentry(args);
1613 			goto out;
1614 		}
1615 		ASSERT(i == 1);
1616 	} else if (error) {
1617 		goto out;
1618 	}
1619 
1620 	/*
1621 	 * First algorithm.
1622 	 * If the requested extent is large wrt the freespaces available
1623 	 * in this a.g., then the cursor will be pointing to a btree entry
1624 	 * near the right edge of the tree.  If it's in the last btree leaf
1625 	 * block, then we just examine all the entries in that block
1626 	 * that are big enough, and pick the best one.
1627 	 */
1628 	if (xfs_btree_islastblock(acur.cnt, 0)) {
1629 		bool		allocated = false;
1630 
1631 		error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1632 				&allocated);
1633 		if (error)
1634 			goto out;
1635 		if (allocated)
1636 			goto alloc_finish;
1637 	}
1638 
1639 	/*
1640 	 * Second algorithm. Combined cntbt and bnobt search to find ideal
1641 	 * locality.
1642 	 */
1643 	error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1644 	if (error)
1645 		goto out;
1646 
1647 	/*
1648 	 * If we couldn't get anything, give up.
1649 	 */
1650 	if (!acur.len) {
1651 		if (acur.busy) {
1652 			/*
1653 			 * Our only valid extents must have been busy. Flush and
1654 			 * retry the allocation again. If we get an -EAGAIN
1655 			 * error, we're being told that a deadlock was avoided
1656 			 * and the current transaction needs committing before
1657 			 * the allocation can be retried.
1658 			 */
1659 			trace_xfs_alloc_near_busy(args);
1660 			error = xfs_extent_busy_flush(args->tp, args->pag,
1661 					acur.busy_gen, alloc_flags);
1662 			if (error)
1663 				goto out;
1664 
1665 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1666 			goto restart;
1667 		}
1668 		trace_xfs_alloc_size_neither(args);
1669 		args->agbno = NULLAGBLOCK;
1670 		goto out;
1671 	}
1672 
1673 alloc_finish:
1674 	/* fix up btrees on a successful allocation */
1675 	error = xfs_alloc_cur_finish(args, &acur);
1676 
1677 out:
1678 	xfs_alloc_cur_close(&acur, error);
1679 	return error;
1680 }
1681 
1682 /*
1683  * Allocate a variable extent anywhere in the allocation group agno.
1684  * Extent's length (returned in len) will be between minlen and maxlen,
1685  * and of the form k * prod + mod unless there's nothing that large.
1686  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1687  */
1688 static int
xfs_alloc_ag_vextent_size(struct xfs_alloc_arg * args,uint32_t alloc_flags)1689 xfs_alloc_ag_vextent_size(
1690 	struct xfs_alloc_arg	*args,
1691 	uint32_t		alloc_flags)
1692 {
1693 	struct xfs_agf		*agf = args->agbp->b_addr;
1694 	struct xfs_btree_cur	*bno_cur;
1695 	struct xfs_btree_cur	*cnt_cur;
1696 	xfs_agblock_t		fbno;		/* start of found freespace */
1697 	xfs_extlen_t		flen;		/* length of found freespace */
1698 	xfs_agblock_t		rbno;		/* returned block number */
1699 	xfs_extlen_t		rlen;		/* length of returned extent */
1700 	bool			busy;
1701 	unsigned		busy_gen;
1702 	int			error;
1703 	int			i;
1704 
1705 	/* Retry once quickly if we find busy extents before blocking. */
1706 	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1707 restart:
1708 	/*
1709 	 * Allocate and initialize a cursor for the by-size btree.
1710 	 */
1711 	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
1712 					args->pag);
1713 	bno_cur = NULL;
1714 
1715 	/*
1716 	 * Look for an entry >= maxlen+alignment-1 blocks.
1717 	 */
1718 	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1719 			args->maxlen + args->alignment - 1, &i)))
1720 		goto error0;
1721 
1722 	/*
1723 	 * If none then we have to settle for a smaller extent. In the case that
1724 	 * there are no large extents, this will return the last entry in the
1725 	 * tree unless the tree is empty. In the case that there are only busy
1726 	 * large extents, this will return the largest small extent unless there
1727 	 * are no smaller extents available.
1728 	 */
1729 	if (!i) {
1730 		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1731 						   &fbno, &flen, &i);
1732 		if (error)
1733 			goto error0;
1734 		if (i == 0 || flen == 0) {
1735 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1736 			trace_xfs_alloc_size_noentry(args);
1737 			return 0;
1738 		}
1739 		ASSERT(i == 1);
1740 		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1741 				&rlen, &busy_gen);
1742 	} else {
1743 		/*
1744 		 * Search for a non-busy extent that is large enough.
1745 		 */
1746 		for (;;) {
1747 			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1748 			if (error)
1749 				goto error0;
1750 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1751 				xfs_btree_mark_sick(cnt_cur);
1752 				error = -EFSCORRUPTED;
1753 				goto error0;
1754 			}
1755 
1756 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1757 					&rbno, &rlen, &busy_gen);
1758 
1759 			if (rlen >= args->maxlen)
1760 				break;
1761 
1762 			error = xfs_btree_increment(cnt_cur, 0, &i);
1763 			if (error)
1764 				goto error0;
1765 			if (i)
1766 				continue;
1767 
1768 			/*
1769 			 * Our only valid extents must have been busy. Flush and
1770 			 * retry the allocation again. If we get an -EAGAIN
1771 			 * error, we're being told that a deadlock was avoided
1772 			 * and the current transaction needs committing before
1773 			 * the allocation can be retried.
1774 			 */
1775 			trace_xfs_alloc_size_busy(args);
1776 			error = xfs_extent_busy_flush(args->tp, args->pag,
1777 					busy_gen, alloc_flags);
1778 			if (error)
1779 				goto error0;
1780 
1781 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1782 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1783 			goto restart;
1784 		}
1785 	}
1786 
1787 	/*
1788 	 * In the first case above, we got the last entry in the
1789 	 * by-size btree.  Now we check to see if the space hits maxlen
1790 	 * once aligned; if not, we search left for something better.
1791 	 * This can't happen in the second case above.
1792 	 */
1793 	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1794 	if (XFS_IS_CORRUPT(args->mp,
1795 			   rlen != 0 &&
1796 			   (rlen > flen ||
1797 			    rbno + rlen > fbno + flen))) {
1798 		xfs_btree_mark_sick(cnt_cur);
1799 		error = -EFSCORRUPTED;
1800 		goto error0;
1801 	}
1802 	if (rlen < args->maxlen) {
1803 		xfs_agblock_t	bestfbno;
1804 		xfs_extlen_t	bestflen;
1805 		xfs_agblock_t	bestrbno;
1806 		xfs_extlen_t	bestrlen;
1807 
1808 		bestrlen = rlen;
1809 		bestrbno = rbno;
1810 		bestflen = flen;
1811 		bestfbno = fbno;
1812 		for (;;) {
1813 			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1814 				goto error0;
1815 			if (i == 0)
1816 				break;
1817 			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1818 					&i)))
1819 				goto error0;
1820 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1821 				xfs_btree_mark_sick(cnt_cur);
1822 				error = -EFSCORRUPTED;
1823 				goto error0;
1824 			}
1825 			if (flen < bestrlen)
1826 				break;
1827 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1828 					&rbno, &rlen, &busy_gen);
1829 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1830 			if (XFS_IS_CORRUPT(args->mp,
1831 					   rlen != 0 &&
1832 					   (rlen > flen ||
1833 					    rbno + rlen > fbno + flen))) {
1834 				xfs_btree_mark_sick(cnt_cur);
1835 				error = -EFSCORRUPTED;
1836 				goto error0;
1837 			}
1838 			if (rlen > bestrlen) {
1839 				bestrlen = rlen;
1840 				bestrbno = rbno;
1841 				bestflen = flen;
1842 				bestfbno = fbno;
1843 				if (rlen == args->maxlen)
1844 					break;
1845 			}
1846 		}
1847 		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1848 				&i)))
1849 			goto error0;
1850 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1851 			xfs_btree_mark_sick(cnt_cur);
1852 			error = -EFSCORRUPTED;
1853 			goto error0;
1854 		}
1855 		rlen = bestrlen;
1856 		rbno = bestrbno;
1857 		flen = bestflen;
1858 		fbno = bestfbno;
1859 	}
1860 	args->wasfromfl = 0;
1861 	/*
1862 	 * Fix up the length.
1863 	 */
1864 	args->len = rlen;
1865 	if (rlen < args->minlen) {
1866 		if (busy) {
1867 			/*
1868 			 * Our only valid extents must have been busy. Flush and
1869 			 * retry the allocation again. If we get an -EAGAIN
1870 			 * error, we're being told that a deadlock was avoided
1871 			 * and the current transaction needs committing before
1872 			 * the allocation can be retried.
1873 			 */
1874 			trace_xfs_alloc_size_busy(args);
1875 			error = xfs_extent_busy_flush(args->tp, args->pag,
1876 					busy_gen, alloc_flags);
1877 			if (error)
1878 				goto error0;
1879 
1880 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1881 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1882 			goto restart;
1883 		}
1884 		goto out_nominleft;
1885 	}
1886 	xfs_alloc_fix_len(args);
1887 
1888 	rlen = args->len;
1889 	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1890 		xfs_btree_mark_sick(cnt_cur);
1891 		error = -EFSCORRUPTED;
1892 		goto error0;
1893 	}
1894 	/*
1895 	 * Allocate and initialize a cursor for the by-block tree.
1896 	 */
1897 	bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
1898 					args->pag);
1899 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1900 			rbno, rlen, XFSA_FIXUP_CNT_OK)))
1901 		goto error0;
1902 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1903 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1904 	cnt_cur = bno_cur = NULL;
1905 	args->len = rlen;
1906 	args->agbno = rbno;
1907 	if (XFS_IS_CORRUPT(args->mp,
1908 			   args->agbno + args->len >
1909 			   be32_to_cpu(agf->agf_length))) {
1910 		xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT);
1911 		error = -EFSCORRUPTED;
1912 		goto error0;
1913 	}
1914 	trace_xfs_alloc_size_done(args);
1915 	return 0;
1916 
1917 error0:
1918 	trace_xfs_alloc_size_error(args);
1919 	if (cnt_cur)
1920 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1921 	if (bno_cur)
1922 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1923 	return error;
1924 
1925 out_nominleft:
1926 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1927 	trace_xfs_alloc_size_nominleft(args);
1928 	args->agbno = NULLAGBLOCK;
1929 	return 0;
1930 }
1931 
1932 /*
1933  * Free the extent starting at agno/bno for length.
1934  */
1935 STATIC int
xfs_free_ag_extent(struct xfs_trans * tp,struct xfs_buf * agbp,xfs_agnumber_t agno,xfs_agblock_t bno,xfs_extlen_t len,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type)1936 xfs_free_ag_extent(
1937 	struct xfs_trans		*tp,
1938 	struct xfs_buf			*agbp,
1939 	xfs_agnumber_t			agno,
1940 	xfs_agblock_t			bno,
1941 	xfs_extlen_t			len,
1942 	const struct xfs_owner_info	*oinfo,
1943 	enum xfs_ag_resv_type		type)
1944 {
1945 	struct xfs_mount		*mp;
1946 	struct xfs_btree_cur		*bno_cur;
1947 	struct xfs_btree_cur		*cnt_cur;
1948 	xfs_agblock_t			gtbno; /* start of right neighbor */
1949 	xfs_extlen_t			gtlen; /* length of right neighbor */
1950 	xfs_agblock_t			ltbno; /* start of left neighbor */
1951 	xfs_extlen_t			ltlen; /* length of left neighbor */
1952 	xfs_agblock_t			nbno; /* new starting block of freesp */
1953 	xfs_extlen_t			nlen; /* new length of freespace */
1954 	int				haveleft; /* have a left neighbor */
1955 	int				haveright; /* have a right neighbor */
1956 	int				i;
1957 	int				error;
1958 	struct xfs_perag		*pag = agbp->b_pag;
1959 
1960 	bno_cur = cnt_cur = NULL;
1961 	mp = tp->t_mountp;
1962 
1963 	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1964 		error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
1965 		if (error)
1966 			goto error0;
1967 	}
1968 
1969 	/*
1970 	 * Allocate and initialize a cursor for the by-block btree.
1971 	 */
1972 	bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
1973 	/*
1974 	 * Look for a neighboring block on the left (lower block numbers)
1975 	 * that is contiguous with this space.
1976 	 */
1977 	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1978 		goto error0;
1979 	if (haveleft) {
1980 		/*
1981 		 * There is a block to our left.
1982 		 */
1983 		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1984 			goto error0;
1985 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1986 			xfs_btree_mark_sick(bno_cur);
1987 			error = -EFSCORRUPTED;
1988 			goto error0;
1989 		}
1990 		/*
1991 		 * It's not contiguous, though.
1992 		 */
1993 		if (ltbno + ltlen < bno)
1994 			haveleft = 0;
1995 		else {
1996 			/*
1997 			 * If this failure happens the request to free this
1998 			 * space was invalid, it's (partly) already free.
1999 			 * Very bad.
2000 			 */
2001 			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
2002 				xfs_btree_mark_sick(bno_cur);
2003 				error = -EFSCORRUPTED;
2004 				goto error0;
2005 			}
2006 		}
2007 	}
2008 	/*
2009 	 * Look for a neighboring block on the right (higher block numbers)
2010 	 * that is contiguous with this space.
2011 	 */
2012 	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
2013 		goto error0;
2014 	if (haveright) {
2015 		/*
2016 		 * There is a block to our right.
2017 		 */
2018 		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
2019 			goto error0;
2020 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2021 			xfs_btree_mark_sick(bno_cur);
2022 			error = -EFSCORRUPTED;
2023 			goto error0;
2024 		}
2025 		/*
2026 		 * It's not contiguous, though.
2027 		 */
2028 		if (bno + len < gtbno)
2029 			haveright = 0;
2030 		else {
2031 			/*
2032 			 * If this failure happens the request to free this
2033 			 * space was invalid, it's (partly) already free.
2034 			 * Very bad.
2035 			 */
2036 			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
2037 				xfs_btree_mark_sick(bno_cur);
2038 				error = -EFSCORRUPTED;
2039 				goto error0;
2040 			}
2041 		}
2042 	}
2043 	/*
2044 	 * Now allocate and initialize a cursor for the by-size tree.
2045 	 */
2046 	cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
2047 	/*
2048 	 * Have both left and right contiguous neighbors.
2049 	 * Merge all three into a single free block.
2050 	 */
2051 	if (haveleft && haveright) {
2052 		/*
2053 		 * Delete the old by-size entry on the left.
2054 		 */
2055 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2056 			goto error0;
2057 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2058 			xfs_btree_mark_sick(cnt_cur);
2059 			error = -EFSCORRUPTED;
2060 			goto error0;
2061 		}
2062 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2063 			goto error0;
2064 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2065 			xfs_btree_mark_sick(cnt_cur);
2066 			error = -EFSCORRUPTED;
2067 			goto error0;
2068 		}
2069 		/*
2070 		 * Delete the old by-size entry on the right.
2071 		 */
2072 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2073 			goto error0;
2074 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2075 			xfs_btree_mark_sick(cnt_cur);
2076 			error = -EFSCORRUPTED;
2077 			goto error0;
2078 		}
2079 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2080 			goto error0;
2081 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2082 			xfs_btree_mark_sick(cnt_cur);
2083 			error = -EFSCORRUPTED;
2084 			goto error0;
2085 		}
2086 		/*
2087 		 * Delete the old by-block entry for the right block.
2088 		 */
2089 		if ((error = xfs_btree_delete(bno_cur, &i)))
2090 			goto error0;
2091 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2092 			xfs_btree_mark_sick(bno_cur);
2093 			error = -EFSCORRUPTED;
2094 			goto error0;
2095 		}
2096 		/*
2097 		 * Move the by-block cursor back to the left neighbor.
2098 		 */
2099 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2100 			goto error0;
2101 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2102 			xfs_btree_mark_sick(bno_cur);
2103 			error = -EFSCORRUPTED;
2104 			goto error0;
2105 		}
2106 #ifdef DEBUG
2107 		/*
2108 		 * Check that this is the right record: delete didn't
2109 		 * mangle the cursor.
2110 		 */
2111 		{
2112 			xfs_agblock_t	xxbno;
2113 			xfs_extlen_t	xxlen;
2114 
2115 			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2116 					&i)))
2117 				goto error0;
2118 			if (XFS_IS_CORRUPT(mp,
2119 					   i != 1 ||
2120 					   xxbno != ltbno ||
2121 					   xxlen != ltlen)) {
2122 				xfs_btree_mark_sick(bno_cur);
2123 				error = -EFSCORRUPTED;
2124 				goto error0;
2125 			}
2126 		}
2127 #endif
2128 		/*
2129 		 * Update remaining by-block entry to the new, joined block.
2130 		 */
2131 		nbno = ltbno;
2132 		nlen = len + ltlen + gtlen;
2133 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2134 			goto error0;
2135 	}
2136 	/*
2137 	 * Have only a left contiguous neighbor.
2138 	 * Merge it together with the new freespace.
2139 	 */
2140 	else if (haveleft) {
2141 		/*
2142 		 * Delete the old by-size entry on the left.
2143 		 */
2144 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2145 			goto error0;
2146 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2147 			xfs_btree_mark_sick(cnt_cur);
2148 			error = -EFSCORRUPTED;
2149 			goto error0;
2150 		}
2151 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2152 			goto error0;
2153 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2154 			xfs_btree_mark_sick(cnt_cur);
2155 			error = -EFSCORRUPTED;
2156 			goto error0;
2157 		}
2158 		/*
2159 		 * Back up the by-block cursor to the left neighbor, and
2160 		 * update its length.
2161 		 */
2162 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2163 			goto error0;
2164 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2165 			xfs_btree_mark_sick(bno_cur);
2166 			error = -EFSCORRUPTED;
2167 			goto error0;
2168 		}
2169 		nbno = ltbno;
2170 		nlen = len + ltlen;
2171 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2172 			goto error0;
2173 	}
2174 	/*
2175 	 * Have only a right contiguous neighbor.
2176 	 * Merge it together with the new freespace.
2177 	 */
2178 	else if (haveright) {
2179 		/*
2180 		 * Delete the old by-size entry on the right.
2181 		 */
2182 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2183 			goto error0;
2184 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2185 			xfs_btree_mark_sick(cnt_cur);
2186 			error = -EFSCORRUPTED;
2187 			goto error0;
2188 		}
2189 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2190 			goto error0;
2191 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2192 			xfs_btree_mark_sick(cnt_cur);
2193 			error = -EFSCORRUPTED;
2194 			goto error0;
2195 		}
2196 		/*
2197 		 * Update the starting block and length of the right
2198 		 * neighbor in the by-block tree.
2199 		 */
2200 		nbno = bno;
2201 		nlen = len + gtlen;
2202 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2203 			goto error0;
2204 	}
2205 	/*
2206 	 * No contiguous neighbors.
2207 	 * Insert the new freespace into the by-block tree.
2208 	 */
2209 	else {
2210 		nbno = bno;
2211 		nlen = len;
2212 		if ((error = xfs_btree_insert(bno_cur, &i)))
2213 			goto error0;
2214 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2215 			xfs_btree_mark_sick(bno_cur);
2216 			error = -EFSCORRUPTED;
2217 			goto error0;
2218 		}
2219 	}
2220 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2221 	bno_cur = NULL;
2222 	/*
2223 	 * In all cases we need to insert the new freespace in the by-size tree.
2224 	 */
2225 	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2226 		goto error0;
2227 	if (XFS_IS_CORRUPT(mp, i != 0)) {
2228 		xfs_btree_mark_sick(cnt_cur);
2229 		error = -EFSCORRUPTED;
2230 		goto error0;
2231 	}
2232 	if ((error = xfs_btree_insert(cnt_cur, &i)))
2233 		goto error0;
2234 	if (XFS_IS_CORRUPT(mp, i != 1)) {
2235 		xfs_btree_mark_sick(cnt_cur);
2236 		error = -EFSCORRUPTED;
2237 		goto error0;
2238 	}
2239 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2240 	cnt_cur = NULL;
2241 
2242 	/*
2243 	 * Update the freespace totals in the ag and superblock.
2244 	 */
2245 	error = xfs_alloc_update_counters(tp, agbp, len);
2246 	xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2247 	if (error)
2248 		goto error0;
2249 
2250 	XFS_STATS_INC(mp, xs_freex);
2251 	XFS_STATS_ADD(mp, xs_freeb, len);
2252 
2253 	trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2254 
2255 	return 0;
2256 
2257  error0:
2258 	trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2259 	if (bno_cur)
2260 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2261 	if (cnt_cur)
2262 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2263 	return error;
2264 }
2265 
2266 /*
2267  * Visible (exported) allocation/free functions.
2268  * Some of these are used just by xfs_alloc_btree.c and this file.
2269  */
2270 
2271 /*
2272  * Compute and fill in value of m_alloc_maxlevels.
2273  */
2274 void
xfs_alloc_compute_maxlevels(xfs_mount_t * mp)2275 xfs_alloc_compute_maxlevels(
2276 	xfs_mount_t	*mp)	/* file system mount structure */
2277 {
2278 	mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2279 			(mp->m_sb.sb_agblocks + 1) / 2);
2280 	ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2281 }
2282 
2283 /*
2284  * Find the length of the longest extent in an AG.  The 'need' parameter
2285  * specifies how much space we're going to need for the AGFL and the
2286  * 'reserved' parameter tells us how many blocks in this AG are reserved for
2287  * other callers.
2288  */
2289 xfs_extlen_t
xfs_alloc_longest_free_extent(struct xfs_perag * pag,xfs_extlen_t need,xfs_extlen_t reserved)2290 xfs_alloc_longest_free_extent(
2291 	struct xfs_perag	*pag,
2292 	xfs_extlen_t		need,
2293 	xfs_extlen_t		reserved)
2294 {
2295 	xfs_extlen_t		delta = 0;
2296 
2297 	/*
2298 	 * If the AGFL needs a recharge, we'll have to subtract that from the
2299 	 * longest extent.
2300 	 */
2301 	if (need > pag->pagf_flcount)
2302 		delta = need - pag->pagf_flcount;
2303 
2304 	/*
2305 	 * If we cannot maintain others' reservations with space from the
2306 	 * not-longest freesp extents, we'll have to subtract /that/ from
2307 	 * the longest extent too.
2308 	 */
2309 	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2310 		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2311 
2312 	/*
2313 	 * If the longest extent is long enough to satisfy all the
2314 	 * reservations and AGFL rules in place, we can return this extent.
2315 	 */
2316 	if (pag->pagf_longest > delta)
2317 		return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2318 				pag->pagf_longest - delta);
2319 
2320 	/* Otherwise, let the caller try for 1 block if there's space. */
2321 	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2322 }
2323 
2324 /*
2325  * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2326  * return the largest possible minimum length.
2327  */
2328 unsigned int
xfs_alloc_min_freelist(struct xfs_mount * mp,struct xfs_perag * pag)2329 xfs_alloc_min_freelist(
2330 	struct xfs_mount	*mp,
2331 	struct xfs_perag	*pag)
2332 {
2333 	/* AG btrees have at least 1 level. */
2334 	const unsigned int	bno_level = pag ? pag->pagf_bno_level : 1;
2335 	const unsigned int	cnt_level = pag ? pag->pagf_cnt_level : 1;
2336 	const unsigned int	rmap_level = pag ? pag->pagf_rmap_level : 1;
2337 	unsigned int		min_free;
2338 
2339 	ASSERT(mp->m_alloc_maxlevels > 0);
2340 
2341 	/*
2342 	 * For a btree shorter than the maximum height, the worst case is that
2343 	 * every level gets split and a new level is added, then while inserting
2344 	 * another entry to refill the AGFL, every level under the old root gets
2345 	 * split again. This is:
2346 	 *
2347 	 *   (full height split reservation) + (AGFL refill split height)
2348 	 * = (current height + 1) + (current height - 1)
2349 	 * = (new height) + (new height - 2)
2350 	 * = 2 * new height - 2
2351 	 *
2352 	 * For a btree of maximum height, the worst case is that every level
2353 	 * under the root gets split, then while inserting another entry to
2354 	 * refill the AGFL, every level under the root gets split again. This is
2355 	 * also:
2356 	 *
2357 	 *   2 * (current height - 1)
2358 	 * = 2 * (new height - 1)
2359 	 * = 2 * new height - 2
2360 	 */
2361 
2362 	/* space needed by-bno freespace btree */
2363 	min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
2364 	/* space needed by-size freespace btree */
2365 	min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
2366 	/* space needed reverse mapping used space btree */
2367 	if (xfs_has_rmapbt(mp))
2368 		min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2;
2369 	return min_free;
2370 }
2371 
2372 /*
2373  * Check if the operation we are fixing up the freelist for should go ahead or
2374  * not. If we are freeing blocks, we always allow it, otherwise the allocation
2375  * is dependent on whether the size and shape of free space available will
2376  * permit the requested allocation to take place.
2377  */
2378 static bool
xfs_alloc_space_available(struct xfs_alloc_arg * args,xfs_extlen_t min_free,int flags)2379 xfs_alloc_space_available(
2380 	struct xfs_alloc_arg	*args,
2381 	xfs_extlen_t		min_free,
2382 	int			flags)
2383 {
2384 	struct xfs_perag	*pag = args->pag;
2385 	xfs_extlen_t		alloc_len, longest;
2386 	xfs_extlen_t		reservation; /* blocks that are still reserved */
2387 	int			available;
2388 	xfs_extlen_t		agflcount;
2389 
2390 	if (flags & XFS_ALLOC_FLAG_FREEING)
2391 		return true;
2392 
2393 	reservation = xfs_ag_resv_needed(pag, args->resv);
2394 
2395 	/* do we have enough contiguous free space for the allocation? */
2396 	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2397 	longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2398 	if (longest < alloc_len)
2399 		return false;
2400 
2401 	/*
2402 	 * Do we have enough free space remaining for the allocation? Don't
2403 	 * account extra agfl blocks because we are about to defer free them,
2404 	 * making them unavailable until the current transaction commits.
2405 	 */
2406 	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2407 	available = (int)(pag->pagf_freeblks + agflcount -
2408 			  reservation - min_free - args->minleft);
2409 	if (available < (int)max(args->total, alloc_len))
2410 		return false;
2411 
2412 	/*
2413 	 * Clamp maxlen to the amount of free space available for the actual
2414 	 * extent allocation.
2415 	 */
2416 	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2417 		args->maxlen = available;
2418 		ASSERT(args->maxlen > 0);
2419 		ASSERT(args->maxlen >= args->minlen);
2420 	}
2421 
2422 	return true;
2423 }
2424 
2425 int
xfs_free_agfl_block(struct xfs_trans * tp,xfs_agnumber_t agno,xfs_agblock_t agbno,struct xfs_buf * agbp,struct xfs_owner_info * oinfo)2426 xfs_free_agfl_block(
2427 	struct xfs_trans	*tp,
2428 	xfs_agnumber_t		agno,
2429 	xfs_agblock_t		agbno,
2430 	struct xfs_buf		*agbp,
2431 	struct xfs_owner_info	*oinfo)
2432 {
2433 	int			error;
2434 	struct xfs_buf		*bp;
2435 
2436 	error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2437 				   XFS_AG_RESV_AGFL);
2438 	if (error)
2439 		return error;
2440 
2441 	error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2442 			XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2443 			tp->t_mountp->m_bsize, 0, &bp);
2444 	if (error)
2445 		return error;
2446 	xfs_trans_binval(tp, bp);
2447 
2448 	return 0;
2449 }
2450 
2451 /*
2452  * Check the agfl fields of the agf for inconsistency or corruption.
2453  *
2454  * The original purpose was to detect an agfl header padding mismatch between
2455  * current and early v5 kernels. This problem manifests as a 1-slot size
2456  * difference between the on-disk flcount and the active [first, last] range of
2457  * a wrapped agfl.
2458  *
2459  * However, we need to use these same checks to catch agfl count corruptions
2460  * unrelated to padding. This could occur on any v4 or v5 filesystem, so either
2461  * way, we need to reset the agfl and warn the user.
2462  *
2463  * Return true if a reset is required before the agfl can be used, false
2464  * otherwise.
2465  */
2466 static bool
xfs_agfl_needs_reset(struct xfs_mount * mp,struct xfs_agf * agf)2467 xfs_agfl_needs_reset(
2468 	struct xfs_mount	*mp,
2469 	struct xfs_agf		*agf)
2470 {
2471 	uint32_t		f = be32_to_cpu(agf->agf_flfirst);
2472 	uint32_t		l = be32_to_cpu(agf->agf_fllast);
2473 	uint32_t		c = be32_to_cpu(agf->agf_flcount);
2474 	int			agfl_size = xfs_agfl_size(mp);
2475 	int			active;
2476 
2477 	/*
2478 	 * The agf read verifier catches severe corruption of these fields.
2479 	 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2480 	 * the verifier allows it.
2481 	 */
2482 	if (f >= agfl_size || l >= agfl_size)
2483 		return true;
2484 	if (c > agfl_size)
2485 		return true;
2486 
2487 	/*
2488 	 * Check consistency between the on-disk count and the active range. An
2489 	 * agfl padding mismatch manifests as an inconsistent flcount.
2490 	 */
2491 	if (c && l >= f)
2492 		active = l - f + 1;
2493 	else if (c)
2494 		active = agfl_size - f + l + 1;
2495 	else
2496 		active = 0;
2497 
2498 	return active != c;
2499 }
2500 
2501 /*
2502  * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2503  * agfl content cannot be trusted. Warn the user that a repair is required to
2504  * recover leaked blocks.
2505  *
2506  * The purpose of this mechanism is to handle filesystems affected by the agfl
2507  * header padding mismatch problem. A reset keeps the filesystem online with a
2508  * relatively minor free space accounting inconsistency rather than suffer the
2509  * inevitable crash from use of an invalid agfl block.
2510  */
2511 static void
xfs_agfl_reset(struct xfs_trans * tp,struct xfs_buf * agbp,struct xfs_perag * pag)2512 xfs_agfl_reset(
2513 	struct xfs_trans	*tp,
2514 	struct xfs_buf		*agbp,
2515 	struct xfs_perag	*pag)
2516 {
2517 	struct xfs_mount	*mp = tp->t_mountp;
2518 	struct xfs_agf		*agf = agbp->b_addr;
2519 
2520 	ASSERT(xfs_perag_agfl_needs_reset(pag));
2521 	trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2522 
2523 	xfs_warn(mp,
2524 	       "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2525 	       "Please unmount and run xfs_repair.",
2526 	         pag->pag_agno, pag->pagf_flcount);
2527 
2528 	agf->agf_flfirst = 0;
2529 	agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2530 	agf->agf_flcount = 0;
2531 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2532 				    XFS_AGF_FLCOUNT);
2533 
2534 	pag->pagf_flcount = 0;
2535 	clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
2536 }
2537 
2538 /*
2539  * Defer an AGFL block free. This is effectively equivalent to
2540  * xfs_free_extent_later() with some special handling particular to AGFL blocks.
2541  *
2542  * Deferring AGFL frees helps prevent log reservation overruns due to too many
2543  * allocation operations in a transaction. AGFL frees are prone to this problem
2544  * because for one they are always freed one at a time. Further, an immediate
2545  * AGFL block free can cause a btree join and require another block free before
2546  * the real allocation can proceed. Deferring the free disconnects freeing up
2547  * the AGFL slot from freeing the block.
2548  */
2549 static int
xfs_defer_agfl_block(struct xfs_trans * tp,xfs_agnumber_t agno,xfs_agblock_t agbno,struct xfs_owner_info * oinfo)2550 xfs_defer_agfl_block(
2551 	struct xfs_trans		*tp,
2552 	xfs_agnumber_t			agno,
2553 	xfs_agblock_t			agbno,
2554 	struct xfs_owner_info		*oinfo)
2555 {
2556 	struct xfs_mount		*mp = tp->t_mountp;
2557 	struct xfs_extent_free_item	*xefi;
2558 	xfs_fsblock_t			fsbno = XFS_AGB_TO_FSB(mp, agno, agbno);
2559 
2560 	ASSERT(xfs_extfree_item_cache != NULL);
2561 	ASSERT(oinfo != NULL);
2562 
2563 	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno)))
2564 		return -EFSCORRUPTED;
2565 
2566 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2567 			       GFP_KERNEL | __GFP_NOFAIL);
2568 	xefi->xefi_startblock = fsbno;
2569 	xefi->xefi_blockcount = 1;
2570 	xefi->xefi_owner = oinfo->oi_owner;
2571 	xefi->xefi_agresv = XFS_AG_RESV_AGFL;
2572 
2573 	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2574 
2575 	xfs_extent_free_get_group(mp, xefi);
2576 	xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
2577 	return 0;
2578 }
2579 
2580 /*
2581  * Add the extent to the list of extents to be free at transaction end.
2582  * The list is maintained sorted (by block number).
2583  */
2584 static int
xfs_defer_extent_free(struct xfs_trans * tp,xfs_fsblock_t bno,xfs_filblks_t len,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type,bool skip_discard,struct xfs_defer_pending ** dfpp)2585 xfs_defer_extent_free(
2586 	struct xfs_trans		*tp,
2587 	xfs_fsblock_t			bno,
2588 	xfs_filblks_t			len,
2589 	const struct xfs_owner_info	*oinfo,
2590 	enum xfs_ag_resv_type		type,
2591 	bool				skip_discard,
2592 	struct xfs_defer_pending	**dfpp)
2593 {
2594 	struct xfs_extent_free_item	*xefi;
2595 	struct xfs_mount		*mp = tp->t_mountp;
2596 #ifdef DEBUG
2597 	xfs_agnumber_t			agno;
2598 	xfs_agblock_t			agbno;
2599 
2600 	ASSERT(bno != NULLFSBLOCK);
2601 	ASSERT(len > 0);
2602 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
2603 	ASSERT(!isnullstartblock(bno));
2604 	agno = XFS_FSB_TO_AGNO(mp, bno);
2605 	agbno = XFS_FSB_TO_AGBNO(mp, bno);
2606 	ASSERT(agno < mp->m_sb.sb_agcount);
2607 	ASSERT(agbno < mp->m_sb.sb_agblocks);
2608 	ASSERT(len < mp->m_sb.sb_agblocks);
2609 	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2610 #endif
2611 	ASSERT(xfs_extfree_item_cache != NULL);
2612 	ASSERT(type != XFS_AG_RESV_AGFL);
2613 
2614 	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
2615 		return -EFSCORRUPTED;
2616 
2617 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2618 			       GFP_KERNEL | __GFP_NOFAIL);
2619 	xefi->xefi_startblock = bno;
2620 	xefi->xefi_blockcount = (xfs_extlen_t)len;
2621 	xefi->xefi_agresv = type;
2622 	if (skip_discard)
2623 		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
2624 	if (oinfo) {
2625 		ASSERT(oinfo->oi_offset == 0);
2626 
2627 		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
2628 			xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
2629 		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
2630 			xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2631 		xefi->xefi_owner = oinfo->oi_owner;
2632 	} else {
2633 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
2634 	}
2635 	trace_xfs_bmap_free_defer(mp,
2636 			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2637 			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
2638 
2639 	xfs_extent_free_get_group(mp, xefi);
2640 	*dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
2641 	return 0;
2642 }
2643 
2644 int
xfs_free_extent_later(struct xfs_trans * tp,xfs_fsblock_t bno,xfs_filblks_t len,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type,bool skip_discard)2645 xfs_free_extent_later(
2646 	struct xfs_trans		*tp,
2647 	xfs_fsblock_t			bno,
2648 	xfs_filblks_t			len,
2649 	const struct xfs_owner_info	*oinfo,
2650 	enum xfs_ag_resv_type		type,
2651 	bool				skip_discard)
2652 {
2653 	struct xfs_defer_pending	*dontcare = NULL;
2654 
2655 	return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
2656 			&dontcare);
2657 }
2658 
2659 /*
2660  * Set up automatic freeing of unwritten space in the filesystem.
2661  *
2662  * This function attached a paused deferred extent free item to the
2663  * transaction.  Pausing means that the EFI will be logged in the next
2664  * transaction commit, but the pending EFI will not be finished until the
2665  * pending item is unpaused.
2666  *
2667  * If the system goes down after the EFI has been persisted to the log but
2668  * before the pending item is unpaused, log recovery will find the EFI, fail to
2669  * find the EFD, and free the space.
2670  *
2671  * If the pending item is unpaused, the next transaction commit will log an EFD
2672  * without freeing the space.
2673  *
2674  * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
2675  * @args structure are set to the relevant values.
2676  */
2677 int
xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg * args,bool skip_discard,struct xfs_alloc_autoreap * aarp)2678 xfs_alloc_schedule_autoreap(
2679 	const struct xfs_alloc_arg	*args,
2680 	bool				skip_discard,
2681 	struct xfs_alloc_autoreap	*aarp)
2682 {
2683 	int				error;
2684 
2685 	error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
2686 			&args->oinfo, args->resv, skip_discard, &aarp->dfp);
2687 	if (error)
2688 		return error;
2689 
2690 	xfs_defer_item_pause(args->tp, aarp->dfp);
2691 	return 0;
2692 }
2693 
2694 /*
2695  * Cancel automatic freeing of unwritten space in the filesystem.
2696  *
2697  * Earlier, we created a paused deferred extent free item and attached it to
2698  * this transaction so that we could automatically roll back a new space
2699  * allocation if the system went down.  Now we want to cancel the paused work
2700  * item by marking the EFI stale so we don't actually free the space, unpausing
2701  * the pending item and logging an EFD.
2702  *
2703  * The caller generally should have already mapped the space into the ondisk
2704  * filesystem.  If the reserved space was partially used, the caller must call
2705  * xfs_free_extent_later to create a new EFI to free the unused space.
2706  */
2707 void
xfs_alloc_cancel_autoreap(struct xfs_trans * tp,struct xfs_alloc_autoreap * aarp)2708 xfs_alloc_cancel_autoreap(
2709 	struct xfs_trans		*tp,
2710 	struct xfs_alloc_autoreap	*aarp)
2711 {
2712 	struct xfs_defer_pending	*dfp = aarp->dfp;
2713 	struct xfs_extent_free_item	*xefi;
2714 
2715 	if (!dfp)
2716 		return;
2717 
2718 	list_for_each_entry(xefi, &dfp->dfp_work, xefi_list)
2719 		xefi->xefi_flags |= XFS_EFI_CANCELLED;
2720 
2721 	xfs_defer_item_unpause(tp, dfp);
2722 }
2723 
2724 /*
2725  * Commit automatic freeing of unwritten space in the filesystem.
2726  *
2727  * This unpauses an earlier _schedule_autoreap and commits to freeing the
2728  * allocated space.  Call this if none of the reserved space was used.
2729  */
2730 void
xfs_alloc_commit_autoreap(struct xfs_trans * tp,struct xfs_alloc_autoreap * aarp)2731 xfs_alloc_commit_autoreap(
2732 	struct xfs_trans		*tp,
2733 	struct xfs_alloc_autoreap	*aarp)
2734 {
2735 	if (aarp->dfp)
2736 		xfs_defer_item_unpause(tp, aarp->dfp);
2737 }
2738 
2739 #ifdef DEBUG
2740 /*
2741  * Check if an AGF has a free extent record whose length is equal to
2742  * args->minlen.
2743  */
2744 STATIC int
xfs_exact_minlen_extent_available(struct xfs_alloc_arg * args,struct xfs_buf * agbp,int * stat)2745 xfs_exact_minlen_extent_available(
2746 	struct xfs_alloc_arg	*args,
2747 	struct xfs_buf		*agbp,
2748 	int			*stat)
2749 {
2750 	struct xfs_btree_cur	*cnt_cur;
2751 	xfs_agblock_t		fbno;
2752 	xfs_extlen_t		flen;
2753 	int			error = 0;
2754 
2755 	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp,
2756 					args->pag);
2757 	error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2758 	if (error)
2759 		goto out;
2760 
2761 	if (*stat == 0) {
2762 		xfs_btree_mark_sick(cnt_cur);
2763 		error = -EFSCORRUPTED;
2764 		goto out;
2765 	}
2766 
2767 	error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2768 	if (error)
2769 		goto out;
2770 
2771 	if (*stat == 1 && flen != args->minlen)
2772 		*stat = 0;
2773 
2774 out:
2775 	xfs_btree_del_cursor(cnt_cur, error);
2776 
2777 	return error;
2778 }
2779 #endif
2780 
2781 /*
2782  * Decide whether to use this allocation group for this allocation.
2783  * If so, fix up the btree freelist's size.
2784  */
2785 int			/* error */
xfs_alloc_fix_freelist(struct xfs_alloc_arg * args,uint32_t alloc_flags)2786 xfs_alloc_fix_freelist(
2787 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
2788 	uint32_t		alloc_flags)
2789 {
2790 	struct xfs_mount	*mp = args->mp;
2791 	struct xfs_perag	*pag = args->pag;
2792 	struct xfs_trans	*tp = args->tp;
2793 	struct xfs_buf		*agbp = NULL;
2794 	struct xfs_buf		*agflbp = NULL;
2795 	struct xfs_alloc_arg	targs;	/* local allocation arguments */
2796 	xfs_agblock_t		bno;	/* freelist block */
2797 	xfs_extlen_t		need;	/* total blocks needed in freelist */
2798 	int			error = 0;
2799 
2800 	/* deferred ops (AGFL block frees) require permanent transactions */
2801 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2802 
2803 	if (!xfs_perag_initialised_agf(pag)) {
2804 		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2805 		if (error) {
2806 			/* Couldn't lock the AGF so skip this AG. */
2807 			if (error == -EAGAIN)
2808 				error = 0;
2809 			goto out_no_agbp;
2810 		}
2811 	}
2812 
2813 	/*
2814 	 * If this is a metadata preferred pag and we are user data then try
2815 	 * somewhere else if we are not being asked to try harder at this
2816 	 * point
2817 	 */
2818 	if (xfs_perag_prefers_metadata(pag) &&
2819 	    (args->datatype & XFS_ALLOC_USERDATA) &&
2820 	    (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2821 		ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING));
2822 		goto out_agbp_relse;
2823 	}
2824 
2825 	need = xfs_alloc_min_freelist(mp, pag);
2826 	if (!xfs_alloc_space_available(args, need, alloc_flags |
2827 			XFS_ALLOC_FLAG_CHECK))
2828 		goto out_agbp_relse;
2829 
2830 	/*
2831 	 * Get the a.g. freespace buffer.
2832 	 * Can fail if we're not blocking on locks, and it's held.
2833 	 */
2834 	if (!agbp) {
2835 		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2836 		if (error) {
2837 			/* Couldn't lock the AGF so skip this AG. */
2838 			if (error == -EAGAIN)
2839 				error = 0;
2840 			goto out_no_agbp;
2841 		}
2842 	}
2843 
2844 	/* reset a padding mismatched agfl before final free space check */
2845 	if (xfs_perag_agfl_needs_reset(pag))
2846 		xfs_agfl_reset(tp, agbp, pag);
2847 
2848 	/* If there isn't enough total space or single-extent, reject it. */
2849 	need = xfs_alloc_min_freelist(mp, pag);
2850 	if (!xfs_alloc_space_available(args, need, alloc_flags))
2851 		goto out_agbp_relse;
2852 
2853 #ifdef DEBUG
2854 	if (args->alloc_minlen_only) {
2855 		int stat;
2856 
2857 		error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2858 		if (error || !stat)
2859 			goto out_agbp_relse;
2860 	}
2861 #endif
2862 	/*
2863 	 * Make the freelist shorter if it's too long.
2864 	 *
2865 	 * Note that from this point onwards, we will always release the agf and
2866 	 * agfl buffers on error. This handles the case where we error out and
2867 	 * the buffers are clean or may not have been joined to the transaction
2868 	 * and hence need to be released manually. If they have been joined to
2869 	 * the transaction, then xfs_trans_brelse() will handle them
2870 	 * appropriately based on the recursion count and dirty state of the
2871 	 * buffer.
2872 	 *
2873 	 * XXX (dgc): When we have lots of free space, does this buy us
2874 	 * anything other than extra overhead when we need to put more blocks
2875 	 * back on the free list? Maybe we should only do this when space is
2876 	 * getting low or the AGFL is more than half full?
2877 	 *
2878 	 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2879 	 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2880 	 * updating the rmapbt.  Both flags are used in xfs_repair while we're
2881 	 * rebuilding the rmapbt, and neither are used by the kernel.  They're
2882 	 * both required to ensure that rmaps are correctly recorded for the
2883 	 * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2884 	 * repair/rmap.c in xfsprogs for details.
2885 	 */
2886 	memset(&targs, 0, sizeof(targs));
2887 	/* struct copy below */
2888 	if (alloc_flags & XFS_ALLOC_FLAG_NORMAP)
2889 		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2890 	else
2891 		targs.oinfo = XFS_RMAP_OINFO_AG;
2892 	while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) &&
2893 			pag->pagf_flcount > need) {
2894 		error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
2895 		if (error)
2896 			goto out_agbp_relse;
2897 
2898 		/* defer agfl frees */
2899 		error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2900 		if (error)
2901 			goto out_agbp_relse;
2902 	}
2903 
2904 	targs.tp = tp;
2905 	targs.mp = mp;
2906 	targs.agbp = agbp;
2907 	targs.agno = args->agno;
2908 	targs.alignment = targs.minlen = targs.prod = 1;
2909 	targs.pag = pag;
2910 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2911 	if (error)
2912 		goto out_agbp_relse;
2913 
2914 	/* Make the freelist longer if it's too short. */
2915 	while (pag->pagf_flcount < need) {
2916 		targs.agbno = 0;
2917 		targs.maxlen = need - pag->pagf_flcount;
2918 		targs.resv = XFS_AG_RESV_AGFL;
2919 
2920 		/* Allocate as many blocks as possible at once. */
2921 		error = xfs_alloc_ag_vextent_size(&targs, alloc_flags);
2922 		if (error)
2923 			goto out_agflbp_relse;
2924 
2925 		/*
2926 		 * Stop if we run out.  Won't happen if callers are obeying
2927 		 * the restrictions correctly.  Can happen for free calls
2928 		 * on a completely full ag.
2929 		 */
2930 		if (targs.agbno == NULLAGBLOCK) {
2931 			if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
2932 				break;
2933 			goto out_agflbp_relse;
2934 		}
2935 
2936 		if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) {
2937 			error = xfs_rmap_alloc(tp, agbp, pag,
2938 				       targs.agbno, targs.len, &targs.oinfo);
2939 			if (error)
2940 				goto out_agflbp_relse;
2941 		}
2942 		error = xfs_alloc_update_counters(tp, agbp,
2943 						  -((long)(targs.len)));
2944 		if (error)
2945 			goto out_agflbp_relse;
2946 
2947 		/*
2948 		 * Put each allocated block on the list.
2949 		 */
2950 		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2951 			error = xfs_alloc_put_freelist(pag, tp, agbp,
2952 							agflbp, bno, 0);
2953 			if (error)
2954 				goto out_agflbp_relse;
2955 		}
2956 	}
2957 	xfs_trans_brelse(tp, agflbp);
2958 	args->agbp = agbp;
2959 	return 0;
2960 
2961 out_agflbp_relse:
2962 	xfs_trans_brelse(tp, agflbp);
2963 out_agbp_relse:
2964 	if (agbp)
2965 		xfs_trans_brelse(tp, agbp);
2966 out_no_agbp:
2967 	args->agbp = NULL;
2968 	return error;
2969 }
2970 
2971 /*
2972  * Get a block from the freelist.
2973  * Returns with the buffer for the block gotten.
2974  */
2975 int
xfs_alloc_get_freelist(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,xfs_agblock_t * bnop,int btreeblk)2976 xfs_alloc_get_freelist(
2977 	struct xfs_perag	*pag,
2978 	struct xfs_trans	*tp,
2979 	struct xfs_buf		*agbp,
2980 	xfs_agblock_t		*bnop,
2981 	int			btreeblk)
2982 {
2983 	struct xfs_agf		*agf = agbp->b_addr;
2984 	struct xfs_buf		*agflbp;
2985 	xfs_agblock_t		bno;
2986 	__be32			*agfl_bno;
2987 	int			error;
2988 	uint32_t		logflags;
2989 	struct xfs_mount	*mp = tp->t_mountp;
2990 
2991 	/*
2992 	 * Freelist is empty, give up.
2993 	 */
2994 	if (!agf->agf_flcount) {
2995 		*bnop = NULLAGBLOCK;
2996 		return 0;
2997 	}
2998 	/*
2999 	 * Read the array of free blocks.
3000 	 */
3001 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
3002 	if (error)
3003 		return error;
3004 
3005 
3006 	/*
3007 	 * Get the block number and update the data structures.
3008 	 */
3009 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3010 	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
3011 	if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
3012 		return -EFSCORRUPTED;
3013 
3014 	be32_add_cpu(&agf->agf_flfirst, 1);
3015 	xfs_trans_brelse(tp, agflbp);
3016 	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
3017 		agf->agf_flfirst = 0;
3018 
3019 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
3020 	be32_add_cpu(&agf->agf_flcount, -1);
3021 	pag->pagf_flcount--;
3022 
3023 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
3024 	if (btreeblk) {
3025 		be32_add_cpu(&agf->agf_btreeblks, 1);
3026 		pag->pagf_btreeblks++;
3027 		logflags |= XFS_AGF_BTREEBLKS;
3028 	}
3029 
3030 	xfs_alloc_log_agf(tp, agbp, logflags);
3031 	*bnop = bno;
3032 
3033 	return 0;
3034 }
3035 
3036 /*
3037  * Log the given fields from the agf structure.
3038  */
3039 void
xfs_alloc_log_agf(struct xfs_trans * tp,struct xfs_buf * bp,uint32_t fields)3040 xfs_alloc_log_agf(
3041 	struct xfs_trans	*tp,
3042 	struct xfs_buf		*bp,
3043 	uint32_t		fields)
3044 {
3045 	int	first;		/* first byte offset */
3046 	int	last;		/* last byte offset */
3047 	static const short	offsets[] = {
3048 		offsetof(xfs_agf_t, agf_magicnum),
3049 		offsetof(xfs_agf_t, agf_versionnum),
3050 		offsetof(xfs_agf_t, agf_seqno),
3051 		offsetof(xfs_agf_t, agf_length),
3052 		offsetof(xfs_agf_t, agf_bno_root),   /* also cnt/rmap root */
3053 		offsetof(xfs_agf_t, agf_bno_level),  /* also cnt/rmap levels */
3054 		offsetof(xfs_agf_t, agf_flfirst),
3055 		offsetof(xfs_agf_t, agf_fllast),
3056 		offsetof(xfs_agf_t, agf_flcount),
3057 		offsetof(xfs_agf_t, agf_freeblks),
3058 		offsetof(xfs_agf_t, agf_longest),
3059 		offsetof(xfs_agf_t, agf_btreeblks),
3060 		offsetof(xfs_agf_t, agf_uuid),
3061 		offsetof(xfs_agf_t, agf_rmap_blocks),
3062 		offsetof(xfs_agf_t, agf_refcount_blocks),
3063 		offsetof(xfs_agf_t, agf_refcount_root),
3064 		offsetof(xfs_agf_t, agf_refcount_level),
3065 		/* needed so that we don't log the whole rest of the structure: */
3066 		offsetof(xfs_agf_t, agf_spare64),
3067 		sizeof(xfs_agf_t)
3068 	};
3069 
3070 	trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
3071 
3072 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
3073 
3074 	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
3075 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
3076 }
3077 
3078 /*
3079  * Put the block on the freelist for the allocation group.
3080  */
3081 int
xfs_alloc_put_freelist(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,struct xfs_buf * agflbp,xfs_agblock_t bno,int btreeblk)3082 xfs_alloc_put_freelist(
3083 	struct xfs_perag	*pag,
3084 	struct xfs_trans	*tp,
3085 	struct xfs_buf		*agbp,
3086 	struct xfs_buf		*agflbp,
3087 	xfs_agblock_t		bno,
3088 	int			btreeblk)
3089 {
3090 	struct xfs_mount	*mp = tp->t_mountp;
3091 	struct xfs_agf		*agf = agbp->b_addr;
3092 	__be32			*blockp;
3093 	int			error;
3094 	uint32_t		logflags;
3095 	__be32			*agfl_bno;
3096 	int			startoff;
3097 
3098 	if (!agflbp) {
3099 		error = xfs_alloc_read_agfl(pag, tp, &agflbp);
3100 		if (error)
3101 			return error;
3102 	}
3103 
3104 	be32_add_cpu(&agf->agf_fllast, 1);
3105 	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
3106 		agf->agf_fllast = 0;
3107 
3108 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
3109 	be32_add_cpu(&agf->agf_flcount, 1);
3110 	pag->pagf_flcount++;
3111 
3112 	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
3113 	if (btreeblk) {
3114 		be32_add_cpu(&agf->agf_btreeblks, -1);
3115 		pag->pagf_btreeblks--;
3116 		logflags |= XFS_AGF_BTREEBLKS;
3117 	}
3118 
3119 	xfs_alloc_log_agf(tp, agbp, logflags);
3120 
3121 	ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
3122 
3123 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3124 	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
3125 	*blockp = cpu_to_be32(bno);
3126 	startoff = (char *)blockp - (char *)agflbp->b_addr;
3127 
3128 	xfs_alloc_log_agf(tp, agbp, logflags);
3129 
3130 	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
3131 	xfs_trans_log_buf(tp, agflbp, startoff,
3132 			  startoff + sizeof(xfs_agblock_t) - 1);
3133 	return 0;
3134 }
3135 
3136 /*
3137  * Check that this AGF/AGI header's sequence number and length matches the AG
3138  * number and size in fsblocks.
3139  */
3140 xfs_failaddr_t
xfs_validate_ag_length(struct xfs_buf * bp,uint32_t seqno,uint32_t length)3141 xfs_validate_ag_length(
3142 	struct xfs_buf		*bp,
3143 	uint32_t		seqno,
3144 	uint32_t		length)
3145 {
3146 	struct xfs_mount	*mp = bp->b_mount;
3147 	/*
3148 	 * During growfs operations, the perag is not fully initialised,
3149 	 * so we can't use it for any useful checking. growfs ensures we can't
3150 	 * use it by using uncached buffers that don't have the perag attached
3151 	 * so we can detect and avoid this problem.
3152 	 */
3153 	if (bp->b_pag && seqno != bp->b_pag->pag_agno)
3154 		return __this_address;
3155 
3156 	/*
3157 	 * Only the last AG in the filesystem is allowed to be shorter
3158 	 * than the AG size recorded in the superblock.
3159 	 */
3160 	if (length != mp->m_sb.sb_agblocks) {
3161 		/*
3162 		 * During growfs, the new last AG can get here before we
3163 		 * have updated the superblock. Give it a pass on the seqno
3164 		 * check.
3165 		 */
3166 		if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1)
3167 			return __this_address;
3168 		if (length < XFS_MIN_AG_BLOCKS)
3169 			return __this_address;
3170 		if (length > mp->m_sb.sb_agblocks)
3171 			return __this_address;
3172 	}
3173 
3174 	return NULL;
3175 }
3176 
3177 /*
3178  * Verify the AGF is consistent.
3179  *
3180  * We do not verify the AGFL indexes in the AGF are fully consistent here
3181  * because of issues with variable on-disk structure sizes. Instead, we check
3182  * the agfl indexes for consistency when we initialise the perag from the AGF
3183  * information after a read completes.
3184  *
3185  * If the index is inconsistent, then we mark the perag as needing an AGFL
3186  * reset. The first AGFL update performed then resets the AGFL indexes and
3187  * refills the AGFL with known good free blocks, allowing the filesystem to
3188  * continue operating normally at the cost of a few leaked free space blocks.
3189  */
3190 static xfs_failaddr_t
xfs_agf_verify(struct xfs_buf * bp)3191 xfs_agf_verify(
3192 	struct xfs_buf		*bp)
3193 {
3194 	struct xfs_mount	*mp = bp->b_mount;
3195 	struct xfs_agf		*agf = bp->b_addr;
3196 	xfs_failaddr_t		fa;
3197 	uint32_t		agf_seqno = be32_to_cpu(agf->agf_seqno);
3198 	uint32_t		agf_length = be32_to_cpu(agf->agf_length);
3199 
3200 	if (xfs_has_crc(mp)) {
3201 		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
3202 			return __this_address;
3203 		if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
3204 			return __this_address;
3205 	}
3206 
3207 	if (!xfs_verify_magic(bp, agf->agf_magicnum))
3208 		return __this_address;
3209 
3210 	if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)))
3211 		return __this_address;
3212 
3213 	/*
3214 	 * Both agf_seqno and agf_length need to validated before anything else
3215 	 * block number related in the AGF or AGFL can be checked.
3216 	 */
3217 	fa = xfs_validate_ag_length(bp, agf_seqno, agf_length);
3218 	if (fa)
3219 		return fa;
3220 
3221 	if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp))
3222 		return __this_address;
3223 	if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp))
3224 		return __this_address;
3225 	if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp))
3226 		return __this_address;
3227 
3228 	if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
3229 	    be32_to_cpu(agf->agf_freeblks) > agf_length)
3230 		return __this_address;
3231 
3232 	if (be32_to_cpu(agf->agf_bno_level) < 1 ||
3233 	    be32_to_cpu(agf->agf_cnt_level) < 1 ||
3234 	    be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels ||
3235 	    be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels)
3236 		return __this_address;
3237 
3238 	if (xfs_has_lazysbcount(mp) &&
3239 	    be32_to_cpu(agf->agf_btreeblks) > agf_length)
3240 		return __this_address;
3241 
3242 	if (xfs_has_rmapbt(mp)) {
3243 		if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
3244 			return __this_address;
3245 
3246 		if (be32_to_cpu(agf->agf_rmap_level) < 1 ||
3247 		    be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels)
3248 			return __this_address;
3249 	}
3250 
3251 	if (xfs_has_reflink(mp)) {
3252 		if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length)
3253 			return __this_address;
3254 
3255 		if (be32_to_cpu(agf->agf_refcount_level) < 1 ||
3256 		    be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)
3257 			return __this_address;
3258 	}
3259 
3260 	return NULL;
3261 }
3262 
3263 static void
xfs_agf_read_verify(struct xfs_buf * bp)3264 xfs_agf_read_verify(
3265 	struct xfs_buf	*bp)
3266 {
3267 	struct xfs_mount *mp = bp->b_mount;
3268 	xfs_failaddr_t	fa;
3269 
3270 	if (xfs_has_crc(mp) &&
3271 	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
3272 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3273 	else {
3274 		fa = xfs_agf_verify(bp);
3275 		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
3276 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3277 	}
3278 }
3279 
3280 static void
xfs_agf_write_verify(struct xfs_buf * bp)3281 xfs_agf_write_verify(
3282 	struct xfs_buf	*bp)
3283 {
3284 	struct xfs_mount	*mp = bp->b_mount;
3285 	struct xfs_buf_log_item	*bip = bp->b_log_item;
3286 	struct xfs_agf		*agf = bp->b_addr;
3287 	xfs_failaddr_t		fa;
3288 
3289 	fa = xfs_agf_verify(bp);
3290 	if (fa) {
3291 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3292 		return;
3293 	}
3294 
3295 	if (!xfs_has_crc(mp))
3296 		return;
3297 
3298 	if (bip)
3299 		agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
3300 
3301 	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
3302 }
3303 
3304 const struct xfs_buf_ops xfs_agf_buf_ops = {
3305 	.name = "xfs_agf",
3306 	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
3307 	.verify_read = xfs_agf_read_verify,
3308 	.verify_write = xfs_agf_write_verify,
3309 	.verify_struct = xfs_agf_verify,
3310 };
3311 
3312 /*
3313  * Read in the allocation group header (free/alloc section).
3314  */
3315 int
xfs_read_agf(struct xfs_perag * pag,struct xfs_trans * tp,int flags,struct xfs_buf ** agfbpp)3316 xfs_read_agf(
3317 	struct xfs_perag	*pag,
3318 	struct xfs_trans	*tp,
3319 	int			flags,
3320 	struct xfs_buf		**agfbpp)
3321 {
3322 	struct xfs_mount	*mp = pag->pag_mount;
3323 	int			error;
3324 
3325 	trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
3326 
3327 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3328 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3329 			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
3330 	if (xfs_metadata_is_sick(error))
3331 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
3332 	if (error)
3333 		return error;
3334 
3335 	xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
3336 	return 0;
3337 }
3338 
3339 /*
3340  * Read in the allocation group header (free/alloc section) and initialise the
3341  * perag structure if necessary. If the caller provides @agfbpp, then return the
3342  * locked buffer to the caller, otherwise free it.
3343  */
3344 int
xfs_alloc_read_agf(struct xfs_perag * pag,struct xfs_trans * tp,int flags,struct xfs_buf ** agfbpp)3345 xfs_alloc_read_agf(
3346 	struct xfs_perag	*pag,
3347 	struct xfs_trans	*tp,
3348 	int			flags,
3349 	struct xfs_buf		**agfbpp)
3350 {
3351 	struct xfs_buf		*agfbp;
3352 	struct xfs_agf		*agf;
3353 	int			error;
3354 	int			allocbt_blks;
3355 
3356 	trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
3357 
3358 	/* We don't support trylock when freeing. */
3359 	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3360 			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3361 	error = xfs_read_agf(pag, tp,
3362 			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3363 			&agfbp);
3364 	if (error)
3365 		return error;
3366 
3367 	agf = agfbp->b_addr;
3368 	if (!xfs_perag_initialised_agf(pag)) {
3369 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3370 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3371 		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3372 		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3373 		pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
3374 		pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
3375 		pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
3376 		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3377 		if (xfs_agfl_needs_reset(pag->pag_mount, agf))
3378 			set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3379 		else
3380 			clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3381 
3382 		/*
3383 		 * Update the in-core allocbt counter. Filter out the rmapbt
3384 		 * subset of the btreeblks counter because the rmapbt is managed
3385 		 * by perag reservation. Subtract one for the rmapbt root block
3386 		 * because the rmap counter includes it while the btreeblks
3387 		 * counter only tracks non-root blocks.
3388 		 */
3389 		allocbt_blks = pag->pagf_btreeblks;
3390 		if (xfs_has_rmapbt(pag->pag_mount))
3391 			allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3392 		if (allocbt_blks > 0)
3393 			atomic64_add(allocbt_blks,
3394 					&pag->pag_mount->m_allocbt_blks);
3395 
3396 		set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
3397 	}
3398 #ifdef DEBUG
3399 	else if (!xfs_is_shutdown(pag->pag_mount)) {
3400 		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3401 		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3402 		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3403 		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3404 		ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
3405 		ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
3406 	}
3407 #endif
3408 	if (agfbpp)
3409 		*agfbpp = agfbp;
3410 	else
3411 		xfs_trans_brelse(tp, agfbp);
3412 	return 0;
3413 }
3414 
3415 /*
3416  * Pre-proces allocation arguments to set initial state that we don't require
3417  * callers to set up correctly, as well as bounds check the allocation args
3418  * that are set up.
3419  */
3420 static int
xfs_alloc_vextent_check_args(struct xfs_alloc_arg * args,xfs_fsblock_t target,xfs_agnumber_t * minimum_agno)3421 xfs_alloc_vextent_check_args(
3422 	struct xfs_alloc_arg	*args,
3423 	xfs_fsblock_t		target,
3424 	xfs_agnumber_t		*minimum_agno)
3425 {
3426 	struct xfs_mount	*mp = args->mp;
3427 	xfs_agblock_t		agsize;
3428 
3429 	args->fsbno = NULLFSBLOCK;
3430 
3431 	*minimum_agno = 0;
3432 	if (args->tp->t_highest_agno != NULLAGNUMBER)
3433 		*minimum_agno = args->tp->t_highest_agno;
3434 
3435 	/*
3436 	 * Just fix this up, for the case where the last a.g. is shorter
3437 	 * (or there's only one a.g.) and the caller couldn't easily figure
3438 	 * that out (xfs_bmap_alloc).
3439 	 */
3440 	agsize = mp->m_sb.sb_agblocks;
3441 	if (args->maxlen > agsize)
3442 		args->maxlen = agsize;
3443 	if (args->alignment == 0)
3444 		args->alignment = 1;
3445 
3446 	ASSERT(args->minlen > 0);
3447 	ASSERT(args->maxlen > 0);
3448 	ASSERT(args->alignment > 0);
3449 	ASSERT(args->resv != XFS_AG_RESV_AGFL);
3450 
3451 	ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount);
3452 	ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize);
3453 	ASSERT(args->minlen <= args->maxlen);
3454 	ASSERT(args->minlen <= agsize);
3455 	ASSERT(args->mod < args->prod);
3456 
3457 	if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount ||
3458 	    XFS_FSB_TO_AGBNO(mp, target) >= agsize ||
3459 	    args->minlen > args->maxlen || args->minlen > agsize ||
3460 	    args->mod >= args->prod) {
3461 		trace_xfs_alloc_vextent_badargs(args);
3462 		return -ENOSPC;
3463 	}
3464 
3465 	if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) {
3466 		trace_xfs_alloc_vextent_skip_deadlock(args);
3467 		return -ENOSPC;
3468 	}
3469 	return 0;
3470 
3471 }
3472 
3473 /*
3474  * Prepare an AG for allocation. If the AG is not prepared to accept the
3475  * allocation, return failure.
3476  *
3477  * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
3478  * modified to hold their own perag references.
3479  */
3480 static int
xfs_alloc_vextent_prepare_ag(struct xfs_alloc_arg * args,uint32_t alloc_flags)3481 xfs_alloc_vextent_prepare_ag(
3482 	struct xfs_alloc_arg	*args,
3483 	uint32_t		alloc_flags)
3484 {
3485 	bool			need_pag = !args->pag;
3486 	int			error;
3487 
3488 	if (need_pag)
3489 		args->pag = xfs_perag_get(args->mp, args->agno);
3490 
3491 	args->agbp = NULL;
3492 	error = xfs_alloc_fix_freelist(args, alloc_flags);
3493 	if (error) {
3494 		trace_xfs_alloc_vextent_nofix(args);
3495 		if (need_pag)
3496 			xfs_perag_put(args->pag);
3497 		args->agbno = NULLAGBLOCK;
3498 		return error;
3499 	}
3500 	if (!args->agbp) {
3501 		/* cannot allocate in this AG at all */
3502 		trace_xfs_alloc_vextent_noagbp(args);
3503 		args->agbno = NULLAGBLOCK;
3504 		return 0;
3505 	}
3506 	args->wasfromfl = 0;
3507 	return 0;
3508 }
3509 
3510 /*
3511  * Post-process allocation results to account for the allocation if it succeed
3512  * and set the allocated block number correctly for the caller.
3513  *
3514  * XXX: we should really be returning ENOSPC for ENOSPC, not
3515  * hiding it behind a "successful" NULLFSBLOCK allocation.
3516  */
3517 static int
xfs_alloc_vextent_finish(struct xfs_alloc_arg * args,xfs_agnumber_t minimum_agno,int alloc_error,bool drop_perag)3518 xfs_alloc_vextent_finish(
3519 	struct xfs_alloc_arg	*args,
3520 	xfs_agnumber_t		minimum_agno,
3521 	int			alloc_error,
3522 	bool			drop_perag)
3523 {
3524 	struct xfs_mount	*mp = args->mp;
3525 	int			error = 0;
3526 
3527 	/*
3528 	 * We can end up here with a locked AGF. If we failed, the caller is
3529 	 * likely going to try to allocate again with different parameters, and
3530 	 * that can widen the AGs that are searched for free space. If we have
3531 	 * to do BMBT block allocation, we have to do a new allocation.
3532 	 *
3533 	 * Hence leaving this function with the AGF locked opens up potential
3534 	 * ABBA AGF deadlocks because a future allocation attempt in this
3535 	 * transaction may attempt to lock a lower number AGF.
3536 	 *
3537 	 * We can't release the AGF until the transaction is commited, so at
3538 	 * this point we must update the "first allocation" tracker to point at
3539 	 * this AG if the tracker is empty or points to a lower AG. This allows
3540 	 * the next allocation attempt to be modified appropriately to avoid
3541 	 * deadlocks.
3542 	 */
3543 	if (args->agbp &&
3544 	    (args->tp->t_highest_agno == NULLAGNUMBER ||
3545 	     args->agno > minimum_agno))
3546 		args->tp->t_highest_agno = args->agno;
3547 
3548 	/*
3549 	 * If the allocation failed with an error or we had an ENOSPC result,
3550 	 * preserve the returned error whilst also marking the allocation result
3551 	 * as "no extent allocated". This ensures that callers that fail to
3552 	 * capture the error will still treat it as a failed allocation.
3553 	 */
3554 	if (alloc_error || args->agbno == NULLAGBLOCK) {
3555 		args->fsbno = NULLFSBLOCK;
3556 		error = alloc_error;
3557 		goto out_drop_perag;
3558 	}
3559 
3560 	args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3561 
3562 	ASSERT(args->len >= args->minlen);
3563 	ASSERT(args->len <= args->maxlen);
3564 	ASSERT(args->agbno % args->alignment == 0);
3565 	XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
3566 
3567 	/* if not file data, insert new block into the reverse map btree */
3568 	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
3569 		error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
3570 				       args->agbno, args->len, &args->oinfo);
3571 		if (error)
3572 			goto out_drop_perag;
3573 	}
3574 
3575 	if (!args->wasfromfl) {
3576 		error = xfs_alloc_update_counters(args->tp, args->agbp,
3577 						  -((long)(args->len)));
3578 		if (error)
3579 			goto out_drop_perag;
3580 
3581 		ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno,
3582 				args->len));
3583 	}
3584 
3585 	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
3586 
3587 	XFS_STATS_INC(mp, xs_allocx);
3588 	XFS_STATS_ADD(mp, xs_allocb, args->len);
3589 
3590 	trace_xfs_alloc_vextent_finish(args);
3591 
3592 out_drop_perag:
3593 	if (drop_perag && args->pag) {
3594 		xfs_perag_rele(args->pag);
3595 		args->pag = NULL;
3596 	}
3597 	return error;
3598 }
3599 
3600 /*
3601  * Allocate within a single AG only. This uses a best-fit length algorithm so if
3602  * you need an exact sized allocation without locality constraints, this is the
3603  * fastest way to do it.
3604  *
3605  * Caller is expected to hold a perag reference in args->pag.
3606  */
3607 int
xfs_alloc_vextent_this_ag(struct xfs_alloc_arg * args,xfs_agnumber_t agno)3608 xfs_alloc_vextent_this_ag(
3609 	struct xfs_alloc_arg	*args,
3610 	xfs_agnumber_t		agno)
3611 {
3612 	struct xfs_mount	*mp = args->mp;
3613 	xfs_agnumber_t		minimum_agno;
3614 	uint32_t		alloc_flags = 0;
3615 	int			error;
3616 
3617 	ASSERT(args->pag != NULL);
3618 	ASSERT(args->pag->pag_agno == agno);
3619 
3620 	args->agno = agno;
3621 	args->agbno = 0;
3622 
3623 	trace_xfs_alloc_vextent_this_ag(args);
3624 
3625 	error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0),
3626 			&minimum_agno);
3627 	if (error) {
3628 		if (error == -ENOSPC)
3629 			return 0;
3630 		return error;
3631 	}
3632 
3633 	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3634 	if (!error && args->agbp)
3635 		error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3636 
3637 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3638 }
3639 
3640 /*
3641  * Iterate all AGs trying to allocate an extent starting from @start_ag.
3642  *
3643  * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
3644  * allocation attempts in @start_agno have locality information. If we fail to
3645  * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
3646  * we attempt to allocation in as there is no locality optimisation possible for
3647  * those allocations.
3648  *
3649  * On return, args->pag may be left referenced if we finish before the "all
3650  * failed" return point. The allocation finish still needs the perag, and
3651  * so the caller will release it once they've finished the allocation.
3652  *
3653  * When we wrap the AG iteration at the end of the filesystem, we have to be
3654  * careful not to wrap into AGs below ones we already have locked in the
3655  * transaction if we are doing a blocking iteration. This will result in an
3656  * out-of-order locking of AGFs and hence can cause deadlocks.
3657  */
3658 static int
xfs_alloc_vextent_iterate_ags(struct xfs_alloc_arg * args,xfs_agnumber_t minimum_agno,xfs_agnumber_t start_agno,xfs_agblock_t target_agbno,uint32_t alloc_flags)3659 xfs_alloc_vextent_iterate_ags(
3660 	struct xfs_alloc_arg	*args,
3661 	xfs_agnumber_t		minimum_agno,
3662 	xfs_agnumber_t		start_agno,
3663 	xfs_agblock_t		target_agbno,
3664 	uint32_t		alloc_flags)
3665 {
3666 	struct xfs_mount	*mp = args->mp;
3667 	xfs_agnumber_t		restart_agno = minimum_agno;
3668 	xfs_agnumber_t		agno;
3669 	int			error = 0;
3670 
3671 	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)
3672 		restart_agno = 0;
3673 restart:
3674 	for_each_perag_wrap_range(mp, start_agno, restart_agno,
3675 			mp->m_sb.sb_agcount, agno, args->pag) {
3676 		args->agno = agno;
3677 		error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3678 		if (error)
3679 			break;
3680 		if (!args->agbp) {
3681 			trace_xfs_alloc_vextent_loopfailed(args);
3682 			continue;
3683 		}
3684 
3685 		/*
3686 		 * Allocation is supposed to succeed now, so break out of the
3687 		 * loop regardless of whether we succeed or not.
3688 		 */
3689 		if (args->agno == start_agno && target_agbno) {
3690 			args->agbno = target_agbno;
3691 			error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3692 		} else {
3693 			args->agbno = 0;
3694 			error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3695 		}
3696 		break;
3697 	}
3698 	if (error) {
3699 		xfs_perag_rele(args->pag);
3700 		args->pag = NULL;
3701 		return error;
3702 	}
3703 	if (args->agbp)
3704 		return 0;
3705 
3706 	/*
3707 	 * We didn't find an AG we can alloation from. If we were given
3708 	 * constraining flags by the caller, drop them and retry the allocation
3709 	 * without any constraints being set.
3710 	 */
3711 	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) {
3712 		alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK;
3713 		restart_agno = minimum_agno;
3714 		goto restart;
3715 	}
3716 
3717 	ASSERT(args->pag == NULL);
3718 	trace_xfs_alloc_vextent_allfailed(args);
3719 	return 0;
3720 }
3721 
3722 /*
3723  * Iterate from the AGs from the start AG to the end of the filesystem, trying
3724  * to allocate blocks. It starts with a near allocation attempt in the initial
3725  * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
3726  * back to zero if allowed by previous allocations in this transaction,
3727  * otherwise will wrap back to the start AG and run a second blocking pass to
3728  * the end of the filesystem.
3729  */
3730 int
xfs_alloc_vextent_start_ag(struct xfs_alloc_arg * args,xfs_fsblock_t target)3731 xfs_alloc_vextent_start_ag(
3732 	struct xfs_alloc_arg	*args,
3733 	xfs_fsblock_t		target)
3734 {
3735 	struct xfs_mount	*mp = args->mp;
3736 	xfs_agnumber_t		minimum_agno;
3737 	xfs_agnumber_t		start_agno;
3738 	xfs_agnumber_t		rotorstep = xfs_rotorstep;
3739 	bool			bump_rotor = false;
3740 	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3741 	int			error;
3742 
3743 	ASSERT(args->pag == NULL);
3744 
3745 	args->agno = NULLAGNUMBER;
3746 	args->agbno = NULLAGBLOCK;
3747 
3748 	trace_xfs_alloc_vextent_start_ag(args);
3749 
3750 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3751 	if (error) {
3752 		if (error == -ENOSPC)
3753 			return 0;
3754 		return error;
3755 	}
3756 
3757 	if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3758 	    xfs_is_inode32(mp)) {
3759 		target = XFS_AGB_TO_FSB(mp,
3760 				((mp->m_agfrotor / rotorstep) %
3761 				mp->m_sb.sb_agcount), 0);
3762 		bump_rotor = 1;
3763 	}
3764 
3765 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3766 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3767 			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3768 
3769 	if (bump_rotor) {
3770 		if (args->agno == start_agno)
3771 			mp->m_agfrotor = (mp->m_agfrotor + 1) %
3772 				(mp->m_sb.sb_agcount * rotorstep);
3773 		else
3774 			mp->m_agfrotor = (args->agno * rotorstep + 1) %
3775 				(mp->m_sb.sb_agcount * rotorstep);
3776 	}
3777 
3778 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3779 }
3780 
3781 /*
3782  * Iterate from the agno indicated via @target through to the end of the
3783  * filesystem attempting blocking allocation. This does not wrap or try a second
3784  * pass, so will not recurse into AGs lower than indicated by the target.
3785  */
3786 int
xfs_alloc_vextent_first_ag(struct xfs_alloc_arg * args,xfs_fsblock_t target)3787 xfs_alloc_vextent_first_ag(
3788 	struct xfs_alloc_arg	*args,
3789 	xfs_fsblock_t		target)
3790  {
3791 	struct xfs_mount	*mp = args->mp;
3792 	xfs_agnumber_t		minimum_agno;
3793 	xfs_agnumber_t		start_agno;
3794 	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3795 	int			error;
3796 
3797 	ASSERT(args->pag == NULL);
3798 
3799 	args->agno = NULLAGNUMBER;
3800 	args->agbno = NULLAGBLOCK;
3801 
3802 	trace_xfs_alloc_vextent_first_ag(args);
3803 
3804 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3805 	if (error) {
3806 		if (error == -ENOSPC)
3807 			return 0;
3808 		return error;
3809 	}
3810 
3811 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3812 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3813 			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3814 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3815 }
3816 
3817 /*
3818  * Allocate at the exact block target or fail. Caller is expected to hold a
3819  * perag reference in args->pag.
3820  */
3821 int
xfs_alloc_vextent_exact_bno(struct xfs_alloc_arg * args,xfs_fsblock_t target)3822 xfs_alloc_vextent_exact_bno(
3823 	struct xfs_alloc_arg	*args,
3824 	xfs_fsblock_t		target)
3825 {
3826 	struct xfs_mount	*mp = args->mp;
3827 	xfs_agnumber_t		minimum_agno;
3828 	int			error;
3829 
3830 	ASSERT(args->pag != NULL);
3831 	ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3832 
3833 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3834 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3835 
3836 	trace_xfs_alloc_vextent_exact_bno(args);
3837 
3838 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3839 	if (error) {
3840 		if (error == -ENOSPC)
3841 			return 0;
3842 		return error;
3843 	}
3844 
3845 	error = xfs_alloc_vextent_prepare_ag(args, 0);
3846 	if (!error && args->agbp)
3847 		error = xfs_alloc_ag_vextent_exact(args);
3848 
3849 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3850 }
3851 
3852 /*
3853  * Allocate an extent as close to the target as possible. If there are not
3854  * viable candidates in the AG, then fail the allocation.
3855  *
3856  * Caller may or may not have a per-ag reference in args->pag.
3857  */
3858 int
xfs_alloc_vextent_near_bno(struct xfs_alloc_arg * args,xfs_fsblock_t target)3859 xfs_alloc_vextent_near_bno(
3860 	struct xfs_alloc_arg	*args,
3861 	xfs_fsblock_t		target)
3862 {
3863 	struct xfs_mount	*mp = args->mp;
3864 	xfs_agnumber_t		minimum_agno;
3865 	bool			needs_perag = args->pag == NULL;
3866 	uint32_t		alloc_flags = 0;
3867 	int			error;
3868 
3869 	if (!needs_perag)
3870 		ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3871 
3872 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3873 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3874 
3875 	trace_xfs_alloc_vextent_near_bno(args);
3876 
3877 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3878 	if (error) {
3879 		if (error == -ENOSPC)
3880 			return 0;
3881 		return error;
3882 	}
3883 
3884 	if (needs_perag)
3885 		args->pag = xfs_perag_grab(mp, args->agno);
3886 
3887 	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3888 	if (!error && args->agbp)
3889 		error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3890 
3891 	return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag);
3892 }
3893 
3894 /* Ensure that the freelist is at full capacity. */
3895 int
xfs_free_extent_fix_freelist(struct xfs_trans * tp,struct xfs_perag * pag,struct xfs_buf ** agbp)3896 xfs_free_extent_fix_freelist(
3897 	struct xfs_trans	*tp,
3898 	struct xfs_perag	*pag,
3899 	struct xfs_buf		**agbp)
3900 {
3901 	struct xfs_alloc_arg	args;
3902 	int			error;
3903 
3904 	memset(&args, 0, sizeof(struct xfs_alloc_arg));
3905 	args.tp = tp;
3906 	args.mp = tp->t_mountp;
3907 	args.agno = pag->pag_agno;
3908 	args.pag = pag;
3909 
3910 	/*
3911 	 * validate that the block number is legal - the enables us to detect
3912 	 * and handle a silent filesystem corruption rather than crashing.
3913 	 */
3914 	if (args.agno >= args.mp->m_sb.sb_agcount)
3915 		return -EFSCORRUPTED;
3916 
3917 	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3918 	if (error)
3919 		return error;
3920 
3921 	*agbp = args.agbp;
3922 	return 0;
3923 }
3924 
3925 /*
3926  * Free an extent.
3927  * Just break up the extent address and hand off to xfs_free_ag_extent
3928  * after fixing up the freelist.
3929  */
3930 int
__xfs_free_extent(struct xfs_trans * tp,struct xfs_perag * pag,xfs_agblock_t agbno,xfs_extlen_t len,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type,bool skip_discard)3931 __xfs_free_extent(
3932 	struct xfs_trans		*tp,
3933 	struct xfs_perag		*pag,
3934 	xfs_agblock_t			agbno,
3935 	xfs_extlen_t			len,
3936 	const struct xfs_owner_info	*oinfo,
3937 	enum xfs_ag_resv_type		type,
3938 	bool				skip_discard)
3939 {
3940 	struct xfs_mount		*mp = tp->t_mountp;
3941 	struct xfs_buf			*agbp;
3942 	struct xfs_agf			*agf;
3943 	int				error;
3944 	unsigned int			busy_flags = 0;
3945 
3946 	ASSERT(len != 0);
3947 	ASSERT(type != XFS_AG_RESV_AGFL);
3948 
3949 	if (XFS_TEST_ERROR(false, mp,
3950 			XFS_ERRTAG_FREE_EXTENT))
3951 		return -EIO;
3952 
3953 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
3954 	if (error) {
3955 		if (xfs_metadata_is_sick(error))
3956 			xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
3957 		return error;
3958 	}
3959 
3960 	agf = agbp->b_addr;
3961 
3962 	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3963 		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
3964 		error = -EFSCORRUPTED;
3965 		goto err_release;
3966 	}
3967 
3968 	/* validate the extent size is legal now we have the agf locked */
3969 	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3970 		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
3971 		error = -EFSCORRUPTED;
3972 		goto err_release;
3973 	}
3974 
3975 	error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
3976 			type);
3977 	if (error)
3978 		goto err_release;
3979 
3980 	if (skip_discard)
3981 		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3982 	xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3983 	return 0;
3984 
3985 err_release:
3986 	xfs_trans_brelse(tp, agbp);
3987 	return error;
3988 }
3989 
3990 struct xfs_alloc_query_range_info {
3991 	xfs_alloc_query_range_fn	fn;
3992 	void				*priv;
3993 };
3994 
3995 /* Format btree record and pass to our callback. */
3996 STATIC int
xfs_alloc_query_range_helper(struct xfs_btree_cur * cur,const union xfs_btree_rec * rec,void * priv)3997 xfs_alloc_query_range_helper(
3998 	struct xfs_btree_cur		*cur,
3999 	const union xfs_btree_rec	*rec,
4000 	void				*priv)
4001 {
4002 	struct xfs_alloc_query_range_info	*query = priv;
4003 	struct xfs_alloc_rec_incore		irec;
4004 	xfs_failaddr_t				fa;
4005 
4006 	xfs_alloc_btrec_to_irec(rec, &irec);
4007 	fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
4008 	if (fa)
4009 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
4010 
4011 	return query->fn(cur, &irec, query->priv);
4012 }
4013 
4014 /* Find all free space within a given range of blocks. */
4015 int
xfs_alloc_query_range(struct xfs_btree_cur * cur,const struct xfs_alloc_rec_incore * low_rec,const struct xfs_alloc_rec_incore * high_rec,xfs_alloc_query_range_fn fn,void * priv)4016 xfs_alloc_query_range(
4017 	struct xfs_btree_cur			*cur,
4018 	const struct xfs_alloc_rec_incore	*low_rec,
4019 	const struct xfs_alloc_rec_incore	*high_rec,
4020 	xfs_alloc_query_range_fn		fn,
4021 	void					*priv)
4022 {
4023 	union xfs_btree_irec			low_brec = { .a = *low_rec };
4024 	union xfs_btree_irec			high_brec = { .a = *high_rec };
4025 	struct xfs_alloc_query_range_info	query = { .priv = priv, .fn = fn };
4026 
4027 	ASSERT(xfs_btree_is_bno(cur->bc_ops));
4028 	return xfs_btree_query_range(cur, &low_brec, &high_brec,
4029 			xfs_alloc_query_range_helper, &query);
4030 }
4031 
4032 /* Find all free space records. */
4033 int
xfs_alloc_query_all(struct xfs_btree_cur * cur,xfs_alloc_query_range_fn fn,void * priv)4034 xfs_alloc_query_all(
4035 	struct xfs_btree_cur			*cur,
4036 	xfs_alloc_query_range_fn		fn,
4037 	void					*priv)
4038 {
4039 	struct xfs_alloc_query_range_info	query;
4040 
4041 	ASSERT(xfs_btree_is_bno(cur->bc_ops));
4042 	query.priv = priv;
4043 	query.fn = fn;
4044 	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
4045 }
4046 
4047 /*
4048  * Scan part of the keyspace of the free space and tell us if the area has no
4049  * records, is fully mapped by records, or is partially filled.
4050  */
4051 int
xfs_alloc_has_records(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len,enum xbtree_recpacking * outcome)4052 xfs_alloc_has_records(
4053 	struct xfs_btree_cur	*cur,
4054 	xfs_agblock_t		bno,
4055 	xfs_extlen_t		len,
4056 	enum xbtree_recpacking	*outcome)
4057 {
4058 	union xfs_btree_irec	low;
4059 	union xfs_btree_irec	high;
4060 
4061 	memset(&low, 0, sizeof(low));
4062 	low.a.ar_startblock = bno;
4063 	memset(&high, 0xFF, sizeof(high));
4064 	high.a.ar_startblock = bno + len - 1;
4065 
4066 	return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
4067 }
4068 
4069 /*
4070  * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
4071  * error code or XFS_ITER_*.
4072  */
4073 int
xfs_agfl_walk(struct xfs_mount * mp,struct xfs_agf * agf,struct xfs_buf * agflbp,xfs_agfl_walk_fn walk_fn,void * priv)4074 xfs_agfl_walk(
4075 	struct xfs_mount	*mp,
4076 	struct xfs_agf		*agf,
4077 	struct xfs_buf		*agflbp,
4078 	xfs_agfl_walk_fn	walk_fn,
4079 	void			*priv)
4080 {
4081 	__be32			*agfl_bno;
4082 	unsigned int		i;
4083 	int			error;
4084 
4085 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
4086 	i = be32_to_cpu(agf->agf_flfirst);
4087 
4088 	/* Nothing to walk in an empty AGFL. */
4089 	if (agf->agf_flcount == cpu_to_be32(0))
4090 		return 0;
4091 
4092 	/* Otherwise, walk from first to last, wrapping as needed. */
4093 	for (;;) {
4094 		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
4095 		if (error)
4096 			return error;
4097 		if (i == be32_to_cpu(agf->agf_fllast))
4098 			break;
4099 		if (++i == xfs_agfl_size(mp))
4100 			i = 0;
4101 	}
4102 
4103 	return 0;
4104 }
4105 
4106 int __init
xfs_extfree_intent_init_cache(void)4107 xfs_extfree_intent_init_cache(void)
4108 {
4109 	xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
4110 			sizeof(struct xfs_extent_free_item),
4111 			0, 0, NULL);
4112 
4113 	return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
4114 }
4115 
4116 void
xfs_extfree_intent_destroy_cache(void)4117 xfs_extfree_intent_destroy_cache(void)
4118 {
4119 	kmem_cache_destroy(xfs_extfree_item_cache);
4120 	xfs_extfree_item_cache = NULL;
4121 }
4122