xref: /linux/fs/gfs2/rgrp.c (revision 44f57d78)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/fs.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
19 
20 #include "gfs2.h"
21 #include "incore.h"
22 #include "glock.h"
23 #include "glops.h"
24 #include "lops.h"
25 #include "meta_io.h"
26 #include "quota.h"
27 #include "rgrp.h"
28 #include "super.h"
29 #include "trans.h"
30 #include "util.h"
31 #include "log.h"
32 #include "inode.h"
33 #include "trace_gfs2.h"
34 #include "dir.h"
35 
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
38 
39 #if BITS_PER_LONG == 32
40 #define LBITMASK   (0x55555555UL)
41 #define LBITSKIP55 (0x55555555UL)
42 #define LBITSKIP00 (0x00000000UL)
43 #else
44 #define LBITMASK   (0x5555555555555555UL)
45 #define LBITSKIP55 (0x5555555555555555UL)
46 #define LBITSKIP00 (0x0000000000000000UL)
47 #endif
48 
49 /*
50  * These routines are used by the resource group routines (rgrp.c)
51  * to keep track of block allocation.  Each block is represented by two
52  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
53  *
54  * 0 = Free
55  * 1 = Used (not metadata)
56  * 2 = Unlinked (still in use) inode
57  * 3 = Used (metadata)
58  */
59 
60 struct gfs2_extent {
61 	struct gfs2_rbm rbm;
62 	u32 len;
63 };
64 
65 static const char valid_change[16] = {
66 	        /* current */
67 	/* n */ 0, 1, 1, 1,
68 	/* e */ 1, 0, 0, 0,
69 	/* w */ 0, 0, 0, 1,
70 	        1, 0, 0, 0
71 };
72 
73 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
74 			 const struct gfs2_inode *ip, bool nowrap);
75 
76 
77 /**
78  * gfs2_setbit - Set a bit in the bitmaps
79  * @rbm: The position of the bit to set
80  * @do_clone: Also set the clone bitmap, if it exists
81  * @new_state: the new state of the block
82  *
83  */
84 
85 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
86 			       unsigned char new_state)
87 {
88 	unsigned char *byte1, *byte2, *end, cur_state;
89 	struct gfs2_bitmap *bi = rbm_bi(rbm);
90 	unsigned int buflen = bi->bi_bytes;
91 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
92 
93 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
94 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
95 
96 	BUG_ON(byte1 >= end);
97 
98 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
99 
100 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
101 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
102 
103 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
104 			rbm->offset, cur_state, new_state);
105 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
106 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
107 			(unsigned long long)bi->bi_bh->b_blocknr);
108 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
109 			bi->bi_offset, bi->bi_bytes,
110 			(unsigned long long)gfs2_rbm_to_block(rbm));
111 		dump_stack();
112 		gfs2_consist_rgrpd(rbm->rgd);
113 		return;
114 	}
115 	*byte1 ^= (cur_state ^ new_state) << bit;
116 
117 	if (do_clone && bi->bi_clone) {
118 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
119 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
120 		*byte2 ^= (cur_state ^ new_state) << bit;
121 	}
122 }
123 
124 /**
125  * gfs2_testbit - test a bit in the bitmaps
126  * @rbm: The bit to test
127  * @use_clone: If true, test the clone bitmap, not the official bitmap.
128  *
129  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
130  * not the "real" bitmaps, to avoid allocating recently freed blocks.
131  *
132  * Returns: The two bit block state of the requested bit
133  */
134 
135 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
136 {
137 	struct gfs2_bitmap *bi = rbm_bi(rbm);
138 	const u8 *buffer;
139 	const u8 *byte;
140 	unsigned int bit;
141 
142 	if (use_clone && bi->bi_clone)
143 		buffer = bi->bi_clone;
144 	else
145 		buffer = bi->bi_bh->b_data;
146 	buffer += bi->bi_offset;
147 	byte = buffer + (rbm->offset / GFS2_NBBY);
148 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
149 
150 	return (*byte >> bit) & GFS2_BIT_MASK;
151 }
152 
153 /**
154  * gfs2_bit_search
155  * @ptr: Pointer to bitmap data
156  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
157  * @state: The state we are searching for
158  *
159  * We xor the bitmap data with a patter which is the bitwise opposite
160  * of what we are looking for, this gives rise to a pattern of ones
161  * wherever there is a match. Since we have two bits per entry, we
162  * take this pattern, shift it down by one place and then and it with
163  * the original. All the even bit positions (0,2,4, etc) then represent
164  * successful matches, so we mask with 0x55555..... to remove the unwanted
165  * odd bit positions.
166  *
167  * This allows searching of a whole u64 at once (32 blocks) with a
168  * single test (on 64 bit arches).
169  */
170 
171 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
172 {
173 	u64 tmp;
174 	static const u64 search[] = {
175 		[0] = 0xffffffffffffffffULL,
176 		[1] = 0xaaaaaaaaaaaaaaaaULL,
177 		[2] = 0x5555555555555555ULL,
178 		[3] = 0x0000000000000000ULL,
179 	};
180 	tmp = le64_to_cpu(*ptr) ^ search[state];
181 	tmp &= (tmp >> 1);
182 	tmp &= mask;
183 	return tmp;
184 }
185 
186 /**
187  * rs_cmp - multi-block reservation range compare
188  * @blk: absolute file system block number of the new reservation
189  * @len: number of blocks in the new reservation
190  * @rs: existing reservation to compare against
191  *
192  * returns: 1 if the block range is beyond the reach of the reservation
193  *         -1 if the block range is before the start of the reservation
194  *          0 if the block range overlaps with the reservation
195  */
196 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
197 {
198 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
199 
200 	if (blk >= startblk + rs->rs_free)
201 		return 1;
202 	if (blk + len - 1 < startblk)
203 		return -1;
204 	return 0;
205 }
206 
207 /**
208  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
209  *       a block in a given allocation state.
210  * @buf: the buffer that holds the bitmaps
211  * @len: the length (in bytes) of the buffer
212  * @goal: start search at this block's bit-pair (within @buffer)
213  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
214  *
215  * Scope of @goal and returned block number is only within this bitmap buffer,
216  * not entire rgrp or filesystem.  @buffer will be offset from the actual
217  * beginning of a bitmap block buffer, skipping any header structures, but
218  * headers are always a multiple of 64 bits long so that the buffer is
219  * always aligned to a 64 bit boundary.
220  *
221  * The size of the buffer is in bytes, but is it assumed that it is
222  * always ok to read a complete multiple of 64 bits at the end
223  * of the block in case the end is no aligned to a natural boundary.
224  *
225  * Return: the block number (bitmap buffer scope) that was found
226  */
227 
228 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
229 		       u32 goal, u8 state)
230 {
231 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
232 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
233 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
234 	u64 tmp;
235 	u64 mask = 0x5555555555555555ULL;
236 	u32 bit;
237 
238 	/* Mask off bits we don't care about at the start of the search */
239 	mask <<= spoint;
240 	tmp = gfs2_bit_search(ptr, mask, state);
241 	ptr++;
242 	while(tmp == 0 && ptr < end) {
243 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
244 		ptr++;
245 	}
246 	/* Mask off any bits which are more than len bytes from the start */
247 	if (ptr == end && (len & (sizeof(u64) - 1)))
248 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
249 	/* Didn't find anything, so return */
250 	if (tmp == 0)
251 		return BFITNOENT;
252 	ptr--;
253 	bit = __ffs64(tmp);
254 	bit /= 2;	/* two bits per entry in the bitmap */
255 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
256 }
257 
258 /**
259  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
260  * @rbm: The rbm with rgd already set correctly
261  * @block: The block number (filesystem relative)
262  *
263  * This sets the bi and offset members of an rbm based on a
264  * resource group and a filesystem relative block number. The
265  * resource group must be set in the rbm on entry, the bi and
266  * offset members will be set by this function.
267  *
268  * Returns: 0 on success, or an error code
269  */
270 
271 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
272 {
273 	if (!rgrp_contains_block(rbm->rgd, block))
274 		return -E2BIG;
275 	rbm->bii = 0;
276 	rbm->offset = block - rbm->rgd->rd_data0;
277 	/* Check if the block is within the first block */
278 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
279 		return 0;
280 
281 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
282 	rbm->offset += (sizeof(struct gfs2_rgrp) -
283 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
284 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
285 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
286 	return 0;
287 }
288 
289 /**
290  * gfs2_rbm_incr - increment an rbm structure
291  * @rbm: The rbm with rgd already set correctly
292  *
293  * This function takes an existing rbm structure and increments it to the next
294  * viable block offset.
295  *
296  * Returns: If incrementing the offset would cause the rbm to go past the
297  *          end of the rgrp, true is returned, otherwise false.
298  *
299  */
300 
301 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
302 {
303 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
304 		rbm->offset++;
305 		return false;
306 	}
307 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
308 		return true;
309 
310 	rbm->offset = 0;
311 	rbm->bii++;
312 	return false;
313 }
314 
315 /**
316  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
317  * @rbm: Position to search (value/result)
318  * @n_unaligned: Number of unaligned blocks to check
319  * @len: Decremented for each block found (terminate on zero)
320  *
321  * Returns: true if a non-free block is encountered
322  */
323 
324 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
325 {
326 	u32 n;
327 	u8 res;
328 
329 	for (n = 0; n < n_unaligned; n++) {
330 		res = gfs2_testbit(rbm, true);
331 		if (res != GFS2_BLKST_FREE)
332 			return true;
333 		(*len)--;
334 		if (*len == 0)
335 			return true;
336 		if (gfs2_rbm_incr(rbm))
337 			return true;
338 	}
339 
340 	return false;
341 }
342 
343 /**
344  * gfs2_free_extlen - Return extent length of free blocks
345  * @rrbm: Starting position
346  * @len: Max length to check
347  *
348  * Starting at the block specified by the rbm, see how many free blocks
349  * there are, not reading more than len blocks ahead. This can be done
350  * using memchr_inv when the blocks are byte aligned, but has to be done
351  * on a block by block basis in case of unaligned blocks. Also this
352  * function can cope with bitmap boundaries (although it must stop on
353  * a resource group boundary)
354  *
355  * Returns: Number of free blocks in the extent
356  */
357 
358 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
359 {
360 	struct gfs2_rbm rbm = *rrbm;
361 	u32 n_unaligned = rbm.offset & 3;
362 	u32 size = len;
363 	u32 bytes;
364 	u32 chunk_size;
365 	u8 *ptr, *start, *end;
366 	u64 block;
367 	struct gfs2_bitmap *bi;
368 
369 	if (n_unaligned &&
370 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
371 		goto out;
372 
373 	n_unaligned = len & 3;
374 	/* Start is now byte aligned */
375 	while (len > 3) {
376 		bi = rbm_bi(&rbm);
377 		start = bi->bi_bh->b_data;
378 		if (bi->bi_clone)
379 			start = bi->bi_clone;
380 		start += bi->bi_offset;
381 		end = start + bi->bi_bytes;
382 		BUG_ON(rbm.offset & 3);
383 		start += (rbm.offset / GFS2_NBBY);
384 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
385 		ptr = memchr_inv(start, 0, bytes);
386 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
387 		chunk_size *= GFS2_NBBY;
388 		BUG_ON(len < chunk_size);
389 		len -= chunk_size;
390 		block = gfs2_rbm_to_block(&rbm);
391 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
392 			n_unaligned = 0;
393 			break;
394 		}
395 		if (ptr) {
396 			n_unaligned = 3;
397 			break;
398 		}
399 		n_unaligned = len & 3;
400 	}
401 
402 	/* Deal with any bits left over at the end */
403 	if (n_unaligned)
404 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
405 out:
406 	return size - len;
407 }
408 
409 /**
410  * gfs2_bitcount - count the number of bits in a certain state
411  * @rgd: the resource group descriptor
412  * @buffer: the buffer that holds the bitmaps
413  * @buflen: the length (in bytes) of the buffer
414  * @state: the state of the block we're looking for
415  *
416  * Returns: The number of bits
417  */
418 
419 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
420 			 unsigned int buflen, u8 state)
421 {
422 	const u8 *byte = buffer;
423 	const u8 *end = buffer + buflen;
424 	const u8 state1 = state << 2;
425 	const u8 state2 = state << 4;
426 	const u8 state3 = state << 6;
427 	u32 count = 0;
428 
429 	for (; byte < end; byte++) {
430 		if (((*byte) & 0x03) == state)
431 			count++;
432 		if (((*byte) & 0x0C) == state1)
433 			count++;
434 		if (((*byte) & 0x30) == state2)
435 			count++;
436 		if (((*byte) & 0xC0) == state3)
437 			count++;
438 	}
439 
440 	return count;
441 }
442 
443 /**
444  * gfs2_rgrp_verify - Verify that a resource group is consistent
445  * @rgd: the rgrp
446  *
447  */
448 
449 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
450 {
451 	struct gfs2_sbd *sdp = rgd->rd_sbd;
452 	struct gfs2_bitmap *bi = NULL;
453 	u32 length = rgd->rd_length;
454 	u32 count[4], tmp;
455 	int buf, x;
456 
457 	memset(count, 0, 4 * sizeof(u32));
458 
459 	/* Count # blocks in each of 4 possible allocation states */
460 	for (buf = 0; buf < length; buf++) {
461 		bi = rgd->rd_bits + buf;
462 		for (x = 0; x < 4; x++)
463 			count[x] += gfs2_bitcount(rgd,
464 						  bi->bi_bh->b_data +
465 						  bi->bi_offset,
466 						  bi->bi_bytes, x);
467 	}
468 
469 	if (count[0] != rgd->rd_free) {
470 		if (gfs2_consist_rgrpd(rgd))
471 			fs_err(sdp, "free data mismatch:  %u != %u\n",
472 			       count[0], rgd->rd_free);
473 		return;
474 	}
475 
476 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
477 	if (count[1] != tmp) {
478 		if (gfs2_consist_rgrpd(rgd))
479 			fs_err(sdp, "used data mismatch:  %u != %u\n",
480 			       count[1], tmp);
481 		return;
482 	}
483 
484 	if (count[2] + count[3] != rgd->rd_dinodes) {
485 		if (gfs2_consist_rgrpd(rgd))
486 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
487 			       count[2] + count[3], rgd->rd_dinodes);
488 		return;
489 	}
490 }
491 
492 /**
493  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
494  * @sdp: The GFS2 superblock
495  * @blk: The data block number
496  * @exact: True if this needs to be an exact match
497  *
498  * The @exact argument should be set to true by most callers. The exception
499  * is when we need to match blocks which are not represented by the rgrp
500  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
501  * there for alignment purposes. Another way of looking at it is that @exact
502  * matches only valid data/metadata blocks, but with @exact false, it will
503  * match any block within the extent of the rgrp.
504  *
505  * Returns: The resource group, or NULL if not found
506  */
507 
508 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
509 {
510 	struct rb_node *n, *next;
511 	struct gfs2_rgrpd *cur;
512 
513 	spin_lock(&sdp->sd_rindex_spin);
514 	n = sdp->sd_rindex_tree.rb_node;
515 	while (n) {
516 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
517 		next = NULL;
518 		if (blk < cur->rd_addr)
519 			next = n->rb_left;
520 		else if (blk >= cur->rd_data0 + cur->rd_data)
521 			next = n->rb_right;
522 		if (next == NULL) {
523 			spin_unlock(&sdp->sd_rindex_spin);
524 			if (exact) {
525 				if (blk < cur->rd_addr)
526 					return NULL;
527 				if (blk >= cur->rd_data0 + cur->rd_data)
528 					return NULL;
529 			}
530 			return cur;
531 		}
532 		n = next;
533 	}
534 	spin_unlock(&sdp->sd_rindex_spin);
535 
536 	return NULL;
537 }
538 
539 /**
540  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
541  * @sdp: The GFS2 superblock
542  *
543  * Returns: The first rgrp in the filesystem
544  */
545 
546 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
547 {
548 	const struct rb_node *n;
549 	struct gfs2_rgrpd *rgd;
550 
551 	spin_lock(&sdp->sd_rindex_spin);
552 	n = rb_first(&sdp->sd_rindex_tree);
553 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
554 	spin_unlock(&sdp->sd_rindex_spin);
555 
556 	return rgd;
557 }
558 
559 /**
560  * gfs2_rgrpd_get_next - get the next RG
561  * @rgd: the resource group descriptor
562  *
563  * Returns: The next rgrp
564  */
565 
566 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
567 {
568 	struct gfs2_sbd *sdp = rgd->rd_sbd;
569 	const struct rb_node *n;
570 
571 	spin_lock(&sdp->sd_rindex_spin);
572 	n = rb_next(&rgd->rd_node);
573 	if (n == NULL)
574 		n = rb_first(&sdp->sd_rindex_tree);
575 
576 	if (unlikely(&rgd->rd_node == n)) {
577 		spin_unlock(&sdp->sd_rindex_spin);
578 		return NULL;
579 	}
580 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
581 	spin_unlock(&sdp->sd_rindex_spin);
582 	return rgd;
583 }
584 
585 void check_and_update_goal(struct gfs2_inode *ip)
586 {
587 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
588 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
589 		ip->i_goal = ip->i_no_addr;
590 }
591 
592 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
593 {
594 	int x;
595 
596 	for (x = 0; x < rgd->rd_length; x++) {
597 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
598 		kfree(bi->bi_clone);
599 		bi->bi_clone = NULL;
600 	}
601 }
602 
603 /**
604  * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
605  *                 plus a quota allocations data structure, if necessary
606  * @ip: the inode for this reservation
607  */
608 int gfs2_rsqa_alloc(struct gfs2_inode *ip)
609 {
610 	return gfs2_qa_alloc(ip);
611 }
612 
613 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
614 {
615 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
616 
617 	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
618 		       (unsigned long long)ip->i_no_addr,
619 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
620 		       rs->rs_rbm.offset, rs->rs_free);
621 }
622 
623 /**
624  * __rs_deltree - remove a multi-block reservation from the rgd tree
625  * @rs: The reservation to remove
626  *
627  */
628 static void __rs_deltree(struct gfs2_blkreserv *rs)
629 {
630 	struct gfs2_rgrpd *rgd;
631 
632 	if (!gfs2_rs_active(rs))
633 		return;
634 
635 	rgd = rs->rs_rbm.rgd;
636 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
637 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
638 	RB_CLEAR_NODE(&rs->rs_node);
639 
640 	if (rs->rs_free) {
641 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
642 				 rs->rs_free - 1;
643 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
644 		struct gfs2_bitmap *start, *last;
645 
646 		/* return reserved blocks to the rgrp */
647 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
648 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
649 		/* The rgrp extent failure point is likely not to increase;
650 		   it will only do so if the freed blocks are somehow
651 		   contiguous with a span of free blocks that follows. Still,
652 		   it will force the number to be recalculated later. */
653 		rgd->rd_extfail_pt += rs->rs_free;
654 		rs->rs_free = 0;
655 		if (gfs2_rbm_from_block(&last_rbm, last_block))
656 			return;
657 		start = rbm_bi(&rs->rs_rbm);
658 		last = rbm_bi(&last_rbm);
659 		do
660 			clear_bit(GBF_FULL, &start->bi_flags);
661 		while (start++ != last);
662 	}
663 }
664 
665 /**
666  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
667  * @rs: The reservation to remove
668  *
669  */
670 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
671 {
672 	struct gfs2_rgrpd *rgd;
673 
674 	rgd = rs->rs_rbm.rgd;
675 	if (rgd) {
676 		spin_lock(&rgd->rd_rsspin);
677 		__rs_deltree(rs);
678 		BUG_ON(rs->rs_free);
679 		spin_unlock(&rgd->rd_rsspin);
680 	}
681 }
682 
683 /**
684  * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
685  * @ip: The inode for this reservation
686  * @wcount: The inode's write count, or NULL
687  *
688  */
689 void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
690 {
691 	down_write(&ip->i_rw_mutex);
692 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
693 		gfs2_rs_deltree(&ip->i_res);
694 	up_write(&ip->i_rw_mutex);
695 	gfs2_qa_delete(ip, wcount);
696 }
697 
698 /**
699  * return_all_reservations - return all reserved blocks back to the rgrp.
700  * @rgd: the rgrp that needs its space back
701  *
702  * We previously reserved a bunch of blocks for allocation. Now we need to
703  * give them back. This leave the reservation structures in tact, but removes
704  * all of their corresponding "no-fly zones".
705  */
706 static void return_all_reservations(struct gfs2_rgrpd *rgd)
707 {
708 	struct rb_node *n;
709 	struct gfs2_blkreserv *rs;
710 
711 	spin_lock(&rgd->rd_rsspin);
712 	while ((n = rb_first(&rgd->rd_rstree))) {
713 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
714 		__rs_deltree(rs);
715 	}
716 	spin_unlock(&rgd->rd_rsspin);
717 }
718 
719 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
720 {
721 	struct rb_node *n;
722 	struct gfs2_rgrpd *rgd;
723 	struct gfs2_glock *gl;
724 
725 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
726 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
727 		gl = rgd->rd_gl;
728 
729 		rb_erase(n, &sdp->sd_rindex_tree);
730 
731 		if (gl) {
732 			glock_clear_object(gl, rgd);
733 			gfs2_rgrp_brelse(rgd);
734 			gfs2_glock_put(gl);
735 		}
736 
737 		gfs2_free_clones(rgd);
738 		kfree(rgd->rd_bits);
739 		rgd->rd_bits = NULL;
740 		return_all_reservations(rgd);
741 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
742 	}
743 }
744 
745 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
746 {
747 	struct gfs2_sbd *sdp = rgd->rd_sbd;
748 
749 	fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
750 	fs_info(sdp, "ri_length = %u\n", rgd->rd_length);
751 	fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
752 	fs_info(sdp, "ri_data = %u\n", rgd->rd_data);
753 	fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes);
754 }
755 
756 /**
757  * gfs2_compute_bitstructs - Compute the bitmap sizes
758  * @rgd: The resource group descriptor
759  *
760  * Calculates bitmap descriptors, one for each block that contains bitmap data
761  *
762  * Returns: errno
763  */
764 
765 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
766 {
767 	struct gfs2_sbd *sdp = rgd->rd_sbd;
768 	struct gfs2_bitmap *bi;
769 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
770 	u32 bytes_left, bytes;
771 	int x;
772 
773 	if (!length)
774 		return -EINVAL;
775 
776 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
777 	if (!rgd->rd_bits)
778 		return -ENOMEM;
779 
780 	bytes_left = rgd->rd_bitbytes;
781 
782 	for (x = 0; x < length; x++) {
783 		bi = rgd->rd_bits + x;
784 
785 		bi->bi_flags = 0;
786 		/* small rgrp; bitmap stored completely in header block */
787 		if (length == 1) {
788 			bytes = bytes_left;
789 			bi->bi_offset = sizeof(struct gfs2_rgrp);
790 			bi->bi_start = 0;
791 			bi->bi_bytes = bytes;
792 			bi->bi_blocks = bytes * GFS2_NBBY;
793 		/* header block */
794 		} else if (x == 0) {
795 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
796 			bi->bi_offset = sizeof(struct gfs2_rgrp);
797 			bi->bi_start = 0;
798 			bi->bi_bytes = bytes;
799 			bi->bi_blocks = bytes * GFS2_NBBY;
800 		/* last block */
801 		} else if (x + 1 == length) {
802 			bytes = bytes_left;
803 			bi->bi_offset = sizeof(struct gfs2_meta_header);
804 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
805 			bi->bi_bytes = bytes;
806 			bi->bi_blocks = bytes * GFS2_NBBY;
807 		/* other blocks */
808 		} else {
809 			bytes = sdp->sd_sb.sb_bsize -
810 				sizeof(struct gfs2_meta_header);
811 			bi->bi_offset = sizeof(struct gfs2_meta_header);
812 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
813 			bi->bi_bytes = bytes;
814 			bi->bi_blocks = bytes * GFS2_NBBY;
815 		}
816 
817 		bytes_left -= bytes;
818 	}
819 
820 	if (bytes_left) {
821 		gfs2_consist_rgrpd(rgd);
822 		return -EIO;
823 	}
824 	bi = rgd->rd_bits + (length - 1);
825 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
826 		if (gfs2_consist_rgrpd(rgd)) {
827 			gfs2_rindex_print(rgd);
828 			fs_err(sdp, "start=%u len=%u offset=%u\n",
829 			       bi->bi_start, bi->bi_bytes, bi->bi_offset);
830 		}
831 		return -EIO;
832 	}
833 
834 	return 0;
835 }
836 
837 /**
838  * gfs2_ri_total - Total up the file system space, according to the rindex.
839  * @sdp: the filesystem
840  *
841  */
842 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
843 {
844 	u64 total_data = 0;
845 	struct inode *inode = sdp->sd_rindex;
846 	struct gfs2_inode *ip = GFS2_I(inode);
847 	char buf[sizeof(struct gfs2_rindex)];
848 	int error, rgrps;
849 
850 	for (rgrps = 0;; rgrps++) {
851 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
852 
853 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
854 			break;
855 		error = gfs2_internal_read(ip, buf, &pos,
856 					   sizeof(struct gfs2_rindex));
857 		if (error != sizeof(struct gfs2_rindex))
858 			break;
859 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
860 	}
861 	return total_data;
862 }
863 
864 static int rgd_insert(struct gfs2_rgrpd *rgd)
865 {
866 	struct gfs2_sbd *sdp = rgd->rd_sbd;
867 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
868 
869 	/* Figure out where to put new node */
870 	while (*newn) {
871 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
872 						  rd_node);
873 
874 		parent = *newn;
875 		if (rgd->rd_addr < cur->rd_addr)
876 			newn = &((*newn)->rb_left);
877 		else if (rgd->rd_addr > cur->rd_addr)
878 			newn = &((*newn)->rb_right);
879 		else
880 			return -EEXIST;
881 	}
882 
883 	rb_link_node(&rgd->rd_node, parent, newn);
884 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
885 	sdp->sd_rgrps++;
886 	return 0;
887 }
888 
889 /**
890  * read_rindex_entry - Pull in a new resource index entry from the disk
891  * @ip: Pointer to the rindex inode
892  *
893  * Returns: 0 on success, > 0 on EOF, error code otherwise
894  */
895 
896 static int read_rindex_entry(struct gfs2_inode *ip)
897 {
898 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
899 	const unsigned bsize = sdp->sd_sb.sb_bsize;
900 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
901 	struct gfs2_rindex buf;
902 	int error;
903 	struct gfs2_rgrpd *rgd;
904 
905 	if (pos >= i_size_read(&ip->i_inode))
906 		return 1;
907 
908 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
909 				   sizeof(struct gfs2_rindex));
910 
911 	if (error != sizeof(struct gfs2_rindex))
912 		return (error == 0) ? 1 : error;
913 
914 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
915 	error = -ENOMEM;
916 	if (!rgd)
917 		return error;
918 
919 	rgd->rd_sbd = sdp;
920 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
921 	rgd->rd_length = be32_to_cpu(buf.ri_length);
922 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
923 	rgd->rd_data = be32_to_cpu(buf.ri_data);
924 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
925 	spin_lock_init(&rgd->rd_rsspin);
926 
927 	error = compute_bitstructs(rgd);
928 	if (error)
929 		goto fail;
930 
931 	error = gfs2_glock_get(sdp, rgd->rd_addr,
932 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
933 	if (error)
934 		goto fail;
935 
936 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
937 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
938 	if (rgd->rd_data > sdp->sd_max_rg_data)
939 		sdp->sd_max_rg_data = rgd->rd_data;
940 	spin_lock(&sdp->sd_rindex_spin);
941 	error = rgd_insert(rgd);
942 	spin_unlock(&sdp->sd_rindex_spin);
943 	if (!error) {
944 		glock_set_object(rgd->rd_gl, rgd);
945 		rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
946 		rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
947 						    rgd->rd_length) * bsize) - 1;
948 		return 0;
949 	}
950 
951 	error = 0; /* someone else read in the rgrp; free it and ignore it */
952 	gfs2_glock_put(rgd->rd_gl);
953 
954 fail:
955 	kfree(rgd->rd_bits);
956 	rgd->rd_bits = NULL;
957 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
958 	return error;
959 }
960 
961 /**
962  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
963  * @sdp: the GFS2 superblock
964  *
965  * The purpose of this function is to select a subset of the resource groups
966  * and mark them as PREFERRED. We do it in such a way that each node prefers
967  * to use a unique set of rgrps to minimize glock contention.
968  */
969 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
970 {
971 	struct gfs2_rgrpd *rgd, *first;
972 	int i;
973 
974 	/* Skip an initial number of rgrps, based on this node's journal ID.
975 	   That should start each node out on its own set. */
976 	rgd = gfs2_rgrpd_get_first(sdp);
977 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
978 		rgd = gfs2_rgrpd_get_next(rgd);
979 	first = rgd;
980 
981 	do {
982 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
983 		for (i = 0; i < sdp->sd_journals; i++) {
984 			rgd = gfs2_rgrpd_get_next(rgd);
985 			if (!rgd || rgd == first)
986 				break;
987 		}
988 	} while (rgd && rgd != first);
989 }
990 
991 /**
992  * gfs2_ri_update - Pull in a new resource index from the disk
993  * @ip: pointer to the rindex inode
994  *
995  * Returns: 0 on successful update, error code otherwise
996  */
997 
998 static int gfs2_ri_update(struct gfs2_inode *ip)
999 {
1000 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1001 	int error;
1002 
1003 	do {
1004 		error = read_rindex_entry(ip);
1005 	} while (error == 0);
1006 
1007 	if (error < 0)
1008 		return error;
1009 
1010 	set_rgrp_preferences(sdp);
1011 
1012 	sdp->sd_rindex_uptodate = 1;
1013 	return 0;
1014 }
1015 
1016 /**
1017  * gfs2_rindex_update - Update the rindex if required
1018  * @sdp: The GFS2 superblock
1019  *
1020  * We grab a lock on the rindex inode to make sure that it doesn't
1021  * change whilst we are performing an operation. We keep this lock
1022  * for quite long periods of time compared to other locks. This
1023  * doesn't matter, since it is shared and it is very, very rarely
1024  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1025  *
1026  * This makes sure that we're using the latest copy of the resource index
1027  * special file, which might have been updated if someone expanded the
1028  * filesystem (via gfs2_grow utility), which adds new resource groups.
1029  *
1030  * Returns: 0 on succeess, error code otherwise
1031  */
1032 
1033 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1034 {
1035 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1036 	struct gfs2_glock *gl = ip->i_gl;
1037 	struct gfs2_holder ri_gh;
1038 	int error = 0;
1039 	int unlock_required = 0;
1040 
1041 	/* Read new copy from disk if we don't have the latest */
1042 	if (!sdp->sd_rindex_uptodate) {
1043 		if (!gfs2_glock_is_locked_by_me(gl)) {
1044 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1045 			if (error)
1046 				return error;
1047 			unlock_required = 1;
1048 		}
1049 		if (!sdp->sd_rindex_uptodate)
1050 			error = gfs2_ri_update(ip);
1051 		if (unlock_required)
1052 			gfs2_glock_dq_uninit(&ri_gh);
1053 	}
1054 
1055 	return error;
1056 }
1057 
1058 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1059 {
1060 	const struct gfs2_rgrp *str = buf;
1061 	u32 rg_flags;
1062 
1063 	rg_flags = be32_to_cpu(str->rg_flags);
1064 	rg_flags &= ~GFS2_RDF_MASK;
1065 	rgd->rd_flags &= GFS2_RDF_MASK;
1066 	rgd->rd_flags |= rg_flags;
1067 	rgd->rd_free = be32_to_cpu(str->rg_free);
1068 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1069 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1070 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1071 }
1072 
1073 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1074 {
1075 	const struct gfs2_rgrp *str = buf;
1076 
1077 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1078 	rgl->rl_flags = str->rg_flags;
1079 	rgl->rl_free = str->rg_free;
1080 	rgl->rl_dinodes = str->rg_dinodes;
1081 	rgl->rl_igeneration = str->rg_igeneration;
1082 	rgl->__pad = 0UL;
1083 }
1084 
1085 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1086 {
1087 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1088 	struct gfs2_rgrp *str = buf;
1089 	u32 crc;
1090 
1091 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1092 	str->rg_free = cpu_to_be32(rgd->rd_free);
1093 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1094 	if (next == NULL)
1095 		str->rg_skip = 0;
1096 	else if (next->rd_addr > rgd->rd_addr)
1097 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1098 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1099 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1100 	str->rg_data = cpu_to_be32(rgd->rd_data);
1101 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1102 	str->rg_crc = 0;
1103 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1104 	str->rg_crc = cpu_to_be32(crc);
1105 
1106 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1107 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1108 }
1109 
1110 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1111 {
1112 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1113 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1114 	int valid = 1;
1115 
1116 	if (rgl->rl_flags != str->rg_flags) {
1117 		printk(KERN_WARNING "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1118 		       (unsigned long long)rgd->rd_addr,
1119 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1120 		valid = 0;
1121 	}
1122 	if (rgl->rl_free != str->rg_free) {
1123 		printk(KERN_WARNING "GFS2: rgd: %llu lvb free mismatch %u/%u",
1124 		       (unsigned long long)rgd->rd_addr,
1125 		       be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1126 		valid = 0;
1127 	}
1128 	if (rgl->rl_dinodes != str->rg_dinodes) {
1129 		printk(KERN_WARNING "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1130 		       (unsigned long long)rgd->rd_addr,
1131 		       be32_to_cpu(rgl->rl_dinodes),
1132 		       be32_to_cpu(str->rg_dinodes));
1133 		valid = 0;
1134 	}
1135 	if (rgl->rl_igeneration != str->rg_igeneration) {
1136 		printk(KERN_WARNING "GFS2: rgd: %llu lvb igen mismatch "
1137 		       "%llu/%llu", (unsigned long long)rgd->rd_addr,
1138 		       (unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1139 		       (unsigned long long)be64_to_cpu(str->rg_igeneration));
1140 		valid = 0;
1141 	}
1142 	return valid;
1143 }
1144 
1145 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1146 {
1147 	struct gfs2_bitmap *bi;
1148 	const u32 length = rgd->rd_length;
1149 	const u8 *buffer = NULL;
1150 	u32 i, goal, count = 0;
1151 
1152 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1153 		goal = 0;
1154 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1155 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1156 		while (goal < bi->bi_blocks) {
1157 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1158 					   GFS2_BLKST_UNLINKED);
1159 			if (goal == BFITNOENT)
1160 				break;
1161 			count++;
1162 			goal++;
1163 		}
1164 	}
1165 
1166 	return count;
1167 }
1168 
1169 
1170 /**
1171  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1172  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1173  *
1174  * Read in all of a Resource Group's header and bitmap blocks.
1175  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1176  *
1177  * Returns: errno
1178  */
1179 
1180 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1181 {
1182 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1183 	struct gfs2_glock *gl = rgd->rd_gl;
1184 	unsigned int length = rgd->rd_length;
1185 	struct gfs2_bitmap *bi;
1186 	unsigned int x, y;
1187 	int error;
1188 
1189 	if (rgd->rd_bits[0].bi_bh != NULL)
1190 		return 0;
1191 
1192 	for (x = 0; x < length; x++) {
1193 		bi = rgd->rd_bits + x;
1194 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1195 		if (error)
1196 			goto fail;
1197 	}
1198 
1199 	for (y = length; y--;) {
1200 		bi = rgd->rd_bits + y;
1201 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1202 		if (error)
1203 			goto fail;
1204 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1205 					      GFS2_METATYPE_RG)) {
1206 			error = -EIO;
1207 			goto fail;
1208 		}
1209 	}
1210 
1211 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1212 		for (x = 0; x < length; x++)
1213 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1214 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1215 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1216 		rgd->rd_free_clone = rgd->rd_free;
1217 		/* max out the rgrp allocation failure point */
1218 		rgd->rd_extfail_pt = rgd->rd_free;
1219 	}
1220 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1221 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1222 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1223 				     rgd->rd_bits[0].bi_bh->b_data);
1224 	}
1225 	else if (sdp->sd_args.ar_rgrplvb) {
1226 		if (!gfs2_rgrp_lvb_valid(rgd)){
1227 			gfs2_consist_rgrpd(rgd);
1228 			error = -EIO;
1229 			goto fail;
1230 		}
1231 		if (rgd->rd_rgl->rl_unlinked == 0)
1232 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1233 	}
1234 	return 0;
1235 
1236 fail:
1237 	while (x--) {
1238 		bi = rgd->rd_bits + x;
1239 		brelse(bi->bi_bh);
1240 		bi->bi_bh = NULL;
1241 		gfs2_assert_warn(sdp, !bi->bi_clone);
1242 	}
1243 
1244 	return error;
1245 }
1246 
1247 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1248 {
1249 	u32 rl_flags;
1250 
1251 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1252 		return 0;
1253 
1254 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1255 		return gfs2_rgrp_bh_get(rgd);
1256 
1257 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1258 	rl_flags &= ~GFS2_RDF_MASK;
1259 	rgd->rd_flags &= GFS2_RDF_MASK;
1260 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1261 	if (rgd->rd_rgl->rl_unlinked == 0)
1262 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1263 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1264 	rgd->rd_free_clone = rgd->rd_free;
1265 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1266 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1267 	return 0;
1268 }
1269 
1270 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1271 {
1272 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1273 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1274 
1275 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1276 		return 0;
1277 	return gfs2_rgrp_bh_get(rgd);
1278 }
1279 
1280 /**
1281  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1282  * @rgd: The resource group
1283  *
1284  */
1285 
1286 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1287 {
1288 	int x, length = rgd->rd_length;
1289 
1290 	for (x = 0; x < length; x++) {
1291 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1292 		if (bi->bi_bh) {
1293 			brelse(bi->bi_bh);
1294 			bi->bi_bh = NULL;
1295 		}
1296 	}
1297 
1298 }
1299 
1300 /**
1301  * gfs2_rgrp_go_unlock - Unlock a rgrp glock
1302  * @gh: The glock holder for the resource group
1303  *
1304  */
1305 
1306 void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
1307 {
1308 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1309 	int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
1310 		test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
1311 
1312 	if (rgd && demote_requested)
1313 		gfs2_rgrp_brelse(rgd);
1314 }
1315 
1316 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1317 			     struct buffer_head *bh,
1318 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1319 {
1320 	struct super_block *sb = sdp->sd_vfs;
1321 	u64 blk;
1322 	sector_t start = 0;
1323 	sector_t nr_blks = 0;
1324 	int rv;
1325 	unsigned int x;
1326 	u32 trimmed = 0;
1327 	u8 diff;
1328 
1329 	for (x = 0; x < bi->bi_bytes; x++) {
1330 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1331 		clone += bi->bi_offset;
1332 		clone += x;
1333 		if (bh) {
1334 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1335 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1336 		} else {
1337 			diff = ~(*clone | (*clone >> 1));
1338 		}
1339 		diff &= 0x55;
1340 		if (diff == 0)
1341 			continue;
1342 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1343 		while(diff) {
1344 			if (diff & 1) {
1345 				if (nr_blks == 0)
1346 					goto start_new_extent;
1347 				if ((start + nr_blks) != blk) {
1348 					if (nr_blks >= minlen) {
1349 						rv = sb_issue_discard(sb,
1350 							start, nr_blks,
1351 							GFP_NOFS, 0);
1352 						if (rv)
1353 							goto fail;
1354 						trimmed += nr_blks;
1355 					}
1356 					nr_blks = 0;
1357 start_new_extent:
1358 					start = blk;
1359 				}
1360 				nr_blks++;
1361 			}
1362 			diff >>= 2;
1363 			blk++;
1364 		}
1365 	}
1366 	if (nr_blks >= minlen) {
1367 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1368 		if (rv)
1369 			goto fail;
1370 		trimmed += nr_blks;
1371 	}
1372 	if (ptrimmed)
1373 		*ptrimmed = trimmed;
1374 	return 0;
1375 
1376 fail:
1377 	if (sdp->sd_args.ar_discard)
1378 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1379 	sdp->sd_args.ar_discard = 0;
1380 	return -EIO;
1381 }
1382 
1383 /**
1384  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1385  * @filp: Any file on the filesystem
1386  * @argp: Pointer to the arguments (also used to pass result)
1387  *
1388  * Returns: 0 on success, otherwise error code
1389  */
1390 
1391 int gfs2_fitrim(struct file *filp, void __user *argp)
1392 {
1393 	struct inode *inode = file_inode(filp);
1394 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1395 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1396 	struct buffer_head *bh;
1397 	struct gfs2_rgrpd *rgd;
1398 	struct gfs2_rgrpd *rgd_end;
1399 	struct gfs2_holder gh;
1400 	struct fstrim_range r;
1401 	int ret = 0;
1402 	u64 amt;
1403 	u64 trimmed = 0;
1404 	u64 start, end, minlen;
1405 	unsigned int x;
1406 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1407 
1408 	if (!capable(CAP_SYS_ADMIN))
1409 		return -EPERM;
1410 
1411 	if (!blk_queue_discard(q))
1412 		return -EOPNOTSUPP;
1413 
1414 	if (copy_from_user(&r, argp, sizeof(r)))
1415 		return -EFAULT;
1416 
1417 	ret = gfs2_rindex_update(sdp);
1418 	if (ret)
1419 		return ret;
1420 
1421 	start = r.start >> bs_shift;
1422 	end = start + (r.len >> bs_shift);
1423 	minlen = max_t(u64, r.minlen,
1424 		       q->limits.discard_granularity) >> bs_shift;
1425 
1426 	if (end <= start || minlen > sdp->sd_max_rg_data)
1427 		return -EINVAL;
1428 
1429 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1430 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1431 
1432 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1433 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1434 		return -EINVAL; /* start is beyond the end of the fs */
1435 
1436 	while (1) {
1437 
1438 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1439 		if (ret)
1440 			goto out;
1441 
1442 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1443 			/* Trim each bitmap in the rgrp */
1444 			for (x = 0; x < rgd->rd_length; x++) {
1445 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1446 				ret = gfs2_rgrp_send_discards(sdp,
1447 						rgd->rd_data0, NULL, bi, minlen,
1448 						&amt);
1449 				if (ret) {
1450 					gfs2_glock_dq_uninit(&gh);
1451 					goto out;
1452 				}
1453 				trimmed += amt;
1454 			}
1455 
1456 			/* Mark rgrp as having been trimmed */
1457 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1458 			if (ret == 0) {
1459 				bh = rgd->rd_bits[0].bi_bh;
1460 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1461 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1462 				gfs2_rgrp_out(rgd, bh->b_data);
1463 				gfs2_trans_end(sdp);
1464 			}
1465 		}
1466 		gfs2_glock_dq_uninit(&gh);
1467 
1468 		if (rgd == rgd_end)
1469 			break;
1470 
1471 		rgd = gfs2_rgrpd_get_next(rgd);
1472 	}
1473 
1474 out:
1475 	r.len = trimmed << bs_shift;
1476 	if (copy_to_user(argp, &r, sizeof(r)))
1477 		return -EFAULT;
1478 
1479 	return ret;
1480 }
1481 
1482 /**
1483  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1484  * @ip: the inode structure
1485  *
1486  */
1487 static void rs_insert(struct gfs2_inode *ip)
1488 {
1489 	struct rb_node **newn, *parent = NULL;
1490 	int rc;
1491 	struct gfs2_blkreserv *rs = &ip->i_res;
1492 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1493 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1494 
1495 	BUG_ON(gfs2_rs_active(rs));
1496 
1497 	spin_lock(&rgd->rd_rsspin);
1498 	newn = &rgd->rd_rstree.rb_node;
1499 	while (*newn) {
1500 		struct gfs2_blkreserv *cur =
1501 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1502 
1503 		parent = *newn;
1504 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1505 		if (rc > 0)
1506 			newn = &((*newn)->rb_right);
1507 		else if (rc < 0)
1508 			newn = &((*newn)->rb_left);
1509 		else {
1510 			spin_unlock(&rgd->rd_rsspin);
1511 			WARN_ON(1);
1512 			return;
1513 		}
1514 	}
1515 
1516 	rb_link_node(&rs->rs_node, parent, newn);
1517 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1518 
1519 	/* Do our rgrp accounting for the reservation */
1520 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1521 	spin_unlock(&rgd->rd_rsspin);
1522 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1523 }
1524 
1525 /**
1526  * rgd_free - return the number of free blocks we can allocate.
1527  * @rgd: the resource group
1528  *
1529  * This function returns the number of free blocks for an rgrp.
1530  * That's the clone-free blocks (blocks that are free, not including those
1531  * still being used for unlinked files that haven't been deleted.)
1532  *
1533  * It also subtracts any blocks reserved by someone else, but does not
1534  * include free blocks that are still part of our current reservation,
1535  * because obviously we can (and will) allocate them.
1536  */
1537 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1538 {
1539 	u32 tot_reserved, tot_free;
1540 
1541 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1542 		return 0;
1543 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1544 
1545 	if (rgd->rd_free_clone < tot_reserved)
1546 		tot_reserved = 0;
1547 
1548 	tot_free = rgd->rd_free_clone - tot_reserved;
1549 
1550 	return tot_free;
1551 }
1552 
1553 /**
1554  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1555  * @rgd: the resource group descriptor
1556  * @ip: pointer to the inode for which we're reserving blocks
1557  * @ap: the allocation parameters
1558  *
1559  */
1560 
1561 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1562 			   const struct gfs2_alloc_parms *ap)
1563 {
1564 	struct gfs2_rbm rbm = { .rgd = rgd, };
1565 	u64 goal;
1566 	struct gfs2_blkreserv *rs = &ip->i_res;
1567 	u32 extlen;
1568 	u32 free_blocks = rgd_free(rgd, rs);
1569 	int ret;
1570 	struct inode *inode = &ip->i_inode;
1571 
1572 	if (S_ISDIR(inode->i_mode))
1573 		extlen = 1;
1574 	else {
1575 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1576 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1577 	}
1578 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1579 		return;
1580 
1581 	/* Find bitmap block that contains bits for goal block */
1582 	if (rgrp_contains_block(rgd, ip->i_goal))
1583 		goal = ip->i_goal;
1584 	else
1585 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1586 
1587 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1588 		return;
1589 
1590 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1591 	if (ret == 0) {
1592 		rs->rs_rbm = rbm;
1593 		rs->rs_free = extlen;
1594 		rs_insert(ip);
1595 	} else {
1596 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1597 			rgd->rd_last_alloc = 0;
1598 	}
1599 }
1600 
1601 /**
1602  * gfs2_next_unreserved_block - Return next block that is not reserved
1603  * @rgd: The resource group
1604  * @block: The starting block
1605  * @length: The required length
1606  * @ip: Ignore any reservations for this inode
1607  *
1608  * If the block does not appear in any reservation, then return the
1609  * block number unchanged. If it does appear in the reservation, then
1610  * keep looking through the tree of reservations in order to find the
1611  * first block number which is not reserved.
1612  */
1613 
1614 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1615 				      u32 length,
1616 				      const struct gfs2_inode *ip)
1617 {
1618 	struct gfs2_blkreserv *rs;
1619 	struct rb_node *n;
1620 	int rc;
1621 
1622 	spin_lock(&rgd->rd_rsspin);
1623 	n = rgd->rd_rstree.rb_node;
1624 	while (n) {
1625 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1626 		rc = rs_cmp(block, length, rs);
1627 		if (rc < 0)
1628 			n = n->rb_left;
1629 		else if (rc > 0)
1630 			n = n->rb_right;
1631 		else
1632 			break;
1633 	}
1634 
1635 	if (n) {
1636 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1637 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1638 			n = n->rb_right;
1639 			if (n == NULL)
1640 				break;
1641 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1642 		}
1643 	}
1644 
1645 	spin_unlock(&rgd->rd_rsspin);
1646 	return block;
1647 }
1648 
1649 /**
1650  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1651  * @rbm: The current position in the resource group
1652  * @ip: The inode for which we are searching for blocks
1653  * @minext: The minimum extent length
1654  * @maxext: A pointer to the maximum extent structure
1655  *
1656  * This checks the current position in the rgrp to see whether there is
1657  * a reservation covering this block. If not then this function is a
1658  * no-op. If there is, then the position is moved to the end of the
1659  * contiguous reservation(s) so that we are pointing at the first
1660  * non-reserved block.
1661  *
1662  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1663  */
1664 
1665 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1666 					     const struct gfs2_inode *ip,
1667 					     u32 minext,
1668 					     struct gfs2_extent *maxext)
1669 {
1670 	u64 block = gfs2_rbm_to_block(rbm);
1671 	u32 extlen = 1;
1672 	u64 nblock;
1673 	int ret;
1674 
1675 	/*
1676 	 * If we have a minimum extent length, then skip over any extent
1677 	 * which is less than the min extent length in size.
1678 	 */
1679 	if (minext) {
1680 		extlen = gfs2_free_extlen(rbm, minext);
1681 		if (extlen <= maxext->len)
1682 			goto fail;
1683 	}
1684 
1685 	/*
1686 	 * Check the extent which has been found against the reservations
1687 	 * and skip if parts of it are already reserved
1688 	 */
1689 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1690 	if (nblock == block) {
1691 		if (!minext || extlen >= minext)
1692 			return 0;
1693 
1694 		if (extlen > maxext->len) {
1695 			maxext->len = extlen;
1696 			maxext->rbm = *rbm;
1697 		}
1698 fail:
1699 		nblock = block + extlen;
1700 	}
1701 	ret = gfs2_rbm_from_block(rbm, nblock);
1702 	if (ret < 0)
1703 		return ret;
1704 	return 1;
1705 }
1706 
1707 /**
1708  * gfs2_rbm_find - Look for blocks of a particular state
1709  * @rbm: Value/result starting position and final position
1710  * @state: The state which we want to find
1711  * @minext: Pointer to the requested extent length (NULL for a single block)
1712  *          This is updated to be the actual reservation size.
1713  * @ip: If set, check for reservations
1714  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1715  *          around until we've reached the starting point.
1716  *
1717  * Side effects:
1718  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1719  *   has no free blocks in it.
1720  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1721  *   has come up short on a free block search.
1722  *
1723  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1724  */
1725 
1726 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1727 			 const struct gfs2_inode *ip, bool nowrap)
1728 {
1729 	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1730 	struct buffer_head *bh;
1731 	int last_bii;
1732 	u32 offset;
1733 	u8 *buffer;
1734 	bool wrapped = false;
1735 	int ret;
1736 	struct gfs2_bitmap *bi;
1737 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1738 
1739 	/*
1740 	 * Determine the last bitmap to search.  If we're not starting at the
1741 	 * beginning of a bitmap, we need to search that bitmap twice to scan
1742 	 * the entire resource group.
1743 	 */
1744 	last_bii = rbm->bii - (rbm->offset == 0);
1745 
1746 	while(1) {
1747 		bi = rbm_bi(rbm);
1748 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1749 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1750 		    (state == GFS2_BLKST_FREE))
1751 			goto next_bitmap;
1752 
1753 		bh = bi->bi_bh;
1754 		buffer = bh->b_data + bi->bi_offset;
1755 		WARN_ON(!buffer_uptodate(bh));
1756 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1757 			buffer = bi->bi_clone + bi->bi_offset;
1758 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1759 		if (offset == BFITNOENT) {
1760 			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1761 				set_bit(GBF_FULL, &bi->bi_flags);
1762 			goto next_bitmap;
1763 		}
1764 		rbm->offset = offset;
1765 		if (ip == NULL)
1766 			return 0;
1767 
1768 		ret = gfs2_reservation_check_and_update(rbm, ip,
1769 							minext ? *minext : 0,
1770 							&maxext);
1771 		if (ret == 0)
1772 			return 0;
1773 		if (ret > 0)
1774 			goto next_iter;
1775 		if (ret == -E2BIG) {
1776 			rbm->bii = 0;
1777 			rbm->offset = 0;
1778 			goto res_covered_end_of_rgrp;
1779 		}
1780 		return ret;
1781 
1782 next_bitmap:	/* Find next bitmap in the rgrp */
1783 		rbm->offset = 0;
1784 		rbm->bii++;
1785 		if (rbm->bii == rbm->rgd->rd_length)
1786 			rbm->bii = 0;
1787 res_covered_end_of_rgrp:
1788 		if (rbm->bii == 0) {
1789 			if (wrapped)
1790 				break;
1791 			wrapped = true;
1792 			if (nowrap)
1793 				break;
1794 		}
1795 next_iter:
1796 		/* Have we scanned the entire resource group? */
1797 		if (wrapped && rbm->bii > last_bii)
1798 			break;
1799 	}
1800 
1801 	if (minext == NULL || state != GFS2_BLKST_FREE)
1802 		return -ENOSPC;
1803 
1804 	/* If the extent was too small, and it's smaller than the smallest
1805 	   to have failed before, remember for future reference that it's
1806 	   useless to search this rgrp again for this amount or more. */
1807 	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1808 	    *minext < rbm->rgd->rd_extfail_pt)
1809 		rbm->rgd->rd_extfail_pt = *minext;
1810 
1811 	/* If the maximum extent we found is big enough to fulfill the
1812 	   minimum requirements, use it anyway. */
1813 	if (maxext.len) {
1814 		*rbm = maxext.rbm;
1815 		*minext = maxext.len;
1816 		return 0;
1817 	}
1818 
1819 	return -ENOSPC;
1820 }
1821 
1822 /**
1823  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1824  * @rgd: The rgrp
1825  * @last_unlinked: block address of the last dinode we unlinked
1826  * @skip: block address we should explicitly not unlink
1827  *
1828  * Returns: 0 if no error
1829  *          The inode, if one has been found, in inode.
1830  */
1831 
1832 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1833 {
1834 	u64 block;
1835 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1836 	struct gfs2_glock *gl;
1837 	struct gfs2_inode *ip;
1838 	int error;
1839 	int found = 0;
1840 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1841 
1842 	while (1) {
1843 		down_write(&sdp->sd_log_flush_lock);
1844 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1845 				      true);
1846 		up_write(&sdp->sd_log_flush_lock);
1847 		if (error == -ENOSPC)
1848 			break;
1849 		if (WARN_ON_ONCE(error))
1850 			break;
1851 
1852 		block = gfs2_rbm_to_block(&rbm);
1853 		if (gfs2_rbm_from_block(&rbm, block + 1))
1854 			break;
1855 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1856 			continue;
1857 		if (block == skip)
1858 			continue;
1859 		*last_unlinked = block;
1860 
1861 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1862 		if (error)
1863 			continue;
1864 
1865 		/* If the inode is already in cache, we can ignore it here
1866 		 * because the existing inode disposal code will deal with
1867 		 * it when all refs have gone away. Accessing gl_object like
1868 		 * this is not safe in general. Here it is ok because we do
1869 		 * not dereference the pointer, and we only need an approx
1870 		 * answer to whether it is NULL or not.
1871 		 */
1872 		ip = gl->gl_object;
1873 
1874 		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1875 			gfs2_glock_put(gl);
1876 		else
1877 			found++;
1878 
1879 		/* Limit reclaim to sensible number of tasks */
1880 		if (found > NR_CPUS)
1881 			return;
1882 	}
1883 
1884 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1885 	return;
1886 }
1887 
1888 /**
1889  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1890  * @rgd: The rgrp in question
1891  * @loops: An indication of how picky we can be (0=very, 1=less so)
1892  *
1893  * This function uses the recently added glock statistics in order to
1894  * figure out whether a parciular resource group is suffering from
1895  * contention from multiple nodes. This is done purely on the basis
1896  * of timings, since this is the only data we have to work with and
1897  * our aim here is to reject a resource group which is highly contended
1898  * but (very important) not to do this too often in order to ensure that
1899  * we do not land up introducing fragmentation by changing resource
1900  * groups when not actually required.
1901  *
1902  * The calculation is fairly simple, we want to know whether the SRTTB
1903  * (i.e. smoothed round trip time for blocking operations) to acquire
1904  * the lock for this rgrp's glock is significantly greater than the
1905  * time taken for resource groups on average. We introduce a margin in
1906  * the form of the variable @var which is computed as the sum of the two
1907  * respective variences, and multiplied by a factor depending on @loops
1908  * and whether we have a lot of data to base the decision on. This is
1909  * then tested against the square difference of the means in order to
1910  * decide whether the result is statistically significant or not.
1911  *
1912  * Returns: A boolean verdict on the congestion status
1913  */
1914 
1915 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1916 {
1917 	const struct gfs2_glock *gl = rgd->rd_gl;
1918 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1919 	struct gfs2_lkstats *st;
1920 	u64 r_dcount, l_dcount;
1921 	u64 l_srttb, a_srttb = 0;
1922 	s64 srttb_diff;
1923 	u64 sqr_diff;
1924 	u64 var;
1925 	int cpu, nonzero = 0;
1926 
1927 	preempt_disable();
1928 	for_each_present_cpu(cpu) {
1929 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1930 		if (st->stats[GFS2_LKS_SRTTB]) {
1931 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1932 			nonzero++;
1933 		}
1934 	}
1935 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1936 	if (nonzero)
1937 		do_div(a_srttb, nonzero);
1938 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1939 	var = st->stats[GFS2_LKS_SRTTVARB] +
1940 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1941 	preempt_enable();
1942 
1943 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1944 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1945 
1946 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1947 		return false;
1948 
1949 	srttb_diff = a_srttb - l_srttb;
1950 	sqr_diff = srttb_diff * srttb_diff;
1951 
1952 	var *= 2;
1953 	if (l_dcount < 8 || r_dcount < 8)
1954 		var *= 2;
1955 	if (loops == 1)
1956 		var *= 2;
1957 
1958 	return ((srttb_diff < 0) && (sqr_diff > var));
1959 }
1960 
1961 /**
1962  * gfs2_rgrp_used_recently
1963  * @rs: The block reservation with the rgrp to test
1964  * @msecs: The time limit in milliseconds
1965  *
1966  * Returns: True if the rgrp glock has been used within the time limit
1967  */
1968 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1969 				    u64 msecs)
1970 {
1971 	u64 tdiff;
1972 
1973 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1974                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1975 
1976 	return tdiff > (msecs * 1000 * 1000);
1977 }
1978 
1979 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1980 {
1981 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1982 	u32 skip;
1983 
1984 	get_random_bytes(&skip, sizeof(skip));
1985 	return skip % sdp->sd_rgrps;
1986 }
1987 
1988 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1989 {
1990 	struct gfs2_rgrpd *rgd = *pos;
1991 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1992 
1993 	rgd = gfs2_rgrpd_get_next(rgd);
1994 	if (rgd == NULL)
1995 		rgd = gfs2_rgrpd_get_first(sdp);
1996 	*pos = rgd;
1997 	if (rgd != begin) /* If we didn't wrap */
1998 		return true;
1999 	return false;
2000 }
2001 
2002 /**
2003  * fast_to_acquire - determine if a resource group will be fast to acquire
2004  *
2005  * If this is one of our preferred rgrps, it should be quicker to acquire,
2006  * because we tried to set ourselves up as dlm lock master.
2007  */
2008 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
2009 {
2010 	struct gfs2_glock *gl = rgd->rd_gl;
2011 
2012 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
2013 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
2014 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
2015 		return 1;
2016 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
2017 		return 1;
2018 	return 0;
2019 }
2020 
2021 /**
2022  * gfs2_inplace_reserve - Reserve space in the filesystem
2023  * @ip: the inode to reserve space for
2024  * @ap: the allocation parameters
2025  *
2026  * We try our best to find an rgrp that has at least ap->target blocks
2027  * available. After a couple of passes (loops == 2), the prospects of finding
2028  * such an rgrp diminish. At this stage, we return the first rgrp that has
2029  * at least ap->min_target blocks available. Either way, we set ap->allowed to
2030  * the number of blocks available in the chosen rgrp.
2031  *
2032  * Returns: 0 on success,
2033  *          -ENOMEM if a suitable rgrp can't be found
2034  *          errno otherwise
2035  */
2036 
2037 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2038 {
2039 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2040 	struct gfs2_rgrpd *begin = NULL;
2041 	struct gfs2_blkreserv *rs = &ip->i_res;
2042 	int error = 0, rg_locked, flags = 0;
2043 	u64 last_unlinked = NO_BLOCK;
2044 	int loops = 0;
2045 	u32 free_blocks, skip = 0;
2046 
2047 	if (sdp->sd_args.ar_rgrplvb)
2048 		flags |= GL_SKIP;
2049 	if (gfs2_assert_warn(sdp, ap->target))
2050 		return -EINVAL;
2051 	if (gfs2_rs_active(rs)) {
2052 		begin = rs->rs_rbm.rgd;
2053 	} else if (rs->rs_rbm.rgd &&
2054 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2055 		begin = rs->rs_rbm.rgd;
2056 	} else {
2057 		check_and_update_goal(ip);
2058 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2059 	}
2060 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2061 		skip = gfs2_orlov_skip(ip);
2062 	if (rs->rs_rbm.rgd == NULL)
2063 		return -EBADSLT;
2064 
2065 	while (loops < 3) {
2066 		rg_locked = 1;
2067 
2068 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2069 			rg_locked = 0;
2070 			if (skip && skip--)
2071 				goto next_rgrp;
2072 			if (!gfs2_rs_active(rs)) {
2073 				if (loops == 0 &&
2074 				    !fast_to_acquire(rs->rs_rbm.rgd))
2075 					goto next_rgrp;
2076 				if ((loops < 2) &&
2077 				    gfs2_rgrp_used_recently(rs, 1000) &&
2078 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2079 					goto next_rgrp;
2080 			}
2081 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2082 						   LM_ST_EXCLUSIVE, flags,
2083 						   &ip->i_rgd_gh);
2084 			if (unlikely(error))
2085 				return error;
2086 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2087 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2088 				goto skip_rgrp;
2089 			if (sdp->sd_args.ar_rgrplvb) {
2090 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2091 				if (unlikely(error)) {
2092 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2093 					return error;
2094 				}
2095 			}
2096 		}
2097 
2098 		/* Skip unusable resource groups */
2099 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2100 						 GFS2_RDF_ERROR)) ||
2101 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2102 			goto skip_rgrp;
2103 
2104 		if (sdp->sd_args.ar_rgrplvb)
2105 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2106 
2107 		/* Get a reservation if we don't already have one */
2108 		if (!gfs2_rs_active(rs))
2109 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2110 
2111 		/* Skip rgrps when we can't get a reservation on first pass */
2112 		if (!gfs2_rs_active(rs) && (loops < 1))
2113 			goto check_rgrp;
2114 
2115 		/* If rgrp has enough free space, use it */
2116 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2117 		if (free_blocks >= ap->target ||
2118 		    (loops == 2 && ap->min_target &&
2119 		     free_blocks >= ap->min_target)) {
2120 			ap->allowed = free_blocks;
2121 			return 0;
2122 		}
2123 check_rgrp:
2124 		/* Check for unlinked inodes which can be reclaimed */
2125 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2126 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2127 					ip->i_no_addr);
2128 skip_rgrp:
2129 		/* Drop reservation, if we couldn't use reserved rgrp */
2130 		if (gfs2_rs_active(rs))
2131 			gfs2_rs_deltree(rs);
2132 
2133 		/* Unlock rgrp if required */
2134 		if (!rg_locked)
2135 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2136 next_rgrp:
2137 		/* Find the next rgrp, and continue looking */
2138 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2139 			continue;
2140 		if (skip)
2141 			continue;
2142 
2143 		/* If we've scanned all the rgrps, but found no free blocks
2144 		 * then this checks for some less likely conditions before
2145 		 * trying again.
2146 		 */
2147 		loops++;
2148 		/* Check that fs hasn't grown if writing to rindex */
2149 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2150 			error = gfs2_ri_update(ip);
2151 			if (error)
2152 				return error;
2153 		}
2154 		/* Flushing the log may release space */
2155 		if (loops == 2)
2156 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2157 				       GFS2_LFC_INPLACE_RESERVE);
2158 	}
2159 
2160 	return -ENOSPC;
2161 }
2162 
2163 /**
2164  * gfs2_inplace_release - release an inplace reservation
2165  * @ip: the inode the reservation was taken out on
2166  *
2167  * Release a reservation made by gfs2_inplace_reserve().
2168  */
2169 
2170 void gfs2_inplace_release(struct gfs2_inode *ip)
2171 {
2172 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2173 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2174 }
2175 
2176 /**
2177  * gfs2_alloc_extent - allocate an extent from a given bitmap
2178  * @rbm: the resource group information
2179  * @dinode: TRUE if the first block we allocate is for a dinode
2180  * @n: The extent length (value/result)
2181  *
2182  * Add the bitmap buffer to the transaction.
2183  * Set the found bits to @new_state to change block's allocation state.
2184  */
2185 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2186 			     unsigned int *n)
2187 {
2188 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2189 	const unsigned int elen = *n;
2190 	u64 block;
2191 	int ret;
2192 
2193 	*n = 1;
2194 	block = gfs2_rbm_to_block(rbm);
2195 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2196 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2197 	block++;
2198 	while (*n < elen) {
2199 		ret = gfs2_rbm_from_block(&pos, block);
2200 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2201 			break;
2202 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2203 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2204 		(*n)++;
2205 		block++;
2206 	}
2207 }
2208 
2209 /**
2210  * rgblk_free - Change alloc state of given block(s)
2211  * @sdp: the filesystem
2212  * @rgd: the resource group the blocks are in
2213  * @bstart: the start of a run of blocks to free
2214  * @blen: the length of the block run (all must lie within ONE RG!)
2215  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2216  */
2217 
2218 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2219 		       u64 bstart, u32 blen, unsigned char new_state)
2220 {
2221 	struct gfs2_rbm rbm;
2222 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2223 
2224 	rbm.rgd = rgd;
2225 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2226 		return;
2227 	while (blen--) {
2228 		bi = rbm_bi(&rbm);
2229 		if (bi != bi_prev) {
2230 			if (!bi->bi_clone) {
2231 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2232 						      GFP_NOFS | __GFP_NOFAIL);
2233 				memcpy(bi->bi_clone + bi->bi_offset,
2234 				       bi->bi_bh->b_data + bi->bi_offset,
2235 				       bi->bi_bytes);
2236 			}
2237 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2238 			bi_prev = bi;
2239 		}
2240 		gfs2_setbit(&rbm, false, new_state);
2241 		gfs2_rbm_incr(&rbm);
2242 	}
2243 }
2244 
2245 /**
2246  * gfs2_rgrp_dump - print out an rgrp
2247  * @seq: The iterator
2248  * @gl: The glock in question
2249  *
2250  */
2251 
2252 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl)
2253 {
2254 	struct gfs2_rgrpd *rgd = gl->gl_object;
2255 	struct gfs2_blkreserv *trs;
2256 	const struct rb_node *n;
2257 
2258 	if (rgd == NULL)
2259 		return;
2260 	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2261 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2262 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2263 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2264 	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2265 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2266 
2267 		gfs2_print_dbg(seq, "  L: f:%02x b:%u i:%u\n",
2268 			       be32_to_cpu(rgl->rl_flags),
2269 			       be32_to_cpu(rgl->rl_free),
2270 			       be32_to_cpu(rgl->rl_dinodes));
2271 	}
2272 	spin_lock(&rgd->rd_rsspin);
2273 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2274 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2275 		dump_rs(seq, trs);
2276 	}
2277 	spin_unlock(&rgd->rd_rsspin);
2278 }
2279 
2280 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2281 {
2282 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2283 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2284 		(unsigned long long)rgd->rd_addr);
2285 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2286 	gfs2_rgrp_dump(NULL, rgd->rd_gl);
2287 	rgd->rd_flags |= GFS2_RDF_ERROR;
2288 }
2289 
2290 /**
2291  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2292  * @ip: The inode we have just allocated blocks for
2293  * @rbm: The start of the allocated blocks
2294  * @len: The extent length
2295  *
2296  * Adjusts a reservation after an allocation has taken place. If the
2297  * reservation does not match the allocation, or if it is now empty
2298  * then it is removed.
2299  */
2300 
2301 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2302 				    const struct gfs2_rbm *rbm, unsigned len)
2303 {
2304 	struct gfs2_blkreserv *rs = &ip->i_res;
2305 	struct gfs2_rgrpd *rgd = rbm->rgd;
2306 	unsigned rlen;
2307 	u64 block;
2308 	int ret;
2309 
2310 	spin_lock(&rgd->rd_rsspin);
2311 	if (gfs2_rs_active(rs)) {
2312 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2313 			block = gfs2_rbm_to_block(rbm);
2314 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2315 			rlen = min(rs->rs_free, len);
2316 			rs->rs_free -= rlen;
2317 			rgd->rd_reserved -= rlen;
2318 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2319 			if (rs->rs_free && !ret)
2320 				goto out;
2321 			/* We used up our block reservation, so we should
2322 			   reserve more blocks next time. */
2323 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2324 		}
2325 		__rs_deltree(rs);
2326 	}
2327 out:
2328 	spin_unlock(&rgd->rd_rsspin);
2329 }
2330 
2331 /**
2332  * gfs2_set_alloc_start - Set starting point for block allocation
2333  * @rbm: The rbm which will be set to the required location
2334  * @ip: The gfs2 inode
2335  * @dinode: Flag to say if allocation includes a new inode
2336  *
2337  * This sets the starting point from the reservation if one is active
2338  * otherwise it falls back to guessing a start point based on the
2339  * inode's goal block or the last allocation point in the rgrp.
2340  */
2341 
2342 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2343 				 const struct gfs2_inode *ip, bool dinode)
2344 {
2345 	u64 goal;
2346 
2347 	if (gfs2_rs_active(&ip->i_res)) {
2348 		*rbm = ip->i_res.rs_rbm;
2349 		return;
2350 	}
2351 
2352 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2353 		goal = ip->i_goal;
2354 	else
2355 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2356 
2357 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2358 		rbm->bii = 0;
2359 		rbm->offset = 0;
2360 	}
2361 }
2362 
2363 /**
2364  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2365  * @ip: the inode to allocate the block for
2366  * @bn: Used to return the starting block number
2367  * @nblocks: requested number of blocks/extent length (value/result)
2368  * @dinode: 1 if we're allocating a dinode block, else 0
2369  * @generation: the generation number of the inode
2370  *
2371  * Returns: 0 or error
2372  */
2373 
2374 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2375 		      bool dinode, u64 *generation)
2376 {
2377 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2378 	struct buffer_head *dibh;
2379 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2380 	unsigned int ndata;
2381 	u64 block; /* block, within the file system scope */
2382 	int error;
2383 
2384 	gfs2_set_alloc_start(&rbm, ip, dinode);
2385 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2386 
2387 	if (error == -ENOSPC) {
2388 		gfs2_set_alloc_start(&rbm, ip, dinode);
2389 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2390 	}
2391 
2392 	/* Since all blocks are reserved in advance, this shouldn't happen */
2393 	if (error) {
2394 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2395 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2396 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2397 			rbm.rgd->rd_extfail_pt);
2398 		goto rgrp_error;
2399 	}
2400 
2401 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2402 	block = gfs2_rbm_to_block(&rbm);
2403 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2404 	if (gfs2_rs_active(&ip->i_res))
2405 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2406 	ndata = *nblocks;
2407 	if (dinode)
2408 		ndata--;
2409 
2410 	if (!dinode) {
2411 		ip->i_goal = block + ndata - 1;
2412 		error = gfs2_meta_inode_buffer(ip, &dibh);
2413 		if (error == 0) {
2414 			struct gfs2_dinode *di =
2415 				(struct gfs2_dinode *)dibh->b_data;
2416 			gfs2_trans_add_meta(ip->i_gl, dibh);
2417 			di->di_goal_meta = di->di_goal_data =
2418 				cpu_to_be64(ip->i_goal);
2419 			brelse(dibh);
2420 		}
2421 	}
2422 	if (rbm.rgd->rd_free < *nblocks) {
2423 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2424 		goto rgrp_error;
2425 	}
2426 
2427 	rbm.rgd->rd_free -= *nblocks;
2428 	if (dinode) {
2429 		rbm.rgd->rd_dinodes++;
2430 		*generation = rbm.rgd->rd_igeneration++;
2431 		if (*generation == 0)
2432 			*generation = rbm.rgd->rd_igeneration++;
2433 	}
2434 
2435 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2436 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2437 
2438 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2439 	if (dinode)
2440 		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2441 
2442 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2443 
2444 	rbm.rgd->rd_free_clone -= *nblocks;
2445 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2446 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2447 	*bn = block;
2448 	return 0;
2449 
2450 rgrp_error:
2451 	gfs2_rgrp_error(rbm.rgd);
2452 	return -EIO;
2453 }
2454 
2455 /**
2456  * __gfs2_free_blocks - free a contiguous run of block(s)
2457  * @ip: the inode these blocks are being freed from
2458  * @rgd: the resource group the blocks are in
2459  * @bstart: first block of a run of contiguous blocks
2460  * @blen: the length of the block run
2461  * @meta: 1 if the blocks represent metadata
2462  *
2463  */
2464 
2465 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2466 			u64 bstart, u32 blen, int meta)
2467 {
2468 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2469 
2470 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2471 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2472 	rgd->rd_free += blen;
2473 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2474 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2475 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2476 
2477 	/* Directories keep their data in the metadata address space */
2478 	if (meta || ip->i_depth)
2479 		gfs2_meta_wipe(ip, bstart, blen);
2480 }
2481 
2482 /**
2483  * gfs2_free_meta - free a contiguous run of data block(s)
2484  * @ip: the inode these blocks are being freed from
2485  * @rgd: the resource group the blocks are in
2486  * @bstart: first block of a run of contiguous blocks
2487  * @blen: the length of the block run
2488  *
2489  */
2490 
2491 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2492 		    u64 bstart, u32 blen)
2493 {
2494 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2495 
2496 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2497 	gfs2_statfs_change(sdp, 0, +blen, 0);
2498 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2499 }
2500 
2501 void gfs2_unlink_di(struct inode *inode)
2502 {
2503 	struct gfs2_inode *ip = GFS2_I(inode);
2504 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2505 	struct gfs2_rgrpd *rgd;
2506 	u64 blkno = ip->i_no_addr;
2507 
2508 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2509 	if (!rgd)
2510 		return;
2511 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2512 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2513 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2514 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2515 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2516 }
2517 
2518 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2519 {
2520 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2521 
2522 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2523 	if (!rgd->rd_dinodes)
2524 		gfs2_consist_rgrpd(rgd);
2525 	rgd->rd_dinodes--;
2526 	rgd->rd_free++;
2527 
2528 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2529 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2530 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2531 
2532 	gfs2_statfs_change(sdp, 0, +1, -1);
2533 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2534 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2535 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2536 }
2537 
2538 /**
2539  * gfs2_check_blk_type - Check the type of a block
2540  * @sdp: The superblock
2541  * @no_addr: The block number to check
2542  * @type: The block type we are looking for
2543  *
2544  * Returns: 0 if the block type matches the expected type
2545  *          -ESTALE if it doesn't match
2546  *          or -ve errno if something went wrong while checking
2547  */
2548 
2549 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2550 {
2551 	struct gfs2_rgrpd *rgd;
2552 	struct gfs2_holder rgd_gh;
2553 	struct gfs2_rbm rbm;
2554 	int error = -EINVAL;
2555 
2556 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2557 	if (!rgd)
2558 		goto fail;
2559 
2560 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2561 	if (error)
2562 		goto fail;
2563 
2564 	rbm.rgd = rgd;
2565 	error = gfs2_rbm_from_block(&rbm, no_addr);
2566 	if (WARN_ON_ONCE(error))
2567 		goto fail;
2568 
2569 	if (gfs2_testbit(&rbm, false) != type)
2570 		error = -ESTALE;
2571 
2572 	gfs2_glock_dq_uninit(&rgd_gh);
2573 fail:
2574 	return error;
2575 }
2576 
2577 /**
2578  * gfs2_rlist_add - add a RG to a list of RGs
2579  * @ip: the inode
2580  * @rlist: the list of resource groups
2581  * @block: the block
2582  *
2583  * Figure out what RG a block belongs to and add that RG to the list
2584  *
2585  * FIXME: Don't use NOFAIL
2586  *
2587  */
2588 
2589 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2590 		    u64 block)
2591 {
2592 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2593 	struct gfs2_rgrpd *rgd;
2594 	struct gfs2_rgrpd **tmp;
2595 	unsigned int new_space;
2596 	unsigned int x;
2597 
2598 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2599 		return;
2600 
2601 	/*
2602 	 * The resource group last accessed is kept in the last position.
2603 	 */
2604 
2605 	if (rlist->rl_rgrps) {
2606 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2607 		if (rgrp_contains_block(rgd, block))
2608 			return;
2609 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2610 	} else {
2611 		rgd = ip->i_res.rs_rbm.rgd;
2612 		if (!rgd || !rgrp_contains_block(rgd, block))
2613 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2614 	}
2615 
2616 	if (!rgd) {
2617 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2618 		       (unsigned long long)block);
2619 		return;
2620 	}
2621 
2622 	for (x = 0; x < rlist->rl_rgrps; x++) {
2623 		if (rlist->rl_rgd[x] == rgd) {
2624 			swap(rlist->rl_rgd[x],
2625 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2626 			return;
2627 		}
2628 	}
2629 
2630 	if (rlist->rl_rgrps == rlist->rl_space) {
2631 		new_space = rlist->rl_space + 10;
2632 
2633 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2634 			      GFP_NOFS | __GFP_NOFAIL);
2635 
2636 		if (rlist->rl_rgd) {
2637 			memcpy(tmp, rlist->rl_rgd,
2638 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2639 			kfree(rlist->rl_rgd);
2640 		}
2641 
2642 		rlist->rl_space = new_space;
2643 		rlist->rl_rgd = tmp;
2644 	}
2645 
2646 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2647 }
2648 
2649 /**
2650  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2651  *      and initialize an array of glock holders for them
2652  * @rlist: the list of resource groups
2653  *
2654  * FIXME: Don't use NOFAIL
2655  *
2656  */
2657 
2658 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2659 {
2660 	unsigned int x;
2661 
2662 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2663 				      sizeof(struct gfs2_holder),
2664 				      GFP_NOFS | __GFP_NOFAIL);
2665 	for (x = 0; x < rlist->rl_rgrps; x++)
2666 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2667 				LM_ST_EXCLUSIVE, 0,
2668 				&rlist->rl_ghs[x]);
2669 }
2670 
2671 /**
2672  * gfs2_rlist_free - free a resource group list
2673  * @rlist: the list of resource groups
2674  *
2675  */
2676 
2677 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2678 {
2679 	unsigned int x;
2680 
2681 	kfree(rlist->rl_rgd);
2682 
2683 	if (rlist->rl_ghs) {
2684 		for (x = 0; x < rlist->rl_rgrps; x++)
2685 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2686 		kfree(rlist->rl_ghs);
2687 		rlist->rl_ghs = NULL;
2688 	}
2689 }
2690 
2691