xref: /linux/drivers/md/dm-zoned-metadata.c (revision 52338415)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2017 Western Digital Corporation or its affiliates.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include "dm-zoned.h"
9 
10 #include <linux/module.h>
11 #include <linux/crc32.h>
12 #include <linux/sched/mm.h>
13 
14 #define	DM_MSG_PREFIX		"zoned metadata"
15 
16 /*
17  * Metadata version.
18  */
19 #define DMZ_META_VER	1
20 
21 /*
22  * On-disk super block magic.
23  */
24 #define DMZ_MAGIC	((((unsigned int)('D')) << 24) | \
25 			 (((unsigned int)('Z')) << 16) | \
26 			 (((unsigned int)('B')) <<  8) | \
27 			 ((unsigned int)('D')))
28 
29 /*
30  * On disk super block.
31  * This uses only 512 B but uses on disk a full 4KB block. This block is
32  * followed on disk by the mapping table of chunks to zones and the bitmap
33  * blocks indicating zone block validity.
34  * The overall resulting metadata format is:
35  *    (1) Super block (1 block)
36  *    (2) Chunk mapping table (nr_map_blocks)
37  *    (3) Bitmap blocks (nr_bitmap_blocks)
38  * All metadata blocks are stored in conventional zones, starting from
39  * the first conventional zone found on disk.
40  */
41 struct dmz_super {
42 	/* Magic number */
43 	__le32		magic;			/*   4 */
44 
45 	/* Metadata version number */
46 	__le32		version;		/*   8 */
47 
48 	/* Generation number */
49 	__le64		gen;			/*  16 */
50 
51 	/* This block number */
52 	__le64		sb_block;		/*  24 */
53 
54 	/* The number of metadata blocks, including this super block */
55 	__le32		nr_meta_blocks;		/*  28 */
56 
57 	/* The number of sequential zones reserved for reclaim */
58 	__le32		nr_reserved_seq;	/*  32 */
59 
60 	/* The number of entries in the mapping table */
61 	__le32		nr_chunks;		/*  36 */
62 
63 	/* The number of blocks used for the chunk mapping table */
64 	__le32		nr_map_blocks;		/*  40 */
65 
66 	/* The number of blocks used for the block bitmaps */
67 	__le32		nr_bitmap_blocks;	/*  44 */
68 
69 	/* Checksum */
70 	__le32		crc;			/*  48 */
71 
72 	/* Padding to full 512B sector */
73 	u8		reserved[464];		/* 512 */
74 };
75 
76 /*
77  * Chunk mapping entry: entries are indexed by chunk number
78  * and give the zone ID (dzone_id) mapping the chunk on disk.
79  * This zone may be sequential or random. If it is a sequential
80  * zone, a second zone (bzone_id) used as a write buffer may
81  * also be specified. This second zone will always be a randomly
82  * writeable zone.
83  */
84 struct dmz_map {
85 	__le32			dzone_id;
86 	__le32			bzone_id;
87 };
88 
89 /*
90  * Chunk mapping table metadata: 512 8-bytes entries per 4KB block.
91  */
92 #define DMZ_MAP_ENTRIES		(DMZ_BLOCK_SIZE / sizeof(struct dmz_map))
93 #define DMZ_MAP_ENTRIES_SHIFT	(ilog2(DMZ_MAP_ENTRIES))
94 #define DMZ_MAP_ENTRIES_MASK	(DMZ_MAP_ENTRIES - 1)
95 #define DMZ_MAP_UNMAPPED	UINT_MAX
96 
97 /*
98  * Meta data block descriptor (for cached metadata blocks).
99  */
100 struct dmz_mblock {
101 	struct rb_node		node;
102 	struct list_head	link;
103 	sector_t		no;
104 	unsigned int		ref;
105 	unsigned long		state;
106 	struct page		*page;
107 	void			*data;
108 };
109 
110 /*
111  * Metadata block state flags.
112  */
113 enum {
114 	DMZ_META_DIRTY,
115 	DMZ_META_READING,
116 	DMZ_META_WRITING,
117 	DMZ_META_ERROR,
118 };
119 
120 /*
121  * Super block information (one per metadata set).
122  */
123 struct dmz_sb {
124 	sector_t		block;
125 	struct dmz_mblock	*mblk;
126 	struct dmz_super	*sb;
127 };
128 
129 /*
130  * In-memory metadata.
131  */
132 struct dmz_metadata {
133 	struct dmz_dev		*dev;
134 
135 	sector_t		zone_bitmap_size;
136 	unsigned int		zone_nr_bitmap_blocks;
137 
138 	unsigned int		nr_bitmap_blocks;
139 	unsigned int		nr_map_blocks;
140 
141 	unsigned int		nr_useable_zones;
142 	unsigned int		nr_meta_blocks;
143 	unsigned int		nr_meta_zones;
144 	unsigned int		nr_data_zones;
145 	unsigned int		nr_rnd_zones;
146 	unsigned int		nr_reserved_seq;
147 	unsigned int		nr_chunks;
148 
149 	/* Zone information array */
150 	struct dm_zone		*zones;
151 
152 	struct dm_zone		*sb_zone;
153 	struct dmz_sb		sb[2];
154 	unsigned int		mblk_primary;
155 	u64			sb_gen;
156 	unsigned int		min_nr_mblks;
157 	unsigned int		max_nr_mblks;
158 	atomic_t		nr_mblks;
159 	struct rw_semaphore	mblk_sem;
160 	struct mutex		mblk_flush_lock;
161 	spinlock_t		mblk_lock;
162 	struct rb_root		mblk_rbtree;
163 	struct list_head	mblk_lru_list;
164 	struct list_head	mblk_dirty_list;
165 	struct shrinker		mblk_shrinker;
166 
167 	/* Zone allocation management */
168 	struct mutex		map_lock;
169 	struct dmz_mblock	**map_mblk;
170 	unsigned int		nr_rnd;
171 	atomic_t		unmap_nr_rnd;
172 	struct list_head	unmap_rnd_list;
173 	struct list_head	map_rnd_list;
174 
175 	unsigned int		nr_seq;
176 	atomic_t		unmap_nr_seq;
177 	struct list_head	unmap_seq_list;
178 	struct list_head	map_seq_list;
179 
180 	atomic_t		nr_reserved_seq_zones;
181 	struct list_head	reserved_seq_zones_list;
182 
183 	wait_queue_head_t	free_wq;
184 };
185 
186 /*
187  * Various accessors
188  */
189 unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
190 {
191 	return ((unsigned int)(zone - zmd->zones));
192 }
193 
194 sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
195 {
196 	return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift;
197 }
198 
199 sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
200 {
201 	return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift;
202 }
203 
204 unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
205 {
206 	return zmd->nr_chunks;
207 }
208 
209 unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd)
210 {
211 	return zmd->nr_rnd;
212 }
213 
214 unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd)
215 {
216 	return atomic_read(&zmd->unmap_nr_rnd);
217 }
218 
219 /*
220  * Lock/unlock mapping table.
221  * The map lock also protects all the zone lists.
222  */
223 void dmz_lock_map(struct dmz_metadata *zmd)
224 {
225 	mutex_lock(&zmd->map_lock);
226 }
227 
228 void dmz_unlock_map(struct dmz_metadata *zmd)
229 {
230 	mutex_unlock(&zmd->map_lock);
231 }
232 
233 /*
234  * Lock/unlock metadata access. This is a "read" lock on a semaphore
235  * that prevents metadata flush from running while metadata are being
236  * modified. The actual metadata write mutual exclusion is achieved with
237  * the map lock and zone state management (active and reclaim state are
238  * mutually exclusive).
239  */
240 void dmz_lock_metadata(struct dmz_metadata *zmd)
241 {
242 	down_read(&zmd->mblk_sem);
243 }
244 
245 void dmz_unlock_metadata(struct dmz_metadata *zmd)
246 {
247 	up_read(&zmd->mblk_sem);
248 }
249 
250 /*
251  * Lock/unlock flush: prevent concurrent executions
252  * of dmz_flush_metadata as well as metadata modification in reclaim
253  * while flush is being executed.
254  */
255 void dmz_lock_flush(struct dmz_metadata *zmd)
256 {
257 	mutex_lock(&zmd->mblk_flush_lock);
258 }
259 
260 void dmz_unlock_flush(struct dmz_metadata *zmd)
261 {
262 	mutex_unlock(&zmd->mblk_flush_lock);
263 }
264 
265 /*
266  * Allocate a metadata block.
267  */
268 static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd,
269 					   sector_t mblk_no)
270 {
271 	struct dmz_mblock *mblk = NULL;
272 
273 	/* See if we can reuse cached blocks */
274 	if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) {
275 		spin_lock(&zmd->mblk_lock);
276 		mblk = list_first_entry_or_null(&zmd->mblk_lru_list,
277 						struct dmz_mblock, link);
278 		if (mblk) {
279 			list_del_init(&mblk->link);
280 			rb_erase(&mblk->node, &zmd->mblk_rbtree);
281 			mblk->no = mblk_no;
282 		}
283 		spin_unlock(&zmd->mblk_lock);
284 		if (mblk)
285 			return mblk;
286 	}
287 
288 	/* Allocate a new block */
289 	mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO);
290 	if (!mblk)
291 		return NULL;
292 
293 	mblk->page = alloc_page(GFP_NOIO);
294 	if (!mblk->page) {
295 		kfree(mblk);
296 		return NULL;
297 	}
298 
299 	RB_CLEAR_NODE(&mblk->node);
300 	INIT_LIST_HEAD(&mblk->link);
301 	mblk->ref = 0;
302 	mblk->state = 0;
303 	mblk->no = mblk_no;
304 	mblk->data = page_address(mblk->page);
305 
306 	atomic_inc(&zmd->nr_mblks);
307 
308 	return mblk;
309 }
310 
311 /*
312  * Free a metadata block.
313  */
314 static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
315 {
316 	__free_pages(mblk->page, 0);
317 	kfree(mblk);
318 
319 	atomic_dec(&zmd->nr_mblks);
320 }
321 
322 /*
323  * Insert a metadata block in the rbtree.
324  */
325 static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
326 {
327 	struct rb_root *root = &zmd->mblk_rbtree;
328 	struct rb_node **new = &(root->rb_node), *parent = NULL;
329 	struct dmz_mblock *b;
330 
331 	/* Figure out where to put the new node */
332 	while (*new) {
333 		b = container_of(*new, struct dmz_mblock, node);
334 		parent = *new;
335 		new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right);
336 	}
337 
338 	/* Add new node and rebalance tree */
339 	rb_link_node(&mblk->node, parent, new);
340 	rb_insert_color(&mblk->node, root);
341 }
342 
343 /*
344  * Lookup a metadata block in the rbtree. If the block is found, increment
345  * its reference count.
346  */
347 static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd,
348 					      sector_t mblk_no)
349 {
350 	struct rb_root *root = &zmd->mblk_rbtree;
351 	struct rb_node *node = root->rb_node;
352 	struct dmz_mblock *mblk;
353 
354 	while (node) {
355 		mblk = container_of(node, struct dmz_mblock, node);
356 		if (mblk->no == mblk_no) {
357 			/*
358 			 * If this is the first reference to the block,
359 			 * remove it from the LRU list.
360 			 */
361 			mblk->ref++;
362 			if (mblk->ref == 1 &&
363 			    !test_bit(DMZ_META_DIRTY, &mblk->state))
364 				list_del_init(&mblk->link);
365 			return mblk;
366 		}
367 		node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right;
368 	}
369 
370 	return NULL;
371 }
372 
373 /*
374  * Metadata block BIO end callback.
375  */
376 static void dmz_mblock_bio_end_io(struct bio *bio)
377 {
378 	struct dmz_mblock *mblk = bio->bi_private;
379 	int flag;
380 
381 	if (bio->bi_status)
382 		set_bit(DMZ_META_ERROR, &mblk->state);
383 
384 	if (bio_op(bio) == REQ_OP_WRITE)
385 		flag = DMZ_META_WRITING;
386 	else
387 		flag = DMZ_META_READING;
388 
389 	clear_bit_unlock(flag, &mblk->state);
390 	smp_mb__after_atomic();
391 	wake_up_bit(&mblk->state, flag);
392 
393 	bio_put(bio);
394 }
395 
396 /*
397  * Read an uncached metadata block from disk and add it to the cache.
398  */
399 static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
400 					      sector_t mblk_no)
401 {
402 	struct dmz_mblock *mblk, *m;
403 	sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
404 	struct bio *bio;
405 
406 	if (dmz_bdev_is_dying(zmd->dev))
407 		return ERR_PTR(-EIO);
408 
409 	/* Get a new block and a BIO to read it */
410 	mblk = dmz_alloc_mblock(zmd, mblk_no);
411 	if (!mblk)
412 		return ERR_PTR(-ENOMEM);
413 
414 	bio = bio_alloc(GFP_NOIO, 1);
415 	if (!bio) {
416 		dmz_free_mblock(zmd, mblk);
417 		return ERR_PTR(-ENOMEM);
418 	}
419 
420 	spin_lock(&zmd->mblk_lock);
421 
422 	/*
423 	 * Make sure that another context did not start reading
424 	 * the block already.
425 	 */
426 	m = dmz_get_mblock_fast(zmd, mblk_no);
427 	if (m) {
428 		spin_unlock(&zmd->mblk_lock);
429 		dmz_free_mblock(zmd, mblk);
430 		bio_put(bio);
431 		return m;
432 	}
433 
434 	mblk->ref++;
435 	set_bit(DMZ_META_READING, &mblk->state);
436 	dmz_insert_mblock(zmd, mblk);
437 
438 	spin_unlock(&zmd->mblk_lock);
439 
440 	/* Submit read BIO */
441 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
442 	bio_set_dev(bio, zmd->dev->bdev);
443 	bio->bi_private = mblk;
444 	bio->bi_end_io = dmz_mblock_bio_end_io;
445 	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
446 	bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
447 	submit_bio(bio);
448 
449 	return mblk;
450 }
451 
452 /*
453  * Free metadata blocks.
454  */
455 static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd,
456 					     unsigned long limit)
457 {
458 	struct dmz_mblock *mblk;
459 	unsigned long count = 0;
460 
461 	if (!zmd->max_nr_mblks)
462 		return 0;
463 
464 	while (!list_empty(&zmd->mblk_lru_list) &&
465 	       atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks &&
466 	       count < limit) {
467 		mblk = list_first_entry(&zmd->mblk_lru_list,
468 					struct dmz_mblock, link);
469 		list_del_init(&mblk->link);
470 		rb_erase(&mblk->node, &zmd->mblk_rbtree);
471 		dmz_free_mblock(zmd, mblk);
472 		count++;
473 	}
474 
475 	return count;
476 }
477 
478 /*
479  * For mblock shrinker: get the number of unused metadata blocks in the cache.
480  */
481 static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink,
482 					       struct shrink_control *sc)
483 {
484 	struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
485 
486 	return atomic_read(&zmd->nr_mblks);
487 }
488 
489 /*
490  * For mblock shrinker: scan unused metadata blocks and shrink the cache.
491  */
492 static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink,
493 					      struct shrink_control *sc)
494 {
495 	struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
496 	unsigned long count;
497 
498 	spin_lock(&zmd->mblk_lock);
499 	count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan);
500 	spin_unlock(&zmd->mblk_lock);
501 
502 	return count ? count : SHRINK_STOP;
503 }
504 
505 /*
506  * Release a metadata block.
507  */
508 static void dmz_release_mblock(struct dmz_metadata *zmd,
509 			       struct dmz_mblock *mblk)
510 {
511 
512 	if (!mblk)
513 		return;
514 
515 	spin_lock(&zmd->mblk_lock);
516 
517 	mblk->ref--;
518 	if (mblk->ref == 0) {
519 		if (test_bit(DMZ_META_ERROR, &mblk->state)) {
520 			rb_erase(&mblk->node, &zmd->mblk_rbtree);
521 			dmz_free_mblock(zmd, mblk);
522 		} else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) {
523 			list_add_tail(&mblk->link, &zmd->mblk_lru_list);
524 			dmz_shrink_mblock_cache(zmd, 1);
525 		}
526 	}
527 
528 	spin_unlock(&zmd->mblk_lock);
529 }
530 
531 /*
532  * Get a metadata block from the rbtree. If the block
533  * is not present, read it from disk.
534  */
535 static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
536 					 sector_t mblk_no)
537 {
538 	struct dmz_mblock *mblk;
539 
540 	/* Check rbtree */
541 	spin_lock(&zmd->mblk_lock);
542 	mblk = dmz_get_mblock_fast(zmd, mblk_no);
543 	spin_unlock(&zmd->mblk_lock);
544 
545 	if (!mblk) {
546 		/* Cache miss: read the block from disk */
547 		mblk = dmz_get_mblock_slow(zmd, mblk_no);
548 		if (IS_ERR(mblk))
549 			return mblk;
550 	}
551 
552 	/* Wait for on-going read I/O and check for error */
553 	wait_on_bit_io(&mblk->state, DMZ_META_READING,
554 		       TASK_UNINTERRUPTIBLE);
555 	if (test_bit(DMZ_META_ERROR, &mblk->state)) {
556 		dmz_release_mblock(zmd, mblk);
557 		return ERR_PTR(-EIO);
558 	}
559 
560 	return mblk;
561 }
562 
563 /*
564  * Mark a metadata block dirty.
565  */
566 static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
567 {
568 	spin_lock(&zmd->mblk_lock);
569 	if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state))
570 		list_add_tail(&mblk->link, &zmd->mblk_dirty_list);
571 	spin_unlock(&zmd->mblk_lock);
572 }
573 
574 /*
575  * Issue a metadata block write BIO.
576  */
577 static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
578 			    unsigned int set)
579 {
580 	sector_t block = zmd->sb[set].block + mblk->no;
581 	struct bio *bio;
582 
583 	if (dmz_bdev_is_dying(zmd->dev))
584 		return -EIO;
585 
586 	bio = bio_alloc(GFP_NOIO, 1);
587 	if (!bio) {
588 		set_bit(DMZ_META_ERROR, &mblk->state);
589 		return -ENOMEM;
590 	}
591 
592 	set_bit(DMZ_META_WRITING, &mblk->state);
593 
594 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
595 	bio_set_dev(bio, zmd->dev->bdev);
596 	bio->bi_private = mblk;
597 	bio->bi_end_io = dmz_mblock_bio_end_io;
598 	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
599 	bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
600 	submit_bio(bio);
601 
602 	return 0;
603 }
604 
605 /*
606  * Read/write a metadata block.
607  */
608 static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
609 			  struct page *page)
610 {
611 	struct bio *bio;
612 	int ret;
613 
614 	if (dmz_bdev_is_dying(zmd->dev))
615 		return -EIO;
616 
617 	bio = bio_alloc(GFP_NOIO, 1);
618 	if (!bio)
619 		return -ENOMEM;
620 
621 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
622 	bio_set_dev(bio, zmd->dev->bdev);
623 	bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
624 	bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
625 	ret = submit_bio_wait(bio);
626 	bio_put(bio);
627 
628 	return ret;
629 }
630 
631 /*
632  * Write super block of the specified metadata set.
633  */
634 static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
635 {
636 	sector_t block = zmd->sb[set].block;
637 	struct dmz_mblock *mblk = zmd->sb[set].mblk;
638 	struct dmz_super *sb = zmd->sb[set].sb;
639 	u64 sb_gen = zmd->sb_gen + 1;
640 	int ret;
641 
642 	sb->magic = cpu_to_le32(DMZ_MAGIC);
643 	sb->version = cpu_to_le32(DMZ_META_VER);
644 
645 	sb->gen = cpu_to_le64(sb_gen);
646 
647 	sb->sb_block = cpu_to_le64(block);
648 	sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks);
649 	sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq);
650 	sb->nr_chunks = cpu_to_le32(zmd->nr_chunks);
651 
652 	sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks);
653 	sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks);
654 
655 	sb->crc = 0;
656 	sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE));
657 
658 	ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
659 	if (ret == 0)
660 		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
661 
662 	return ret;
663 }
664 
665 /*
666  * Write dirty metadata blocks to the specified set.
667  */
668 static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
669 				   struct list_head *write_list,
670 				   unsigned int set)
671 {
672 	struct dmz_mblock *mblk;
673 	struct blk_plug plug;
674 	int ret = 0, nr_mblks_submitted = 0;
675 
676 	/* Issue writes */
677 	blk_start_plug(&plug);
678 	list_for_each_entry(mblk, write_list, link) {
679 		ret = dmz_write_mblock(zmd, mblk, set);
680 		if (ret)
681 			break;
682 		nr_mblks_submitted++;
683 	}
684 	blk_finish_plug(&plug);
685 
686 	/* Wait for completion */
687 	list_for_each_entry(mblk, write_list, link) {
688 		if (!nr_mblks_submitted)
689 			break;
690 		wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
691 			       TASK_UNINTERRUPTIBLE);
692 		if (test_bit(DMZ_META_ERROR, &mblk->state)) {
693 			clear_bit(DMZ_META_ERROR, &mblk->state);
694 			ret = -EIO;
695 		}
696 		nr_mblks_submitted--;
697 	}
698 
699 	/* Flush drive cache (this will also sync data) */
700 	if (ret == 0)
701 		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
702 
703 	return ret;
704 }
705 
706 /*
707  * Log dirty metadata blocks.
708  */
709 static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd,
710 				 struct list_head *write_list)
711 {
712 	unsigned int log_set = zmd->mblk_primary ^ 0x1;
713 	int ret;
714 
715 	/* Write dirty blocks to the log */
716 	ret = dmz_write_dirty_mblocks(zmd, write_list, log_set);
717 	if (ret)
718 		return ret;
719 
720 	/*
721 	 * No error so far: now validate the log by updating the
722 	 * log index super block generation.
723 	 */
724 	ret = dmz_write_sb(zmd, log_set);
725 	if (ret)
726 		return ret;
727 
728 	return 0;
729 }
730 
731 /*
732  * Flush dirty metadata blocks.
733  */
734 int dmz_flush_metadata(struct dmz_metadata *zmd)
735 {
736 	struct dmz_mblock *mblk;
737 	struct list_head write_list;
738 	int ret;
739 
740 	if (WARN_ON(!zmd))
741 		return 0;
742 
743 	INIT_LIST_HEAD(&write_list);
744 
745 	/*
746 	 * Make sure that metadata blocks are stable before logging: take
747 	 * the write lock on the metadata semaphore to prevent target BIOs
748 	 * from modifying metadata.
749 	 */
750 	down_write(&zmd->mblk_sem);
751 
752 	/*
753 	 * This is called from the target flush work and reclaim work.
754 	 * Concurrent execution is not allowed.
755 	 */
756 	dmz_lock_flush(zmd);
757 
758 	if (dmz_bdev_is_dying(zmd->dev)) {
759 		ret = -EIO;
760 		goto out;
761 	}
762 
763 	/* Get dirty blocks */
764 	spin_lock(&zmd->mblk_lock);
765 	list_splice_init(&zmd->mblk_dirty_list, &write_list);
766 	spin_unlock(&zmd->mblk_lock);
767 
768 	/* If there are no dirty metadata blocks, just flush the device cache */
769 	if (list_empty(&write_list)) {
770 		ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
771 		goto out;
772 	}
773 
774 	/*
775 	 * The primary metadata set is still clean. Keep it this way until
776 	 * all updates are successful in the secondary set. That is, use
777 	 * the secondary set as a log.
778 	 */
779 	ret = dmz_log_dirty_mblocks(zmd, &write_list);
780 	if (ret)
781 		goto out;
782 
783 	/*
784 	 * The log is on disk. It is now safe to update in place
785 	 * in the primary metadata set.
786 	 */
787 	ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
788 	if (ret)
789 		goto out;
790 
791 	ret = dmz_write_sb(zmd, zmd->mblk_primary);
792 	if (ret)
793 		goto out;
794 
795 	while (!list_empty(&write_list)) {
796 		mblk = list_first_entry(&write_list, struct dmz_mblock, link);
797 		list_del_init(&mblk->link);
798 
799 		spin_lock(&zmd->mblk_lock);
800 		clear_bit(DMZ_META_DIRTY, &mblk->state);
801 		if (mblk->ref == 0)
802 			list_add_tail(&mblk->link, &zmd->mblk_lru_list);
803 		spin_unlock(&zmd->mblk_lock);
804 	}
805 
806 	zmd->sb_gen++;
807 out:
808 	if (ret && !list_empty(&write_list)) {
809 		spin_lock(&zmd->mblk_lock);
810 		list_splice(&write_list, &zmd->mblk_dirty_list);
811 		spin_unlock(&zmd->mblk_lock);
812 	}
813 
814 	dmz_unlock_flush(zmd);
815 	up_write(&zmd->mblk_sem);
816 
817 	return ret;
818 }
819 
820 /*
821  * Check super block.
822  */
823 static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
824 {
825 	unsigned int nr_meta_zones, nr_data_zones;
826 	struct dmz_dev *dev = zmd->dev;
827 	u32 crc, stored_crc;
828 	u64 gen;
829 
830 	gen = le64_to_cpu(sb->gen);
831 	stored_crc = le32_to_cpu(sb->crc);
832 	sb->crc = 0;
833 	crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE);
834 	if (crc != stored_crc) {
835 		dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)",
836 			    crc, stored_crc);
837 		return -ENXIO;
838 	}
839 
840 	if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
841 		dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
842 			    DMZ_MAGIC, le32_to_cpu(sb->magic));
843 		return -ENXIO;
844 	}
845 
846 	if (le32_to_cpu(sb->version) != DMZ_META_VER) {
847 		dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
848 			    DMZ_META_VER, le32_to_cpu(sb->version));
849 		return -ENXIO;
850 	}
851 
852 	nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1)
853 		>> dev->zone_nr_blocks_shift;
854 	if (!nr_meta_zones ||
855 	    nr_meta_zones >= zmd->nr_rnd_zones) {
856 		dmz_dev_err(dev, "Invalid number of metadata blocks");
857 		return -ENXIO;
858 	}
859 
860 	if (!le32_to_cpu(sb->nr_reserved_seq) ||
861 	    le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) {
862 		dmz_dev_err(dev, "Invalid number of reserved sequential zones");
863 		return -ENXIO;
864 	}
865 
866 	nr_data_zones = zmd->nr_useable_zones -
867 		(nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq));
868 	if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) {
869 		dmz_dev_err(dev, "Invalid number of chunks %u / %u",
870 			    le32_to_cpu(sb->nr_chunks), nr_data_zones);
871 		return -ENXIO;
872 	}
873 
874 	/* OK */
875 	zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks);
876 	zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq);
877 	zmd->nr_chunks = le32_to_cpu(sb->nr_chunks);
878 	zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks);
879 	zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks);
880 	zmd->nr_meta_zones = nr_meta_zones;
881 	zmd->nr_data_zones = nr_data_zones;
882 
883 	return 0;
884 }
885 
886 /*
887  * Read the first or second super block from disk.
888  */
889 static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set)
890 {
891 	return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block,
892 			      zmd->sb[set].mblk->page);
893 }
894 
895 /*
896  * Determine the position of the secondary super blocks on disk.
897  * This is used only if a corruption of the primary super block
898  * is detected.
899  */
900 static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
901 {
902 	unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
903 	struct dmz_mblock *mblk;
904 	int i;
905 
906 	/* Allocate a block */
907 	mblk = dmz_alloc_mblock(zmd, 0);
908 	if (!mblk)
909 		return -ENOMEM;
910 
911 	zmd->sb[1].mblk = mblk;
912 	zmd->sb[1].sb = mblk->data;
913 
914 	/* Bad first super block: search for the second one */
915 	zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks;
916 	for (i = 0; i < zmd->nr_rnd_zones - 1; i++) {
917 		if (dmz_read_sb(zmd, 1) != 0)
918 			break;
919 		if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC)
920 			return 0;
921 		zmd->sb[1].block += zone_nr_blocks;
922 	}
923 
924 	dmz_free_mblock(zmd, mblk);
925 	zmd->sb[1].mblk = NULL;
926 
927 	return -EIO;
928 }
929 
930 /*
931  * Read the first or second super block from disk.
932  */
933 static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set)
934 {
935 	struct dmz_mblock *mblk;
936 	int ret;
937 
938 	/* Allocate a block */
939 	mblk = dmz_alloc_mblock(zmd, 0);
940 	if (!mblk)
941 		return -ENOMEM;
942 
943 	zmd->sb[set].mblk = mblk;
944 	zmd->sb[set].sb = mblk->data;
945 
946 	/* Read super block */
947 	ret = dmz_read_sb(zmd, set);
948 	if (ret) {
949 		dmz_free_mblock(zmd, mblk);
950 		zmd->sb[set].mblk = NULL;
951 		return ret;
952 	}
953 
954 	return 0;
955 }
956 
957 /*
958  * Recover a metadata set.
959  */
960 static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
961 {
962 	unsigned int src_set = dst_set ^ 0x1;
963 	struct page *page;
964 	int i, ret;
965 
966 	dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set);
967 
968 	if (dst_set == 0)
969 		zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
970 	else {
971 		zmd->sb[1].block = zmd->sb[0].block +
972 			(zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
973 	}
974 
975 	page = alloc_page(GFP_NOIO);
976 	if (!page)
977 		return -ENOMEM;
978 
979 	/* Copy metadata blocks */
980 	for (i = 1; i < zmd->nr_meta_blocks; i++) {
981 		ret = dmz_rdwr_block(zmd, REQ_OP_READ,
982 				     zmd->sb[src_set].block + i, page);
983 		if (ret)
984 			goto out;
985 		ret = dmz_rdwr_block(zmd, REQ_OP_WRITE,
986 				     zmd->sb[dst_set].block + i, page);
987 		if (ret)
988 			goto out;
989 	}
990 
991 	/* Finalize with the super block */
992 	if (!zmd->sb[dst_set].mblk) {
993 		zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0);
994 		if (!zmd->sb[dst_set].mblk) {
995 			ret = -ENOMEM;
996 			goto out;
997 		}
998 		zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data;
999 	}
1000 
1001 	ret = dmz_write_sb(zmd, dst_set);
1002 out:
1003 	__free_pages(page, 0);
1004 
1005 	return ret;
1006 }
1007 
1008 /*
1009  * Get super block from disk.
1010  */
1011 static int dmz_load_sb(struct dmz_metadata *zmd)
1012 {
1013 	bool sb_good[2] = {false, false};
1014 	u64 sb_gen[2] = {0, 0};
1015 	int ret;
1016 
1017 	/* Read and check the primary super block */
1018 	zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
1019 	ret = dmz_get_sb(zmd, 0);
1020 	if (ret) {
1021 		dmz_dev_err(zmd->dev, "Read primary super block failed");
1022 		return ret;
1023 	}
1024 
1025 	ret = dmz_check_sb(zmd, zmd->sb[0].sb);
1026 
1027 	/* Read and check secondary super block */
1028 	if (ret == 0) {
1029 		sb_good[0] = true;
1030 		zmd->sb[1].block = zmd->sb[0].block +
1031 			(zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
1032 		ret = dmz_get_sb(zmd, 1);
1033 	} else
1034 		ret = dmz_lookup_secondary_sb(zmd);
1035 
1036 	if (ret) {
1037 		dmz_dev_err(zmd->dev, "Read secondary super block failed");
1038 		return ret;
1039 	}
1040 
1041 	ret = dmz_check_sb(zmd, zmd->sb[1].sb);
1042 	if (ret == 0)
1043 		sb_good[1] = true;
1044 
1045 	/* Use highest generation sb first */
1046 	if (!sb_good[0] && !sb_good[1]) {
1047 		dmz_dev_err(zmd->dev, "No valid super block found");
1048 		return -EIO;
1049 	}
1050 
1051 	if (sb_good[0])
1052 		sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen);
1053 	else
1054 		ret = dmz_recover_mblocks(zmd, 0);
1055 
1056 	if (sb_good[1])
1057 		sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen);
1058 	else
1059 		ret = dmz_recover_mblocks(zmd, 1);
1060 
1061 	if (ret) {
1062 		dmz_dev_err(zmd->dev, "Recovery failed");
1063 		return -EIO;
1064 	}
1065 
1066 	if (sb_gen[0] >= sb_gen[1]) {
1067 		zmd->sb_gen = sb_gen[0];
1068 		zmd->mblk_primary = 0;
1069 	} else {
1070 		zmd->sb_gen = sb_gen[1];
1071 		zmd->mblk_primary = 1;
1072 	}
1073 
1074 	dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)",
1075 		      zmd->mblk_primary, zmd->sb_gen);
1076 
1077 	return 0;
1078 }
1079 
1080 /*
1081  * Initialize a zone descriptor.
1082  */
1083 static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
1084 			 struct blk_zone *blkz)
1085 {
1086 	struct dmz_dev *dev = zmd->dev;
1087 
1088 	/* Ignore the eventual last runt (smaller) zone */
1089 	if (blkz->len != dev->zone_nr_sectors) {
1090 		if (blkz->start + blkz->len == dev->capacity)
1091 			return 0;
1092 		return -ENXIO;
1093 	}
1094 
1095 	INIT_LIST_HEAD(&zone->link);
1096 	atomic_set(&zone->refcount, 0);
1097 	zone->chunk = DMZ_MAP_UNMAPPED;
1098 
1099 	if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) {
1100 		set_bit(DMZ_RND, &zone->flags);
1101 		zmd->nr_rnd_zones++;
1102 	} else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ ||
1103 		   blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) {
1104 		set_bit(DMZ_SEQ, &zone->flags);
1105 	} else
1106 		return -ENXIO;
1107 
1108 	if (blkz->cond == BLK_ZONE_COND_OFFLINE)
1109 		set_bit(DMZ_OFFLINE, &zone->flags);
1110 	else if (blkz->cond == BLK_ZONE_COND_READONLY)
1111 		set_bit(DMZ_READ_ONLY, &zone->flags);
1112 
1113 	if (dmz_is_rnd(zone))
1114 		zone->wp_block = 0;
1115 	else
1116 		zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
1117 
1118 	if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) {
1119 		zmd->nr_useable_zones++;
1120 		if (dmz_is_rnd(zone)) {
1121 			zmd->nr_rnd_zones++;
1122 			if (!zmd->sb_zone) {
1123 				/* Super block zone */
1124 				zmd->sb_zone = zone;
1125 			}
1126 		}
1127 	}
1128 
1129 	return 0;
1130 }
1131 
1132 /*
1133  * Free zones descriptors.
1134  */
1135 static void dmz_drop_zones(struct dmz_metadata *zmd)
1136 {
1137 	kfree(zmd->zones);
1138 	zmd->zones = NULL;
1139 }
1140 
1141 /*
1142  * The size of a zone report in number of zones.
1143  * This results in 4096*64B=256KB report zones commands.
1144  */
1145 #define DMZ_REPORT_NR_ZONES	4096
1146 
1147 /*
1148  * Allocate and initialize zone descriptors using the zone
1149  * information from disk.
1150  */
1151 static int dmz_init_zones(struct dmz_metadata *zmd)
1152 {
1153 	struct dmz_dev *dev = zmd->dev;
1154 	struct dm_zone *zone;
1155 	struct blk_zone *blkz;
1156 	unsigned int nr_blkz;
1157 	sector_t sector = 0;
1158 	int i, ret = 0;
1159 
1160 	/* Init */
1161 	zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
1162 	zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
1163 
1164 	/* Allocate zone array */
1165 	zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
1166 	if (!zmd->zones)
1167 		return -ENOMEM;
1168 
1169 	dmz_dev_info(dev, "Using %zu B for zone information",
1170 		     sizeof(struct dm_zone) * dev->nr_zones);
1171 
1172 	/* Get zone information */
1173 	nr_blkz = DMZ_REPORT_NR_ZONES;
1174 	blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL);
1175 	if (!blkz) {
1176 		ret = -ENOMEM;
1177 		goto out;
1178 	}
1179 
1180 	/*
1181 	 * Get zone information and initialize zone descriptors.
1182 	 * At the same time, determine where the super block
1183 	 * should be: first block of the first randomly writable
1184 	 * zone.
1185 	 */
1186 	zone = zmd->zones;
1187 	while (sector < dev->capacity) {
1188 		/* Get zone information */
1189 		nr_blkz = DMZ_REPORT_NR_ZONES;
1190 		ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
1191 		if (ret) {
1192 			dmz_dev_err(dev, "Report zones failed %d", ret);
1193 			goto out;
1194 		}
1195 
1196 		if (!nr_blkz)
1197 			break;
1198 
1199 		/* Process report */
1200 		for (i = 0; i < nr_blkz; i++) {
1201 			ret = dmz_init_zone(zmd, zone, &blkz[i]);
1202 			if (ret)
1203 				goto out;
1204 			sector += dev->zone_nr_sectors;
1205 			zone++;
1206 		}
1207 	}
1208 
1209 	/* The entire zone configuration of the disk should now be known */
1210 	if (sector < dev->capacity) {
1211 		dmz_dev_err(dev, "Failed to get correct zone information");
1212 		ret = -ENXIO;
1213 	}
1214 out:
1215 	kfree(blkz);
1216 	if (ret)
1217 		dmz_drop_zones(zmd);
1218 
1219 	return ret;
1220 }
1221 
1222 /*
1223  * Update a zone information.
1224  */
1225 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1226 {
1227 	unsigned int nr_blkz = 1;
1228 	unsigned int noio_flag;
1229 	struct blk_zone blkz;
1230 	int ret;
1231 
1232 	/*
1233 	 * Get zone information from disk. Since blkdev_report_zones() uses
1234 	 * GFP_KERNEL by default for memory allocations, set the per-task
1235 	 * PF_MEMALLOC_NOIO flag so that all allocations are done as if
1236 	 * GFP_NOIO was specified.
1237 	 */
1238 	noio_flag = memalloc_noio_save();
1239 	ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
1240 				  &blkz, &nr_blkz);
1241 	memalloc_noio_restore(noio_flag);
1242 	if (!nr_blkz)
1243 		ret = -EIO;
1244 	if (ret) {
1245 		dmz_dev_err(zmd->dev, "Get zone %u report failed",
1246 			    dmz_id(zmd, zone));
1247 		return ret;
1248 	}
1249 
1250 	clear_bit(DMZ_OFFLINE, &zone->flags);
1251 	clear_bit(DMZ_READ_ONLY, &zone->flags);
1252 	if (blkz.cond == BLK_ZONE_COND_OFFLINE)
1253 		set_bit(DMZ_OFFLINE, &zone->flags);
1254 	else if (blkz.cond == BLK_ZONE_COND_READONLY)
1255 		set_bit(DMZ_READ_ONLY, &zone->flags);
1256 
1257 	if (dmz_is_seq(zone))
1258 		zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start);
1259 	else
1260 		zone->wp_block = 0;
1261 
1262 	return 0;
1263 }
1264 
1265 /*
1266  * Check a zone write pointer position when the zone is marked
1267  * with the sequential write error flag.
1268  */
1269 static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
1270 				    struct dm_zone *zone)
1271 {
1272 	unsigned int wp = 0;
1273 	int ret;
1274 
1275 	wp = zone->wp_block;
1276 	ret = dmz_update_zone(zmd, zone);
1277 	if (ret)
1278 		return ret;
1279 
1280 	dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)",
1281 		     dmz_id(zmd, zone), zone->wp_block, wp);
1282 
1283 	if (zone->wp_block < wp) {
1284 		dmz_invalidate_blocks(zmd, zone, zone->wp_block,
1285 				      wp - zone->wp_block);
1286 	}
1287 
1288 	return 0;
1289 }
1290 
1291 static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
1292 {
1293 	return &zmd->zones[zone_id];
1294 }
1295 
1296 /*
1297  * Reset a zone write pointer.
1298  */
1299 static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1300 {
1301 	int ret;
1302 
1303 	/*
1304 	 * Ignore offline zones, read only zones,
1305 	 * and conventional zones.
1306 	 */
1307 	if (dmz_is_offline(zone) ||
1308 	    dmz_is_readonly(zone) ||
1309 	    dmz_is_rnd(zone))
1310 		return 0;
1311 
1312 	if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
1313 		struct dmz_dev *dev = zmd->dev;
1314 
1315 		ret = blkdev_reset_zones(dev->bdev,
1316 					 dmz_start_sect(zmd, zone),
1317 					 dev->zone_nr_sectors, GFP_NOIO);
1318 		if (ret) {
1319 			dmz_dev_err(dev, "Reset zone %u failed %d",
1320 				    dmz_id(zmd, zone), ret);
1321 			return ret;
1322 		}
1323 	}
1324 
1325 	/* Clear write error bit and rewind write pointer position */
1326 	clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
1327 	zone->wp_block = 0;
1328 
1329 	return 0;
1330 }
1331 
1332 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone);
1333 
1334 /*
1335  * Initialize chunk mapping.
1336  */
1337 static int dmz_load_mapping(struct dmz_metadata *zmd)
1338 {
1339 	struct dmz_dev *dev = zmd->dev;
1340 	struct dm_zone *dzone, *bzone;
1341 	struct dmz_mblock *dmap_mblk = NULL;
1342 	struct dmz_map *dmap;
1343 	unsigned int i = 0, e = 0, chunk = 0;
1344 	unsigned int dzone_id;
1345 	unsigned int bzone_id;
1346 
1347 	/* Metadata block array for the chunk mapping table */
1348 	zmd->map_mblk = kcalloc(zmd->nr_map_blocks,
1349 				sizeof(struct dmz_mblk *), GFP_KERNEL);
1350 	if (!zmd->map_mblk)
1351 		return -ENOMEM;
1352 
1353 	/* Get chunk mapping table blocks and initialize zone mapping */
1354 	while (chunk < zmd->nr_chunks) {
1355 		if (!dmap_mblk) {
1356 			/* Get mapping block */
1357 			dmap_mblk = dmz_get_mblock(zmd, i + 1);
1358 			if (IS_ERR(dmap_mblk))
1359 				return PTR_ERR(dmap_mblk);
1360 			zmd->map_mblk[i] = dmap_mblk;
1361 			dmap = (struct dmz_map *) dmap_mblk->data;
1362 			i++;
1363 			e = 0;
1364 		}
1365 
1366 		/* Check data zone */
1367 		dzone_id = le32_to_cpu(dmap[e].dzone_id);
1368 		if (dzone_id == DMZ_MAP_UNMAPPED)
1369 			goto next;
1370 
1371 		if (dzone_id >= dev->nr_zones) {
1372 			dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u",
1373 				    chunk, dzone_id);
1374 			return -EIO;
1375 		}
1376 
1377 		dzone = dmz_get(zmd, dzone_id);
1378 		set_bit(DMZ_DATA, &dzone->flags);
1379 		dzone->chunk = chunk;
1380 		dmz_get_zone_weight(zmd, dzone);
1381 
1382 		if (dmz_is_rnd(dzone))
1383 			list_add_tail(&dzone->link, &zmd->map_rnd_list);
1384 		else
1385 			list_add_tail(&dzone->link, &zmd->map_seq_list);
1386 
1387 		/* Check buffer zone */
1388 		bzone_id = le32_to_cpu(dmap[e].bzone_id);
1389 		if (bzone_id == DMZ_MAP_UNMAPPED)
1390 			goto next;
1391 
1392 		if (bzone_id >= dev->nr_zones) {
1393 			dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u",
1394 				    chunk, bzone_id);
1395 			return -EIO;
1396 		}
1397 
1398 		bzone = dmz_get(zmd, bzone_id);
1399 		if (!dmz_is_rnd(bzone)) {
1400 			dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u",
1401 				    chunk, bzone_id);
1402 			return -EIO;
1403 		}
1404 
1405 		set_bit(DMZ_DATA, &bzone->flags);
1406 		set_bit(DMZ_BUF, &bzone->flags);
1407 		bzone->chunk = chunk;
1408 		bzone->bzone = dzone;
1409 		dzone->bzone = bzone;
1410 		dmz_get_zone_weight(zmd, bzone);
1411 		list_add_tail(&bzone->link, &zmd->map_rnd_list);
1412 next:
1413 		chunk++;
1414 		e++;
1415 		if (e >= DMZ_MAP_ENTRIES)
1416 			dmap_mblk = NULL;
1417 	}
1418 
1419 	/*
1420 	 * At this point, only meta zones and mapped data zones were
1421 	 * fully initialized. All remaining zones are unmapped data
1422 	 * zones. Finish initializing those here.
1423 	 */
1424 	for (i = 0; i < dev->nr_zones; i++) {
1425 		dzone = dmz_get(zmd, i);
1426 		if (dmz_is_meta(dzone))
1427 			continue;
1428 
1429 		if (dmz_is_rnd(dzone))
1430 			zmd->nr_rnd++;
1431 		else
1432 			zmd->nr_seq++;
1433 
1434 		if (dmz_is_data(dzone)) {
1435 			/* Already initialized */
1436 			continue;
1437 		}
1438 
1439 		/* Unmapped data zone */
1440 		set_bit(DMZ_DATA, &dzone->flags);
1441 		dzone->chunk = DMZ_MAP_UNMAPPED;
1442 		if (dmz_is_rnd(dzone)) {
1443 			list_add_tail(&dzone->link, &zmd->unmap_rnd_list);
1444 			atomic_inc(&zmd->unmap_nr_rnd);
1445 		} else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) {
1446 			list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list);
1447 			atomic_inc(&zmd->nr_reserved_seq_zones);
1448 			zmd->nr_seq--;
1449 		} else {
1450 			list_add_tail(&dzone->link, &zmd->unmap_seq_list);
1451 			atomic_inc(&zmd->unmap_nr_seq);
1452 		}
1453 	}
1454 
1455 	return 0;
1456 }
1457 
1458 /*
1459  * Set a data chunk mapping.
1460  */
1461 static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk,
1462 				  unsigned int dzone_id, unsigned int bzone_id)
1463 {
1464 	struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
1465 	struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
1466 	int map_idx = chunk & DMZ_MAP_ENTRIES_MASK;
1467 
1468 	dmap[map_idx].dzone_id = cpu_to_le32(dzone_id);
1469 	dmap[map_idx].bzone_id = cpu_to_le32(bzone_id);
1470 	dmz_dirty_mblock(zmd, dmap_mblk);
1471 }
1472 
1473 /*
1474  * The list of mapped zones is maintained in LRU order.
1475  * This rotates a zone at the end of its map list.
1476  */
1477 static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1478 {
1479 	if (list_empty(&zone->link))
1480 		return;
1481 
1482 	list_del_init(&zone->link);
1483 	if (dmz_is_seq(zone)) {
1484 		/* LRU rotate sequential zone */
1485 		list_add_tail(&zone->link, &zmd->map_seq_list);
1486 	} else {
1487 		/* LRU rotate random zone */
1488 		list_add_tail(&zone->link, &zmd->map_rnd_list);
1489 	}
1490 }
1491 
1492 /*
1493  * The list of mapped random zones is maintained
1494  * in LRU order. This rotates a zone at the end of the list.
1495  */
1496 static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1497 {
1498 	__dmz_lru_zone(zmd, zone);
1499 	if (zone->bzone)
1500 		__dmz_lru_zone(zmd, zone->bzone);
1501 }
1502 
1503 /*
1504  * Wait for any zone to be freed.
1505  */
1506 static void dmz_wait_for_free_zones(struct dmz_metadata *zmd)
1507 {
1508 	DEFINE_WAIT(wait);
1509 
1510 	prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE);
1511 	dmz_unlock_map(zmd);
1512 	dmz_unlock_metadata(zmd);
1513 
1514 	io_schedule_timeout(HZ);
1515 
1516 	dmz_lock_metadata(zmd);
1517 	dmz_lock_map(zmd);
1518 	finish_wait(&zmd->free_wq, &wait);
1519 }
1520 
1521 /*
1522  * Lock a zone for reclaim (set the zone RECLAIM bit).
1523  * Returns false if the zone cannot be locked or if it is already locked
1524  * and 1 otherwise.
1525  */
1526 int dmz_lock_zone_reclaim(struct dm_zone *zone)
1527 {
1528 	/* Active zones cannot be reclaimed */
1529 	if (dmz_is_active(zone))
1530 		return 0;
1531 
1532 	return !test_and_set_bit(DMZ_RECLAIM, &zone->flags);
1533 }
1534 
1535 /*
1536  * Clear a zone reclaim flag.
1537  */
1538 void dmz_unlock_zone_reclaim(struct dm_zone *zone)
1539 {
1540 	WARN_ON(dmz_is_active(zone));
1541 	WARN_ON(!dmz_in_reclaim(zone));
1542 
1543 	clear_bit_unlock(DMZ_RECLAIM, &zone->flags);
1544 	smp_mb__after_atomic();
1545 	wake_up_bit(&zone->flags, DMZ_RECLAIM);
1546 }
1547 
1548 /*
1549  * Wait for a zone reclaim to complete.
1550  */
1551 static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone)
1552 {
1553 	dmz_unlock_map(zmd);
1554 	dmz_unlock_metadata(zmd);
1555 	wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ);
1556 	dmz_lock_metadata(zmd);
1557 	dmz_lock_map(zmd);
1558 }
1559 
1560 /*
1561  * Select a random write zone for reclaim.
1562  */
1563 static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
1564 {
1565 	struct dm_zone *dzone = NULL;
1566 	struct dm_zone *zone;
1567 
1568 	if (list_empty(&zmd->map_rnd_list))
1569 		return ERR_PTR(-EBUSY);
1570 
1571 	list_for_each_entry(zone, &zmd->map_rnd_list, link) {
1572 		if (dmz_is_buf(zone))
1573 			dzone = zone->bzone;
1574 		else
1575 			dzone = zone;
1576 		if (dmz_lock_zone_reclaim(dzone))
1577 			return dzone;
1578 	}
1579 
1580 	return ERR_PTR(-EBUSY);
1581 }
1582 
1583 /*
1584  * Select a buffered sequential zone for reclaim.
1585  */
1586 static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
1587 {
1588 	struct dm_zone *zone;
1589 
1590 	if (list_empty(&zmd->map_seq_list))
1591 		return ERR_PTR(-EBUSY);
1592 
1593 	list_for_each_entry(zone, &zmd->map_seq_list, link) {
1594 		if (!zone->bzone)
1595 			continue;
1596 		if (dmz_lock_zone_reclaim(zone))
1597 			return zone;
1598 	}
1599 
1600 	return ERR_PTR(-EBUSY);
1601 }
1602 
1603 /*
1604  * Select a zone for reclaim.
1605  */
1606 struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
1607 {
1608 	struct dm_zone *zone;
1609 
1610 	/*
1611 	 * Search for a zone candidate to reclaim: 2 cases are possible.
1612 	 * (1) There is no free sequential zones. Then a random data zone
1613 	 *     cannot be reclaimed. So choose a sequential zone to reclaim so
1614 	 *     that afterward a random zone can be reclaimed.
1615 	 * (2) At least one free sequential zone is available, then choose
1616 	 *     the oldest random zone (data or buffer) that can be locked.
1617 	 */
1618 	dmz_lock_map(zmd);
1619 	if (list_empty(&zmd->reserved_seq_zones_list))
1620 		zone = dmz_get_seq_zone_for_reclaim(zmd);
1621 	else
1622 		zone = dmz_get_rnd_zone_for_reclaim(zmd);
1623 	dmz_unlock_map(zmd);
1624 
1625 	return zone;
1626 }
1627 
1628 /*
1629  * Get the zone mapping a chunk, if the chunk is mapped already.
1630  * If no mapping exist and the operation is WRITE, a zone is
1631  * allocated and used to map the chunk.
1632  * The zone returned will be set to the active state.
1633  */
1634 struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op)
1635 {
1636 	struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
1637 	struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
1638 	int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK;
1639 	unsigned int dzone_id;
1640 	struct dm_zone *dzone = NULL;
1641 	int ret = 0;
1642 
1643 	dmz_lock_map(zmd);
1644 again:
1645 	/* Get the chunk mapping */
1646 	dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id);
1647 	if (dzone_id == DMZ_MAP_UNMAPPED) {
1648 		/*
1649 		 * Read or discard in unmapped chunks are fine. But for
1650 		 * writes, we need a mapping, so get one.
1651 		 */
1652 		if (op != REQ_OP_WRITE)
1653 			goto out;
1654 
1655 		/* Allocate a random zone */
1656 		dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
1657 		if (!dzone) {
1658 			if (dmz_bdev_is_dying(zmd->dev)) {
1659 				dzone = ERR_PTR(-EIO);
1660 				goto out;
1661 			}
1662 			dmz_wait_for_free_zones(zmd);
1663 			goto again;
1664 		}
1665 
1666 		dmz_map_zone(zmd, dzone, chunk);
1667 
1668 	} else {
1669 		/* The chunk is already mapped: get the mapping zone */
1670 		dzone = dmz_get(zmd, dzone_id);
1671 		if (dzone->chunk != chunk) {
1672 			dzone = ERR_PTR(-EIO);
1673 			goto out;
1674 		}
1675 
1676 		/* Repair write pointer if the sequential dzone has error */
1677 		if (dmz_seq_write_err(dzone)) {
1678 			ret = dmz_handle_seq_write_err(zmd, dzone);
1679 			if (ret) {
1680 				dzone = ERR_PTR(-EIO);
1681 				goto out;
1682 			}
1683 			clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags);
1684 		}
1685 	}
1686 
1687 	/*
1688 	 * If the zone is being reclaimed, the chunk mapping may change
1689 	 * to a different zone. So wait for reclaim and retry. Otherwise,
1690 	 * activate the zone (this will prevent reclaim from touching it).
1691 	 */
1692 	if (dmz_in_reclaim(dzone)) {
1693 		dmz_wait_for_reclaim(zmd, dzone);
1694 		goto again;
1695 	}
1696 	dmz_activate_zone(dzone);
1697 	dmz_lru_zone(zmd, dzone);
1698 out:
1699 	dmz_unlock_map(zmd);
1700 
1701 	return dzone;
1702 }
1703 
1704 /*
1705  * Write and discard change the block validity of data zones and their buffer
1706  * zones. Check here that valid blocks are still present. If all blocks are
1707  * invalid, the zones can be unmapped on the fly without waiting for reclaim
1708  * to do it.
1709  */
1710 void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone)
1711 {
1712 	struct dm_zone *bzone;
1713 
1714 	dmz_lock_map(zmd);
1715 
1716 	bzone = dzone->bzone;
1717 	if (bzone) {
1718 		if (dmz_weight(bzone))
1719 			dmz_lru_zone(zmd, bzone);
1720 		else {
1721 			/* Empty buffer zone: reclaim it */
1722 			dmz_unmap_zone(zmd, bzone);
1723 			dmz_free_zone(zmd, bzone);
1724 			bzone = NULL;
1725 		}
1726 	}
1727 
1728 	/* Deactivate the data zone */
1729 	dmz_deactivate_zone(dzone);
1730 	if (dmz_is_active(dzone) || bzone || dmz_weight(dzone))
1731 		dmz_lru_zone(zmd, dzone);
1732 	else {
1733 		/* Unbuffered inactive empty data zone: reclaim it */
1734 		dmz_unmap_zone(zmd, dzone);
1735 		dmz_free_zone(zmd, dzone);
1736 	}
1737 
1738 	dmz_unlock_map(zmd);
1739 }
1740 
1741 /*
1742  * Allocate and map a random zone to buffer a chunk
1743  * already mapped to a sequential zone.
1744  */
1745 struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
1746 				     struct dm_zone *dzone)
1747 {
1748 	struct dm_zone *bzone;
1749 
1750 	dmz_lock_map(zmd);
1751 again:
1752 	bzone = dzone->bzone;
1753 	if (bzone)
1754 		goto out;
1755 
1756 	/* Allocate a random zone */
1757 	bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
1758 	if (!bzone) {
1759 		if (dmz_bdev_is_dying(zmd->dev)) {
1760 			bzone = ERR_PTR(-EIO);
1761 			goto out;
1762 		}
1763 		dmz_wait_for_free_zones(zmd);
1764 		goto again;
1765 	}
1766 
1767 	/* Update the chunk mapping */
1768 	dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone),
1769 			      dmz_id(zmd, bzone));
1770 
1771 	set_bit(DMZ_BUF, &bzone->flags);
1772 	bzone->chunk = dzone->chunk;
1773 	bzone->bzone = dzone;
1774 	dzone->bzone = bzone;
1775 	list_add_tail(&bzone->link, &zmd->map_rnd_list);
1776 out:
1777 	dmz_unlock_map(zmd);
1778 
1779 	return bzone;
1780 }
1781 
1782 /*
1783  * Get an unmapped (free) zone.
1784  * This must be called with the mapping lock held.
1785  */
1786 struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags)
1787 {
1788 	struct list_head *list;
1789 	struct dm_zone *zone;
1790 
1791 	if (flags & DMZ_ALLOC_RND)
1792 		list = &zmd->unmap_rnd_list;
1793 	else
1794 		list = &zmd->unmap_seq_list;
1795 again:
1796 	if (list_empty(list)) {
1797 		/*
1798 		 * No free zone: if this is for reclaim, allow using the
1799 		 * reserved sequential zones.
1800 		 */
1801 		if (!(flags & DMZ_ALLOC_RECLAIM) ||
1802 		    list_empty(&zmd->reserved_seq_zones_list))
1803 			return NULL;
1804 
1805 		zone = list_first_entry(&zmd->reserved_seq_zones_list,
1806 					struct dm_zone, link);
1807 		list_del_init(&zone->link);
1808 		atomic_dec(&zmd->nr_reserved_seq_zones);
1809 		return zone;
1810 	}
1811 
1812 	zone = list_first_entry(list, struct dm_zone, link);
1813 	list_del_init(&zone->link);
1814 
1815 	if (dmz_is_rnd(zone))
1816 		atomic_dec(&zmd->unmap_nr_rnd);
1817 	else
1818 		atomic_dec(&zmd->unmap_nr_seq);
1819 
1820 	if (dmz_is_offline(zone)) {
1821 		dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone));
1822 		zone = NULL;
1823 		goto again;
1824 	}
1825 
1826 	return zone;
1827 }
1828 
1829 /*
1830  * Free a zone.
1831  * This must be called with the mapping lock held.
1832  */
1833 void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1834 {
1835 	/* If this is a sequential zone, reset it */
1836 	if (dmz_is_seq(zone))
1837 		dmz_reset_zone(zmd, zone);
1838 
1839 	/* Return the zone to its type unmap list */
1840 	if (dmz_is_rnd(zone)) {
1841 		list_add_tail(&zone->link, &zmd->unmap_rnd_list);
1842 		atomic_inc(&zmd->unmap_nr_rnd);
1843 	} else if (atomic_read(&zmd->nr_reserved_seq_zones) <
1844 		   zmd->nr_reserved_seq) {
1845 		list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
1846 		atomic_inc(&zmd->nr_reserved_seq_zones);
1847 	} else {
1848 		list_add_tail(&zone->link, &zmd->unmap_seq_list);
1849 		atomic_inc(&zmd->unmap_nr_seq);
1850 	}
1851 
1852 	wake_up_all(&zmd->free_wq);
1853 }
1854 
1855 /*
1856  * Map a chunk to a zone.
1857  * This must be called with the mapping lock held.
1858  */
1859 void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone,
1860 		  unsigned int chunk)
1861 {
1862 	/* Set the chunk mapping */
1863 	dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone),
1864 			      DMZ_MAP_UNMAPPED);
1865 	dzone->chunk = chunk;
1866 	if (dmz_is_rnd(dzone))
1867 		list_add_tail(&dzone->link, &zmd->map_rnd_list);
1868 	else
1869 		list_add_tail(&dzone->link, &zmd->map_seq_list);
1870 }
1871 
1872 /*
1873  * Unmap a zone.
1874  * This must be called with the mapping lock held.
1875  */
1876 void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1877 {
1878 	unsigned int chunk = zone->chunk;
1879 	unsigned int dzone_id;
1880 
1881 	if (chunk == DMZ_MAP_UNMAPPED) {
1882 		/* Already unmapped */
1883 		return;
1884 	}
1885 
1886 	if (test_and_clear_bit(DMZ_BUF, &zone->flags)) {
1887 		/*
1888 		 * Unmapping the chunk buffer zone: clear only
1889 		 * the chunk buffer mapping
1890 		 */
1891 		dzone_id = dmz_id(zmd, zone->bzone);
1892 		zone->bzone->bzone = NULL;
1893 		zone->bzone = NULL;
1894 
1895 	} else {
1896 		/*
1897 		 * Unmapping the chunk data zone: the zone must
1898 		 * not be buffered.
1899 		 */
1900 		if (WARN_ON(zone->bzone)) {
1901 			zone->bzone->bzone = NULL;
1902 			zone->bzone = NULL;
1903 		}
1904 		dzone_id = DMZ_MAP_UNMAPPED;
1905 	}
1906 
1907 	dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED);
1908 
1909 	zone->chunk = DMZ_MAP_UNMAPPED;
1910 	list_del_init(&zone->link);
1911 }
1912 
1913 /*
1914  * Set @nr_bits bits in @bitmap starting from @bit.
1915  * Return the number of bits changed from 0 to 1.
1916  */
1917 static unsigned int dmz_set_bits(unsigned long *bitmap,
1918 				 unsigned int bit, unsigned int nr_bits)
1919 {
1920 	unsigned long *addr;
1921 	unsigned int end = bit + nr_bits;
1922 	unsigned int n = 0;
1923 
1924 	while (bit < end) {
1925 		if (((bit & (BITS_PER_LONG - 1)) == 0) &&
1926 		    ((end - bit) >= BITS_PER_LONG)) {
1927 			/* Try to set the whole word at once */
1928 			addr = bitmap + BIT_WORD(bit);
1929 			if (*addr == 0) {
1930 				*addr = ULONG_MAX;
1931 				n += BITS_PER_LONG;
1932 				bit += BITS_PER_LONG;
1933 				continue;
1934 			}
1935 		}
1936 
1937 		if (!test_and_set_bit(bit, bitmap))
1938 			n++;
1939 		bit++;
1940 	}
1941 
1942 	return n;
1943 }
1944 
1945 /*
1946  * Get the bitmap block storing the bit for chunk_block in zone.
1947  */
1948 static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd,
1949 					 struct dm_zone *zone,
1950 					 sector_t chunk_block)
1951 {
1952 	sector_t bitmap_block = 1 + zmd->nr_map_blocks +
1953 		(sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) +
1954 		(chunk_block >> DMZ_BLOCK_SHIFT_BITS);
1955 
1956 	return dmz_get_mblock(zmd, bitmap_block);
1957 }
1958 
1959 /*
1960  * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone.
1961  */
1962 int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
1963 			  struct dm_zone *to_zone)
1964 {
1965 	struct dmz_mblock *from_mblk, *to_mblk;
1966 	sector_t chunk_block = 0;
1967 
1968 	/* Get the zones bitmap blocks */
1969 	while (chunk_block < zmd->dev->zone_nr_blocks) {
1970 		from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block);
1971 		if (IS_ERR(from_mblk))
1972 			return PTR_ERR(from_mblk);
1973 		to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block);
1974 		if (IS_ERR(to_mblk)) {
1975 			dmz_release_mblock(zmd, from_mblk);
1976 			return PTR_ERR(to_mblk);
1977 		}
1978 
1979 		memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE);
1980 		dmz_dirty_mblock(zmd, to_mblk);
1981 
1982 		dmz_release_mblock(zmd, to_mblk);
1983 		dmz_release_mblock(zmd, from_mblk);
1984 
1985 		chunk_block += DMZ_BLOCK_SIZE_BITS;
1986 	}
1987 
1988 	to_zone->weight = from_zone->weight;
1989 
1990 	return 0;
1991 }
1992 
1993 /*
1994  * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone,
1995  * starting from chunk_block.
1996  */
1997 int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
1998 			   struct dm_zone *to_zone, sector_t chunk_block)
1999 {
2000 	unsigned int nr_blocks;
2001 	int ret;
2002 
2003 	/* Get the zones bitmap blocks */
2004 	while (chunk_block < zmd->dev->zone_nr_blocks) {
2005 		/* Get a valid region from the source zone */
2006 		ret = dmz_first_valid_block(zmd, from_zone, &chunk_block);
2007 		if (ret <= 0)
2008 			return ret;
2009 
2010 		nr_blocks = ret;
2011 		ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks);
2012 		if (ret)
2013 			return ret;
2014 
2015 		chunk_block += nr_blocks;
2016 	}
2017 
2018 	return 0;
2019 }
2020 
2021 /*
2022  * Validate all the blocks in the range [block..block+nr_blocks-1].
2023  */
2024 int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
2025 			sector_t chunk_block, unsigned int nr_blocks)
2026 {
2027 	unsigned int count, bit, nr_bits;
2028 	unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
2029 	struct dmz_mblock *mblk;
2030 	unsigned int n = 0;
2031 
2032 	dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks",
2033 		      dmz_id(zmd, zone), (unsigned long long)chunk_block,
2034 		      nr_blocks);
2035 
2036 	WARN_ON(chunk_block + nr_blocks > zone_nr_blocks);
2037 
2038 	while (nr_blocks) {
2039 		/* Get bitmap block */
2040 		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2041 		if (IS_ERR(mblk))
2042 			return PTR_ERR(mblk);
2043 
2044 		/* Set bits */
2045 		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2046 		nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
2047 
2048 		count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
2049 		if (count) {
2050 			dmz_dirty_mblock(zmd, mblk);
2051 			n += count;
2052 		}
2053 		dmz_release_mblock(zmd, mblk);
2054 
2055 		nr_blocks -= nr_bits;
2056 		chunk_block += nr_bits;
2057 	}
2058 
2059 	if (likely(zone->weight + n <= zone_nr_blocks))
2060 		zone->weight += n;
2061 	else {
2062 		dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u",
2063 			     dmz_id(zmd, zone), zone->weight,
2064 			     zone_nr_blocks - n);
2065 		zone->weight = zone_nr_blocks;
2066 	}
2067 
2068 	return 0;
2069 }
2070 
2071 /*
2072  * Clear nr_bits bits in bitmap starting from bit.
2073  * Return the number of bits cleared.
2074  */
2075 static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits)
2076 {
2077 	unsigned long *addr;
2078 	int end = bit + nr_bits;
2079 	int n = 0;
2080 
2081 	while (bit < end) {
2082 		if (((bit & (BITS_PER_LONG - 1)) == 0) &&
2083 		    ((end - bit) >= BITS_PER_LONG)) {
2084 			/* Try to clear whole word at once */
2085 			addr = bitmap + BIT_WORD(bit);
2086 			if (*addr == ULONG_MAX) {
2087 				*addr = 0;
2088 				n += BITS_PER_LONG;
2089 				bit += BITS_PER_LONG;
2090 				continue;
2091 			}
2092 		}
2093 
2094 		if (test_and_clear_bit(bit, bitmap))
2095 			n++;
2096 		bit++;
2097 	}
2098 
2099 	return n;
2100 }
2101 
2102 /*
2103  * Invalidate all the blocks in the range [block..block+nr_blocks-1].
2104  */
2105 int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
2106 			  sector_t chunk_block, unsigned int nr_blocks)
2107 {
2108 	unsigned int count, bit, nr_bits;
2109 	struct dmz_mblock *mblk;
2110 	unsigned int n = 0;
2111 
2112 	dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks",
2113 		      dmz_id(zmd, zone), (u64)chunk_block, nr_blocks);
2114 
2115 	WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
2116 
2117 	while (nr_blocks) {
2118 		/* Get bitmap block */
2119 		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2120 		if (IS_ERR(mblk))
2121 			return PTR_ERR(mblk);
2122 
2123 		/* Clear bits */
2124 		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2125 		nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
2126 
2127 		count = dmz_clear_bits((unsigned long *)mblk->data,
2128 				       bit, nr_bits);
2129 		if (count) {
2130 			dmz_dirty_mblock(zmd, mblk);
2131 			n += count;
2132 		}
2133 		dmz_release_mblock(zmd, mblk);
2134 
2135 		nr_blocks -= nr_bits;
2136 		chunk_block += nr_bits;
2137 	}
2138 
2139 	if (zone->weight >= n)
2140 		zone->weight -= n;
2141 	else {
2142 		dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u",
2143 			     dmz_id(zmd, zone), zone->weight, n);
2144 		zone->weight = 0;
2145 	}
2146 
2147 	return 0;
2148 }
2149 
2150 /*
2151  * Get a block bit value.
2152  */
2153 static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone,
2154 			  sector_t chunk_block)
2155 {
2156 	struct dmz_mblock *mblk;
2157 	int ret;
2158 
2159 	WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks);
2160 
2161 	/* Get bitmap block */
2162 	mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2163 	if (IS_ERR(mblk))
2164 		return PTR_ERR(mblk);
2165 
2166 	/* Get offset */
2167 	ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS,
2168 		       (unsigned long *) mblk->data) != 0;
2169 
2170 	dmz_release_mblock(zmd, mblk);
2171 
2172 	return ret;
2173 }
2174 
2175 /*
2176  * Return the number of blocks from chunk_block to the first block with a bit
2177  * value specified by set. Search at most nr_blocks blocks from chunk_block.
2178  */
2179 static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
2180 				 sector_t chunk_block, unsigned int nr_blocks,
2181 				 int set)
2182 {
2183 	struct dmz_mblock *mblk;
2184 	unsigned int bit, set_bit, nr_bits;
2185 	unsigned long *bitmap;
2186 	int n = 0;
2187 
2188 	WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
2189 
2190 	while (nr_blocks) {
2191 		/* Get bitmap block */
2192 		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2193 		if (IS_ERR(mblk))
2194 			return PTR_ERR(mblk);
2195 
2196 		/* Get offset */
2197 		bitmap = (unsigned long *) mblk->data;
2198 		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2199 		nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
2200 		if (set)
2201 			set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
2202 		else
2203 			set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
2204 		dmz_release_mblock(zmd, mblk);
2205 
2206 		n += set_bit - bit;
2207 		if (set_bit < DMZ_BLOCK_SIZE_BITS)
2208 			break;
2209 
2210 		nr_blocks -= nr_bits;
2211 		chunk_block += nr_bits;
2212 	}
2213 
2214 	return n;
2215 }
2216 
2217 /*
2218  * Test if chunk_block is valid. If it is, the number of consecutive
2219  * valid blocks from chunk_block will be returned.
2220  */
2221 int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
2222 		    sector_t chunk_block)
2223 {
2224 	int valid;
2225 
2226 	valid = dmz_test_block(zmd, zone, chunk_block);
2227 	if (valid <= 0)
2228 		return valid;
2229 
2230 	/* The block is valid: get the number of valid blocks from block */
2231 	return dmz_to_next_set_block(zmd, zone, chunk_block,
2232 				     zmd->dev->zone_nr_blocks - chunk_block, 0);
2233 }
2234 
2235 /*
2236  * Find the first valid block from @chunk_block in @zone.
2237  * If such a block is found, its number is returned using
2238  * @chunk_block and the total number of valid blocks from @chunk_block
2239  * is returned.
2240  */
2241 int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
2242 			  sector_t *chunk_block)
2243 {
2244 	sector_t start_block = *chunk_block;
2245 	int ret;
2246 
2247 	ret = dmz_to_next_set_block(zmd, zone, start_block,
2248 				    zmd->dev->zone_nr_blocks - start_block, 1);
2249 	if (ret < 0)
2250 		return ret;
2251 
2252 	start_block += ret;
2253 	*chunk_block = start_block;
2254 
2255 	return dmz_to_next_set_block(zmd, zone, start_block,
2256 				     zmd->dev->zone_nr_blocks - start_block, 0);
2257 }
2258 
2259 /*
2260  * Count the number of bits set starting from bit up to bit + nr_bits - 1.
2261  */
2262 static int dmz_count_bits(void *bitmap, int bit, int nr_bits)
2263 {
2264 	unsigned long *addr;
2265 	int end = bit + nr_bits;
2266 	int n = 0;
2267 
2268 	while (bit < end) {
2269 		if (((bit & (BITS_PER_LONG - 1)) == 0) &&
2270 		    ((end - bit) >= BITS_PER_LONG)) {
2271 			addr = (unsigned long *)bitmap + BIT_WORD(bit);
2272 			if (*addr == ULONG_MAX) {
2273 				n += BITS_PER_LONG;
2274 				bit += BITS_PER_LONG;
2275 				continue;
2276 			}
2277 		}
2278 
2279 		if (test_bit(bit, bitmap))
2280 			n++;
2281 		bit++;
2282 	}
2283 
2284 	return n;
2285 }
2286 
2287 /*
2288  * Get a zone weight.
2289  */
2290 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
2291 {
2292 	struct dmz_mblock *mblk;
2293 	sector_t chunk_block = 0;
2294 	unsigned int bit, nr_bits;
2295 	unsigned int nr_blocks = zmd->dev->zone_nr_blocks;
2296 	void *bitmap;
2297 	int n = 0;
2298 
2299 	while (nr_blocks) {
2300 		/* Get bitmap block */
2301 		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2302 		if (IS_ERR(mblk)) {
2303 			n = 0;
2304 			break;
2305 		}
2306 
2307 		/* Count bits in this block */
2308 		bitmap = mblk->data;
2309 		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2310 		nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
2311 		n += dmz_count_bits(bitmap, bit, nr_bits);
2312 
2313 		dmz_release_mblock(zmd, mblk);
2314 
2315 		nr_blocks -= nr_bits;
2316 		chunk_block += nr_bits;
2317 	}
2318 
2319 	zone->weight = n;
2320 }
2321 
2322 /*
2323  * Cleanup the zoned metadata resources.
2324  */
2325 static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
2326 {
2327 	struct rb_root *root;
2328 	struct dmz_mblock *mblk, *next;
2329 	int i;
2330 
2331 	/* Release zone mapping resources */
2332 	if (zmd->map_mblk) {
2333 		for (i = 0; i < zmd->nr_map_blocks; i++)
2334 			dmz_release_mblock(zmd, zmd->map_mblk[i]);
2335 		kfree(zmd->map_mblk);
2336 		zmd->map_mblk = NULL;
2337 	}
2338 
2339 	/* Release super blocks */
2340 	for (i = 0; i < 2; i++) {
2341 		if (zmd->sb[i].mblk) {
2342 			dmz_free_mblock(zmd, zmd->sb[i].mblk);
2343 			zmd->sb[i].mblk = NULL;
2344 		}
2345 	}
2346 
2347 	/* Free cached blocks */
2348 	while (!list_empty(&zmd->mblk_dirty_list)) {
2349 		mblk = list_first_entry(&zmd->mblk_dirty_list,
2350 					struct dmz_mblock, link);
2351 		dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)",
2352 			     (u64)mblk->no, mblk->ref);
2353 		list_del_init(&mblk->link);
2354 		rb_erase(&mblk->node, &zmd->mblk_rbtree);
2355 		dmz_free_mblock(zmd, mblk);
2356 	}
2357 
2358 	while (!list_empty(&zmd->mblk_lru_list)) {
2359 		mblk = list_first_entry(&zmd->mblk_lru_list,
2360 					struct dmz_mblock, link);
2361 		list_del_init(&mblk->link);
2362 		rb_erase(&mblk->node, &zmd->mblk_rbtree);
2363 		dmz_free_mblock(zmd, mblk);
2364 	}
2365 
2366 	/* Sanity checks: the mblock rbtree should now be empty */
2367 	root = &zmd->mblk_rbtree;
2368 	rbtree_postorder_for_each_entry_safe(mblk, next, root, node) {
2369 		dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree",
2370 			     (u64)mblk->no, mblk->ref);
2371 		mblk->ref = 0;
2372 		dmz_free_mblock(zmd, mblk);
2373 	}
2374 
2375 	/* Free the zone descriptors */
2376 	dmz_drop_zones(zmd);
2377 
2378 	mutex_destroy(&zmd->mblk_flush_lock);
2379 	mutex_destroy(&zmd->map_lock);
2380 }
2381 
2382 /*
2383  * Initialize the zoned metadata.
2384  */
2385 int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
2386 {
2387 	struct dmz_metadata *zmd;
2388 	unsigned int i, zid;
2389 	struct dm_zone *zone;
2390 	int ret;
2391 
2392 	zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL);
2393 	if (!zmd)
2394 		return -ENOMEM;
2395 
2396 	zmd->dev = dev;
2397 	zmd->mblk_rbtree = RB_ROOT;
2398 	init_rwsem(&zmd->mblk_sem);
2399 	mutex_init(&zmd->mblk_flush_lock);
2400 	spin_lock_init(&zmd->mblk_lock);
2401 	INIT_LIST_HEAD(&zmd->mblk_lru_list);
2402 	INIT_LIST_HEAD(&zmd->mblk_dirty_list);
2403 
2404 	mutex_init(&zmd->map_lock);
2405 	atomic_set(&zmd->unmap_nr_rnd, 0);
2406 	INIT_LIST_HEAD(&zmd->unmap_rnd_list);
2407 	INIT_LIST_HEAD(&zmd->map_rnd_list);
2408 
2409 	atomic_set(&zmd->unmap_nr_seq, 0);
2410 	INIT_LIST_HEAD(&zmd->unmap_seq_list);
2411 	INIT_LIST_HEAD(&zmd->map_seq_list);
2412 
2413 	atomic_set(&zmd->nr_reserved_seq_zones, 0);
2414 	INIT_LIST_HEAD(&zmd->reserved_seq_zones_list);
2415 
2416 	init_waitqueue_head(&zmd->free_wq);
2417 
2418 	/* Initialize zone descriptors */
2419 	ret = dmz_init_zones(zmd);
2420 	if (ret)
2421 		goto err;
2422 
2423 	/* Get super block */
2424 	ret = dmz_load_sb(zmd);
2425 	if (ret)
2426 		goto err;
2427 
2428 	/* Set metadata zones starting from sb_zone */
2429 	zid = dmz_id(zmd, zmd->sb_zone);
2430 	for (i = 0; i < zmd->nr_meta_zones << 1; i++) {
2431 		zone = dmz_get(zmd, zid + i);
2432 		if (!dmz_is_rnd(zone))
2433 			goto err;
2434 		set_bit(DMZ_META, &zone->flags);
2435 	}
2436 
2437 	/* Load mapping table */
2438 	ret = dmz_load_mapping(zmd);
2439 	if (ret)
2440 		goto err;
2441 
2442 	/*
2443 	 * Cache size boundaries: allow at least 2 super blocks, the chunk map
2444 	 * blocks and enough blocks to be able to cache the bitmap blocks of
2445 	 * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow
2446 	 * the cache to add 512 more metadata blocks.
2447 	 */
2448 	zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16;
2449 	zmd->max_nr_mblks = zmd->min_nr_mblks + 512;
2450 	zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count;
2451 	zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan;
2452 	zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
2453 
2454 	/* Metadata cache shrinker */
2455 	ret = register_shrinker(&zmd->mblk_shrinker);
2456 	if (ret) {
2457 		dmz_dev_err(dev, "Register metadata cache shrinker failed");
2458 		goto err;
2459 	}
2460 
2461 	dmz_dev_info(dev, "Host-%s zoned block device",
2462 		     bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
2463 		     "aware" : "managed");
2464 	dmz_dev_info(dev, "  %llu 512-byte logical sectors",
2465 		     (u64)dev->capacity);
2466 	dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors",
2467 		     dev->nr_zones, (u64)dev->zone_nr_sectors);
2468 	dmz_dev_info(dev, "  %u metadata zones",
2469 		     zmd->nr_meta_zones * 2);
2470 	dmz_dev_info(dev, "  %u data zones for %u chunks",
2471 		     zmd->nr_data_zones, zmd->nr_chunks);
2472 	dmz_dev_info(dev, "    %u random zones (%u unmapped)",
2473 		     zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd));
2474 	dmz_dev_info(dev, "    %u sequential zones (%u unmapped)",
2475 		     zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq));
2476 	dmz_dev_info(dev, "  %u reserved sequential data zones",
2477 		     zmd->nr_reserved_seq);
2478 
2479 	dmz_dev_debug(dev, "Format:");
2480 	dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)",
2481 		      zmd->nr_meta_blocks, zmd->max_nr_mblks);
2482 	dmz_dev_debug(dev, "  %u data zone mapping blocks",
2483 		      zmd->nr_map_blocks);
2484 	dmz_dev_debug(dev, "  %u bitmap blocks",
2485 		      zmd->nr_bitmap_blocks);
2486 
2487 	*metadata = zmd;
2488 
2489 	return 0;
2490 err:
2491 	dmz_cleanup_metadata(zmd);
2492 	kfree(zmd);
2493 	*metadata = NULL;
2494 
2495 	return ret;
2496 }
2497 
2498 /*
2499  * Cleanup the zoned metadata resources.
2500  */
2501 void dmz_dtr_metadata(struct dmz_metadata *zmd)
2502 {
2503 	unregister_shrinker(&zmd->mblk_shrinker);
2504 	dmz_cleanup_metadata(zmd);
2505 	kfree(zmd);
2506 }
2507 
2508 /*
2509  * Check zone information on resume.
2510  */
2511 int dmz_resume_metadata(struct dmz_metadata *zmd)
2512 {
2513 	struct dmz_dev *dev = zmd->dev;
2514 	struct dm_zone *zone;
2515 	sector_t wp_block;
2516 	unsigned int i;
2517 	int ret;
2518 
2519 	/* Check zones */
2520 	for (i = 0; i < dev->nr_zones; i++) {
2521 		zone = dmz_get(zmd, i);
2522 		if (!zone) {
2523 			dmz_dev_err(dev, "Unable to get zone %u", i);
2524 			return -EIO;
2525 		}
2526 
2527 		wp_block = zone->wp_block;
2528 
2529 		ret = dmz_update_zone(zmd, zone);
2530 		if (ret) {
2531 			dmz_dev_err(dev, "Broken zone %u", i);
2532 			return ret;
2533 		}
2534 
2535 		if (dmz_is_offline(zone)) {
2536 			dmz_dev_warn(dev, "Zone %u is offline", i);
2537 			continue;
2538 		}
2539 
2540 		/* Check write pointer */
2541 		if (!dmz_is_seq(zone))
2542 			zone->wp_block = 0;
2543 		else if (zone->wp_block != wp_block) {
2544 			dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)",
2545 				    i, (u64)zone->wp_block, (u64)wp_block);
2546 			zone->wp_block = wp_block;
2547 			dmz_invalidate_blocks(zmd, zone, zone->wp_block,
2548 					      dev->zone_nr_blocks - zone->wp_block);
2549 		}
2550 	}
2551 
2552 	return 0;
2553 }
2554