xref: /linux/block/genhd.c (revision 0b6e522c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  gendisk handling
4  *
5  * Portions Copyright (C) 2020 Christoph Hellwig
6  */
7 
8 #include <linux/module.h>
9 #include <linux/ctype.h>
10 #include <linux/fs.h>
11 #include <linux/genhd.h>
12 #include <linux/kdev_t.h>
13 #include <linux/kernel.h>
14 #include <linux/blkdev.h>
15 #include <linux/backing-dev.h>
16 #include <linux/init.h>
17 #include <linux/spinlock.h>
18 #include <linux/proc_fs.h>
19 #include <linux/seq_file.h>
20 #include <linux/slab.h>
21 #include <linux/kmod.h>
22 #include <linux/mutex.h>
23 #include <linux/idr.h>
24 #include <linux/log2.h>
25 #include <linux/pm_runtime.h>
26 #include <linux/badblocks.h>
27 
28 #include "blk.h"
29 
30 static struct kobject *block_depr;
31 
32 DECLARE_RWSEM(bdev_lookup_sem);
33 
34 /* for extended dynamic devt allocation, currently only one major is used */
35 #define NR_EXT_DEVT		(1 << MINORBITS)
36 static DEFINE_IDA(ext_devt_ida);
37 
38 static void disk_check_events(struct disk_events *ev,
39 			      unsigned int *clearing_ptr);
40 static void disk_alloc_events(struct gendisk *disk);
41 static void disk_add_events(struct gendisk *disk);
42 static void disk_del_events(struct gendisk *disk);
43 static void disk_release_events(struct gendisk *disk);
44 
45 void set_capacity(struct gendisk *disk, sector_t sectors)
46 {
47 	struct block_device *bdev = disk->part0;
48 
49 	spin_lock(&bdev->bd_size_lock);
50 	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
51 	spin_unlock(&bdev->bd_size_lock);
52 }
53 EXPORT_SYMBOL(set_capacity);
54 
55 /*
56  * Set disk capacity and notify if the size is not currently zero and will not
57  * be set to zero.  Returns true if a uevent was sent, otherwise false.
58  */
59 bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
60 {
61 	sector_t capacity = get_capacity(disk);
62 	char *envp[] = { "RESIZE=1", NULL };
63 
64 	set_capacity(disk, size);
65 
66 	/*
67 	 * Only print a message and send a uevent if the gendisk is user visible
68 	 * and alive.  This avoids spamming the log and udev when setting the
69 	 * initial capacity during probing.
70 	 */
71 	if (size == capacity ||
72 	    (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
73 		return false;
74 
75 	pr_info("%s: detected capacity change from %lld to %lld\n",
76 		disk->disk_name, size, capacity);
77 
78 	/*
79 	 * Historically we did not send a uevent for changes to/from an empty
80 	 * device.
81 	 */
82 	if (!capacity || !size)
83 		return false;
84 	kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
85 	return true;
86 }
87 EXPORT_SYMBOL_GPL(set_capacity_and_notify);
88 
89 /*
90  * Format the device name of the indicated disk into the supplied buffer and
91  * return a pointer to that same buffer for convenience.
92  */
93 char *disk_name(struct gendisk *hd, int partno, char *buf)
94 {
95 	if (!partno)
96 		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
97 	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
98 		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
99 	else
100 		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
101 
102 	return buf;
103 }
104 
105 const char *bdevname(struct block_device *bdev, char *buf)
106 {
107 	return disk_name(bdev->bd_disk, bdev->bd_partno, buf);
108 }
109 EXPORT_SYMBOL(bdevname);
110 
111 static void part_stat_read_all(struct block_device *part,
112 		struct disk_stats *stat)
113 {
114 	int cpu;
115 
116 	memset(stat, 0, sizeof(struct disk_stats));
117 	for_each_possible_cpu(cpu) {
118 		struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
119 		int group;
120 
121 		for (group = 0; group < NR_STAT_GROUPS; group++) {
122 			stat->nsecs[group] += ptr->nsecs[group];
123 			stat->sectors[group] += ptr->sectors[group];
124 			stat->ios[group] += ptr->ios[group];
125 			stat->merges[group] += ptr->merges[group];
126 		}
127 
128 		stat->io_ticks += ptr->io_ticks;
129 	}
130 }
131 
132 static unsigned int part_in_flight(struct block_device *part)
133 {
134 	unsigned int inflight = 0;
135 	int cpu;
136 
137 	for_each_possible_cpu(cpu) {
138 		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
139 			    part_stat_local_read_cpu(part, in_flight[1], cpu);
140 	}
141 	if ((int)inflight < 0)
142 		inflight = 0;
143 
144 	return inflight;
145 }
146 
147 static void part_in_flight_rw(struct block_device *part,
148 		unsigned int inflight[2])
149 {
150 	int cpu;
151 
152 	inflight[0] = 0;
153 	inflight[1] = 0;
154 	for_each_possible_cpu(cpu) {
155 		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
156 		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
157 	}
158 	if ((int)inflight[0] < 0)
159 		inflight[0] = 0;
160 	if ((int)inflight[1] < 0)
161 		inflight[1] = 0;
162 }
163 
164 static struct block_device *__disk_get_part(struct gendisk *disk, int partno)
165 {
166 	struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
167 
168 	if (unlikely(partno < 0 || partno >= ptbl->len))
169 		return NULL;
170 	return rcu_dereference(ptbl->part[partno]);
171 }
172 
173 /**
174  * disk_part_iter_init - initialize partition iterator
175  * @piter: iterator to initialize
176  * @disk: disk to iterate over
177  * @flags: DISK_PITER_* flags
178  *
179  * Initialize @piter so that it iterates over partitions of @disk.
180  *
181  * CONTEXT:
182  * Don't care.
183  */
184 void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
185 			  unsigned int flags)
186 {
187 	struct disk_part_tbl *ptbl;
188 
189 	rcu_read_lock();
190 	ptbl = rcu_dereference(disk->part_tbl);
191 
192 	piter->disk = disk;
193 	piter->part = NULL;
194 
195 	if (flags & DISK_PITER_REVERSE)
196 		piter->idx = ptbl->len - 1;
197 	else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
198 		piter->idx = 0;
199 	else
200 		piter->idx = 1;
201 
202 	piter->flags = flags;
203 
204 	rcu_read_unlock();
205 }
206 EXPORT_SYMBOL_GPL(disk_part_iter_init);
207 
208 /**
209  * disk_part_iter_next - proceed iterator to the next partition and return it
210  * @piter: iterator of interest
211  *
212  * Proceed @piter to the next partition and return it.
213  *
214  * CONTEXT:
215  * Don't care.
216  */
217 struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
218 {
219 	struct disk_part_tbl *ptbl;
220 	int inc, end;
221 
222 	/* put the last partition */
223 	disk_part_iter_exit(piter);
224 
225 	/* get part_tbl */
226 	rcu_read_lock();
227 	ptbl = rcu_dereference(piter->disk->part_tbl);
228 
229 	/* determine iteration parameters */
230 	if (piter->flags & DISK_PITER_REVERSE) {
231 		inc = -1;
232 		if (piter->flags & (DISK_PITER_INCL_PART0 |
233 				    DISK_PITER_INCL_EMPTY_PART0))
234 			end = -1;
235 		else
236 			end = 0;
237 	} else {
238 		inc = 1;
239 		end = ptbl->len;
240 	}
241 
242 	/* iterate to the next partition */
243 	for (; piter->idx != end; piter->idx += inc) {
244 		struct block_device *part;
245 
246 		part = rcu_dereference(ptbl->part[piter->idx]);
247 		if (!part)
248 			continue;
249 		piter->part = bdgrab(part);
250 		if (!piter->part)
251 			continue;
252 		if (!bdev_nr_sectors(part) &&
253 		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
254 		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
255 		      piter->idx == 0)) {
256 			bdput(piter->part);
257 			piter->part = NULL;
258 			continue;
259 		}
260 
261 		piter->idx += inc;
262 		break;
263 	}
264 
265 	rcu_read_unlock();
266 
267 	return piter->part;
268 }
269 EXPORT_SYMBOL_GPL(disk_part_iter_next);
270 
271 /**
272  * disk_part_iter_exit - finish up partition iteration
273  * @piter: iter of interest
274  *
275  * Called when iteration is over.  Cleans up @piter.
276  *
277  * CONTEXT:
278  * Don't care.
279  */
280 void disk_part_iter_exit(struct disk_part_iter *piter)
281 {
282 	if (piter->part)
283 		bdput(piter->part);
284 	piter->part = NULL;
285 }
286 EXPORT_SYMBOL_GPL(disk_part_iter_exit);
287 
288 /**
289  * disk_has_partitions
290  * @disk: gendisk of interest
291  *
292  * Walk through the partition table and check if valid partition exists.
293  *
294  * CONTEXT:
295  * Don't care.
296  *
297  * RETURNS:
298  * True if the gendisk has at least one valid non-zero size partition.
299  * Otherwise false.
300  */
301 bool disk_has_partitions(struct gendisk *disk)
302 {
303 	struct disk_part_tbl *ptbl;
304 	int i;
305 	bool ret = false;
306 
307 	rcu_read_lock();
308 	ptbl = rcu_dereference(disk->part_tbl);
309 
310 	/* Iterate partitions skipping the whole device at index 0 */
311 	for (i = 1; i < ptbl->len; i++) {
312 		if (rcu_dereference(ptbl->part[i])) {
313 			ret = true;
314 			break;
315 		}
316 	}
317 
318 	rcu_read_unlock();
319 
320 	return ret;
321 }
322 EXPORT_SYMBOL_GPL(disk_has_partitions);
323 
324 /*
325  * Can be deleted altogether. Later.
326  *
327  */
328 #define BLKDEV_MAJOR_HASH_SIZE 255
329 static struct blk_major_name {
330 	struct blk_major_name *next;
331 	int major;
332 	char name[16];
333 	void (*probe)(dev_t devt);
334 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
335 static DEFINE_MUTEX(major_names_lock);
336 
337 /* index in the above - for now: assume no multimajor ranges */
338 static inline int major_to_index(unsigned major)
339 {
340 	return major % BLKDEV_MAJOR_HASH_SIZE;
341 }
342 
343 #ifdef CONFIG_PROC_FS
344 void blkdev_show(struct seq_file *seqf, off_t offset)
345 {
346 	struct blk_major_name *dp;
347 
348 	mutex_lock(&major_names_lock);
349 	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
350 		if (dp->major == offset)
351 			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
352 	mutex_unlock(&major_names_lock);
353 }
354 #endif /* CONFIG_PROC_FS */
355 
356 /**
357  * __register_blkdev - register a new block device
358  *
359  * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
360  *         @major = 0, try to allocate any unused major number.
361  * @name: the name of the new block device as a zero terminated string
362  * @probe: allback that is called on access to any minor number of @major
363  *
364  * The @name must be unique within the system.
365  *
366  * The return value depends on the @major input parameter:
367  *
368  *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
369  *    then the function returns zero on success, or a negative error code
370  *  - if any unused major number was requested with @major = 0 parameter
371  *    then the return value is the allocated major number in range
372  *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
373  *
374  * See Documentation/admin-guide/devices.txt for the list of allocated
375  * major numbers.
376  *
377  * Use register_blkdev instead for any new code.
378  */
379 int __register_blkdev(unsigned int major, const char *name,
380 		void (*probe)(dev_t devt))
381 {
382 	struct blk_major_name **n, *p;
383 	int index, ret = 0;
384 
385 	mutex_lock(&major_names_lock);
386 
387 	/* temporary */
388 	if (major == 0) {
389 		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
390 			if (major_names[index] == NULL)
391 				break;
392 		}
393 
394 		if (index == 0) {
395 			printk("%s: failed to get major for %s\n",
396 			       __func__, name);
397 			ret = -EBUSY;
398 			goto out;
399 		}
400 		major = index;
401 		ret = major;
402 	}
403 
404 	if (major >= BLKDEV_MAJOR_MAX) {
405 		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
406 		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
407 
408 		ret = -EINVAL;
409 		goto out;
410 	}
411 
412 	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
413 	if (p == NULL) {
414 		ret = -ENOMEM;
415 		goto out;
416 	}
417 
418 	p->major = major;
419 	p->probe = probe;
420 	strlcpy(p->name, name, sizeof(p->name));
421 	p->next = NULL;
422 	index = major_to_index(major);
423 
424 	for (n = &major_names[index]; *n; n = &(*n)->next) {
425 		if ((*n)->major == major)
426 			break;
427 	}
428 	if (!*n)
429 		*n = p;
430 	else
431 		ret = -EBUSY;
432 
433 	if (ret < 0) {
434 		printk("register_blkdev: cannot get major %u for %s\n",
435 		       major, name);
436 		kfree(p);
437 	}
438 out:
439 	mutex_unlock(&major_names_lock);
440 	return ret;
441 }
442 EXPORT_SYMBOL(__register_blkdev);
443 
444 void unregister_blkdev(unsigned int major, const char *name)
445 {
446 	struct blk_major_name **n;
447 	struct blk_major_name *p = NULL;
448 	int index = major_to_index(major);
449 
450 	mutex_lock(&major_names_lock);
451 	for (n = &major_names[index]; *n; n = &(*n)->next)
452 		if ((*n)->major == major)
453 			break;
454 	if (!*n || strcmp((*n)->name, name)) {
455 		WARN_ON(1);
456 	} else {
457 		p = *n;
458 		*n = p->next;
459 	}
460 	mutex_unlock(&major_names_lock);
461 	kfree(p);
462 }
463 
464 EXPORT_SYMBOL(unregister_blkdev);
465 
466 /**
467  * blk_mangle_minor - scatter minor numbers apart
468  * @minor: minor number to mangle
469  *
470  * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
471  * is enabled.  Mangling twice gives the original value.
472  *
473  * RETURNS:
474  * Mangled value.
475  *
476  * CONTEXT:
477  * Don't care.
478  */
479 static int blk_mangle_minor(int minor)
480 {
481 #ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
482 	int i;
483 
484 	for (i = 0; i < MINORBITS / 2; i++) {
485 		int low = minor & (1 << i);
486 		int high = minor & (1 << (MINORBITS - 1 - i));
487 		int distance = MINORBITS - 1 - 2 * i;
488 
489 		minor ^= low | high;	/* clear both bits */
490 		low <<= distance;	/* swap the positions */
491 		high >>= distance;
492 		minor |= low | high;	/* and set */
493 	}
494 #endif
495 	return minor;
496 }
497 
498 /**
499  * blk_alloc_devt - allocate a dev_t for a block device
500  * @bdev: block device to allocate dev_t for
501  * @devt: out parameter for resulting dev_t
502  *
503  * Allocate a dev_t for block device.
504  *
505  * RETURNS:
506  * 0 on success, allocated dev_t is returned in *@devt.  -errno on
507  * failure.
508  *
509  * CONTEXT:
510  * Might sleep.
511  */
512 int blk_alloc_devt(struct block_device *bdev, dev_t *devt)
513 {
514 	struct gendisk *disk = bdev->bd_disk;
515 	int idx;
516 
517 	/* in consecutive minor range? */
518 	if (bdev->bd_partno < disk->minors) {
519 		*devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno);
520 		return 0;
521 	}
522 
523 	idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
524 	if (idx < 0)
525 		return idx == -ENOSPC ? -EBUSY : idx;
526 
527 	*devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
528 	return 0;
529 }
530 
531 /**
532  * blk_free_devt - free a dev_t
533  * @devt: dev_t to free
534  *
535  * Free @devt which was allocated using blk_alloc_devt().
536  *
537  * CONTEXT:
538  * Might sleep.
539  */
540 void blk_free_devt(dev_t devt)
541 {
542 	if (MAJOR(devt) == BLOCK_EXT_MAJOR)
543 		ida_free(&ext_devt_ida, blk_mangle_minor(MINOR(devt)));
544 }
545 
546 static char *bdevt_str(dev_t devt, char *buf)
547 {
548 	if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
549 		char tbuf[BDEVT_SIZE];
550 		snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
551 		snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
552 	} else
553 		snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
554 
555 	return buf;
556 }
557 
558 static void disk_scan_partitions(struct gendisk *disk)
559 {
560 	struct block_device *bdev;
561 
562 	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
563 		return;
564 
565 	set_bit(GD_NEED_PART_SCAN, &disk->state);
566 	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
567 	if (!IS_ERR(bdev))
568 		blkdev_put(bdev, FMODE_READ);
569 }
570 
571 static void register_disk(struct device *parent, struct gendisk *disk,
572 			  const struct attribute_group **groups)
573 {
574 	struct device *ddev = disk_to_dev(disk);
575 	struct disk_part_iter piter;
576 	struct block_device *part;
577 	int err;
578 
579 	ddev->parent = parent;
580 
581 	dev_set_name(ddev, "%s", disk->disk_name);
582 
583 	/* delay uevents, until we scanned partition table */
584 	dev_set_uevent_suppress(ddev, 1);
585 
586 	if (groups) {
587 		WARN_ON(ddev->groups);
588 		ddev->groups = groups;
589 	}
590 	if (device_add(ddev))
591 		return;
592 	if (!sysfs_deprecated) {
593 		err = sysfs_create_link(block_depr, &ddev->kobj,
594 					kobject_name(&ddev->kobj));
595 		if (err) {
596 			device_del(ddev);
597 			return;
598 		}
599 	}
600 
601 	/*
602 	 * avoid probable deadlock caused by allocating memory with
603 	 * GFP_KERNEL in runtime_resume callback of its all ancestor
604 	 * devices
605 	 */
606 	pm_runtime_set_memalloc_noio(ddev, true);
607 
608 	disk->part0->bd_holder_dir =
609 		kobject_create_and_add("holders", &ddev->kobj);
610 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
611 
612 	if (disk->flags & GENHD_FL_HIDDEN) {
613 		dev_set_uevent_suppress(ddev, 0);
614 		return;
615 	}
616 
617 	disk_scan_partitions(disk);
618 
619 	/* announce disk after possible partitions are created */
620 	dev_set_uevent_suppress(ddev, 0);
621 	kobject_uevent(&ddev->kobj, KOBJ_ADD);
622 
623 	/* announce possible partitions */
624 	disk_part_iter_init(&piter, disk, 0);
625 	while ((part = disk_part_iter_next(&piter)))
626 		kobject_uevent(bdev_kobj(part), KOBJ_ADD);
627 	disk_part_iter_exit(&piter);
628 
629 	if (disk->queue->backing_dev_info->dev) {
630 		err = sysfs_create_link(&ddev->kobj,
631 			  &disk->queue->backing_dev_info->dev->kobj,
632 			  "bdi");
633 		WARN_ON(err);
634 	}
635 }
636 
637 /**
638  * __device_add_disk - add disk information to kernel list
639  * @parent: parent device for the disk
640  * @disk: per-device partitioning information
641  * @groups: Additional per-device sysfs groups
642  * @register_queue: register the queue if set to true
643  *
644  * This function registers the partitioning information in @disk
645  * with the kernel.
646  *
647  * FIXME: error handling
648  */
649 static void __device_add_disk(struct device *parent, struct gendisk *disk,
650 			      const struct attribute_group **groups,
651 			      bool register_queue)
652 {
653 	dev_t devt;
654 	int retval;
655 
656 	/*
657 	 * The disk queue should now be all set with enough information about
658 	 * the device for the elevator code to pick an adequate default
659 	 * elevator if one is needed, that is, for devices requesting queue
660 	 * registration.
661 	 */
662 	if (register_queue)
663 		elevator_init_mq(disk->queue);
664 
665 	/* minors == 0 indicates to use ext devt from part0 and should
666 	 * be accompanied with EXT_DEVT flag.  Make sure all
667 	 * parameters make sense.
668 	 */
669 	WARN_ON(disk->minors && !(disk->major || disk->first_minor));
670 	WARN_ON(!disk->minors &&
671 		!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
672 
673 	disk->flags |= GENHD_FL_UP;
674 
675 	retval = blk_alloc_devt(disk->part0, &devt);
676 	if (retval) {
677 		WARN_ON(1);
678 		return;
679 	}
680 	disk->major = MAJOR(devt);
681 	disk->first_minor = MINOR(devt);
682 
683 	disk_alloc_events(disk);
684 
685 	if (disk->flags & GENHD_FL_HIDDEN) {
686 		/*
687 		 * Don't let hidden disks show up in /proc/partitions,
688 		 * and don't bother scanning for partitions either.
689 		 */
690 		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
691 		disk->flags |= GENHD_FL_NO_PART_SCAN;
692 	} else {
693 		struct backing_dev_info *bdi = disk->queue->backing_dev_info;
694 		struct device *dev = disk_to_dev(disk);
695 		int ret;
696 
697 		/* Register BDI before referencing it from bdev */
698 		dev->devt = devt;
699 		ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
700 		WARN_ON(ret);
701 		bdi_set_owner(bdi, dev);
702 		bdev_add(disk->part0, devt);
703 	}
704 	register_disk(parent, disk, groups);
705 	if (register_queue)
706 		blk_register_queue(disk);
707 
708 	/*
709 	 * Take an extra ref on queue which will be put on disk_release()
710 	 * so that it sticks around as long as @disk is there.
711 	 */
712 	WARN_ON_ONCE(!blk_get_queue(disk->queue));
713 
714 	disk_add_events(disk);
715 	blk_integrity_add(disk);
716 }
717 
718 void device_add_disk(struct device *parent, struct gendisk *disk,
719 		     const struct attribute_group **groups)
720 
721 {
722 	__device_add_disk(parent, disk, groups, true);
723 }
724 EXPORT_SYMBOL(device_add_disk);
725 
726 void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
727 {
728 	__device_add_disk(parent, disk, NULL, false);
729 }
730 EXPORT_SYMBOL(device_add_disk_no_queue_reg);
731 
732 static void invalidate_partition(struct block_device *bdev)
733 {
734 	fsync_bdev(bdev);
735 	__invalidate_device(bdev, true);
736 
737 	/*
738 	 * Unhash the bdev inode for this device so that it can't be looked
739 	 * up any more even if openers still hold references to it.
740 	 */
741 	remove_inode_hash(bdev->bd_inode);
742 }
743 
744 /**
745  * del_gendisk - remove the gendisk
746  * @disk: the struct gendisk to remove
747  *
748  * Removes the gendisk and all its associated resources. This deletes the
749  * partitions associated with the gendisk, and unregisters the associated
750  * request_queue.
751  *
752  * This is the counter to the respective __device_add_disk() call.
753  *
754  * The final removal of the struct gendisk happens when its refcount reaches 0
755  * with put_disk(), which should be called after del_gendisk(), if
756  * __device_add_disk() was used.
757  *
758  * Drivers exist which depend on the release of the gendisk to be synchronous,
759  * it should not be deferred.
760  *
761  * Context: can sleep
762  */
763 void del_gendisk(struct gendisk *disk)
764 {
765 	struct disk_part_iter piter;
766 	struct block_device *part;
767 
768 	might_sleep();
769 
770 	if (WARN_ON_ONCE(!disk->queue))
771 		return;
772 
773 	blk_integrity_del(disk);
774 	disk_del_events(disk);
775 
776 	/*
777 	 * Block lookups of the disk until all bdevs are unhashed and the
778 	 * disk is marked as dead (GENHD_FL_UP cleared).
779 	 */
780 	down_write(&bdev_lookup_sem);
781 
782 	/* invalidate stuff */
783 	disk_part_iter_init(&piter, disk,
784 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
785 	while ((part = disk_part_iter_next(&piter))) {
786 		invalidate_partition(part);
787 		delete_partition(part);
788 	}
789 	disk_part_iter_exit(&piter);
790 
791 	invalidate_partition(disk->part0);
792 	set_capacity(disk, 0);
793 	disk->flags &= ~GENHD_FL_UP;
794 	up_write(&bdev_lookup_sem);
795 
796 	if (!(disk->flags & GENHD_FL_HIDDEN)) {
797 		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
798 
799 		/*
800 		 * Unregister bdi before releasing device numbers (as they can
801 		 * get reused and we'd get clashes in sysfs).
802 		 */
803 		bdi_unregister(disk->queue->backing_dev_info);
804 	}
805 
806 	blk_unregister_queue(disk);
807 
808 	kobject_put(disk->part0->bd_holder_dir);
809 	kobject_put(disk->slave_dir);
810 
811 	part_stat_set_all(disk->part0, 0);
812 	disk->part0->bd_stamp = 0;
813 	if (!sysfs_deprecated)
814 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
815 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
816 	device_del(disk_to_dev(disk));
817 }
818 EXPORT_SYMBOL(del_gendisk);
819 
820 /* sysfs access to bad-blocks list. */
821 static ssize_t disk_badblocks_show(struct device *dev,
822 					struct device_attribute *attr,
823 					char *page)
824 {
825 	struct gendisk *disk = dev_to_disk(dev);
826 
827 	if (!disk->bb)
828 		return sprintf(page, "\n");
829 
830 	return badblocks_show(disk->bb, page, 0);
831 }
832 
833 static ssize_t disk_badblocks_store(struct device *dev,
834 					struct device_attribute *attr,
835 					const char *page, size_t len)
836 {
837 	struct gendisk *disk = dev_to_disk(dev);
838 
839 	if (!disk->bb)
840 		return -ENXIO;
841 
842 	return badblocks_store(disk->bb, page, len, 0);
843 }
844 
845 void blk_request_module(dev_t devt)
846 {
847 	unsigned int major = MAJOR(devt);
848 	struct blk_major_name **n;
849 
850 	mutex_lock(&major_names_lock);
851 	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
852 		if ((*n)->major == major && (*n)->probe) {
853 			(*n)->probe(devt);
854 			mutex_unlock(&major_names_lock);
855 			return;
856 		}
857 	}
858 	mutex_unlock(&major_names_lock);
859 
860 	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
861 		/* Make old-style 2.4 aliases work */
862 		request_module("block-major-%d", MAJOR(devt));
863 }
864 
865 /**
866  * bdget_disk - do bdget() by gendisk and partition number
867  * @disk: gendisk of interest
868  * @partno: partition number
869  *
870  * Find partition @partno from @disk, do bdget() on it.
871  *
872  * CONTEXT:
873  * Don't care.
874  *
875  * RETURNS:
876  * Resulting block_device on success, NULL on failure.
877  */
878 struct block_device *bdget_disk(struct gendisk *disk, int partno)
879 {
880 	struct block_device *bdev = NULL;
881 
882 	rcu_read_lock();
883 	bdev = __disk_get_part(disk, partno);
884 	if (bdev && !bdgrab(bdev))
885 		bdev = NULL;
886 	rcu_read_unlock();
887 
888 	return bdev;
889 }
890 
891 /*
892  * print a full list of all partitions - intended for places where the root
893  * filesystem can't be mounted and thus to give the victim some idea of what
894  * went wrong
895  */
896 void __init printk_all_partitions(void)
897 {
898 	struct class_dev_iter iter;
899 	struct device *dev;
900 
901 	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
902 	while ((dev = class_dev_iter_next(&iter))) {
903 		struct gendisk *disk = dev_to_disk(dev);
904 		struct disk_part_iter piter;
905 		struct block_device *part;
906 		char name_buf[BDEVNAME_SIZE];
907 		char devt_buf[BDEVT_SIZE];
908 
909 		/*
910 		 * Don't show empty devices or things that have been
911 		 * suppressed
912 		 */
913 		if (get_capacity(disk) == 0 ||
914 		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
915 			continue;
916 
917 		/*
918 		 * Note, unlike /proc/partitions, I am showing the
919 		 * numbers in hex - the same format as the root=
920 		 * option takes.
921 		 */
922 		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
923 		while ((part = disk_part_iter_next(&piter))) {
924 			bool is_part0 = part == disk->part0;
925 
926 			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
927 			       bdevt_str(part->bd_dev, devt_buf),
928 			       bdev_nr_sectors(part) >> 1,
929 			       disk_name(disk, part->bd_partno, name_buf),
930 			       part->bd_meta_info ?
931 					part->bd_meta_info->uuid : "");
932 			if (is_part0) {
933 				if (dev->parent && dev->parent->driver)
934 					printk(" driver: %s\n",
935 					      dev->parent->driver->name);
936 				else
937 					printk(" (driver?)\n");
938 			} else
939 				printk("\n");
940 		}
941 		disk_part_iter_exit(&piter);
942 	}
943 	class_dev_iter_exit(&iter);
944 }
945 
946 #ifdef CONFIG_PROC_FS
947 /* iterator */
948 static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
949 {
950 	loff_t skip = *pos;
951 	struct class_dev_iter *iter;
952 	struct device *dev;
953 
954 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
955 	if (!iter)
956 		return ERR_PTR(-ENOMEM);
957 
958 	seqf->private = iter;
959 	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
960 	do {
961 		dev = class_dev_iter_next(iter);
962 		if (!dev)
963 			return NULL;
964 	} while (skip--);
965 
966 	return dev_to_disk(dev);
967 }
968 
969 static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
970 {
971 	struct device *dev;
972 
973 	(*pos)++;
974 	dev = class_dev_iter_next(seqf->private);
975 	if (dev)
976 		return dev_to_disk(dev);
977 
978 	return NULL;
979 }
980 
981 static void disk_seqf_stop(struct seq_file *seqf, void *v)
982 {
983 	struct class_dev_iter *iter = seqf->private;
984 
985 	/* stop is called even after start failed :-( */
986 	if (iter) {
987 		class_dev_iter_exit(iter);
988 		kfree(iter);
989 		seqf->private = NULL;
990 	}
991 }
992 
993 static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
994 {
995 	void *p;
996 
997 	p = disk_seqf_start(seqf, pos);
998 	if (!IS_ERR_OR_NULL(p) && !*pos)
999 		seq_puts(seqf, "major minor  #blocks  name\n\n");
1000 	return p;
1001 }
1002 
1003 static int show_partition(struct seq_file *seqf, void *v)
1004 {
1005 	struct gendisk *sgp = v;
1006 	struct disk_part_iter piter;
1007 	struct block_device *part;
1008 	char buf[BDEVNAME_SIZE];
1009 
1010 	/* Don't show non-partitionable removeable devices or empty devices */
1011 	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
1012 				   (sgp->flags & GENHD_FL_REMOVABLE)))
1013 		return 0;
1014 	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
1015 		return 0;
1016 
1017 	/* show the full disk and all non-0 size partitions of it */
1018 	disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
1019 	while ((part = disk_part_iter_next(&piter)))
1020 		seq_printf(seqf, "%4d  %7d %10llu %s\n",
1021 			   MAJOR(part->bd_dev), MINOR(part->bd_dev),
1022 			   bdev_nr_sectors(part) >> 1,
1023 			   disk_name(sgp, part->bd_partno, buf));
1024 	disk_part_iter_exit(&piter);
1025 
1026 	return 0;
1027 }
1028 
1029 static const struct seq_operations partitions_op = {
1030 	.start	= show_partition_start,
1031 	.next	= disk_seqf_next,
1032 	.stop	= disk_seqf_stop,
1033 	.show	= show_partition
1034 };
1035 #endif
1036 
1037 static int __init genhd_device_init(void)
1038 {
1039 	int error;
1040 
1041 	block_class.dev_kobj = sysfs_dev_block_kobj;
1042 	error = class_register(&block_class);
1043 	if (unlikely(error))
1044 		return error;
1045 	blk_dev_init();
1046 
1047 	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
1048 
1049 	/* create top-level block dir */
1050 	if (!sysfs_deprecated)
1051 		block_depr = kobject_create_and_add("block", NULL);
1052 	return 0;
1053 }
1054 
1055 subsys_initcall(genhd_device_init);
1056 
1057 static ssize_t disk_range_show(struct device *dev,
1058 			       struct device_attribute *attr, char *buf)
1059 {
1060 	struct gendisk *disk = dev_to_disk(dev);
1061 
1062 	return sprintf(buf, "%d\n", disk->minors);
1063 }
1064 
1065 static ssize_t disk_ext_range_show(struct device *dev,
1066 				   struct device_attribute *attr, char *buf)
1067 {
1068 	struct gendisk *disk = dev_to_disk(dev);
1069 
1070 	return sprintf(buf, "%d\n", disk_max_parts(disk));
1071 }
1072 
1073 static ssize_t disk_removable_show(struct device *dev,
1074 				   struct device_attribute *attr, char *buf)
1075 {
1076 	struct gendisk *disk = dev_to_disk(dev);
1077 
1078 	return sprintf(buf, "%d\n",
1079 		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
1080 }
1081 
1082 static ssize_t disk_hidden_show(struct device *dev,
1083 				   struct device_attribute *attr, char *buf)
1084 {
1085 	struct gendisk *disk = dev_to_disk(dev);
1086 
1087 	return sprintf(buf, "%d\n",
1088 		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
1089 }
1090 
1091 static ssize_t disk_ro_show(struct device *dev,
1092 				   struct device_attribute *attr, char *buf)
1093 {
1094 	struct gendisk *disk = dev_to_disk(dev);
1095 
1096 	return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
1097 }
1098 
1099 ssize_t part_size_show(struct device *dev,
1100 		       struct device_attribute *attr, char *buf)
1101 {
1102 	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
1103 }
1104 
1105 ssize_t part_stat_show(struct device *dev,
1106 		       struct device_attribute *attr, char *buf)
1107 {
1108 	struct block_device *bdev = dev_to_bdev(dev);
1109 	struct request_queue *q = bdev->bd_disk->queue;
1110 	struct disk_stats stat;
1111 	unsigned int inflight;
1112 
1113 	part_stat_read_all(bdev, &stat);
1114 	if (queue_is_mq(q))
1115 		inflight = blk_mq_in_flight(q, bdev);
1116 	else
1117 		inflight = part_in_flight(bdev);
1118 
1119 	return sprintf(buf,
1120 		"%8lu %8lu %8llu %8u "
1121 		"%8lu %8lu %8llu %8u "
1122 		"%8u %8u %8u "
1123 		"%8lu %8lu %8llu %8u "
1124 		"%8lu %8u"
1125 		"\n",
1126 		stat.ios[STAT_READ],
1127 		stat.merges[STAT_READ],
1128 		(unsigned long long)stat.sectors[STAT_READ],
1129 		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
1130 		stat.ios[STAT_WRITE],
1131 		stat.merges[STAT_WRITE],
1132 		(unsigned long long)stat.sectors[STAT_WRITE],
1133 		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
1134 		inflight,
1135 		jiffies_to_msecs(stat.io_ticks),
1136 		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
1137 				      stat.nsecs[STAT_WRITE] +
1138 				      stat.nsecs[STAT_DISCARD] +
1139 				      stat.nsecs[STAT_FLUSH],
1140 						NSEC_PER_MSEC),
1141 		stat.ios[STAT_DISCARD],
1142 		stat.merges[STAT_DISCARD],
1143 		(unsigned long long)stat.sectors[STAT_DISCARD],
1144 		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
1145 		stat.ios[STAT_FLUSH],
1146 		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
1147 }
1148 
1149 ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
1150 			   char *buf)
1151 {
1152 	struct block_device *bdev = dev_to_bdev(dev);
1153 	struct request_queue *q = bdev->bd_disk->queue;
1154 	unsigned int inflight[2];
1155 
1156 	if (queue_is_mq(q))
1157 		blk_mq_in_flight_rw(q, bdev, inflight);
1158 	else
1159 		part_in_flight_rw(bdev, inflight);
1160 
1161 	return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
1162 }
1163 
1164 static ssize_t disk_capability_show(struct device *dev,
1165 				    struct device_attribute *attr, char *buf)
1166 {
1167 	struct gendisk *disk = dev_to_disk(dev);
1168 
1169 	return sprintf(buf, "%x\n", disk->flags);
1170 }
1171 
1172 static ssize_t disk_alignment_offset_show(struct device *dev,
1173 					  struct device_attribute *attr,
1174 					  char *buf)
1175 {
1176 	struct gendisk *disk = dev_to_disk(dev);
1177 
1178 	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
1179 }
1180 
1181 static ssize_t disk_discard_alignment_show(struct device *dev,
1182 					   struct device_attribute *attr,
1183 					   char *buf)
1184 {
1185 	struct gendisk *disk = dev_to_disk(dev);
1186 
1187 	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
1188 }
1189 
1190 static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
1191 static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
1192 static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
1193 static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
1194 static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
1195 static DEVICE_ATTR(size, 0444, part_size_show, NULL);
1196 static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
1197 static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
1198 static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
1199 static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
1200 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
1201 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
1202 
1203 #ifdef CONFIG_FAIL_MAKE_REQUEST
1204 ssize_t part_fail_show(struct device *dev,
1205 		       struct device_attribute *attr, char *buf)
1206 {
1207 	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
1208 }
1209 
1210 ssize_t part_fail_store(struct device *dev,
1211 			struct device_attribute *attr,
1212 			const char *buf, size_t count)
1213 {
1214 	int i;
1215 
1216 	if (count > 0 && sscanf(buf, "%d", &i) > 0)
1217 		dev_to_bdev(dev)->bd_make_it_fail = i;
1218 
1219 	return count;
1220 }
1221 
1222 static struct device_attribute dev_attr_fail =
1223 	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
1224 #endif /* CONFIG_FAIL_MAKE_REQUEST */
1225 
1226 #ifdef CONFIG_FAIL_IO_TIMEOUT
1227 static struct device_attribute dev_attr_fail_timeout =
1228 	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
1229 #endif
1230 
1231 static struct attribute *disk_attrs[] = {
1232 	&dev_attr_range.attr,
1233 	&dev_attr_ext_range.attr,
1234 	&dev_attr_removable.attr,
1235 	&dev_attr_hidden.attr,
1236 	&dev_attr_ro.attr,
1237 	&dev_attr_size.attr,
1238 	&dev_attr_alignment_offset.attr,
1239 	&dev_attr_discard_alignment.attr,
1240 	&dev_attr_capability.attr,
1241 	&dev_attr_stat.attr,
1242 	&dev_attr_inflight.attr,
1243 	&dev_attr_badblocks.attr,
1244 #ifdef CONFIG_FAIL_MAKE_REQUEST
1245 	&dev_attr_fail.attr,
1246 #endif
1247 #ifdef CONFIG_FAIL_IO_TIMEOUT
1248 	&dev_attr_fail_timeout.attr,
1249 #endif
1250 	NULL
1251 };
1252 
1253 static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
1254 {
1255 	struct device *dev = container_of(kobj, typeof(*dev), kobj);
1256 	struct gendisk *disk = dev_to_disk(dev);
1257 
1258 	if (a == &dev_attr_badblocks.attr && !disk->bb)
1259 		return 0;
1260 	return a->mode;
1261 }
1262 
1263 static struct attribute_group disk_attr_group = {
1264 	.attrs = disk_attrs,
1265 	.is_visible = disk_visible,
1266 };
1267 
1268 static const struct attribute_group *disk_attr_groups[] = {
1269 	&disk_attr_group,
1270 	NULL
1271 };
1272 
1273 /**
1274  * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
1275  * @disk: disk to replace part_tbl for
1276  * @new_ptbl: new part_tbl to install
1277  *
1278  * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
1279  * original ptbl is freed using RCU callback.
1280  *
1281  * LOCKING:
1282  * Matching bd_mutex locked or the caller is the only user of @disk.
1283  */
1284 static void disk_replace_part_tbl(struct gendisk *disk,
1285 				  struct disk_part_tbl *new_ptbl)
1286 {
1287 	struct disk_part_tbl *old_ptbl =
1288 		rcu_dereference_protected(disk->part_tbl, 1);
1289 
1290 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
1291 
1292 	if (old_ptbl) {
1293 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
1294 		kfree_rcu(old_ptbl, rcu_head);
1295 	}
1296 }
1297 
1298 /**
1299  * disk_expand_part_tbl - expand disk->part_tbl
1300  * @disk: disk to expand part_tbl for
1301  * @partno: expand such that this partno can fit in
1302  *
1303  * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
1304  * uses RCU to allow unlocked dereferencing for stats and other stuff.
1305  *
1306  * LOCKING:
1307  * Matching bd_mutex locked or the caller is the only user of @disk.
1308  * Might sleep.
1309  *
1310  * RETURNS:
1311  * 0 on success, -errno on failure.
1312  */
1313 int disk_expand_part_tbl(struct gendisk *disk, int partno)
1314 {
1315 	struct disk_part_tbl *old_ptbl =
1316 		rcu_dereference_protected(disk->part_tbl, 1);
1317 	struct disk_part_tbl *new_ptbl;
1318 	int len = old_ptbl ? old_ptbl->len : 0;
1319 	int i, target;
1320 
1321 	/*
1322 	 * check for int overflow, since we can get here from blkpg_ioctl()
1323 	 * with a user passed 'partno'.
1324 	 */
1325 	target = partno + 1;
1326 	if (target < 0)
1327 		return -EINVAL;
1328 
1329 	/* disk_max_parts() is zero during initialization, ignore if so */
1330 	if (disk_max_parts(disk) && target > disk_max_parts(disk))
1331 		return -EINVAL;
1332 
1333 	if (target <= len)
1334 		return 0;
1335 
1336 	new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
1337 				disk->node_id);
1338 	if (!new_ptbl)
1339 		return -ENOMEM;
1340 
1341 	new_ptbl->len = target;
1342 
1343 	for (i = 0; i < len; i++)
1344 		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
1345 
1346 	disk_replace_part_tbl(disk, new_ptbl);
1347 	return 0;
1348 }
1349 
1350 /**
1351  * disk_release - releases all allocated resources of the gendisk
1352  * @dev: the device representing this disk
1353  *
1354  * This function releases all allocated resources of the gendisk.
1355  *
1356  * Drivers which used __device_add_disk() have a gendisk with a request_queue
1357  * assigned. Since the request_queue sits on top of the gendisk for these
1358  * drivers we also call blk_put_queue() for them, and we expect the
1359  * request_queue refcount to reach 0 at this point, and so the request_queue
1360  * will also be freed prior to the disk.
1361  *
1362  * Context: can sleep
1363  */
1364 static void disk_release(struct device *dev)
1365 {
1366 	struct gendisk *disk = dev_to_disk(dev);
1367 
1368 	might_sleep();
1369 
1370 	blk_free_devt(dev->devt);
1371 	disk_release_events(disk);
1372 	kfree(disk->random);
1373 	disk_replace_part_tbl(disk, NULL);
1374 	bdput(disk->part0);
1375 	if (disk->queue)
1376 		blk_put_queue(disk->queue);
1377 	kfree(disk);
1378 }
1379 struct class block_class = {
1380 	.name		= "block",
1381 };
1382 
1383 static char *block_devnode(struct device *dev, umode_t *mode,
1384 			   kuid_t *uid, kgid_t *gid)
1385 {
1386 	struct gendisk *disk = dev_to_disk(dev);
1387 
1388 	if (disk->fops->devnode)
1389 		return disk->fops->devnode(disk, mode);
1390 	return NULL;
1391 }
1392 
1393 const struct device_type disk_type = {
1394 	.name		= "disk",
1395 	.groups		= disk_attr_groups,
1396 	.release	= disk_release,
1397 	.devnode	= block_devnode,
1398 };
1399 
1400 #ifdef CONFIG_PROC_FS
1401 /*
1402  * aggregate disk stat collector.  Uses the same stats that the sysfs
1403  * entries do, above, but makes them available through one seq_file.
1404  *
1405  * The output looks suspiciously like /proc/partitions with a bunch of
1406  * extra fields.
1407  */
1408 static int diskstats_show(struct seq_file *seqf, void *v)
1409 {
1410 	struct gendisk *gp = v;
1411 	struct disk_part_iter piter;
1412 	struct block_device *hd;
1413 	char buf[BDEVNAME_SIZE];
1414 	unsigned int inflight;
1415 	struct disk_stats stat;
1416 
1417 	/*
1418 	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1419 		seq_puts(seqf,	"major minor name"
1420 				"     rio rmerge rsect ruse wio wmerge "
1421 				"wsect wuse running use aveq"
1422 				"\n\n");
1423 	*/
1424 
1425 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
1426 	while ((hd = disk_part_iter_next(&piter))) {
1427 		part_stat_read_all(hd, &stat);
1428 		if (queue_is_mq(gp->queue))
1429 			inflight = blk_mq_in_flight(gp->queue, hd);
1430 		else
1431 			inflight = part_in_flight(hd);
1432 
1433 		seq_printf(seqf, "%4d %7d %s "
1434 			   "%lu %lu %lu %u "
1435 			   "%lu %lu %lu %u "
1436 			   "%u %u %u "
1437 			   "%lu %lu %lu %u "
1438 			   "%lu %u"
1439 			   "\n",
1440 			   MAJOR(hd->bd_dev), MINOR(hd->bd_dev),
1441 			   disk_name(gp, hd->bd_partno, buf),
1442 			   stat.ios[STAT_READ],
1443 			   stat.merges[STAT_READ],
1444 			   stat.sectors[STAT_READ],
1445 			   (unsigned int)div_u64(stat.nsecs[STAT_READ],
1446 							NSEC_PER_MSEC),
1447 			   stat.ios[STAT_WRITE],
1448 			   stat.merges[STAT_WRITE],
1449 			   stat.sectors[STAT_WRITE],
1450 			   (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
1451 							NSEC_PER_MSEC),
1452 			   inflight,
1453 			   jiffies_to_msecs(stat.io_ticks),
1454 			   (unsigned int)div_u64(stat.nsecs[STAT_READ] +
1455 						 stat.nsecs[STAT_WRITE] +
1456 						 stat.nsecs[STAT_DISCARD] +
1457 						 stat.nsecs[STAT_FLUSH],
1458 							NSEC_PER_MSEC),
1459 			   stat.ios[STAT_DISCARD],
1460 			   stat.merges[STAT_DISCARD],
1461 			   stat.sectors[STAT_DISCARD],
1462 			   (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
1463 						 NSEC_PER_MSEC),
1464 			   stat.ios[STAT_FLUSH],
1465 			   (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
1466 						 NSEC_PER_MSEC)
1467 			);
1468 	}
1469 	disk_part_iter_exit(&piter);
1470 
1471 	return 0;
1472 }
1473 
1474 static const struct seq_operations diskstats_op = {
1475 	.start	= disk_seqf_start,
1476 	.next	= disk_seqf_next,
1477 	.stop	= disk_seqf_stop,
1478 	.show	= diskstats_show
1479 };
1480 
1481 static int __init proc_genhd_init(void)
1482 {
1483 	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
1484 	proc_create_seq("partitions", 0, NULL, &partitions_op);
1485 	return 0;
1486 }
1487 module_init(proc_genhd_init);
1488 #endif /* CONFIG_PROC_FS */
1489 
1490 dev_t blk_lookup_devt(const char *name, int partno)
1491 {
1492 	dev_t devt = MKDEV(0, 0);
1493 	struct class_dev_iter iter;
1494 	struct device *dev;
1495 
1496 	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
1497 	while ((dev = class_dev_iter_next(&iter))) {
1498 		struct gendisk *disk = dev_to_disk(dev);
1499 		struct block_device *part;
1500 
1501 		if (strcmp(dev_name(dev), name))
1502 			continue;
1503 
1504 		if (partno < disk->minors) {
1505 			/* We need to return the right devno, even
1506 			 * if the partition doesn't exist yet.
1507 			 */
1508 			devt = MKDEV(MAJOR(dev->devt),
1509 				     MINOR(dev->devt) + partno);
1510 			break;
1511 		}
1512 		part = bdget_disk(disk, partno);
1513 		if (part) {
1514 			devt = part->bd_dev;
1515 			bdput(part);
1516 			break;
1517 		}
1518 	}
1519 	class_dev_iter_exit(&iter);
1520 	return devt;
1521 }
1522 
1523 struct gendisk *__alloc_disk_node(int minors, int node_id)
1524 {
1525 	struct gendisk *disk;
1526 	struct disk_part_tbl *ptbl;
1527 
1528 	if (minors > DISK_MAX_PARTS) {
1529 		printk(KERN_ERR
1530 			"block: can't allocate more than %d partitions\n",
1531 			DISK_MAX_PARTS);
1532 		minors = DISK_MAX_PARTS;
1533 	}
1534 
1535 	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1536 	if (!disk)
1537 		return NULL;
1538 
1539 	disk->part0 = bdev_alloc(disk, 0);
1540 	if (!disk->part0)
1541 		goto out_free_disk;
1542 
1543 	disk->node_id = node_id;
1544 	if (disk_expand_part_tbl(disk, 0))
1545 		goto out_bdput;
1546 
1547 	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
1548 	rcu_assign_pointer(ptbl->part[0], disk->part0);
1549 
1550 	disk->minors = minors;
1551 	rand_initialize_disk(disk);
1552 	disk_to_dev(disk)->class = &block_class;
1553 	disk_to_dev(disk)->type = &disk_type;
1554 	device_initialize(disk_to_dev(disk));
1555 	return disk;
1556 
1557 out_bdput:
1558 	bdput(disk->part0);
1559 out_free_disk:
1560 	kfree(disk);
1561 	return NULL;
1562 }
1563 EXPORT_SYMBOL(__alloc_disk_node);
1564 
1565 /**
1566  * put_disk - decrements the gendisk refcount
1567  * @disk: the struct gendisk to decrement the refcount for
1568  *
1569  * This decrements the refcount for the struct gendisk. When this reaches 0
1570  * we'll have disk_release() called.
1571  *
1572  * Context: Any context, but the last reference must not be dropped from
1573  *          atomic context.
1574  */
1575 void put_disk(struct gendisk *disk)
1576 {
1577 	if (disk)
1578 		put_device(disk_to_dev(disk));
1579 }
1580 EXPORT_SYMBOL(put_disk);
1581 
1582 static void set_disk_ro_uevent(struct gendisk *gd, int ro)
1583 {
1584 	char event[] = "DISK_RO=1";
1585 	char *envp[] = { event, NULL };
1586 
1587 	if (!ro)
1588 		event[8] = '0';
1589 	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1590 }
1591 
1592 /**
1593  * set_disk_ro - set a gendisk read-only
1594  * @disk:	gendisk to operate on
1595  * @ready_only:	%true to set the disk read-only, %false set the disk read/write
1596  *
1597  * This function is used to indicate whether a given disk device should have its
1598  * read-only flag set. set_disk_ro() is typically used by device drivers to
1599  * indicate whether the underlying physical device is write-protected.
1600  */
1601 void set_disk_ro(struct gendisk *disk, bool read_only)
1602 {
1603 	if (read_only) {
1604 		if (test_and_set_bit(GD_READ_ONLY, &disk->state))
1605 			return;
1606 	} else {
1607 		if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
1608 			return;
1609 	}
1610 	set_disk_ro_uevent(disk, read_only);
1611 }
1612 EXPORT_SYMBOL(set_disk_ro);
1613 
1614 int bdev_read_only(struct block_device *bdev)
1615 {
1616 	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
1617 }
1618 EXPORT_SYMBOL(bdev_read_only);
1619 
1620 /*
1621  * Disk events - monitor disk events like media change and eject request.
1622  */
1623 struct disk_events {
1624 	struct list_head	node;		/* all disk_event's */
1625 	struct gendisk		*disk;		/* the associated disk */
1626 	spinlock_t		lock;
1627 
1628 	struct mutex		block_mutex;	/* protects blocking */
1629 	int			block;		/* event blocking depth */
1630 	unsigned int		pending;	/* events already sent out */
1631 	unsigned int		clearing;	/* events being cleared */
1632 
1633 	long			poll_msecs;	/* interval, -1 for default */
1634 	struct delayed_work	dwork;
1635 };
1636 
1637 static const char *disk_events_strs[] = {
1638 	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "media_change",
1639 	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "eject_request",
1640 };
1641 
1642 static char *disk_uevents[] = {
1643 	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "DISK_MEDIA_CHANGE=1",
1644 	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "DISK_EJECT_REQUEST=1",
1645 };
1646 
1647 /* list of all disk_events */
1648 static DEFINE_MUTEX(disk_events_mutex);
1649 static LIST_HEAD(disk_events);
1650 
1651 /* disable in-kernel polling by default */
1652 static unsigned long disk_events_dfl_poll_msecs;
1653 
1654 static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1655 {
1656 	struct disk_events *ev = disk->ev;
1657 	long intv_msecs = 0;
1658 
1659 	/*
1660 	 * If device-specific poll interval is set, always use it.  If
1661 	 * the default is being used, poll if the POLL flag is set.
1662 	 */
1663 	if (ev->poll_msecs >= 0)
1664 		intv_msecs = ev->poll_msecs;
1665 	else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
1666 		intv_msecs = disk_events_dfl_poll_msecs;
1667 
1668 	return msecs_to_jiffies(intv_msecs);
1669 }
1670 
1671 /**
1672  * disk_block_events - block and flush disk event checking
1673  * @disk: disk to block events for
1674  *
1675  * On return from this function, it is guaranteed that event checking
1676  * isn't in progress and won't happen until unblocked by
1677  * disk_unblock_events().  Events blocking is counted and the actual
1678  * unblocking happens after the matching number of unblocks are done.
1679  *
1680  * Note that this intentionally does not block event checking from
1681  * disk_clear_events().
1682  *
1683  * CONTEXT:
1684  * Might sleep.
1685  */
1686 void disk_block_events(struct gendisk *disk)
1687 {
1688 	struct disk_events *ev = disk->ev;
1689 	unsigned long flags;
1690 	bool cancel;
1691 
1692 	if (!ev)
1693 		return;
1694 
1695 	/*
1696 	 * Outer mutex ensures that the first blocker completes canceling
1697 	 * the event work before further blockers are allowed to finish.
1698 	 */
1699 	mutex_lock(&ev->block_mutex);
1700 
1701 	spin_lock_irqsave(&ev->lock, flags);
1702 	cancel = !ev->block++;
1703 	spin_unlock_irqrestore(&ev->lock, flags);
1704 
1705 	if (cancel)
1706 		cancel_delayed_work_sync(&disk->ev->dwork);
1707 
1708 	mutex_unlock(&ev->block_mutex);
1709 }
1710 
1711 static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1712 {
1713 	struct disk_events *ev = disk->ev;
1714 	unsigned long intv;
1715 	unsigned long flags;
1716 
1717 	spin_lock_irqsave(&ev->lock, flags);
1718 
1719 	if (WARN_ON_ONCE(ev->block <= 0))
1720 		goto out_unlock;
1721 
1722 	if (--ev->block)
1723 		goto out_unlock;
1724 
1725 	intv = disk_events_poll_jiffies(disk);
1726 	if (check_now)
1727 		queue_delayed_work(system_freezable_power_efficient_wq,
1728 				&ev->dwork, 0);
1729 	else if (intv)
1730 		queue_delayed_work(system_freezable_power_efficient_wq,
1731 				&ev->dwork, intv);
1732 out_unlock:
1733 	spin_unlock_irqrestore(&ev->lock, flags);
1734 }
1735 
1736 /**
1737  * disk_unblock_events - unblock disk event checking
1738  * @disk: disk to unblock events for
1739  *
1740  * Undo disk_block_events().  When the block count reaches zero, it
1741  * starts events polling if configured.
1742  *
1743  * CONTEXT:
1744  * Don't care.  Safe to call from irq context.
1745  */
1746 void disk_unblock_events(struct gendisk *disk)
1747 {
1748 	if (disk->ev)
1749 		__disk_unblock_events(disk, false);
1750 }
1751 
1752 /**
1753  * disk_flush_events - schedule immediate event checking and flushing
1754  * @disk: disk to check and flush events for
1755  * @mask: events to flush
1756  *
1757  * Schedule immediate event checking on @disk if not blocked.  Events in
1758  * @mask are scheduled to be cleared from the driver.  Note that this
1759  * doesn't clear the events from @disk->ev.
1760  *
1761  * CONTEXT:
1762  * If @mask is non-zero must be called with bdev->bd_mutex held.
1763  */
1764 void disk_flush_events(struct gendisk *disk, unsigned int mask)
1765 {
1766 	struct disk_events *ev = disk->ev;
1767 
1768 	if (!ev)
1769 		return;
1770 
1771 	spin_lock_irq(&ev->lock);
1772 	ev->clearing |= mask;
1773 	if (!ev->block)
1774 		mod_delayed_work(system_freezable_power_efficient_wq,
1775 				&ev->dwork, 0);
1776 	spin_unlock_irq(&ev->lock);
1777 }
1778 
1779 /**
1780  * disk_clear_events - synchronously check, clear and return pending events
1781  * @disk: disk to fetch and clear events from
1782  * @mask: mask of events to be fetched and cleared
1783  *
1784  * Disk events are synchronously checked and pending events in @mask
1785  * are cleared and returned.  This ignores the block count.
1786  *
1787  * CONTEXT:
1788  * Might sleep.
1789  */
1790 static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1791 {
1792 	struct disk_events *ev = disk->ev;
1793 	unsigned int pending;
1794 	unsigned int clearing = mask;
1795 
1796 	if (!ev)
1797 		return 0;
1798 
1799 	disk_block_events(disk);
1800 
1801 	/*
1802 	 * store the union of mask and ev->clearing on the stack so that the
1803 	 * race with disk_flush_events does not cause ambiguity (ev->clearing
1804 	 * can still be modified even if events are blocked).
1805 	 */
1806 	spin_lock_irq(&ev->lock);
1807 	clearing |= ev->clearing;
1808 	ev->clearing = 0;
1809 	spin_unlock_irq(&ev->lock);
1810 
1811 	disk_check_events(ev, &clearing);
1812 	/*
1813 	 * if ev->clearing is not 0, the disk_flush_events got called in the
1814 	 * middle of this function, so we want to run the workfn without delay.
1815 	 */
1816 	__disk_unblock_events(disk, ev->clearing ? true : false);
1817 
1818 	/* then, fetch and clear pending events */
1819 	spin_lock_irq(&ev->lock);
1820 	pending = ev->pending & mask;
1821 	ev->pending &= ~mask;
1822 	spin_unlock_irq(&ev->lock);
1823 	WARN_ON_ONCE(clearing & mask);
1824 
1825 	return pending;
1826 }
1827 
1828 /**
1829  * bdev_check_media_change - check if a removable media has been changed
1830  * @bdev: block device to check
1831  *
1832  * Check whether a removable media has been changed, and attempt to free all
1833  * dentries and inodes and invalidates all block device page cache entries in
1834  * that case.
1835  *
1836  * Returns %true if the block device changed, or %false if not.
1837  */
1838 bool bdev_check_media_change(struct block_device *bdev)
1839 {
1840 	unsigned int events;
1841 
1842 	events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE |
1843 				   DISK_EVENT_EJECT_REQUEST);
1844 	if (!(events & DISK_EVENT_MEDIA_CHANGE))
1845 		return false;
1846 
1847 	if (__invalidate_device(bdev, true))
1848 		pr_warn("VFS: busy inodes on changed media %s\n",
1849 			bdev->bd_disk->disk_name);
1850 	set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
1851 	return true;
1852 }
1853 EXPORT_SYMBOL(bdev_check_media_change);
1854 
1855 /*
1856  * Separate this part out so that a different pointer for clearing_ptr can be
1857  * passed in for disk_clear_events.
1858  */
1859 static void disk_events_workfn(struct work_struct *work)
1860 {
1861 	struct delayed_work *dwork = to_delayed_work(work);
1862 	struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
1863 
1864 	disk_check_events(ev, &ev->clearing);
1865 }
1866 
1867 static void disk_check_events(struct disk_events *ev,
1868 			      unsigned int *clearing_ptr)
1869 {
1870 	struct gendisk *disk = ev->disk;
1871 	char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
1872 	unsigned int clearing = *clearing_ptr;
1873 	unsigned int events;
1874 	unsigned long intv;
1875 	int nr_events = 0, i;
1876 
1877 	/* check events */
1878 	events = disk->fops->check_events(disk, clearing);
1879 
1880 	/* accumulate pending events and schedule next poll if necessary */
1881 	spin_lock_irq(&ev->lock);
1882 
1883 	events &= ~ev->pending;
1884 	ev->pending |= events;
1885 	*clearing_ptr &= ~clearing;
1886 
1887 	intv = disk_events_poll_jiffies(disk);
1888 	if (!ev->block && intv)
1889 		queue_delayed_work(system_freezable_power_efficient_wq,
1890 				&ev->dwork, intv);
1891 
1892 	spin_unlock_irq(&ev->lock);
1893 
1894 	/*
1895 	 * Tell userland about new events.  Only the events listed in
1896 	 * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
1897 	 * is set. Otherwise, events are processed internally but never
1898 	 * get reported to userland.
1899 	 */
1900 	for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1901 		if ((events & disk->events & (1 << i)) &&
1902 		    (disk->event_flags & DISK_EVENT_FLAG_UEVENT))
1903 			envp[nr_events++] = disk_uevents[i];
1904 
1905 	if (nr_events)
1906 		kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
1907 }
1908 
1909 /*
1910  * A disk events enabled device has the following sysfs nodes under
1911  * its /sys/block/X/ directory.
1912  *
1913  * events		: list of all supported events
1914  * events_async		: list of events which can be detected w/o polling
1915  *			  (always empty, only for backwards compatibility)
1916  * events_poll_msecs	: polling interval, 0: disable, -1: system default
1917  */
1918 static ssize_t __disk_events_show(unsigned int events, char *buf)
1919 {
1920 	const char *delim = "";
1921 	ssize_t pos = 0;
1922 	int i;
1923 
1924 	for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
1925 		if (events & (1 << i)) {
1926 			pos += sprintf(buf + pos, "%s%s",
1927 				       delim, disk_events_strs[i]);
1928 			delim = " ";
1929 		}
1930 	if (pos)
1931 		pos += sprintf(buf + pos, "\n");
1932 	return pos;
1933 }
1934 
1935 static ssize_t disk_events_show(struct device *dev,
1936 				struct device_attribute *attr, char *buf)
1937 {
1938 	struct gendisk *disk = dev_to_disk(dev);
1939 
1940 	if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
1941 		return 0;
1942 
1943 	return __disk_events_show(disk->events, buf);
1944 }
1945 
1946 static ssize_t disk_events_async_show(struct device *dev,
1947 				      struct device_attribute *attr, char *buf)
1948 {
1949 	return 0;
1950 }
1951 
1952 static ssize_t disk_events_poll_msecs_show(struct device *dev,
1953 					   struct device_attribute *attr,
1954 					   char *buf)
1955 {
1956 	struct gendisk *disk = dev_to_disk(dev);
1957 
1958 	if (!disk->ev)
1959 		return sprintf(buf, "-1\n");
1960 
1961 	return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1962 }
1963 
1964 static ssize_t disk_events_poll_msecs_store(struct device *dev,
1965 					    struct device_attribute *attr,
1966 					    const char *buf, size_t count)
1967 {
1968 	struct gendisk *disk = dev_to_disk(dev);
1969 	long intv;
1970 
1971 	if (!count || !sscanf(buf, "%ld", &intv))
1972 		return -EINVAL;
1973 
1974 	if (intv < 0 && intv != -1)
1975 		return -EINVAL;
1976 
1977 	if (!disk->ev)
1978 		return -ENODEV;
1979 
1980 	disk_block_events(disk);
1981 	disk->ev->poll_msecs = intv;
1982 	__disk_unblock_events(disk, true);
1983 
1984 	return count;
1985 }
1986 
1987 static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
1988 static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
1989 static const DEVICE_ATTR(events_poll_msecs, 0644,
1990 			 disk_events_poll_msecs_show,
1991 			 disk_events_poll_msecs_store);
1992 
1993 static const struct attribute *disk_events_attrs[] = {
1994 	&dev_attr_events.attr,
1995 	&dev_attr_events_async.attr,
1996 	&dev_attr_events_poll_msecs.attr,
1997 	NULL,
1998 };
1999 
2000 /*
2001  * The default polling interval can be specified by the kernel
2002  * parameter block.events_dfl_poll_msecs which defaults to 0
2003  * (disable).  This can also be modified runtime by writing to
2004  * /sys/module/block/parameters/events_dfl_poll_msecs.
2005  */
2006 static int disk_events_set_dfl_poll_msecs(const char *val,
2007 					  const struct kernel_param *kp)
2008 {
2009 	struct disk_events *ev;
2010 	int ret;
2011 
2012 	ret = param_set_ulong(val, kp);
2013 	if (ret < 0)
2014 		return ret;
2015 
2016 	mutex_lock(&disk_events_mutex);
2017 
2018 	list_for_each_entry(ev, &disk_events, node)
2019 		disk_flush_events(ev->disk, 0);
2020 
2021 	mutex_unlock(&disk_events_mutex);
2022 
2023 	return 0;
2024 }
2025 
2026 static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
2027 	.set	= disk_events_set_dfl_poll_msecs,
2028 	.get	= param_get_ulong,
2029 };
2030 
2031 #undef MODULE_PARAM_PREFIX
2032 #define MODULE_PARAM_PREFIX	"block."
2033 
2034 module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
2035 		&disk_events_dfl_poll_msecs, 0644);
2036 
2037 /*
2038  * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
2039  */
2040 static void disk_alloc_events(struct gendisk *disk)
2041 {
2042 	struct disk_events *ev;
2043 
2044 	if (!disk->fops->check_events || !disk->events)
2045 		return;
2046 
2047 	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
2048 	if (!ev) {
2049 		pr_warn("%s: failed to initialize events\n", disk->disk_name);
2050 		return;
2051 	}
2052 
2053 	INIT_LIST_HEAD(&ev->node);
2054 	ev->disk = disk;
2055 	spin_lock_init(&ev->lock);
2056 	mutex_init(&ev->block_mutex);
2057 	ev->block = 1;
2058 	ev->poll_msecs = -1;
2059 	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
2060 
2061 	disk->ev = ev;
2062 }
2063 
2064 static void disk_add_events(struct gendisk *disk)
2065 {
2066 	/* FIXME: error handling */
2067 	if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
2068 		pr_warn("%s: failed to create sysfs files for events\n",
2069 			disk->disk_name);
2070 
2071 	if (!disk->ev)
2072 		return;
2073 
2074 	mutex_lock(&disk_events_mutex);
2075 	list_add_tail(&disk->ev->node, &disk_events);
2076 	mutex_unlock(&disk_events_mutex);
2077 
2078 	/*
2079 	 * Block count is initialized to 1 and the following initial
2080 	 * unblock kicks it into action.
2081 	 */
2082 	__disk_unblock_events(disk, true);
2083 }
2084 
2085 static void disk_del_events(struct gendisk *disk)
2086 {
2087 	if (disk->ev) {
2088 		disk_block_events(disk);
2089 
2090 		mutex_lock(&disk_events_mutex);
2091 		list_del_init(&disk->ev->node);
2092 		mutex_unlock(&disk_events_mutex);
2093 	}
2094 
2095 	sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
2096 }
2097 
2098 static void disk_release_events(struct gendisk *disk)
2099 {
2100 	/* the block count should be 1 from disk_del_events() */
2101 	WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
2102 	kfree(disk->ev);
2103 }
2104