1/* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 11 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 12 - kmod support by: Cyrus Durgin 13 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 14 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 15 16 - lots of fixes and improvements to the RAID1/RAID5 and generic 17 RAID code (such as request based resynchronization): 18 19 Neil Brown <neilb@cse.unsw.edu.au>. 20 21 This program is free software; you can redistribute it and/or modify 22 it under the terms of the GNU General Public License as published by 23 the Free Software Foundation; either version 2, or (at your option) 24 any later version. 25 26 You should have received a copy of the GNU General Public License 27 (for example /usr/src/linux/COPYING); if not, write to the Free 28 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 29*/ 30 31#include <linux/module.h> 32#include <linux/config.h> 33#include <linux/linkage.h> 34#include <linux/raid/md.h> 35#include <linux/sysctl.h> 36#include <linux/bio.h> 37#include <linux/devfs_fs_kernel.h> 38#include <linux/buffer_head.h> /* for invalidate_bdev */ 39#include <linux/suspend.h> 40 41#include <linux/init.h> 42 43#ifdef CONFIG_KMOD 44#include <linux/kmod.h> 45#endif 46 47#define __KERNEL_SYSCALLS__ 48#include <linux/unistd.h> 49 50#include <asm/unaligned.h> 51 52#define MAJOR_NR MD_MAJOR 53#define MD_DRIVER 54#define DEVICE_NR(device) (minor(device)) 55 56#include <linux/blk.h> 57 58#define DEBUG 0 59#define dprintk(x...) ((void)(DEBUG && printk(x))) 60 61 62#ifndef MODULE 63static void autostart_arrays (void); 64#endif 65 66static mdk_personality_t *pers[MAX_PERSONALITY]; 67static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED; 68 69/* 70 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 71 * is 1000 KB/sec, so the extra system load does not show up that much. 72 * Increase it if you want to have more _guaranteed_ speed. Note that 73 * the RAID driver will use the maximum available bandwith if the IO 74 * subsystem is idle. There is also an 'absolute maximum' reconstruction 75 * speed limit - in case reconstruction slows down your system despite 76 * idle IO detection. 77 * 78 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 79 */ 80 81static int sysctl_speed_limit_min = 1000; 82static int sysctl_speed_limit_max = 200000; 83 84static struct ctl_table_header *raid_table_header; 85 86static ctl_table raid_table[] = { 87 { 88 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 89 .procname = "speed_limit_min", 90 .data = &sysctl_speed_limit_min, 91 .maxlen = sizeof(int), 92 .mode = 0644, 93 .proc_handler = &proc_dointvec, 94 }, 95 { 96 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 97 .procname = "speed_limit_max", 98 .data = &sysctl_speed_limit_max, 99 .maxlen = sizeof(int), 100 .mode = 0644, 101 .proc_handler = &proc_dointvec, 102 }, 103 { .ctl_name = 0 } 104}; 105 106static ctl_table raid_dir_table[] = { 107 { 108 .ctl_name = DEV_RAID, 109 .procname = "raid", 110 .maxlen = 0, 111 .mode = 0555, 112 .child = raid_table, 113 }, 114 { .ctl_name = 0 } 115}; 116 117static ctl_table raid_root_table[] = { 118 { 119 .ctl_name = CTL_DEV, 120 .procname = "dev", 121 .maxlen = 0, 122 .mode = 0555, 123 .child = raid_dir_table, 124 }, 125 { .ctl_name = 0 } 126}; 127 128static struct block_device_operations md_fops; 129 130static struct gendisk *disks[MAX_MD_DEVS]; 131 132/* 133 * Enables to iterate over all existing md arrays 134 * all_mddevs_lock protects this list as well as mddev_map. 135 */ 136static LIST_HEAD(all_mddevs); 137static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; 138 139 140/* 141 * iterates through all used mddevs in the system. 142 * We take care to grab the all_mddevs_lock whenever navigating 143 * the list, and to always hold a refcount when unlocked. 144 * Any code which breaks out of this loop while own 145 * a reference to the current mddev and must mddev_put it. 146 */ 147#define ITERATE_MDDEV(mddev,tmp) \ 148 \ 149 for (({ spin_lock(&all_mddevs_lock); \ 150 tmp = all_mddevs.next; \ 151 mddev = NULL;}); \ 152 ({ if (tmp != &all_mddevs) \ 153 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 154 spin_unlock(&all_mddevs_lock); \ 155 if (mddev) mddev_put(mddev); \ 156 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 157 tmp != &all_mddevs;}); \ 158 ({ spin_lock(&all_mddevs_lock); \ 159 tmp = tmp->next;}) \ 160 ) 161 162static mddev_t *mddev_map[MAX_MD_DEVS]; 163 164static int md_fail_request (request_queue_t *q, struct bio *bio) 165{ 166 bio_io_error(bio, bio->bi_size); 167 return 0; 168} 169 170static inline mddev_t *mddev_get(mddev_t *mddev) 171{ 172 atomic_inc(&mddev->active); 173 return mddev; 174} 175 176static void mddev_put(mddev_t *mddev) 177{ 178 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 179 return; 180 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 181 list_del(&mddev->all_mddevs); 182 mddev_map[mdidx(mddev)] = NULL; 183 kfree(mddev); 184 MOD_DEC_USE_COUNT; 185 } 186 spin_unlock(&all_mddevs_lock); 187} 188 189static mddev_t * mddev_find(int unit) 190{ 191 mddev_t *mddev, *new = NULL; 192 193 retry: 194 spin_lock(&all_mddevs_lock); 195 if (mddev_map[unit]) { 196 mddev = mddev_get(mddev_map[unit]); 197 spin_unlock(&all_mddevs_lock); 198 if (new) 199 kfree(new); 200 return mddev; 201 } 202 if (new) { 203 mddev_map[unit] = new; 204 list_add(&new->all_mddevs, &all_mddevs); 205 spin_unlock(&all_mddevs_lock); 206 MOD_INC_USE_COUNT; 207 return new; 208 } 209 spin_unlock(&all_mddevs_lock); 210 211 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 212 if (!new) 213 return NULL; 214 215 memset(new, 0, sizeof(*new)); 216 217 new->__minor = unit; 218 init_MUTEX(&new->reconfig_sem); 219 INIT_LIST_HEAD(&new->disks); 220 INIT_LIST_HEAD(&new->all_mddevs); 221 init_timer(&new->safemode_timer); 222 atomic_set(&new->active, 1); 223 blk_queue_make_request(&new->queue, md_fail_request); 224 225 goto retry; 226} 227 228static inline int mddev_lock(mddev_t * mddev) 229{ 230 return down_interruptible(&mddev->reconfig_sem); 231} 232 233static inline void mddev_lock_uninterruptible(mddev_t * mddev) 234{ 235 down(&mddev->reconfig_sem); 236} 237 238static inline int mddev_trylock(mddev_t * mddev) 239{ 240 return down_trylock(&mddev->reconfig_sem); 241} 242 243static inline void mddev_unlock(mddev_t * mddev) 244{ 245 up(&mddev->reconfig_sem); 246} 247 248mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 249{ 250 mdk_rdev_t * rdev; 251 struct list_head *tmp; 252 253 ITERATE_RDEV(mddev,rdev,tmp) { 254 if (rdev->desc_nr == nr) 255 return rdev; 256 } 257 return NULL; 258} 259 260static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 261{ 262 struct list_head *tmp; 263 mdk_rdev_t *rdev; 264 265 ITERATE_RDEV(mddev,rdev,tmp) { 266 if (rdev->bdev->bd_dev == dev) 267 return rdev; 268 } 269 return NULL; 270} 271 272inline static sector_t calc_dev_sboffset(struct block_device *bdev) 273{ 274 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 275 return MD_NEW_SIZE_BLOCKS(size); 276} 277 278static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 279{ 280 sector_t size; 281 282 size = rdev->sb_offset; 283 284 if (chunk_size) 285 size &= ~((sector_t)chunk_size/1024 - 1); 286 return size; 287} 288 289static int alloc_disk_sb(mdk_rdev_t * rdev) 290{ 291 if (rdev->sb_page) 292 MD_BUG(); 293 294 rdev->sb_page = alloc_page(GFP_KERNEL); 295 if (!rdev->sb_page) { 296 printk(KERN_ALERT "md: out of memory.\n"); 297 return -EINVAL; 298 } 299 300 return 0; 301} 302 303static void free_disk_sb(mdk_rdev_t * rdev) 304{ 305 if (rdev->sb_page) { 306 page_cache_release(rdev->sb_page); 307 rdev->sb_loaded = 0; 308 rdev->sb_page = NULL; 309 rdev->sb_offset = 0; 310 rdev->size = 0; 311 } 312} 313 314 315static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 316{ 317 if (bio->bi_size) 318 return 1; 319 320 complete((struct completion*)bio->bi_private); 321 return 0; 322} 323 324static int sync_page_io(struct block_device *bdev, sector_t sector, int size, 325 struct page *page, int rw) 326{ 327 struct bio bio; 328 struct bio_vec vec; 329 struct completion event; 330 331 bio_init(&bio); 332 bio.bi_io_vec = &vec; 333 vec.bv_page = page; 334 vec.bv_len = size; 335 vec.bv_offset = 0; 336 bio.bi_vcnt = 1; 337 bio.bi_idx = 0; 338 bio.bi_size = size; 339 bio.bi_bdev = bdev; 340 bio.bi_sector = sector; 341 init_completion(&event); 342 bio.bi_private = &event; 343 bio.bi_end_io = bi_complete; 344 submit_bio(rw, &bio); 345 blk_run_queues(); 346 wait_for_completion(&event); 347 348 return test_bit(BIO_UPTODATE, &bio.bi_flags); 349} 350 351static int read_disk_sb(mdk_rdev_t * rdev) 352{ 353 354 if (!rdev->sb_page) { 355 MD_BUG(); 356 return -EINVAL; 357 } 358 if (rdev->sb_loaded) 359 return 0; 360 361 362 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) 363 goto fail; 364 rdev->sb_loaded = 1; 365 return 0; 366 367fail: 368 printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", 369 bdev_partition_name(rdev->bdev)); 370 return -EINVAL; 371} 372 373static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 374{ 375 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 376 (sb1->set_uuid1 == sb2->set_uuid1) && 377 (sb1->set_uuid2 == sb2->set_uuid2) && 378 (sb1->set_uuid3 == sb2->set_uuid3)) 379 380 return 1; 381 382 return 0; 383} 384 385 386static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 387{ 388 int ret; 389 mdp_super_t *tmp1, *tmp2; 390 391 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 392 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 393 394 if (!tmp1 || !tmp2) { 395 ret = 0; 396 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 397 goto abort; 398 } 399 400 *tmp1 = *sb1; 401 *tmp2 = *sb2; 402 403 /* 404 * nr_disks is not constant 405 */ 406 tmp1->nr_disks = 0; 407 tmp2->nr_disks = 0; 408 409 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 410 ret = 0; 411 else 412 ret = 1; 413 414abort: 415 if (tmp1) 416 kfree(tmp1); 417 if (tmp2) 418 kfree(tmp2); 419 420 return ret; 421} 422 423static unsigned int calc_sb_csum(mdp_super_t * sb) 424{ 425 unsigned int disk_csum, csum; 426 427 disk_csum = sb->sb_csum; 428 sb->sb_csum = 0; 429 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 430 sb->sb_csum = disk_csum; 431 return csum; 432} 433 434/* 435 * Handle superblock details. 436 * We want to be able to handle multiple superblock formats 437 * so we have a common interface to them all, and an array of 438 * different handlers. 439 * We rely on user-space to write the initial superblock, and support 440 * reading and updating of superblocks. 441 * Interface methods are: 442 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 443 * loads and validates a superblock on dev. 444 * if refdev != NULL, compare superblocks on both devices 445 * Return: 446 * 0 - dev has a superblock that is compatible with refdev 447 * 1 - dev has a superblock that is compatible and newer than refdev 448 * so dev should be used as the refdev in future 449 * -EINVAL superblock incompatible or invalid 450 * -othererror e.g. -EIO 451 * 452 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 453 * Verify that dev is acceptable into mddev. 454 * The first time, mddev->raid_disks will be 0, and data from 455 * dev should be merged in. Subsequent calls check that dev 456 * is new enough. Return 0 or -EINVAL 457 * 458 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 459 * Update the superblock for rdev with data in mddev 460 * This does not write to disc. 461 * 462 */ 463 464struct super_type { 465 char *name; 466 struct module *owner; 467 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 468 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 469 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 470}; 471 472/* 473 * load_super for 0.90.0 474 */ 475static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 476{ 477 mdp_super_t *sb; 478 int ret; 479 sector_t sb_offset; 480 481 /* 482 * Calculate the position of the superblock, 483 * it's at the end of the disk. 484 * 485 * It also happens to be a multiple of 4Kb. 486 */ 487 sb_offset = calc_dev_sboffset(rdev->bdev); 488 rdev->sb_offset = sb_offset; 489 490 ret = read_disk_sb(rdev); 491 if (ret) return ret; 492 493 ret = -EINVAL; 494 495 sb = (mdp_super_t*)page_address(rdev->sb_page); 496 497 if (sb->md_magic != MD_SB_MAGIC) { 498 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 499 bdev_partition_name(rdev->bdev)); 500 goto abort; 501 } 502 503 if (sb->major_version != 0 || 504 sb->minor_version != 90) { 505 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 506 sb->major_version, sb->minor_version, 507 bdev_partition_name(rdev->bdev)); 508 goto abort; 509 } 510 511 if (sb->md_minor >= MAX_MD_DEVS) { 512 printk(KERN_ERR "md: %s: invalid raid minor (%x)\n", 513 bdev_partition_name(rdev->bdev), sb->md_minor); 514 goto abort; 515 } 516 if (sb->raid_disks <= 0) 517 goto abort; 518 519 if (calc_sb_csum(sb) != sb->sb_csum) { 520 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 521 bdev_partition_name(rdev->bdev)); 522 goto abort; 523 } 524 525 rdev->preferred_minor = sb->md_minor; 526 rdev->data_offset = 0; 527 528 if (sb->level == MULTIPATH) 529 rdev->desc_nr = -1; 530 else 531 rdev->desc_nr = sb->this_disk.number; 532 533 if (refdev == 0) 534 ret = 1; 535 else { 536 __u64 ev1, ev2; 537 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 538 if (!uuid_equal(refsb, sb)) { 539 printk(KERN_WARNING "md: %s has different UUID to %s\n", 540 bdev_partition_name(rdev->bdev), 541 bdev_partition_name(refdev->bdev)); 542 goto abort; 543 } 544 if (!sb_equal(refsb, sb)) { 545 printk(KERN_WARNING "md: %s has same UUID" 546 " but different superblock to %s\n", 547 bdev_partition_name(rdev->bdev), 548 bdev_partition_name(refdev->bdev)); 549 goto abort; 550 } 551 ev1 = md_event(sb); 552 ev2 = md_event(refsb); 553 if (ev1 > ev2) 554 ret = 1; 555 else 556 ret = 0; 557 } 558 rdev->size = calc_dev_size(rdev, sb->chunk_size); 559 560 abort: 561 return ret; 562} 563 564/* 565 * validate_super for 0.90.0 566 */ 567static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 568{ 569 mdp_disk_t *desc; 570 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 571 572 if (mddev->raid_disks == 0) { 573 mddev->major_version = 0; 574 mddev->minor_version = sb->minor_version; 575 mddev->patch_version = sb->patch_version; 576 mddev->persistent = ! sb->not_persistent; 577 mddev->chunk_size = sb->chunk_size; 578 mddev->ctime = sb->ctime; 579 mddev->utime = sb->utime; 580 mddev->level = sb->level; 581 mddev->layout = sb->layout; 582 mddev->raid_disks = sb->raid_disks; 583 mddev->size = sb->size; 584 mddev->events = md_event(sb); 585 586 if (sb->state & (1<<MD_SB_CLEAN)) 587 mddev->recovery_cp = MaxSector; 588 else { 589 if (sb->events_hi == sb->cp_events_hi && 590 sb->events_lo == sb->cp_events_lo) { 591 mddev->recovery_cp = sb->recovery_cp; 592 } else 593 mddev->recovery_cp = 0; 594 } 595 596 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 597 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 598 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 599 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 600 601 mddev->max_disks = MD_SB_DISKS; 602 } else { 603 __u64 ev1; 604 ev1 = md_event(sb); 605 ++ev1; 606 if (ev1 < mddev->events) 607 return -EINVAL; 608 } 609 if (mddev->level != LEVEL_MULTIPATH) { 610 rdev->raid_disk = -1; 611 rdev->in_sync = rdev->faulty = 0; 612 desc = sb->disks + rdev->desc_nr; 613 614 if (desc->state & (1<<MD_DISK_FAULTY)) 615 rdev->faulty = 1; 616 else if (desc->state & (1<<MD_DISK_SYNC) && 617 desc->raid_disk < mddev->raid_disks) { 618 rdev->in_sync = 1; 619 rdev->raid_disk = desc->raid_disk; 620 } 621 } 622 return 0; 623} 624 625/* 626 * sync_super for 0.90.0 627 */ 628static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 629{ 630 mdp_super_t *sb; 631 struct list_head *tmp; 632 mdk_rdev_t *rdev2; 633 int next_spare = mddev->raid_disks; 634 635 /* make rdev->sb match mddev data.. 636 * 637 * 1/ zero out disks 638 * 2/ Add info for each disk, keeping track of highest desc_nr 639 * 3/ any empty disks < highest become removed 640 * 641 * disks[0] gets initialised to REMOVED because 642 * we cannot be sure from other fields if it has 643 * been initialised or not. 644 */ 645 int highest = 0; 646 int i; 647 int active=0, working=0,failed=0,spare=0,nr_disks=0; 648 649 sb = (mdp_super_t*)page_address(rdev->sb_page); 650 651 memset(sb, 0, sizeof(*sb)); 652 653 sb->md_magic = MD_SB_MAGIC; 654 sb->major_version = mddev->major_version; 655 sb->minor_version = mddev->minor_version; 656 sb->patch_version = mddev->patch_version; 657 sb->gvalid_words = 0; /* ignored */ 658 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 659 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 660 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 661 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 662 663 sb->ctime = mddev->ctime; 664 sb->level = mddev->level; 665 sb->size = mddev->size; 666 sb->raid_disks = mddev->raid_disks; 667 sb->md_minor = mddev->__minor; 668 sb->not_persistent = !mddev->persistent; 669 sb->utime = mddev->utime; 670 sb->state = 0; 671 sb->events_hi = (mddev->events>>32); 672 sb->events_lo = (u32)mddev->events; 673 674 if (mddev->in_sync) 675 { 676 sb->recovery_cp = mddev->recovery_cp; 677 sb->cp_events_hi = (mddev->events>>32); 678 sb->cp_events_lo = (u32)mddev->events; 679 if (mddev->recovery_cp == MaxSector) 680 sb->state = (1<< MD_SB_CLEAN); 681 } else 682 sb->recovery_cp = 0; 683 684 sb->layout = mddev->layout; 685 sb->chunk_size = mddev->chunk_size; 686 687 sb->disks[0].state = (1<<MD_DISK_REMOVED); 688 ITERATE_RDEV(mddev,rdev2,tmp) { 689 mdp_disk_t *d; 690 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 691 rdev2->desc_nr = rdev2->raid_disk; 692 else 693 rdev2->desc_nr = next_spare++; 694 d = &sb->disks[rdev2->desc_nr]; 695 nr_disks++; 696 d->number = rdev2->desc_nr; 697 d->major = MAJOR(rdev2->bdev->bd_dev); 698 d->minor = MINOR(rdev2->bdev->bd_dev); 699 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) 700 d->raid_disk = rdev2->raid_disk; 701 else 702 d->raid_disk = rdev2->desc_nr; /* compatibility */ 703 if (rdev2->faulty) { 704 d->state = (1<<MD_DISK_FAULTY); 705 failed++; 706 } else if (rdev2->in_sync) { 707 d->state = (1<<MD_DISK_ACTIVE); 708 d->state |= (1<<MD_DISK_SYNC); 709 active++; 710 working++; 711 } else { 712 d->state = 0; 713 spare++; 714 working++; 715 } 716 if (rdev2->desc_nr > highest) 717 highest = rdev2->desc_nr; 718 } 719 720 /* now set the "removed" bit on any non-trailing holes */ 721 for (i=0; i<highest; i++) { 722 mdp_disk_t *d = &sb->disks[i]; 723 if (d->state == 0 && d->number == 0) { 724 d->number = i; 725 d->raid_disk = i; 726 d->state = (1<<MD_DISK_REMOVED); 727 } 728 } 729 sb->nr_disks = nr_disks; 730 sb->active_disks = active; 731 sb->working_disks = working; 732 sb->failed_disks = failed; 733 sb->spare_disks = spare; 734 735 sb->this_disk = sb->disks[rdev->desc_nr]; 736 sb->sb_csum = calc_sb_csum(sb); 737} 738 739/* 740 * version 1 superblock 741 */ 742 743static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 744{ 745 unsigned int disk_csum, csum; 746 int size = 256 + sb->max_dev*2; 747 748 disk_csum = sb->sb_csum; 749 sb->sb_csum = 0; 750 csum = csum_partial((void *)sb, size, 0); 751 sb->sb_csum = disk_csum; 752 return csum; 753} 754 755static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 756{ 757 struct mdp_superblock_1 *sb; 758 int ret; 759 sector_t sb_offset; 760 761 /* 762 * Calculate the position of the superblock. 763 * It is always aligned to a 4K boundary and 764 * depeding on minor_version, it can be: 765 * 0: At least 8K, but less than 12K, from end of device 766 * 1: At start of device 767 * 2: 4K from start of device. 768 */ 769 switch(minor_version) { 770 case 0: 771 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 772 sb_offset -= 8*2; 773 sb_offset &= ~(4*2); 774 /* convert from sectors to K */ 775 sb_offset /= 2; 776 break; 777 case 1: 778 sb_offset = 0; 779 break; 780 case 2: 781 sb_offset = 4; 782 break; 783 default: 784 return -EINVAL; 785 } 786 rdev->sb_offset = sb_offset; 787 788 ret = read_disk_sb(rdev); 789 if (ret) return ret; 790 791 792 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 793 794 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 795 sb->major_version != cpu_to_le32(1) || 796 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 797 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 798 sb->feature_map != 0) 799 return -EINVAL; 800 801 if (calc_sb_1_csum(sb) != sb->sb_csum) { 802 printk("md: invalid superblock checksum on %s\n", 803 bdev_partition_name(rdev->bdev)); 804 return -EINVAL; 805 } 806 rdev->preferred_minor = 0xffff; 807 rdev->data_offset = le64_to_cpu(sb->data_offset); 808 809 if (refdev == 0) 810 return 1; 811 else { 812 __u64 ev1, ev2; 813 struct mdp_superblock_1 *refsb = 814 (struct mdp_superblock_1*)page_address(refdev->sb_page); 815 816 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 817 sb->level != refsb->level || 818 sb->layout != refsb->layout || 819 sb->chunksize != refsb->chunksize) { 820 printk(KERN_WARNING "md: %s has strangely different" 821 " superblock to %s\n", 822 bdev_partition_name(rdev->bdev), 823 bdev_partition_name(refdev->bdev)); 824 return -EINVAL; 825 } 826 ev1 = le64_to_cpu(sb->events); 827 ev2 = le64_to_cpu(refsb->events); 828 829 if (ev1 > ev2) 830 return 1; 831 } 832 if (minor_version) 833 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 834 else 835 rdev->size = rdev->sb_offset; 836 if (rdev->size < le64_to_cpu(sb->data_size)/2) 837 return -EINVAL; 838 rdev->size = le64_to_cpu(sb->data_size)/2; 839 if (le32_to_cpu(sb->chunksize)) 840 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 841 return 0; 842} 843 844static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 845{ 846 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 847 848 if (mddev->raid_disks == 0) { 849 mddev->major_version = 1; 850 mddev->minor_version = 0; 851 mddev->patch_version = 0; 852 mddev->persistent = 1; 853 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 854 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 855 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 856 mddev->level = le32_to_cpu(sb->level); 857 mddev->layout = le32_to_cpu(sb->layout); 858 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 859 mddev->size = (u32)le64_to_cpu(sb->size); 860 mddev->events = le64_to_cpu(sb->events); 861 862 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 863 memcpy(mddev->uuid, sb->set_uuid, 16); 864 865 mddev->max_disks = (4096-256)/2; 866 } else { 867 __u64 ev1; 868 ev1 = le64_to_cpu(sb->events); 869 ++ev1; 870 if (ev1 < mddev->events) 871 return -EINVAL; 872 } 873 874 if (mddev->level != LEVEL_MULTIPATH) { 875 int role; 876 rdev->desc_nr = le32_to_cpu(sb->dev_number); 877 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 878 switch(role) { 879 case 0xffff: /* spare */ 880 rdev->in_sync = 0; 881 rdev->faulty = 0; 882 rdev->raid_disk = -1; 883 break; 884 case 0xfffe: /* faulty */ 885 rdev->in_sync = 0; 886 rdev->faulty = 1; 887 rdev->raid_disk = -1; 888 break; 889 default: 890 rdev->in_sync = 1; 891 rdev->faulty = 0; 892 rdev->raid_disk = role; 893 break; 894 } 895 } 896 return 0; 897} 898 899static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 900{ 901 struct mdp_superblock_1 *sb; 902 struct list_head *tmp; 903 mdk_rdev_t *rdev2; 904 int max_dev, i; 905 /* make rdev->sb match mddev and rdev data. */ 906 907 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 908 909 sb->feature_map = 0; 910 sb->pad0 = 0; 911 memset(sb->pad1, 0, sizeof(sb->pad1)); 912 memset(sb->pad2, 0, sizeof(sb->pad2)); 913 memset(sb->pad3, 0, sizeof(sb->pad3)); 914 915 sb->utime = cpu_to_le64((__u64)mddev->utime); 916 sb->events = cpu_to_le64(mddev->events); 917 if (mddev->in_sync) 918 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 919 else 920 sb->resync_offset = cpu_to_le64(0); 921 922 max_dev = 0; 923 ITERATE_RDEV(mddev,rdev2,tmp) 924 if (rdev2->desc_nr > max_dev) 925 max_dev = rdev2->desc_nr; 926 927 sb->max_dev = max_dev; 928 for (i=0; i<max_dev;i++) 929 sb->dev_roles[max_dev] = cpu_to_le16(0xfffe); 930 931 ITERATE_RDEV(mddev,rdev2,tmp) { 932 i = rdev2->desc_nr; 933 if (rdev2->faulty) 934 sb->dev_roles[i] = cpu_to_le16(0xfffe); 935 else if (rdev2->in_sync) 936 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 937 else 938 sb->dev_roles[i] = cpu_to_le16(0xffff); 939 } 940 941 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 942} 943 944 945struct super_type super_types[] = { 946 [0] = { 947 .name = "0.90.0", 948 .owner = THIS_MODULE, 949 .load_super = super_90_load, 950 .validate_super = super_90_validate, 951 .sync_super = super_90_sync, 952 }, 953 [1] = { 954 .name = "md-1", 955 .owner = THIS_MODULE, 956 .load_super = super_1_load, 957 .validate_super = super_1_validate, 958 .sync_super = super_1_sync, 959 }, 960}; 961 962static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 963{ 964 struct list_head *tmp; 965 mdk_rdev_t *rdev; 966 967 ITERATE_RDEV(mddev,rdev,tmp) 968 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 969 return rdev; 970 971 return NULL; 972} 973 974static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 975{ 976 struct list_head *tmp; 977 mdk_rdev_t *rdev; 978 979 ITERATE_RDEV(mddev1,rdev,tmp) 980 if (match_dev_unit(mddev2, rdev)) 981 return 1; 982 983 return 0; 984} 985 986static LIST_HEAD(pending_raid_disks); 987 988static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 989{ 990 mdk_rdev_t *same_pdev; 991 992 if (rdev->mddev) { 993 MD_BUG(); 994 return -EINVAL; 995 } 996 same_pdev = match_dev_unit(mddev, rdev); 997 if (same_pdev) 998 printk(KERN_WARNING 999 "md%d: WARNING: %s appears to be on the same physical" 1000 " disk as %s. True\n protection against single-disk" 1001 " failure might be compromised.\n", 1002 mdidx(mddev), bdev_partition_name(rdev->bdev), 1003 bdev_partition_name(same_pdev->bdev)); 1004 1005 /* Verify rdev->desc_nr is unique. 1006 * If it is -1, assign a free number, else 1007 * check number is not in use 1008 */ 1009 if (rdev->desc_nr < 0) { 1010 int choice = 0; 1011 if (mddev->pers) choice = mddev->raid_disks; 1012 while (find_rdev_nr(mddev, choice)) 1013 choice++; 1014 rdev->desc_nr = choice; 1015 } else { 1016 if (find_rdev_nr(mddev, rdev->desc_nr)) 1017 return -EBUSY; 1018 } 1019 1020 list_add(&rdev->same_set, &mddev->disks); 1021 rdev->mddev = mddev; 1022 printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev)); 1023 return 0; 1024} 1025 1026static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1027{ 1028 if (!rdev->mddev) { 1029 MD_BUG(); 1030 return; 1031 } 1032 list_del_init(&rdev->same_set); 1033 printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev)); 1034 rdev->mddev = NULL; 1035} 1036 1037/* 1038 * prevent the device from being mounted, repartitioned or 1039 * otherwise reused by a RAID array (or any other kernel 1040 * subsystem), by opening the device. [simply getting an 1041 * inode is not enough, the SCSI module usage code needs 1042 * an explicit open() on the device] 1043 */ 1044static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1045{ 1046 int err = 0; 1047 struct block_device *bdev; 1048 1049 bdev = bdget(dev); 1050 if (!bdev) 1051 return -ENOMEM; 1052 err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); 1053 if (err) 1054 return err; 1055 err = bd_claim(bdev, rdev); 1056 if (err) { 1057 blkdev_put(bdev, BDEV_RAW); 1058 return err; 1059 } 1060 rdev->bdev = bdev; 1061 return err; 1062} 1063 1064static void unlock_rdev(mdk_rdev_t *rdev) 1065{ 1066 struct block_device *bdev = rdev->bdev; 1067 rdev->bdev = NULL; 1068 if (!bdev) 1069 MD_BUG(); 1070 bd_release(bdev); 1071 blkdev_put(bdev, BDEV_RAW); 1072} 1073 1074void md_autodetect_dev(dev_t dev); 1075 1076static void export_rdev(mdk_rdev_t * rdev) 1077{ 1078 printk(KERN_INFO "md: export_rdev(%s)\n", 1079 bdev_partition_name(rdev->bdev)); 1080 if (rdev->mddev) 1081 MD_BUG(); 1082 free_disk_sb(rdev); 1083 list_del_init(&rdev->same_set); 1084#ifndef MODULE 1085 md_autodetect_dev(rdev->bdev->bd_dev); 1086#endif 1087 unlock_rdev(rdev); 1088 kfree(rdev); 1089} 1090 1091static void kick_rdev_from_array(mdk_rdev_t * rdev) 1092{ 1093 unbind_rdev_from_array(rdev); 1094 export_rdev(rdev); 1095} 1096 1097static void export_array(mddev_t *mddev) 1098{ 1099 struct list_head *tmp; 1100 mdk_rdev_t *rdev; 1101 1102 ITERATE_RDEV(mddev,rdev,tmp) { 1103 if (!rdev->mddev) { 1104 MD_BUG(); 1105 continue; 1106 } 1107 kick_rdev_from_array(rdev); 1108 } 1109 if (!list_empty(&mddev->disks)) 1110 MD_BUG(); 1111 mddev->raid_disks = 0; 1112 mddev->major_version = 0; 1113} 1114 1115static void print_desc(mdp_disk_t *desc) 1116{ 1117 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number, 1118 partition_name(MKDEV(desc->major,desc->minor)), 1119 desc->major,desc->minor,desc->raid_disk,desc->state); 1120} 1121 1122static void print_sb(mdp_super_t *sb) 1123{ 1124 int i; 1125 1126 printk(KERN_INFO 1127 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1128 sb->major_version, sb->minor_version, sb->patch_version, 1129 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1130 sb->ctime); 1131 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1132 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1133 sb->md_minor, sb->layout, sb->chunk_size); 1134 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1135 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1136 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1137 sb->failed_disks, sb->spare_disks, 1138 sb->sb_csum, (unsigned long)sb->events_lo); 1139 1140 printk(KERN_INFO); 1141 for (i = 0; i < MD_SB_DISKS; i++) { 1142 mdp_disk_t *desc; 1143 1144 desc = sb->disks + i; 1145 if (desc->number || desc->major || desc->minor || 1146 desc->raid_disk || (desc->state && (desc->state != 4))) { 1147 printk(" D %2d: ", i); 1148 print_desc(desc); 1149 } 1150 } 1151 printk(KERN_INFO "md: THIS: "); 1152 print_desc(&sb->this_disk); 1153 1154} 1155 1156static void print_rdev(mdk_rdev_t *rdev) 1157{ 1158 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ", 1159 bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size, 1160 rdev->faulty, rdev->in_sync, rdev->desc_nr); 1161 if (rdev->sb_loaded) { 1162 printk(KERN_INFO "md: rdev superblock:\n"); 1163 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1164 } else 1165 printk(KERN_INFO "md: no rdev superblock!\n"); 1166} 1167 1168void md_print_devices(void) 1169{ 1170 struct list_head *tmp, *tmp2; 1171 mdk_rdev_t *rdev; 1172 mddev_t *mddev; 1173 1174 printk("\n"); 1175 printk("md: **********************************\n"); 1176 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1177 printk("md: **********************************\n"); 1178 ITERATE_MDDEV(mddev,tmp) { 1179 printk("md%d: ", mdidx(mddev)); 1180 1181 ITERATE_RDEV(mddev,rdev,tmp2) 1182 printk("<%s>", bdev_partition_name(rdev->bdev)); 1183 1184 ITERATE_RDEV(mddev,rdev,tmp2) 1185 print_rdev(rdev); 1186 } 1187 printk("md: **********************************\n"); 1188 printk("\n"); 1189} 1190 1191 1192static int write_disk_sb(mdk_rdev_t * rdev) 1193{ 1194 1195 if (!rdev->sb_loaded) { 1196 MD_BUG(); 1197 return 1; 1198 } 1199 if (rdev->faulty) { 1200 MD_BUG(); 1201 return 1; 1202 } 1203 1204 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1205 bdev_partition_name(rdev->bdev), 1206 (unsigned long long)rdev->sb_offset); 1207 1208 if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) 1209 return 0; 1210 1211 printk("md: write_disk_sb failed for device %s\n", 1212 bdev_partition_name(rdev->bdev)); 1213 return 1; 1214} 1215 1216static void sync_sbs(mddev_t * mddev) 1217{ 1218 mdk_rdev_t *rdev; 1219 struct list_head *tmp; 1220 1221 ITERATE_RDEV(mddev,rdev,tmp) { 1222 super_types[mddev->major_version]. 1223 sync_super(mddev, rdev); 1224 rdev->sb_loaded = 1; 1225 } 1226} 1227 1228static void md_update_sb(mddev_t * mddev) 1229{ 1230 int err, count = 100; 1231 struct list_head *tmp; 1232 mdk_rdev_t *rdev; 1233 1234 mddev->sb_dirty = 0; 1235repeat: 1236 mddev->utime = get_seconds(); 1237 mddev->events ++; 1238 1239 if (!mddev->events) { 1240 /* 1241 * oops, this 64-bit counter should never wrap. 1242 * Either we are in around ~1 trillion A.C., assuming 1243 * 1 reboot per second, or we have a bug: 1244 */ 1245 MD_BUG(); 1246 mddev->events --; 1247 } 1248 sync_sbs(mddev); 1249 1250 /* 1251 * do not write anything to disk if using 1252 * nonpersistent superblocks 1253 */ 1254 if (!mddev->persistent) 1255 return; 1256 1257 dprintk(KERN_INFO 1258 "md: updating md%d RAID superblock on device (in sync %d)\n", 1259 mdidx(mddev),mddev->in_sync); 1260 1261 err = 0; 1262 ITERATE_RDEV(mddev,rdev,tmp) { 1263 dprintk(KERN_INFO "md: "); 1264 if (rdev->faulty) 1265 dprintk("(skipping faulty "); 1266 1267 dprintk("%s ", bdev_partition_name(rdev->bdev)); 1268 if (!rdev->faulty) { 1269 err += write_disk_sb(rdev); 1270 } else 1271 dprintk(")\n"); 1272 if (!err && mddev->level == LEVEL_MULTIPATH) 1273 /* only need to write one superblock... */ 1274 break; 1275 } 1276 if (err) { 1277 if (--count) { 1278 printk(KERN_ERR "md: errors occurred during superblock" 1279 " update, repeating\n"); 1280 goto repeat; 1281 } 1282 printk(KERN_ERR \ 1283 "md: excessive errors occurred during superblock update, exiting\n"); 1284 } 1285} 1286 1287/* 1288 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1289 * 1290 * mark the device faulty if: 1291 * 1292 * - the device is nonexistent (zero size) 1293 * - the device has no valid superblock 1294 * 1295 * a faulty rdev _never_ has rdev->sb set. 1296 */ 1297static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1298{ 1299 int err; 1300 mdk_rdev_t *rdev; 1301 sector_t size; 1302 1303 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1304 if (!rdev) { 1305 printk(KERN_ERR "md: could not alloc mem for %s!\n", 1306 partition_name(newdev)); 1307 return ERR_PTR(-ENOMEM); 1308 } 1309 memset(rdev, 0, sizeof(*rdev)); 1310 1311 if ((err = alloc_disk_sb(rdev))) 1312 goto abort_free; 1313 1314 err = lock_rdev(rdev, newdev); 1315 if (err) { 1316 printk(KERN_ERR "md: could not lock %s.\n", 1317 partition_name(newdev)); 1318 goto abort_free; 1319 } 1320 rdev->desc_nr = -1; 1321 rdev->faulty = 0; 1322 rdev->in_sync = 0; 1323 rdev->data_offset = 0; 1324 atomic_set(&rdev->nr_pending, 0); 1325 1326 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1327 if (!size) { 1328 printk(KERN_WARNING 1329 "md: %s has zero or unknown size, marking faulty!\n", 1330 bdev_partition_name(rdev->bdev)); 1331 err = -EINVAL; 1332 goto abort_free; 1333 } 1334 1335 if (super_format >= 0) { 1336 err = super_types[super_format]. 1337 load_super(rdev, NULL, super_minor); 1338 if (err == -EINVAL) { 1339 printk(KERN_WARNING 1340 "md: %s has invalid sb, not importing!\n", 1341 bdev_partition_name(rdev->bdev)); 1342 goto abort_free; 1343 } 1344 if (err < 0) { 1345 printk(KERN_WARNING 1346 "md: could not read %s's sb, not importing!\n", 1347 bdev_partition_name(rdev->bdev)); 1348 goto abort_free; 1349 } 1350 } 1351 INIT_LIST_HEAD(&rdev->same_set); 1352 1353 return rdev; 1354 1355abort_free: 1356 if (rdev->sb_page) { 1357 if (rdev->bdev) 1358 unlock_rdev(rdev); 1359 free_disk_sb(rdev); 1360 } 1361 kfree(rdev); 1362 return ERR_PTR(err); 1363} 1364 1365/* 1366 * Check a full RAID array for plausibility 1367 */ 1368 1369 1370static int analyze_sbs(mddev_t * mddev) 1371{ 1372 int i; 1373 struct list_head *tmp; 1374 mdk_rdev_t *rdev, *freshest; 1375 1376 freshest = NULL; 1377 ITERATE_RDEV(mddev,rdev,tmp) 1378 switch (super_types[mddev->major_version]. 1379 load_super(rdev, freshest, mddev->minor_version)) { 1380 case 1: 1381 freshest = rdev; 1382 break; 1383 case 0: 1384 break; 1385 default: 1386 printk( KERN_ERR \ 1387 "md: fatal superblock inconsistency in %s" 1388 " -- removing from array\n", 1389 bdev_partition_name(rdev->bdev)); 1390 kick_rdev_from_array(rdev); 1391 } 1392 1393 1394 super_types[mddev->major_version]. 1395 validate_super(mddev, freshest); 1396 1397 i = 0; 1398 ITERATE_RDEV(mddev,rdev,tmp) { 1399 if (rdev != freshest) 1400 if (super_types[mddev->major_version]. 1401 validate_super(mddev, rdev)) { 1402 printk(KERN_WARNING "md: kicking non-fresh %s" 1403 " from array!\n", 1404 bdev_partition_name(rdev->bdev)); 1405 kick_rdev_from_array(rdev); 1406 continue; 1407 } 1408 if (mddev->level == LEVEL_MULTIPATH) { 1409 rdev->desc_nr = i++; 1410 rdev->raid_disk = rdev->desc_nr; 1411 rdev->in_sync = 1; 1412 } 1413 } 1414 1415 1416 /* 1417 * Check if we can support this RAID array 1418 */ 1419 if (mddev->major_version != MD_MAJOR_VERSION || 1420 mddev->minor_version > MD_MINOR_VERSION) { 1421 printk(KERN_ALERT 1422 "md: md%d: unsupported raid array version %d.%d.%d\n", 1423 mdidx(mddev), mddev->major_version, 1424 mddev->minor_version, mddev->patch_version); 1425 goto abort; 1426 } 1427 1428 if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) || 1429 (mddev->level == 4) || (mddev->level == 5))) 1430 printk(KERN_ERR "md: md%d: raid array is not clean" 1431 " -- starting background reconstruction\n", 1432 mdidx(mddev)); 1433 1434 return 0; 1435abort: 1436 return 1; 1437} 1438 1439static struct gendisk *md_probe(dev_t dev, int *part, void *data) 1440{ 1441 static DECLARE_MUTEX(disks_sem); 1442 int unit = MINOR(dev); 1443 mddev_t *mddev = mddev_find(unit); 1444 struct gendisk *disk; 1445 1446 if (!mddev) 1447 return NULL; 1448 1449 down(&disks_sem); 1450 if (disks[unit]) { 1451 up(&disks_sem); 1452 mddev_put(mddev); 1453 return NULL; 1454 } 1455 disk = alloc_disk(1); 1456 if (!disk) { 1457 up(&disks_sem); 1458 mddev_put(mddev); 1459 return NULL; 1460 } 1461 disk->major = MD_MAJOR; 1462 disk->first_minor = mdidx(mddev); 1463 sprintf(disk->disk_name, "md%d", mdidx(mddev)); 1464 disk->fops = &md_fops; 1465 disk->private_data = mddev; 1466 disk->queue = &mddev->queue; 1467 add_disk(disk); 1468 disks[mdidx(mddev)] = disk; 1469 up(&disks_sem); 1470 return NULL; 1471} 1472 1473void md_wakeup_thread(mdk_thread_t *thread); 1474 1475static void md_safemode_timeout(unsigned long data) 1476{ 1477 mddev_t *mddev = (mddev_t *) data; 1478 1479 mddev->safemode = 1; 1480 md_wakeup_thread(mddev->thread); 1481} 1482 1483 1484static int do_md_run(mddev_t * mddev) 1485{ 1486 int pnum, err; 1487 int chunk_size; 1488 struct list_head *tmp; 1489 mdk_rdev_t *rdev; 1490 struct gendisk *disk; 1491 1492 if (list_empty(&mddev->disks)) { 1493 MD_BUG(); 1494 return -EINVAL; 1495 } 1496 1497 if (mddev->pers) 1498 return -EBUSY; 1499 1500 /* 1501 * Analyze all RAID superblock(s) 1502 */ 1503 if (!mddev->raid_disks && analyze_sbs(mddev)) { 1504 MD_BUG(); 1505 return -EINVAL; 1506 } 1507 1508 chunk_size = mddev->chunk_size; 1509 pnum = level_to_pers(mddev->level); 1510 1511 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1512 if (!chunk_size) { 1513 /* 1514 * 'default chunksize' in the old md code used to 1515 * be PAGE_SIZE, baaad. 1516 * we abort here to be on the safe side. We don't 1517 * want to continue the bad practice. 1518 */ 1519 printk(KERN_ERR 1520 "no chunksize specified, see 'man raidtab'\n"); 1521 return -EINVAL; 1522 } 1523 if (chunk_size > MAX_CHUNK_SIZE) { 1524 printk(KERN_ERR "too big chunk_size: %d > %d\n", 1525 chunk_size, MAX_CHUNK_SIZE); 1526 return -EINVAL; 1527 } 1528 /* 1529 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1530 */ 1531 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1532 MD_BUG(); 1533 return -EINVAL; 1534 } 1535 if (chunk_size < PAGE_SIZE) { 1536 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 1537 chunk_size, PAGE_SIZE); 1538 return -EINVAL; 1539 } 1540 1541 /* devices must have minimum size of one chunk */ 1542 ITERATE_RDEV(mddev,rdev,tmp) { 1543 if (rdev->faulty) 1544 continue; 1545 if (rdev->size < chunk_size / 1024) { 1546 printk(KERN_WARNING 1547 "md: Dev %s smaller than chunk_size:" 1548 " %lluk < %dk\n", 1549 bdev_partition_name(rdev->bdev), 1550 (unsigned long long)rdev->size, 1551 chunk_size / 1024); 1552 return -EINVAL; 1553 } 1554 } 1555 } 1556 if (pnum >= MAX_PERSONALITY) { 1557 MD_BUG(); 1558 return -EINVAL; 1559 } 1560 1561#ifdef CONFIG_KMOD 1562 if (!pers[pnum]) 1563 { 1564 char module_name[80]; 1565 sprintf (module_name, "md-personality-%d", pnum); 1566 request_module (module_name); 1567 } 1568#endif 1569 1570 /* 1571 * Drop all container device buffers, from now on 1572 * the only valid external interface is through the md 1573 * device. 1574 * Also find largest hardsector size 1575 */ 1576 ITERATE_RDEV(mddev,rdev,tmp) { 1577 if (rdev->faulty) 1578 continue; 1579 sync_blockdev(rdev->bdev); 1580 invalidate_bdev(rdev->bdev, 0); 1581 } 1582 1583 md_probe(mdidx(mddev), NULL, NULL); 1584 disk = disks[mdidx(mddev)]; 1585 if (!disk) 1586 return -ENOMEM; 1587 1588 spin_lock(&pers_lock); 1589 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 1590 spin_unlock(&pers_lock); 1591 printk(KERN_ERR "md: personality %d is not loaded!\n", 1592 pnum); 1593 return -EINVAL; 1594 } 1595 1596 mddev->pers = pers[pnum]; 1597 spin_unlock(&pers_lock); 1598 1599 blk_queue_make_request(&mddev->queue, mddev->pers->make_request); 1600 printk("%s: setting max_sectors to %d, segment boundary to %d\n", 1601 disk->disk_name, 1602 chunk_size >> 9, 1603 (chunk_size>>1)-1); 1604 blk_queue_max_sectors(&mddev->queue, chunk_size >> 9); 1605 blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1); 1606 mddev->queue.queuedata = mddev; 1607 1608 err = mddev->pers->run(mddev); 1609 if (err) { 1610 printk(KERN_ERR "md: pers->run() failed ...\n"); 1611 module_put(mddev->pers->owner); 1612 mddev->pers = NULL; 1613 return -EINVAL; 1614 } 1615 atomic_set(&mddev->writes_pending,0); 1616 mddev->safemode = 0; 1617 mddev->safemode_timer.function = md_safemode_timeout; 1618 mddev->safemode_timer.data = (unsigned long) mddev; 1619 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 1620 mddev->in_sync = 1; 1621 1622 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1623 md_wakeup_thread(mddev->thread); 1624 set_capacity(disk, mddev->array_size<<1); 1625 return 0; 1626} 1627 1628static int restart_array(mddev_t *mddev) 1629{ 1630 struct gendisk *disk = disks[mdidx(mddev)]; 1631 int err; 1632 1633 /* 1634 * Complain if it has no devices 1635 */ 1636 err = -ENXIO; 1637 if (list_empty(&mddev->disks)) 1638 goto out; 1639 1640 if (mddev->pers) { 1641 err = -EBUSY; 1642 if (!mddev->ro) 1643 goto out; 1644 1645 mddev->safemode = 0; 1646 mddev->ro = 0; 1647 set_disk_ro(disk, 0); 1648 1649 printk(KERN_INFO "md: md%d switched to read-write mode.\n", 1650 mdidx(mddev)); 1651 /* 1652 * Kick recovery or resync if necessary 1653 */ 1654 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1655 md_wakeup_thread(mddev->thread); 1656 err = 0; 1657 } else { 1658 printk(KERN_ERR "md: md%d has no personality assigned.\n", 1659 mdidx(mddev)); 1660 err = -EINVAL; 1661 } 1662 1663out: 1664 return err; 1665} 1666 1667static int do_md_stop(mddev_t * mddev, int ro) 1668{ 1669 int err = 0; 1670 struct gendisk *disk = disks[mdidx(mddev)]; 1671 1672 if (atomic_read(&mddev->active)>2) { 1673 printk("md: md%d still in use.\n",mdidx(mddev)); 1674 err = -EBUSY; 1675 goto out; 1676 } 1677 1678 if (mddev->pers) { 1679 if (mddev->sync_thread) { 1680 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1681 md_unregister_thread(mddev->sync_thread); 1682 mddev->sync_thread = NULL; 1683 } 1684 1685 del_timer_sync(&mddev->safemode_timer); 1686 1687 invalidate_device(mk_kdev(disk->major, disk->first_minor), 1); 1688 1689 if (ro) { 1690 err = -ENXIO; 1691 if (mddev->ro) 1692 goto out; 1693 mddev->ro = 1; 1694 } else { 1695 if (mddev->ro) 1696 set_disk_ro(disk, 0); 1697 if (mddev->pers->stop(mddev)) { 1698 err = -EBUSY; 1699 if (mddev->ro) 1700 set_disk_ro(disk, 1); 1701 goto out; 1702 } 1703 module_put(mddev->pers->owner); 1704 mddev->pers = NULL; 1705 if (mddev->ro) 1706 mddev->ro = 0; 1707 } 1708 if (mddev->raid_disks) { 1709 /* mark array as shutdown cleanly */ 1710 mddev->in_sync = 1; 1711 md_update_sb(mddev); 1712 } 1713 if (ro) 1714 set_disk_ro(disk, 1); 1715 } 1716 /* 1717 * Free resources if final stop 1718 */ 1719 if (!ro) { 1720 struct gendisk *disk; 1721 printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); 1722 1723 export_array(mddev); 1724 1725 mddev->array_size = 0; 1726 disk = disks[mdidx(mddev)]; 1727 if (disk) 1728 set_capacity(disk, 0); 1729 } else 1730 printk(KERN_INFO "md: md%d switched to read-only mode.\n", 1731 mdidx(mddev)); 1732 err = 0; 1733out: 1734 return err; 1735} 1736 1737static void autorun_array(mddev_t *mddev) 1738{ 1739 mdk_rdev_t *rdev; 1740 struct list_head *tmp; 1741 int err; 1742 1743 if (list_empty(&mddev->disks)) { 1744 MD_BUG(); 1745 return; 1746 } 1747 1748 printk(KERN_INFO "md: running: "); 1749 1750 ITERATE_RDEV(mddev,rdev,tmp) { 1751 printk("<%s>", bdev_partition_name(rdev->bdev)); 1752 } 1753 printk("\n"); 1754 1755 err = do_md_run (mddev); 1756 if (err) { 1757 printk(KERN_WARNING "md :do_md_run() returned %d\n", err); 1758 do_md_stop (mddev, 0); 1759 } 1760} 1761 1762/* 1763 * lets try to run arrays based on all disks that have arrived 1764 * until now. (those are in pending_raid_disks) 1765 * 1766 * the method: pick the first pending disk, collect all disks with 1767 * the same UUID, remove all from the pending list and put them into 1768 * the 'same_array' list. Then order this list based on superblock 1769 * update time (freshest comes first), kick out 'old' disks and 1770 * compare superblocks. If everything's fine then run it. 1771 * 1772 * If "unit" is allocated, then bump its reference count 1773 */ 1774static void autorun_devices(void) 1775{ 1776 struct list_head candidates; 1777 struct list_head *tmp; 1778 mdk_rdev_t *rdev0, *rdev; 1779 mddev_t *mddev; 1780 1781 printk(KERN_INFO "md: autorun ...\n"); 1782 while (!list_empty(&pending_raid_disks)) { 1783 rdev0 = list_entry(pending_raid_disks.next, 1784 mdk_rdev_t, same_set); 1785 1786 printk(KERN_INFO "md: considering %s ...\n", 1787 bdev_partition_name(rdev0->bdev)); 1788 INIT_LIST_HEAD(&candidates); 1789 ITERATE_RDEV_PENDING(rdev,tmp) 1790 if (super_90_load(rdev, rdev0, 0) >= 0) { 1791 printk(KERN_INFO "md: adding %s ...\n", 1792 bdev_partition_name(rdev->bdev)); 1793 list_move(&rdev->same_set, &candidates); 1794 } 1795 /* 1796 * now we have a set of devices, with all of them having 1797 * mostly sane superblocks. It's time to allocate the 1798 * mddev. 1799 */ 1800 1801 mddev = mddev_find(rdev0->preferred_minor); 1802 if (!mddev) { 1803 printk(KERN_ERR 1804 "md: cannot allocate memory for md drive.\n"); 1805 break; 1806 } 1807 if (mddev_lock(mddev)) 1808 printk(KERN_WARNING "md: md%d locked, cannot run\n", 1809 mdidx(mddev)); 1810 else if (mddev->raid_disks || mddev->major_version 1811 || !list_empty(&mddev->disks)) { 1812 printk(KERN_WARNING 1813 "md: md%d already running, cannot run %s\n", 1814 mdidx(mddev), bdev_partition_name(rdev0->bdev)); 1815 mddev_unlock(mddev); 1816 } else { 1817 printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); 1818 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 1819 list_del_init(&rdev->same_set); 1820 if (bind_rdev_to_array(rdev, mddev)) 1821 export_rdev(rdev); 1822 } 1823 autorun_array(mddev); 1824 mddev_unlock(mddev); 1825 } 1826 /* on success, candidates will be empty, on error 1827 * it won't... 1828 */ 1829 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 1830 export_rdev(rdev); 1831 mddev_put(mddev); 1832 } 1833 printk(KERN_INFO "md: ... autorun DONE.\n"); 1834} 1835 1836/* 1837 * import RAID devices based on one partition 1838 * if possible, the array gets run as well. 1839 */ 1840 1841static int autostart_array(dev_t startdev) 1842{ 1843 int err = -EINVAL, i; 1844 mdp_super_t *sb = NULL; 1845 mdk_rdev_t *start_rdev = NULL, *rdev; 1846 1847 start_rdev = md_import_device(startdev, 0, 0); 1848 if (IS_ERR(start_rdev)) { 1849 printk(KERN_WARNING "md: could not import %s!\n", 1850 partition_name(startdev)); 1851 return err; 1852 } 1853 1854 /* NOTE: this can only work for 0.90.0 superblocks */ 1855 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 1856 if (sb->major_version != 0 || 1857 sb->minor_version != 90 ) { 1858 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 1859 export_rdev(start_rdev); 1860 return err; 1861 } 1862 1863 if (start_rdev->faulty) { 1864 printk(KERN_WARNING 1865 "md: can not autostart based on faulty %s!\n", 1866 bdev_partition_name(start_rdev->bdev)); 1867 export_rdev(start_rdev); 1868 return err; 1869 } 1870 list_add(&start_rdev->same_set, &pending_raid_disks); 1871 1872 for (i = 0; i < MD_SB_DISKS; i++) { 1873 mdp_disk_t *desc; 1874 dev_t dev; 1875 1876 desc = sb->disks + i; 1877 dev = MKDEV(desc->major, desc->minor); 1878 1879 if (!dev) 1880 continue; 1881 if (dev == startdev) 1882 continue; 1883 rdev = md_import_device(dev, 0, 0); 1884 if (IS_ERR(rdev)) { 1885 printk(KERN_WARNING "md: could not import %s," 1886 " trying to run array nevertheless.\n", 1887 partition_name(dev)); 1888 continue; 1889 } 1890 list_add(&rdev->same_set, &pending_raid_disks); 1891 } 1892 1893 /* 1894 * possibly return codes 1895 */ 1896 autorun_devices(); 1897 return 0; 1898 1899} 1900 1901 1902static int get_version(void * arg) 1903{ 1904 mdu_version_t ver; 1905 1906 ver.major = MD_MAJOR_VERSION; 1907 ver.minor = MD_MINOR_VERSION; 1908 ver.patchlevel = MD_PATCHLEVEL_VERSION; 1909 1910 if (copy_to_user(arg, &ver, sizeof(ver))) 1911 return -EFAULT; 1912 1913 return 0; 1914} 1915 1916static int get_array_info(mddev_t * mddev, void * arg) 1917{ 1918 mdu_array_info_t info; 1919 int nr,working,active,failed,spare; 1920 mdk_rdev_t *rdev; 1921 struct list_head *tmp; 1922 1923 nr=working=active=failed=spare=0; 1924 ITERATE_RDEV(mddev,rdev,tmp) { 1925 nr++; 1926 if (rdev->faulty) 1927 failed++; 1928 else { 1929 working++; 1930 if (rdev->in_sync) 1931 active++; 1932 else 1933 spare++; 1934 } 1935 } 1936 1937 info.major_version = mddev->major_version; 1938 info.minor_version = mddev->minor_version; 1939 info.patch_version = 1; 1940 info.ctime = mddev->ctime; 1941 info.level = mddev->level; 1942 info.size = mddev->size; 1943 info.nr_disks = nr; 1944 info.raid_disks = mddev->raid_disks; 1945 info.md_minor = mddev->__minor; 1946 info.not_persistent= !mddev->persistent; 1947 1948 info.utime = mddev->utime; 1949 info.state = 0; 1950 if (mddev->in_sync) 1951 info.state = (1<<MD_SB_CLEAN); 1952 info.active_disks = active; 1953 info.working_disks = working; 1954 info.failed_disks = failed; 1955 info.spare_disks = spare; 1956 1957 info.layout = mddev->layout; 1958 info.chunk_size = mddev->chunk_size; 1959 1960 if (copy_to_user(arg, &info, sizeof(info))) 1961 return -EFAULT; 1962 1963 return 0; 1964} 1965 1966static int get_disk_info(mddev_t * mddev, void * arg) 1967{ 1968 mdu_disk_info_t info; 1969 unsigned int nr; 1970 mdk_rdev_t *rdev; 1971 1972 if (copy_from_user(&info, arg, sizeof(info))) 1973 return -EFAULT; 1974 1975 nr = info.number; 1976 1977 rdev = find_rdev_nr(mddev, nr); 1978 if (rdev) { 1979 info.major = MAJOR(rdev->bdev->bd_dev); 1980 info.minor = MINOR(rdev->bdev->bd_dev); 1981 info.raid_disk = rdev->raid_disk; 1982 info.state = 0; 1983 if (rdev->faulty) 1984 info.state |= (1<<MD_DISK_FAULTY); 1985 else if (rdev->in_sync) { 1986 info.state |= (1<<MD_DISK_ACTIVE); 1987 info.state |= (1<<MD_DISK_SYNC); 1988 } 1989 } else { 1990 info.major = info.minor = 0; 1991 info.raid_disk = -1; 1992 info.state = (1<<MD_DISK_REMOVED); 1993 } 1994 1995 if (copy_to_user(arg, &info, sizeof(info))) 1996 return -EFAULT; 1997 1998 return 0; 1999} 2000 2001static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2002{ 2003 mdk_rdev_t *rdev; 2004 dev_t dev; 2005 dev = MKDEV(info->major,info->minor); 2006 if (!mddev->raid_disks) { 2007 int err; 2008 /* expecting a device which has a superblock */ 2009 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2010 if (IS_ERR(rdev)) { 2011 printk(KERN_WARNING 2012 "md: md_import_device returned %ld\n", 2013 PTR_ERR(rdev)); 2014 return PTR_ERR(rdev); 2015 } 2016 if (!list_empty(&mddev->disks)) { 2017 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2018 mdk_rdev_t, same_set); 2019 int err = super_types[mddev->major_version] 2020 .load_super(rdev, rdev0, mddev->minor_version); 2021 if (err < 0) { 2022 printk(KERN_WARNING 2023 "md: %s has different UUID to %s\n", 2024 bdev_partition_name(rdev->bdev), 2025 bdev_partition_name(rdev0->bdev)); 2026 export_rdev(rdev); 2027 return -EINVAL; 2028 } 2029 } 2030 err = bind_rdev_to_array(rdev, mddev); 2031 if (err) 2032 export_rdev(rdev); 2033 return err; 2034 } 2035 2036 /* 2037 * add_new_disk can be used once the array is assembled 2038 * to add "hot spares". They must already have a superblock 2039 * written 2040 */ 2041 if (mddev->pers) { 2042 int err; 2043 if (!mddev->pers->hot_add_disk) { 2044 printk(KERN_WARNING 2045 "md%d: personality does not support diskops!\n", 2046 mdidx(mddev)); 2047 return -EINVAL; 2048 } 2049 rdev = md_import_device(dev, mddev->major_version, 2050 mddev->minor_version); 2051 if (IS_ERR(rdev)) { 2052 printk(KERN_WARNING 2053 "md: md_import_device returned %ld\n", 2054 PTR_ERR(rdev)); 2055 return PTR_ERR(rdev); 2056 } 2057 rdev->in_sync = 0; /* just to be sure */ 2058 rdev->raid_disk = -1; 2059 err = bind_rdev_to_array(rdev, mddev); 2060 if (err) 2061 export_rdev(rdev); 2062 if (mddev->thread) 2063 md_wakeup_thread(mddev->thread); 2064 return err; 2065 } 2066 2067 /* otherwise, add_new_disk is only allowed 2068 * for major_version==0 superblocks 2069 */ 2070 if (mddev->major_version != 0) { 2071 printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n", 2072 mdidx(mddev)); 2073 return -EINVAL; 2074 } 2075 2076 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2077 int err; 2078 rdev = md_import_device (dev, -1, 0); 2079 if (IS_ERR(rdev)) { 2080 printk(KERN_WARNING 2081 "md: error, md_import_device() returned %ld\n", 2082 PTR_ERR(rdev)); 2083 return PTR_ERR(rdev); 2084 } 2085 rdev->desc_nr = info->number; 2086 if (info->raid_disk < mddev->raid_disks) 2087 rdev->raid_disk = info->raid_disk; 2088 else 2089 rdev->raid_disk = -1; 2090 2091 rdev->faulty = 0; 2092 if (rdev->raid_disk < mddev->raid_disks) 2093 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); 2094 else 2095 rdev->in_sync = 0; 2096 2097 err = bind_rdev_to_array(rdev, mddev); 2098 if (err) { 2099 export_rdev(rdev); 2100 return err; 2101 } 2102 2103 if (!mddev->persistent) { 2104 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2105 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2106 } else 2107 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2108 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2109 2110 if (!mddev->size || (mddev->size > rdev->size)) 2111 mddev->size = rdev->size; 2112 } 2113 2114 return 0; 2115} 2116 2117static int hot_generate_error(mddev_t * mddev, dev_t dev) 2118{ 2119 struct request_queue *q; 2120 mdk_rdev_t *rdev; 2121 2122 if (!mddev->pers) 2123 return -ENODEV; 2124 2125 printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", 2126 partition_name(dev), mdidx(mddev)); 2127 2128 rdev = find_rdev(mddev, dev); 2129 if (!rdev) { 2130 MD_BUG(); 2131 return -ENXIO; 2132 } 2133 2134 if (rdev->desc_nr == -1) { 2135 MD_BUG(); 2136 return -EINVAL; 2137 } 2138 if (!rdev->in_sync) 2139 return -ENODEV; 2140 2141 q = bdev_get_queue(rdev->bdev); 2142 if (!q) { 2143 MD_BUG(); 2144 return -ENODEV; 2145 } 2146 printk(KERN_INFO "md: okay, generating error!\n"); 2147// q->oneshot_error = 1; // disabled for now 2148 2149 return 0; 2150} 2151 2152static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2153{ 2154 mdk_rdev_t *rdev; 2155 2156 if (!mddev->pers) 2157 return -ENODEV; 2158 2159 printk(KERN_INFO "md: trying to remove %s from md%d ... \n", 2160 partition_name(dev), mdidx(mddev)); 2161 2162 rdev = find_rdev(mddev, dev); 2163 if (!rdev) 2164 return -ENXIO; 2165 2166 if (rdev->raid_disk >= 0) 2167 goto busy; 2168 2169 kick_rdev_from_array(rdev); 2170 md_update_sb(mddev); 2171 2172 return 0; 2173busy: 2174 printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", 2175 bdev_partition_name(rdev->bdev), mdidx(mddev)); 2176 return -EBUSY; 2177} 2178 2179static int hot_add_disk(mddev_t * mddev, dev_t dev) 2180{ 2181 int err; 2182 unsigned int size; 2183 mdk_rdev_t *rdev; 2184 2185 if (!mddev->pers) 2186 return -ENODEV; 2187 2188 printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", 2189 partition_name(dev), mdidx(mddev)); 2190 2191 if (mddev->major_version != 0) { 2192 printk(KERN_WARNING "md%d: HOT_ADD may only be used with" 2193 " version-0 superblocks.\n", 2194 mdidx(mddev)); 2195 return -EINVAL; 2196 } 2197 if (!mddev->pers->hot_add_disk) { 2198 printk(KERN_WARNING 2199 "md%d: personality does not support diskops!\n", 2200 mdidx(mddev)); 2201 return -EINVAL; 2202 } 2203 2204 rdev = md_import_device (dev, -1, 0); 2205 if (IS_ERR(rdev)) { 2206 printk(KERN_WARNING 2207 "md: error, md_import_device() returned %ld\n", 2208 PTR_ERR(rdev)); 2209 return -EINVAL; 2210 } 2211 2212 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2213 size = calc_dev_size(rdev, mddev->chunk_size); 2214 rdev->size = size; 2215 2216 if (size < mddev->size) { 2217 printk(KERN_WARNING 2218 "md%d: disk size %llu blocks < array size %llu\n", 2219 mdidx(mddev), (unsigned long long)size, 2220 (unsigned long long)mddev->size); 2221 err = -ENOSPC; 2222 goto abort_export; 2223 } 2224 2225 if (rdev->faulty) { 2226 printk(KERN_WARNING 2227 "md: can not hot-add faulty %s disk to md%d!\n", 2228 bdev_partition_name(rdev->bdev), mdidx(mddev)); 2229 err = -EINVAL; 2230 goto abort_export; 2231 } 2232 rdev->in_sync = 0; 2233 rdev->desc_nr = -1; 2234 bind_rdev_to_array(rdev, mddev); 2235 2236 /* 2237 * The rest should better be atomic, we can have disk failures 2238 * noticed in interrupt contexts ... 2239 */ 2240 2241 if (rdev->desc_nr == mddev->max_disks) { 2242 printk(KERN_WARNING "md%d: can not hot-add to full array!\n", 2243 mdidx(mddev)); 2244 err = -EBUSY; 2245 goto abort_unbind_export; 2246 } 2247 2248 rdev->raid_disk = -1; 2249 2250 md_update_sb(mddev); 2251 2252 /* 2253 * Kick recovery, maybe this spare has to be added to the 2254 * array immediately. 2255 */ 2256 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2257 md_wakeup_thread(mddev->thread); 2258 2259 return 0; 2260 2261abort_unbind_export: 2262 unbind_rdev_from_array(rdev); 2263 2264abort_export: 2265 export_rdev(rdev); 2266 return err; 2267} 2268 2269/* 2270 * set_array_info is used two different ways 2271 * The original usage is when creating a new array. 2272 * In this usage, raid_disks is > = and it together with 2273 * level, size, not_persistent,layout,chunksize determine the 2274 * shape of the array. 2275 * This will always create an array with a type-0.90.0 superblock. 2276 * The newer usage is when assembling an array. 2277 * In this case raid_disks will be 0, and the major_version field is 2278 * use to determine which style super-blocks are to be found on the devices. 2279 * The minor and patch _version numbers are also kept incase the 2280 * super_block handler wishes to interpret them. 2281 */ 2282static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2283{ 2284 2285 if (info->raid_disks == 0) { 2286 /* just setting version number for superblock loading */ 2287 if (info->major_version < 0 || 2288 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2289 super_types[info->major_version].name == NULL) { 2290 /* maybe try to auto-load a module? */ 2291 printk(KERN_INFO 2292 "md: superblock version %d not known\n", 2293 info->major_version); 2294 return -EINVAL; 2295 } 2296 mddev->major_version = info->major_version; 2297 mddev->minor_version = info->minor_version; 2298 mddev->patch_version = info->patch_version; 2299 return 0; 2300 } 2301 mddev->major_version = MD_MAJOR_VERSION; 2302 mddev->minor_version = MD_MINOR_VERSION; 2303 mddev->patch_version = MD_PATCHLEVEL_VERSION; 2304 mddev->ctime = get_seconds(); 2305 2306 mddev->level = info->level; 2307 mddev->size = info->size; 2308 mddev->raid_disks = info->raid_disks; 2309 /* don't set __minor, it is determined by which /dev/md* was 2310 * openned 2311 */ 2312 if (info->state & (1<<MD_SB_CLEAN)) 2313 mddev->recovery_cp = MaxSector; 2314 else 2315 mddev->recovery_cp = 0; 2316 mddev->persistent = ! info->not_persistent; 2317 2318 mddev->layout = info->layout; 2319 mddev->chunk_size = info->chunk_size; 2320 2321 mddev->max_disks = MD_SB_DISKS; 2322 2323 2324 /* 2325 * Generate a 128 bit UUID 2326 */ 2327 get_random_bytes(mddev->uuid, 16); 2328 2329 return 0; 2330} 2331 2332static int set_disk_faulty(mddev_t *mddev, dev_t dev) 2333{ 2334 mdk_rdev_t *rdev; 2335 2336 rdev = find_rdev(mddev, dev); 2337 if (!rdev) 2338 return 0; 2339 2340 md_error(mddev, rdev); 2341 return 1; 2342} 2343 2344static int md_ioctl(struct inode *inode, struct file *file, 2345 unsigned int cmd, unsigned long arg) 2346{ 2347 unsigned int minor; 2348 int err = 0; 2349 struct hd_geometry *loc = (struct hd_geometry *) arg; 2350 mddev_t *mddev = NULL; 2351 kdev_t dev; 2352 2353 if (!capable(CAP_SYS_ADMIN)) 2354 return -EACCES; 2355 2356 dev = inode->i_rdev; 2357 minor = minor(dev); 2358 if (minor >= MAX_MD_DEVS) { 2359 MD_BUG(); 2360 return -EINVAL; 2361 } 2362 2363 /* 2364 * Commands dealing with the RAID driver but not any 2365 * particular array: 2366 */ 2367 switch (cmd) 2368 { 2369 case RAID_VERSION: 2370 err = get_version((void *)arg); 2371 goto done; 2372 2373 case PRINT_RAID_DEBUG: 2374 err = 0; 2375 md_print_devices(); 2376 goto done; 2377 2378#ifndef MODULE 2379 case RAID_AUTORUN: 2380 err = 0; 2381 autostart_arrays(); 2382 goto done; 2383#endif 2384 default:; 2385 } 2386 2387 /* 2388 * Commands creating/starting a new array: 2389 */ 2390 2391 mddev = inode->i_bdev->bd_inode->u.generic_ip; 2392 2393 if (!mddev) { 2394 BUG(); 2395 goto abort; 2396 } 2397 2398 2399 if (cmd == START_ARRAY) { 2400 /* START_ARRAY doesn't need to lock the array as autostart_array 2401 * does the locking, and it could even be a different array 2402 */ 2403 err = autostart_array(arg); 2404 if (err) { 2405 printk(KERN_WARNING "md: autostart %s failed!\n", 2406 partition_name(arg)); 2407 goto abort; 2408 } 2409 goto done; 2410 } 2411 2412 err = mddev_lock(mddev); 2413 if (err) { 2414 printk(KERN_INFO 2415 "md: ioctl lock interrupted, reason %d, cmd %d\n", 2416 err, cmd); 2417 goto abort; 2418 } 2419 2420 switch (cmd) 2421 { 2422 case SET_ARRAY_INFO: 2423 2424 if (!list_empty(&mddev->disks)) { 2425 printk(KERN_WARNING 2426 "md: array md%d already has disks!\n", 2427 mdidx(mddev)); 2428 err = -EBUSY; 2429 goto abort_unlock; 2430 } 2431 if (mddev->raid_disks) { 2432 printk(KERN_WARNING 2433 "md: array md%d already initialised!\n", 2434 mdidx(mddev)); 2435 err = -EBUSY; 2436 goto abort_unlock; 2437 } 2438 { 2439 mdu_array_info_t info; 2440 if (!arg) 2441 memset(&info, 0, sizeof(info)); 2442 else if (copy_from_user(&info, (void*)arg, sizeof(info))) { 2443 err = -EFAULT; 2444 goto abort_unlock; 2445 } 2446 err = set_array_info(mddev, &info); 2447 if (err) { 2448 printk(KERN_WARNING "md: couldn't set" 2449 " array info. %d\n", err); 2450 goto abort_unlock; 2451 } 2452 } 2453 goto done_unlock; 2454 2455 default:; 2456 } 2457 2458 /* 2459 * Commands querying/configuring an existing array: 2460 */ 2461 /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ 2462 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { 2463 err = -ENODEV; 2464 goto abort_unlock; 2465 } 2466 2467 /* 2468 * Commands even a read-only array can execute: 2469 */ 2470 switch (cmd) 2471 { 2472 case GET_ARRAY_INFO: 2473 err = get_array_info(mddev, (void *)arg); 2474 goto done_unlock; 2475 2476 case GET_DISK_INFO: 2477 err = get_disk_info(mddev, (void *)arg); 2478 goto done_unlock; 2479 2480 case RESTART_ARRAY_RW: 2481 err = restart_array(mddev); 2482 goto done_unlock; 2483 2484 case STOP_ARRAY: 2485 err = do_md_stop (mddev, 0); 2486 goto done_unlock; 2487 2488 case STOP_ARRAY_RO: 2489 err = do_md_stop (mddev, 1); 2490 goto done_unlock; 2491 2492 /* 2493 * We have a problem here : there is no easy way to give a CHS 2494 * virtual geometry. We currently pretend that we have a 2 heads 2495 * 4 sectors (with a BIG number of cylinders...). This drives 2496 * dosfs just mad... ;-) 2497 */ 2498 case HDIO_GETGEO: 2499 if (!loc) { 2500 err = -EINVAL; 2501 goto abort_unlock; 2502 } 2503 err = put_user (2, (char *) &loc->heads); 2504 if (err) 2505 goto abort_unlock; 2506 err = put_user (4, (char *) &loc->sectors); 2507 if (err) 2508 goto abort_unlock; 2509 err = put_user(get_capacity(disks[mdidx(mddev)])/8, 2510 (short *) &loc->cylinders); 2511 if (err) 2512 goto abort_unlock; 2513 err = put_user (get_start_sect(inode->i_bdev), 2514 (long *) &loc->start); 2515 goto done_unlock; 2516 } 2517 2518 /* 2519 * The remaining ioctls are changing the state of the 2520 * superblock, so we do not allow read-only arrays 2521 * here: 2522 */ 2523 if (mddev->ro) { 2524 err = -EROFS; 2525 goto abort_unlock; 2526 } 2527 2528 switch (cmd) 2529 { 2530 case ADD_NEW_DISK: 2531 { 2532 mdu_disk_info_t info; 2533 if (copy_from_user(&info, (void*)arg, sizeof(info))) 2534 err = -EFAULT; 2535 else 2536 err = add_new_disk(mddev, &info); 2537 goto done_unlock; 2538 } 2539 case HOT_GENERATE_ERROR: 2540 err = hot_generate_error(mddev, arg); 2541 goto done_unlock; 2542 case HOT_REMOVE_DISK: 2543 err = hot_remove_disk(mddev, arg); 2544 goto done_unlock; 2545 2546 case HOT_ADD_DISK: 2547 err = hot_add_disk(mddev, arg); 2548 goto done_unlock; 2549 2550 case SET_DISK_FAULTY: 2551 err = set_disk_faulty(mddev, arg); 2552 goto done_unlock; 2553 2554 case RUN_ARRAY: 2555 { 2556 err = do_md_run (mddev); 2557 /* 2558 * we have to clean up the mess if 2559 * the array cannot be run for some 2560 * reason ... 2561 * ->pers will not be set, to superblock will 2562 * not be updated. 2563 */ 2564 if (err) 2565 do_md_stop (mddev, 0); 2566 goto done_unlock; 2567 } 2568 2569 default: 2570 if (_IOC_TYPE(cmd) == MD_MAJOR) 2571 printk(KERN_WARNING "md: %s(pid %d) used" 2572 " obsolete MD ioctl, upgrade your" 2573 " software to use new ictls.\n", 2574 current->comm, current->pid); 2575 err = -EINVAL; 2576 goto abort_unlock; 2577 } 2578 2579done_unlock: 2580abort_unlock: 2581 mddev_unlock(mddev); 2582 2583 return err; 2584done: 2585 if (err) 2586 MD_BUG(); 2587abort: 2588 return err; 2589} 2590 2591static int md_open(struct inode *inode, struct file *file) 2592{ 2593 /* 2594 * Succeed if we can find or allocate a mddev structure. 2595 */ 2596 mddev_t *mddev = mddev_find(minor(inode->i_rdev)); 2597 int err = -ENOMEM; 2598 2599 if (!mddev) 2600 goto out; 2601 2602 if ((err = mddev_lock(mddev))) 2603 goto put; 2604 2605 err = 0; 2606 mddev_unlock(mddev); 2607 inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev); 2608 put: 2609 mddev_put(mddev); 2610 out: 2611 return err; 2612} 2613 2614static int md_release(struct inode *inode, struct file * file) 2615{ 2616 mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip; 2617 2618 if (!mddev) 2619 BUG(); 2620 mddev_put(mddev); 2621 2622 return 0; 2623} 2624 2625static struct block_device_operations md_fops = 2626{ 2627 .owner = THIS_MODULE, 2628 .open = md_open, 2629 .release = md_release, 2630 .ioctl = md_ioctl, 2631}; 2632 2633int md_thread(void * arg) 2634{ 2635 mdk_thread_t *thread = arg; 2636 2637 lock_kernel(); 2638 2639 /* 2640 * Detach thread 2641 */ 2642 2643 daemonize(thread->name, mdidx(thread->mddev)); 2644 2645 current->exit_signal = SIGCHLD; 2646 allow_signal(SIGKILL); 2647 thread->tsk = current; 2648 2649 /* 2650 * md_thread is a 'system-thread', it's priority should be very 2651 * high. We avoid resource deadlocks individually in each 2652 * raid personality. (RAID5 does preallocation) We also use RR and 2653 * the very same RT priority as kswapd, thus we will never get 2654 * into a priority inversion deadlock. 2655 * 2656 * we definitely have to have equal or higher priority than 2657 * bdflush, otherwise bdflush will deadlock if there are too 2658 * many dirty RAID5 blocks. 2659 */ 2660 unlock_kernel(); 2661 2662 complete(thread->event); 2663 while (thread->run) { 2664 void (*run)(mddev_t *); 2665 2666 wait_event_interruptible(thread->wqueue, 2667 test_bit(THREAD_WAKEUP, &thread->flags)); 2668 if (current->flags & PF_FREEZE) 2669 refrigerator(PF_IOTHREAD); 2670 2671 clear_bit(THREAD_WAKEUP, &thread->flags); 2672 2673 run = thread->run; 2674 if (run) { 2675 run(thread->mddev); 2676 blk_run_queues(); 2677 } 2678 if (signal_pending(current)) 2679 flush_signals(current); 2680 } 2681 complete(thread->event); 2682 return 0; 2683} 2684 2685void md_wakeup_thread(mdk_thread_t *thread) 2686{ 2687 if (thread) { 2688 dprintk("md: waking up MD thread %p.\n", thread); 2689 set_bit(THREAD_WAKEUP, &thread->flags); 2690 wake_up(&thread->wqueue); 2691 } 2692} 2693 2694mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 2695 const char *name) 2696{ 2697 mdk_thread_t *thread; 2698 int ret; 2699 struct completion event; 2700 2701 thread = (mdk_thread_t *) kmalloc 2702 (sizeof(mdk_thread_t), GFP_KERNEL); 2703 if (!thread) 2704 return NULL; 2705 2706 memset(thread, 0, sizeof(mdk_thread_t)); 2707 init_waitqueue_head(&thread->wqueue); 2708 2709 init_completion(&event); 2710 thread->event = &event; 2711 thread->run = run; 2712 thread->mddev = mddev; 2713 thread->name = name; 2714 ret = kernel_thread(md_thread, thread, 0); 2715 if (ret < 0) { 2716 kfree(thread); 2717 return NULL; 2718 } 2719 wait_for_completion(&event); 2720 return thread; 2721} 2722 2723void md_interrupt_thread(mdk_thread_t *thread) 2724{ 2725 if (!thread->tsk) { 2726 MD_BUG(); 2727 return; 2728 } 2729 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 2730 send_sig(SIGKILL, thread->tsk, 1); 2731} 2732 2733void md_unregister_thread(mdk_thread_t *thread) 2734{ 2735 struct completion event; 2736 2737 init_completion(&event); 2738 2739 thread->event = &event; 2740 thread->run = NULL; 2741 thread->name = NULL; 2742 md_interrupt_thread(thread); 2743 wait_for_completion(&event); 2744 kfree(thread); 2745} 2746 2747void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 2748{ 2749 dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 2750 MD_MAJOR,mdidx(mddev), 2751 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 2752 __builtin_return_address(0),__builtin_return_address(1), 2753 __builtin_return_address(2),__builtin_return_address(3)); 2754 2755 if (!mddev) { 2756 MD_BUG(); 2757 return; 2758 } 2759 2760 if (!rdev || rdev->faulty) 2761 return; 2762 if (!mddev->pers->error_handler) 2763 return; 2764 mddev->pers->error_handler(mddev,rdev); 2765 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2766 md_wakeup_thread(mddev->thread); 2767} 2768 2769/* seq_file implementation /proc/mdstat */ 2770 2771static void status_unused(struct seq_file *seq) 2772{ 2773 int i = 0; 2774 mdk_rdev_t *rdev; 2775 struct list_head *tmp; 2776 2777 seq_printf(seq, "unused devices: "); 2778 2779 ITERATE_RDEV_PENDING(rdev,tmp) { 2780 i++; 2781 seq_printf(seq, "%s ", 2782 bdev_partition_name(rdev->bdev)); 2783 } 2784 if (!i) 2785 seq_printf(seq, "<none>"); 2786 2787 seq_printf(seq, "\n"); 2788} 2789 2790 2791static void status_resync(struct seq_file *seq, mddev_t * mddev) 2792{ 2793 unsigned long max_blocks, resync, res, dt, db, rt; 2794 2795 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 2796 max_blocks = mddev->size; 2797 2798 /* 2799 * Should not happen. 2800 */ 2801 if (!max_blocks) { 2802 MD_BUG(); 2803 return; 2804 } 2805 res = (resync/1024)*1000/(max_blocks/1024 + 1); 2806 { 2807 int i, x = res/50, y = 20-x; 2808 seq_printf(seq, "["); 2809 for (i = 0; i < x; i++) 2810 seq_printf(seq, "="); 2811 seq_printf(seq, ">"); 2812 for (i = 0; i < y; i++) 2813 seq_printf(seq, "."); 2814 seq_printf(seq, "] "); 2815 } 2816 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 2817 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 2818 "resync" : "recovery"), 2819 res/10, res % 10, resync, max_blocks); 2820 2821 /* 2822 * We do not want to overflow, so the order of operands and 2823 * the * 100 / 100 trick are important. We do a +1 to be 2824 * safe against division by zero. We only estimate anyway. 2825 * 2826 * dt: time from mark until now 2827 * db: blocks written from mark until now 2828 * rt: remaining time 2829 */ 2830 dt = ((jiffies - mddev->resync_mark) / HZ); 2831 if (!dt) dt++; 2832 db = resync - (mddev->resync_mark_cnt/2); 2833 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 2834 2835 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 2836 2837 seq_printf(seq, " speed=%ldK/sec", db/dt); 2838} 2839 2840static void *md_seq_start(struct seq_file *seq, loff_t *pos) 2841{ 2842 struct list_head *tmp; 2843 loff_t l = *pos; 2844 mddev_t *mddev; 2845 2846 if (l > 0x10000) 2847 return NULL; 2848 if (!l--) 2849 /* header */ 2850 return (void*)1; 2851 2852 spin_lock(&all_mddevs_lock); 2853 list_for_each(tmp,&all_mddevs) 2854 if (!l--) { 2855 mddev = list_entry(tmp, mddev_t, all_mddevs); 2856 mddev_get(mddev); 2857 spin_unlock(&all_mddevs_lock); 2858 return mddev; 2859 } 2860 spin_unlock(&all_mddevs_lock); 2861 return (void*)2;/* tail */ 2862} 2863 2864static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2865{ 2866 struct list_head *tmp; 2867 mddev_t *next_mddev, *mddev = v; 2868 2869 ++*pos; 2870 if (v == (void*)2) 2871 return NULL; 2872 2873 spin_lock(&all_mddevs_lock); 2874 if (v == (void*)1) 2875 tmp = all_mddevs.next; 2876 else 2877 tmp = mddev->all_mddevs.next; 2878 if (tmp != &all_mddevs) 2879 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 2880 else { 2881 next_mddev = (void*)2; 2882 *pos = 0x10000; 2883 } 2884 spin_unlock(&all_mddevs_lock); 2885 2886 if (v != (void*)1) 2887 mddev_put(mddev); 2888 return next_mddev; 2889 2890} 2891 2892static void md_seq_stop(struct seq_file *seq, void *v) 2893{ 2894 mddev_t *mddev = v; 2895 2896 if (mddev && v != (void*)1 && v != (void*)2) 2897 mddev_put(mddev); 2898} 2899 2900static int md_seq_show(struct seq_file *seq, void *v) 2901{ 2902 mddev_t *mddev = v; 2903 sector_t size; 2904 struct list_head *tmp2; 2905 mdk_rdev_t *rdev; 2906 int i; 2907 2908 if (v == (void*)1) { 2909 seq_printf(seq, "Personalities : "); 2910 spin_lock(&pers_lock); 2911 for (i = 0; i < MAX_PERSONALITY; i++) 2912 if (pers[i]) 2913 seq_printf(seq, "[%s] ", pers[i]->name); 2914 2915 spin_unlock(&pers_lock); 2916 seq_printf(seq, "\n"); 2917 return 0; 2918 } 2919 if (v == (void*)2) { 2920 status_unused(seq); 2921 return 0; 2922 } 2923 2924 if (mddev_lock(mddev)!=0) 2925 return -EINTR; 2926 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 2927 seq_printf(seq, "md%d : %sactive", mdidx(mddev), 2928 mddev->pers ? "" : "in"); 2929 if (mddev->pers) { 2930 if (mddev->ro) 2931 seq_printf(seq, " (read-only)"); 2932 seq_printf(seq, " %s", mddev->pers->name); 2933 } 2934 2935 size = 0; 2936 ITERATE_RDEV(mddev,rdev,tmp2) { 2937 seq_printf(seq, " %s[%d]", 2938 bdev_partition_name(rdev->bdev), rdev->desc_nr); 2939 if (rdev->faulty) { 2940 seq_printf(seq, "(F)"); 2941 continue; 2942 } 2943 size += rdev->size; 2944 } 2945 2946 if (!list_empty(&mddev->disks)) { 2947 if (mddev->pers) 2948 seq_printf(seq, "\n %llu blocks", 2949 (unsigned long long)mddev->array_size); 2950 else 2951 seq_printf(seq, "\n %llu blocks", 2952 (unsigned long long)size); 2953 } 2954 2955 if (mddev->pers) { 2956 mddev->pers->status (seq, mddev); 2957 seq_printf(seq, "\n "); 2958 if (mddev->curr_resync > 2) 2959 status_resync (seq, mddev); 2960 else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 2961 seq_printf(seq, " resync=DELAYED"); 2962 } 2963 2964 seq_printf(seq, "\n"); 2965 } 2966 mddev_unlock(mddev); 2967 2968 return 0; 2969} 2970 2971static struct seq_operations md_seq_ops = { 2972 .start = md_seq_start, 2973 .next = md_seq_next, 2974 .stop = md_seq_stop, 2975 .show = md_seq_show, 2976}; 2977 2978static int md_seq_open(struct inode *inode, struct file *file) 2979{ 2980 int error; 2981 2982 error = seq_open(file, &md_seq_ops); 2983 return error; 2984} 2985 2986static struct file_operations md_seq_fops = { 2987 .open = md_seq_open, 2988 .read = seq_read, 2989 .llseek = seq_lseek, 2990 .release = seq_release, 2991}; 2992 2993int register_md_personality(int pnum, mdk_personality_t *p) 2994{ 2995 if (pnum >= MAX_PERSONALITY) { 2996 MD_BUG(); 2997 return -EINVAL; 2998 } 2999 3000 spin_lock(&pers_lock); 3001 if (pers[pnum]) { 3002 spin_unlock(&pers_lock); 3003 MD_BUG(); 3004 return -EBUSY; 3005 } 3006 3007 pers[pnum] = p; 3008 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3009 spin_unlock(&pers_lock); 3010 return 0; 3011} 3012 3013int unregister_md_personality(int pnum) 3014{ 3015 if (pnum >= MAX_PERSONALITY) { 3016 MD_BUG(); 3017 return -EINVAL; 3018 } 3019 3020 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3021 spin_lock(&pers_lock); 3022 pers[pnum] = NULL; 3023 spin_unlock(&pers_lock); 3024 return 0; 3025} 3026 3027void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) 3028{ 3029 rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; 3030} 3031 3032static int is_mddev_idle(mddev_t *mddev) 3033{ 3034 mdk_rdev_t * rdev; 3035 struct list_head *tmp; 3036 int idle; 3037 unsigned long curr_events; 3038 3039 idle = 1; 3040 ITERATE_RDEV(mddev,rdev,tmp) { 3041 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3042 curr_events = disk_stat_read(disk, read_sectors) + 3043 disk_stat_read(disk, write_sectors) - 3044 disk->sync_io; 3045 if ((curr_events - rdev->last_events) > 32) { 3046 rdev->last_events = curr_events; 3047 idle = 0; 3048 } 3049 } 3050 return idle; 3051} 3052 3053void md_done_sync(mddev_t *mddev, int blocks, int ok) 3054{ 3055 /* another "blocks" (512byte) blocks have been synced */ 3056 atomic_sub(blocks, &mddev->recovery_active); 3057 wake_up(&mddev->recovery_wait); 3058 if (!ok) { 3059 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3060 md_wakeup_thread(mddev->thread); 3061 // stop recovery, signal do_sync .... 3062 } 3063} 3064 3065 3066void md_write_start(mddev_t *mddev) 3067{ 3068 if (!atomic_read(&mddev->writes_pending)) { 3069 mddev_lock_uninterruptible(mddev); 3070 if (mddev->in_sync) { 3071 mddev->in_sync = 0; 3072 del_timer(&mddev->safemode_timer); 3073 md_update_sb(mddev); 3074 } 3075 atomic_inc(&mddev->writes_pending); 3076 mddev_unlock(mddev); 3077 } else 3078 atomic_inc(&mddev->writes_pending); 3079} 3080 3081void md_write_end(mddev_t *mddev) 3082{ 3083 if (atomic_dec_and_test(&mddev->writes_pending)) { 3084 if (mddev->safemode == 2) 3085 md_wakeup_thread(mddev->thread); 3086 else 3087 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 3088 } 3089} 3090 3091static inline void md_enter_safemode(mddev_t *mddev) 3092{ 3093 mddev_lock_uninterruptible(mddev); 3094 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 3095 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 3096 mddev->in_sync = 1; 3097 md_update_sb(mddev); 3098 } 3099 mddev_unlock(mddev); 3100 3101 if (mddev->safemode == 1) 3102 mddev->safemode = 0; 3103} 3104 3105void md_handle_safemode(mddev_t *mddev) 3106{ 3107 if (signal_pending(current)) { 3108 printk(KERN_INFO "md: md%d in immediate safe mode\n", 3109 mdidx(mddev)); 3110 mddev->safemode = 2; 3111 flush_signals(current); 3112 } 3113 if (mddev->safemode) 3114 md_enter_safemode(mddev); 3115} 3116 3117 3118DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3119 3120#define SYNC_MARKS 10 3121#define SYNC_MARK_STEP (3*HZ) 3122static void md_do_sync(mddev_t *mddev) 3123{ 3124 mddev_t *mddev2; 3125 unsigned int max_sectors, currspeed = 0, 3126 j, window; 3127 unsigned long mark[SYNC_MARKS]; 3128 unsigned long mark_cnt[SYNC_MARKS]; 3129 int last_mark,m; 3130 struct list_head *tmp; 3131 unsigned long last_check; 3132 3133 /* just incase thread restarts... */ 3134 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3135 return; 3136 3137 /* we overload curr_resync somewhat here. 3138 * 0 == not engaged in resync at all 3139 * 2 == checking that there is no conflict with another sync 3140 * 1 == like 2, but have yielded to allow conflicting resync to 3141 * commense 3142 * other == active in resync - this many blocks 3143 */ 3144 do { 3145 mddev->curr_resync = 2; 3146 3147 ITERATE_MDDEV(mddev2,tmp) { 3148 if (mddev2 == mddev) 3149 continue; 3150 if (mddev2->curr_resync && 3151 match_mddev_units(mddev,mddev2)) { 3152 printk(KERN_INFO "md: delaying resync of md%d" 3153 " until md%d has finished resync (they" 3154 " share one or more physical units)\n", 3155 mdidx(mddev), mdidx(mddev2)); 3156 if (mddev < mddev2) {/* arbitrarily yield */ 3157 mddev->curr_resync = 1; 3158 wake_up(&resync_wait); 3159 } 3160 if (wait_event_interruptible(resync_wait, 3161 mddev2->curr_resync < mddev->curr_resync)) { 3162 flush_signals(current); 3163 mddev_put(mddev2); 3164 goto skip; 3165 } 3166 } 3167 if (mddev->curr_resync == 1) { 3168 mddev_put(mddev2); 3169 break; 3170 } 3171 } 3172 } while (mddev->curr_resync < 2); 3173 3174 max_sectors = mddev->size << 1; 3175 3176 printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); 3177 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3178 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3179 printk(KERN_INFO "md: using maximum available idle IO bandwith " 3180 "(but not more than %d KB/sec) for reconstruction.\n", 3181 sysctl_speed_limit_max); 3182 3183 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3184 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3185 j = mddev->recovery_cp; 3186 else 3187 j = 0; 3188 for (m = 0; m < SYNC_MARKS; m++) { 3189 mark[m] = jiffies; 3190 mark_cnt[m] = j; 3191 } 3192 last_mark = 0; 3193 mddev->resync_mark = mark[last_mark]; 3194 mddev->resync_mark_cnt = mark_cnt[last_mark]; 3195 3196 /* 3197 * Tune reconstruction: 3198 */ 3199 window = 32*(PAGE_SIZE/512); 3200 printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", 3201 window/2,max_sectors/2); 3202 3203 atomic_set(&mddev->recovery_active, 0); 3204 init_waitqueue_head(&mddev->recovery_wait); 3205 last_check = 0; 3206 3207 if (j) 3208 printk(KERN_INFO 3209 "md: resuming recovery of md%d from checkpoint.\n", 3210 mdidx(mddev)); 3211 3212 while (j < max_sectors) { 3213 int sectors; 3214 3215 sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); 3216 if (sectors < 0) { 3217 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3218 goto out; 3219 } 3220 atomic_add(sectors, &mddev->recovery_active); 3221 j += sectors; 3222 if (j>1) mddev->curr_resync = j; 3223 3224 if (last_check + window > j) 3225 continue; 3226 3227 last_check = j; 3228 3229 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3230 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3231 break; 3232 3233 blk_run_queues(); 3234 3235 repeat: 3236 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { 3237 /* step marks */ 3238 int next = (last_mark+1) % SYNC_MARKS; 3239 3240 mddev->resync_mark = mark[next]; 3241 mddev->resync_mark_cnt = mark_cnt[next]; 3242 mark[next] = jiffies; 3243 mark_cnt[next] = j - atomic_read(&mddev->recovery_active); 3244 last_mark = next; 3245 } 3246 3247 3248 if (signal_pending(current)) { 3249 /* 3250 * got a signal, exit. 3251 */ 3252 printk(KERN_INFO 3253 "md: md_do_sync() got signal ... exiting\n"); 3254 flush_signals(current); 3255 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3256 goto out; 3257 } 3258 3259 /* 3260 * this loop exits only if either when we are slower than 3261 * the 'hard' speed limit, or the system was IO-idle for 3262 * a jiffy. 3263 * the system might be non-idle CPU-wise, but we only care 3264 * about not overloading the IO subsystem. (things like an 3265 * e2fsck being done on the RAID array should execute fast) 3266 */ 3267 cond_resched(); 3268 3269 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; 3270 3271 if (currspeed > sysctl_speed_limit_min) { 3272 if ((currspeed > sysctl_speed_limit_max) || 3273 !is_mddev_idle(mddev)) { 3274 current->state = TASK_INTERRUPTIBLE; 3275 schedule_timeout(HZ/4); 3276 goto repeat; 3277 } 3278 } 3279 } 3280 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); 3281 /* 3282 * this also signals 'finished resyncing' to md_stop 3283 */ 3284 out: 3285 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 3286 3287 /* tell personality that we are finished */ 3288 mddev->pers->sync_request(mddev, max_sectors, 1); 3289 3290 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3291 mddev->curr_resync > 2 && 3292 mddev->curr_resync > mddev->recovery_cp) { 3293 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3294 printk(KERN_INFO 3295 "md: checkpointing recovery of md%d.\n", 3296 mdidx(mddev)); 3297 mddev->recovery_cp = mddev->curr_resync; 3298 } else 3299 mddev->recovery_cp = MaxSector; 3300 } 3301 3302 if (mddev->safemode) 3303 md_enter_safemode(mddev); 3304 skip: 3305 mddev->curr_resync = 0; 3306 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 3307 md_wakeup_thread(mddev->thread); 3308} 3309 3310 3311/* 3312 * This routine is regularly called by all per-raid-array threads to 3313 * deal with generic issues like resync and super-block update. 3314 * Raid personalities that don't have a thread (linear/raid0) do not 3315 * need this as they never do any recovery or update the superblock. 3316 * 3317 * It does not do any resync itself, but rather "forks" off other threads 3318 * to do that as needed. 3319 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 3320 * "->recovery" and create a thread at ->sync_thread. 3321 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 3322 * and wakeups up this thread which will reap the thread and finish up. 3323 * This thread also removes any faulty devices (with nr_pending == 0). 3324 * 3325 * The overall approach is: 3326 * 1/ if the superblock needs updating, update it. 3327 * 2/ If a recovery thread is running, don't do anything else. 3328 * 3/ If recovery has finished, clean up, possibly marking spares active. 3329 * 4/ If there are any faulty devices, remove them. 3330 * 5/ If array is degraded, try to add spares devices 3331 * 6/ If array has spares or is not in-sync, start a resync thread. 3332 */ 3333void md_check_recovery(mddev_t *mddev) 3334{ 3335 mdk_rdev_t *rdev; 3336 struct list_head *rtmp; 3337 3338 3339 dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); 3340 3341 if (mddev->ro) 3342 return; 3343 if ( ! ( 3344 mddev->sb_dirty || 3345 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 3346 test_bit(MD_RECOVERY_DONE, &mddev->recovery) 3347 )) 3348 return; 3349 if (mddev_trylock(mddev)==0) { 3350 int spares =0; 3351 if (mddev->sb_dirty) 3352 md_update_sb(mddev); 3353 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 3354 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3355 /* resync/recovery still happening */ 3356 goto unlock; 3357 if (mddev->sync_thread) { 3358 /* resync has finished, collect result */ 3359 md_unregister_thread(mddev->sync_thread); 3360 mddev->sync_thread = NULL; 3361 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) { 3362 /* success...*/ 3363 /* activate any spares */ 3364 mddev->pers->spare_active(mddev); 3365 } 3366 md_update_sb(mddev); 3367 mddev->recovery = 0; 3368 wake_up(&resync_wait); 3369 goto unlock; 3370 } 3371 if (mddev->recovery) { 3372 /* that's odd.. */ 3373 mddev->recovery = 0; 3374 wake_up(&resync_wait); 3375 } 3376 3377 /* no recovery is running. 3378 * remove any failed drives, then 3379 * add spares if possible 3380 */ 3381 ITERATE_RDEV(mddev,rdev,rtmp) { 3382 if (rdev->raid_disk >= 0 && 3383 rdev->faulty && 3384 atomic_read(&rdev->nr_pending)==0) { 3385 mddev->pers->hot_remove_disk(mddev, rdev->raid_disk); 3386 rdev->raid_disk = -1; 3387 } 3388 if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync) 3389 spares++; 3390 } 3391 if (mddev->degraded) { 3392 ITERATE_RDEV(mddev,rdev,rtmp) 3393 if (rdev->raid_disk < 0 3394 && !rdev->faulty) { 3395 if (mddev->pers->hot_add_disk(mddev,rdev)) 3396 spares++; 3397 else 3398 break; 3399 } 3400 } 3401 3402 if (!spares && (mddev->recovery_cp == MaxSector )) { 3403 /* nothing we can do ... */ 3404 goto unlock; 3405 } 3406 if (mddev->pers->sync_request) { 3407 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3408 if (!spares) 3409 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3410 mddev->sync_thread = md_register_thread(md_do_sync, 3411 mddev, 3412 "md%d_resync"); 3413 if (!mddev->sync_thread) { 3414 printk(KERN_ERR "md%d: could not start resync" 3415 " thread...\n", 3416 mdidx(mddev)); 3417 /* leave the spares where they are, it shouldn't hurt */ 3418 mddev->recovery = 0; 3419 } else { 3420 md_wakeup_thread(mddev->sync_thread); 3421 } 3422 } 3423 unlock: 3424 mddev_unlock(mddev); 3425 } 3426} 3427 3428int md_notify_reboot(struct notifier_block *this, 3429 unsigned long code, void *x) 3430{ 3431 struct list_head *tmp; 3432 mddev_t *mddev; 3433 3434 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 3435 3436 printk(KERN_INFO "md: stopping all md devices.\n"); 3437 3438 ITERATE_MDDEV(mddev,tmp) 3439 if (mddev_trylock(mddev)==0) 3440 do_md_stop (mddev, 1); 3441 /* 3442 * certain more exotic SCSI devices are known to be 3443 * volatile wrt too early system reboots. While the 3444 * right place to handle this issue is the given 3445 * driver, we do want to have a safe RAID driver ... 3446 */ 3447 mdelay(1000*1); 3448 } 3449 return NOTIFY_DONE; 3450} 3451 3452struct notifier_block md_notifier = { 3453 .notifier_call = md_notify_reboot, 3454 .next = NULL, 3455 .priority = INT_MAX, /* before any real devices */ 3456}; 3457 3458static void md_geninit(void) 3459{ 3460 struct proc_dir_entry *p; 3461 3462 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 3463 3464#ifdef CONFIG_PROC_FS 3465 p = create_proc_entry("mdstat", S_IRUGO, NULL); 3466 if (p) 3467 p->proc_fops = &md_seq_fops; 3468#endif 3469} 3470 3471int __init md_init(void) 3472{ 3473 int minor; 3474 3475 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 3476 " MD_SB_DISKS=%d\n", 3477 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3478 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3479 3480 if (register_blkdev(MAJOR_NR, "md")) 3481 return -1; 3482 3483 devfs_mk_dir("md"); 3484 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 3485 md_probe, NULL, NULL); 3486 for (minor=0; minor < MAX_MD_DEVS; ++minor) { 3487 char name[16]; 3488 sprintf(name, "md/%d", minor); 3489 devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor, 3490 S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); 3491 } 3492 3493 register_reboot_notifier(&md_notifier); 3494 raid_table_header = register_sysctl_table(raid_root_table, 1); 3495 3496 md_geninit(); 3497 return (0); 3498} 3499 3500 3501#ifndef MODULE 3502 3503/* 3504 * Searches all registered partitions for autorun RAID arrays 3505 * at boot time. 3506 */ 3507static dev_t detected_devices[128]; 3508static int dev_cnt; 3509 3510void md_autodetect_dev(dev_t dev) 3511{ 3512 if (dev_cnt >= 0 && dev_cnt < 127) 3513 detected_devices[dev_cnt++] = dev; 3514} 3515 3516 3517static void autostart_arrays(void) 3518{ 3519 mdk_rdev_t *rdev; 3520 int i; 3521 3522 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 3523 3524 for (i = 0; i < dev_cnt; i++) { 3525 dev_t dev = detected_devices[i]; 3526 3527 rdev = md_import_device(dev,0, 0); 3528 if (IS_ERR(rdev)) { 3529 printk(KERN_ALERT "md: could not import %s!\n", 3530 partition_name(dev)); 3531 continue; 3532 } 3533 if (rdev->faulty) { 3534 MD_BUG(); 3535 continue; 3536 } 3537 list_add(&rdev->same_set, &pending_raid_disks); 3538 } 3539 dev_cnt = 0; 3540 3541 autorun_devices(); 3542} 3543 3544#endif 3545 3546static __exit void md_exit(void) 3547{ 3548 int i; 3549 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 3550 for (i=0; i < MAX_MD_DEVS; i++) 3551 devfs_remove("md/%d", i); 3552 devfs_remove("md"); 3553 3554 unregister_blkdev(MAJOR_NR,"md"); 3555 unregister_reboot_notifier(&md_notifier); 3556 unregister_sysctl_table(raid_table_header); 3557#ifdef CONFIG_PROC_FS 3558 remove_proc_entry("mdstat", NULL); 3559#endif 3560 for (i = 0; i < MAX_MD_DEVS; i++) { 3561 struct gendisk *disk = disks[i]; 3562 mddev_t *mddev; 3563 if (!disks[i]) 3564 continue; 3565 mddev = disk->private_data; 3566 del_gendisk(disk); 3567 put_disk(disk); 3568 mddev_put(mddev); 3569 } 3570} 3571 3572module_init(md_init) 3573module_exit(md_exit) 3574 3575EXPORT_SYMBOL(register_md_personality); 3576EXPORT_SYMBOL(unregister_md_personality); 3577EXPORT_SYMBOL(md_error); 3578EXPORT_SYMBOL(md_sync_acct); 3579EXPORT_SYMBOL(md_done_sync); 3580EXPORT_SYMBOL(md_write_start); 3581EXPORT_SYMBOL(md_write_end); 3582EXPORT_SYMBOL(md_handle_safemode); 3583EXPORT_SYMBOL(md_register_thread); 3584EXPORT_SYMBOL(md_unregister_thread); 3585EXPORT_SYMBOL(md_wakeup_thread); 3586EXPORT_SYMBOL(md_print_devices); 3587EXPORT_SYMBOL(md_interrupt_thread); 3588EXPORT_SYMBOL(md_check_recovery); 3589MODULE_LICENSE("GPL"); 3590