1/* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 11 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 12 - kmod support by: Cyrus Durgin 13 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 14 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 15 16 - lots of fixes and improvements to the RAID1/RAID5 and generic 17 RAID code (such as request based resynchronization): 18 19 Neil Brown <neilb@cse.unsw.edu.au>. 20 21 This program is free software; you can redistribute it and/or modify 22 it under the terms of the GNU General Public License as published by 23 the Free Software Foundation; either version 2, or (at your option) 24 any later version. 25 26 You should have received a copy of the GNU General Public License 27 (for example /usr/src/linux/COPYING); if not, write to the Free 28 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 29*/ 30 31#include <linux/module.h> 32#include <linux/config.h> 33#include <linux/raid/md.h> 34#include <linux/sysctl.h> 35#include <linux/raid/xor.h> 36#include <linux/devfs_fs_kernel.h> 37 38#include <linux/init.h> 39 40#ifdef CONFIG_KMOD 41#include <linux/kmod.h> 42#endif 43 44#define __KERNEL_SYSCALLS__ 45#include <linux/unistd.h> 46 47#include <asm/unaligned.h> 48 49#define MAJOR_NR MD_MAJOR 50#define MD_DRIVER 51 52#include <linux/blk.h> 53 54#define DEBUG 0 55#if DEBUG 56# define dprintk(x...) printk(x) 57#else 58# define dprintk(x...) do { } while(0) 59#endif 60 61#ifndef MODULE 62static void autostart_arrays (void); 63#endif 64 65static mdk_personality_t *pers[MAX_PERSONALITY]; 66 67/* 68 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 69 * is 100 KB/sec, so the extra system load does not show up that much. 70 * Increase it if you want to have more _guaranteed_ speed. Note that 71 * the RAID driver will use the maximum available bandwith if the IO 72 * subsystem is idle. There is also an 'absolute maximum' reconstruction 73 * speed limit - in case reconstruction slows down your system despite 74 * idle IO detection. 75 * 76 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 77 */ 78 79static int sysctl_speed_limit_min = 100; 80static int sysctl_speed_limit_max = 100000; 81 82static struct ctl_table_header *raid_table_header; 83 84static ctl_table raid_table[] = { 85 {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", 86 &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, 87 {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", 88 &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, 89 {0} 90}; 91 92static ctl_table raid_dir_table[] = { 93 {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, 94 {0} 95}; 96 97static ctl_table raid_root_table[] = { 98 {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, 99 {0} 100}; 101 102/* 103 * these have to be allocated separately because external 104 * subsystems want to have a pre-defined structure 105 */ 106struct hd_struct md_hd_struct[MAX_MD_DEVS]; 107static int md_blocksizes[MAX_MD_DEVS]; 108static int md_hardsect_sizes[MAX_MD_DEVS]; 109static mdk_thread_t *md_recovery_thread; 110 111int md_size[MAX_MD_DEVS]; 112 113static struct block_device_operations md_fops; 114static devfs_handle_t devfs_handle; 115 116static struct gendisk md_gendisk= 117{ 118 major: MD_MAJOR, 119 major_name: "md", 120 minor_shift: 0, 121 max_p: 1, 122 part: md_hd_struct, 123 sizes: md_size, 124 nr_real: MAX_MD_DEVS, 125 real_devices: NULL, 126 next: NULL, 127 fops: &md_fops, 128}; 129 130/* 131 * Enables to iterate over all existing md arrays 132 */ 133static MD_LIST_HEAD(all_mddevs); 134 135static mddev_t *mddev_map[MAX_MD_DEVS]; 136 137static inline mddev_t * kdev_to_mddev (kdev_t dev) 138{ 139 if (MAJOR(dev) != MD_MAJOR) 140 BUG(); 141 return mddev_map[MINOR(dev)]; 142} 143 144static int md_fail_request (request_queue_t *q, struct bio *bio) 145{ 146 bio_io_error(bio); 147 return 0; 148} 149 150static mddev_t * alloc_mddev(kdev_t dev) 151{ 152 mddev_t *mddev; 153 154 if (MAJOR(dev) != MD_MAJOR) { 155 MD_BUG(); 156 return 0; 157 } 158 mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); 159 if (!mddev) 160 return NULL; 161 162 memset(mddev, 0, sizeof(*mddev)); 163 164 mddev->__minor = MINOR(dev); 165 init_MUTEX(&mddev->reconfig_sem); 166 init_MUTEX(&mddev->recovery_sem); 167 init_MUTEX(&mddev->resync_sem); 168 MD_INIT_LIST_HEAD(&mddev->disks); 169 MD_INIT_LIST_HEAD(&mddev->all_mddevs); 170 atomic_set(&mddev->active, 0); 171 172 mddev_map[mdidx(mddev)] = mddev; 173 md_list_add(&mddev->all_mddevs, &all_mddevs); 174 175 MOD_INC_USE_COUNT; 176 177 return mddev; 178} 179 180mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 181{ 182 mdk_rdev_t * rdev; 183 struct md_list_head *tmp; 184 185 ITERATE_RDEV(mddev,rdev,tmp) { 186 if (rdev->desc_nr == nr) 187 return rdev; 188 } 189 return NULL; 190} 191 192mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) 193{ 194 struct md_list_head *tmp; 195 mdk_rdev_t *rdev; 196 197 ITERATE_RDEV(mddev,rdev,tmp) { 198 if (rdev->dev == dev) 199 return rdev; 200 } 201 return NULL; 202} 203 204static MD_LIST_HEAD(device_names); 205 206char * partition_name(kdev_t dev) 207{ 208 struct gendisk *hd; 209 static char nomem [] = "<nomem>"; 210 dev_name_t *dname; 211 struct md_list_head *tmp; 212 213 list_for_each(tmp, &device_names) { 214 dname = md_list_entry(tmp, dev_name_t, list); 215 if (dname->dev == dev) 216 return dname->name; 217 } 218 219 dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); 220 221 if (!dname) 222 return nomem; 223 /* 224 * ok, add this new device name to the list 225 */ 226 hd = get_gendisk (dev); 227 dname->name = NULL; 228 if (hd) 229 dname->name = disk_name (hd, MINOR(dev), dname->namebuf); 230 if (!dname->name) { 231 sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); 232 dname->name = dname->namebuf; 233 } 234 235 dname->dev = dev; 236 md_list_add(&dname->list, &device_names); 237 238 return dname->name; 239} 240 241static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev, 242 int persistent) 243{ 244 unsigned int size = 0; 245 246 if (blk_size[MAJOR(dev)]) 247 size = blk_size[MAJOR(dev)][MINOR(dev)]; 248 if (persistent) 249 size = MD_NEW_SIZE_BLOCKS(size); 250 return size; 251} 252 253static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent) 254{ 255 unsigned int size; 256 257 size = calc_dev_sboffset(dev, mddev, persistent); 258 if (!mddev->sb) { 259 MD_BUG(); 260 return size; 261 } 262 if (mddev->sb->chunk_size) 263 size &= ~(mddev->sb->chunk_size/1024 - 1); 264 return size; 265} 266 267static unsigned int zoned_raid_size(mddev_t *mddev) 268{ 269 unsigned int mask; 270 mdk_rdev_t * rdev; 271 struct md_list_head *tmp; 272 273 if (!mddev->sb) { 274 MD_BUG(); 275 return -EINVAL; 276 } 277 /* 278 * do size and offset calculations. 279 */ 280 mask = ~(mddev->sb->chunk_size/1024 - 1); 281 282 ITERATE_RDEV(mddev,rdev,tmp) { 283 rdev->size &= mask; 284 md_size[mdidx(mddev)] += rdev->size; 285 } 286 return 0; 287} 288 289static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb) 290{ 291 if (disk_active(disk)) { 292 sb->working_disks--; 293 } else { 294 if (disk_spare(disk)) { 295 sb->spare_disks--; 296 sb->working_disks--; 297 } else { 298 sb->failed_disks--; 299 } 300 } 301 sb->nr_disks--; 302 disk->major = 0; 303 disk->minor = 0; 304 mark_disk_removed(disk); 305} 306 307#define BAD_MAGIC KERN_ERR \ 308"md: invalid raid superblock magic on %s\n" 309 310#define BAD_MINOR KERN_ERR \ 311"md: %s: invalid raid minor (%x)\n" 312 313#define OUT_OF_MEM KERN_ALERT \ 314"md: out of memory.\n" 315 316#define NO_SB KERN_ERR \ 317"md: disabled device %s, could not read superblock.\n" 318 319#define BAD_CSUM KERN_WARNING \ 320"md: invalid superblock checksum on %s\n" 321 322static int alloc_array_sb(mddev_t * mddev) 323{ 324 if (mddev->sb) { 325 MD_BUG(); 326 return 0; 327 } 328 329 mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); 330 if (!mddev->sb) 331 return -ENOMEM; 332 md_clear_page(mddev->sb); 333 return 0; 334} 335 336static int alloc_disk_sb(mdk_rdev_t * rdev) 337{ 338 if (rdev->sb) 339 MD_BUG(); 340 341 rdev->sb_page = alloc_page(GFP_KERNEL); 342 if (!rdev->sb_page) { 343 printk(OUT_OF_MEM); 344 return -EINVAL; 345 } 346 rdev->sb = (mdp_super_t *) page_address(rdev->sb_page); 347 348 return 0; 349} 350 351static void free_disk_sb(mdk_rdev_t * rdev) 352{ 353 if (rdev->sb_page) { 354 page_cache_release(rdev->sb_page); 355 rdev->sb = NULL; 356 rdev->sb_page = NULL; 357 rdev->sb_offset = 0; 358 rdev->size = 0; 359 } else { 360 if (!rdev->faulty) 361 MD_BUG(); 362 } 363} 364 365 366static void bh_complete(struct buffer_head *bh, int uptodate) 367{ 368 369 if (uptodate) 370 set_bit(BH_Uptodate, &bh->b_state); 371 372 complete((struct completion*)bh->b_private); 373} 374 375static int sync_page_io(kdev_t dev, unsigned long sector, int size, 376 struct page *page, int rw) 377{ 378 struct buffer_head bh; 379 struct completion event; 380 381 init_completion(&event); 382 init_buffer(&bh, bh_complete, &event); 383 bh.b_rdev = dev; 384 bh.b_rsector = sector; 385 bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock); 386 bh.b_size = size; 387 bh.b_page = page; 388 bh.b_reqnext = NULL; 389 bh.b_data = page_address(page); 390 generic_make_request(rw, &bh); 391 392 run_task_queue(&tq_disk); 393 wait_for_completion(&event); 394 395 return test_bit(BH_Uptodate, &bh.b_state); 396} 397 398static int read_disk_sb(mdk_rdev_t * rdev) 399{ 400 int ret = -EINVAL; 401 kdev_t dev = rdev->dev; 402 unsigned long sb_offset; 403 404 if (!rdev->sb) { 405 MD_BUG(); 406 goto abort; 407 } 408 409 /* 410 * Calculate the position of the superblock, 411 * it's at the end of the disk 412 */ 413 sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); 414 rdev->sb_offset = sb_offset; 415 416 if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) { 417 printk(NO_SB,partition_name(dev)); 418 return -EINVAL; 419 } 420 printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); 421 ret = 0; 422abort: 423 return ret; 424} 425 426static unsigned int calc_sb_csum(mdp_super_t * sb) 427{ 428 unsigned int disk_csum, csum; 429 430 disk_csum = sb->sb_csum; 431 sb->sb_csum = 0; 432 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 433 sb->sb_csum = disk_csum; 434 return csum; 435} 436 437/* 438 * Check one RAID superblock for generic plausibility 439 */ 440 441static int check_disk_sb(mdk_rdev_t * rdev) 442{ 443 mdp_super_t *sb; 444 int ret = -EINVAL; 445 446 sb = rdev->sb; 447 if (!sb) { 448 MD_BUG(); 449 goto abort; 450 } 451 452 if (sb->md_magic != MD_SB_MAGIC) { 453 printk(BAD_MAGIC, partition_name(rdev->dev)); 454 goto abort; 455 } 456 457 if (sb->md_minor >= MAX_MD_DEVS) { 458 printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor); 459 goto abort; 460 } 461 462 if (calc_sb_csum(sb) != sb->sb_csum) { 463 printk(BAD_CSUM, partition_name(rdev->dev)); 464 goto abort; 465 } 466 ret = 0; 467abort: 468 return ret; 469} 470 471static kdev_t dev_unit(kdev_t dev) 472{ 473 unsigned int mask; 474 struct gendisk *hd = get_gendisk(dev); 475 476 if (!hd) 477 return 0; 478 mask = ~((1 << hd->minor_shift) - 1); 479 480 return MKDEV(MAJOR(dev), MINOR(dev) & mask); 481} 482 483static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) 484{ 485 struct md_list_head *tmp; 486 mdk_rdev_t *rdev; 487 488 ITERATE_RDEV(mddev,rdev,tmp) 489 if (dev_unit(rdev->dev) == dev_unit(dev)) 490 return rdev; 491 492 return NULL; 493} 494 495static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 496{ 497 struct md_list_head *tmp; 498 mdk_rdev_t *rdev; 499 500 ITERATE_RDEV(mddev1,rdev,tmp) 501 if (match_dev_unit(mddev2, rdev->dev)) 502 return 1; 503 504 return 0; 505} 506 507static MD_LIST_HEAD(all_raid_disks); 508static MD_LIST_HEAD(pending_raid_disks); 509 510static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 511{ 512 mdk_rdev_t *same_pdev; 513 514 if (rdev->mddev) { 515 MD_BUG(); 516 return; 517 } 518 same_pdev = match_dev_unit(mddev, rdev->dev); 519 if (same_pdev) 520 printk( KERN_WARNING 521"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" 522" protection against single-disk failure might be compromised.\n", 523 mdidx(mddev), partition_name(rdev->dev), 524 partition_name(same_pdev->dev)); 525 526 md_list_add(&rdev->same_set, &mddev->disks); 527 rdev->mddev = mddev; 528 printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev)); 529} 530 531static void unbind_rdev_from_array(mdk_rdev_t * rdev) 532{ 533 if (!rdev->mddev) { 534 MD_BUG(); 535 return; 536 } 537 list_del_init(&rdev->same_set); 538 printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev)); 539 rdev->mddev = NULL; 540} 541 542/* 543 * prevent the device from being mounted, repartitioned or 544 * otherwise reused by a RAID array (or any other kernel 545 * subsystem), by opening the device. [simply getting an 546 * inode is not enough, the SCSI module usage code needs 547 * an explicit open() on the device] 548 */ 549static int lock_rdev(mdk_rdev_t *rdev) 550{ 551 int err = 0; 552 struct block_device *bdev; 553 554 bdev = bdget(rdev->dev); 555 if (!bdev) 556 return -ENOMEM; 557 err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); 558 if (!err) 559 rdev->bdev = bdev; 560 return err; 561} 562 563static void unlock_rdev(mdk_rdev_t *rdev) 564{ 565 struct block_device *bdev = rdev->bdev; 566 rdev->bdev = NULL; 567 if (!bdev) 568 MD_BUG(); 569 blkdev_put(bdev, BDEV_RAW); 570} 571 572void md_autodetect_dev(kdev_t dev); 573 574static void export_rdev(mdk_rdev_t * rdev) 575{ 576 printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev)); 577 if (rdev->mddev) 578 MD_BUG(); 579 unlock_rdev(rdev); 580 free_disk_sb(rdev); 581 list_del_init(&rdev->all); 582 if (!list_empty(&rdev->pending)) { 583 printk(KERN_INFO "md: (%s was pending)\n", 584 partition_name(rdev->dev)); 585 list_del_init(&rdev->pending); 586 } 587#ifndef MODULE 588 md_autodetect_dev(rdev->dev); 589#endif 590 rdev->dev = 0; 591 rdev->faulty = 0; 592 kfree(rdev); 593} 594 595static void kick_rdev_from_array(mdk_rdev_t * rdev) 596{ 597 unbind_rdev_from_array(rdev); 598 export_rdev(rdev); 599} 600 601static void export_array(mddev_t *mddev) 602{ 603 struct md_list_head *tmp; 604 mdk_rdev_t *rdev; 605 mdp_super_t *sb = mddev->sb; 606 607 if (mddev->sb) { 608 mddev->sb = NULL; 609 free_page((unsigned long) sb); 610 } 611 612 ITERATE_RDEV(mddev,rdev,tmp) { 613 if (!rdev->mddev) { 614 MD_BUG(); 615 continue; 616 } 617 kick_rdev_from_array(rdev); 618 } 619 if (!list_empty(&mddev->disks)) 620 MD_BUG(); 621} 622 623static void free_mddev(mddev_t *mddev) 624{ 625 if (!mddev) { 626 MD_BUG(); 627 return; 628 } 629 630 export_array(mddev); 631 md_size[mdidx(mddev)] = 0; 632 md_hd_struct[mdidx(mddev)].nr_sects = 0; 633 634 /* 635 * Make sure nobody else is using this mddev 636 * (careful, we rely on the global kernel lock here) 637 */ 638 while (sem_getcount(&mddev->resync_sem) != 1) 639 schedule(); 640 while (sem_getcount(&mddev->recovery_sem) != 1) 641 schedule(); 642 643 del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); 644 md_list_del(&mddev->all_mddevs); 645 kfree(mddev); 646 MOD_DEC_USE_COUNT; 647} 648 649#undef BAD_CSUM 650#undef BAD_MAGIC 651#undef OUT_OF_MEM 652#undef NO_SB 653 654static void print_desc(mdp_disk_t *desc) 655{ 656 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number, 657 partition_name(MKDEV(desc->major,desc->minor)), 658 desc->major,desc->minor,desc->raid_disk,desc->state); 659} 660 661static void print_sb(mdp_super_t *sb) 662{ 663 int i; 664 665 printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 666 sb->major_version, sb->minor_version, sb->patch_version, 667 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 668 sb->ctime); 669 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, 670 sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, 671 sb->layout, sb->chunk_size); 672 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", 673 sb->utime, sb->state, sb->active_disks, sb->working_disks, 674 sb->failed_disks, sb->spare_disks, 675 sb->sb_csum, (unsigned long)sb->events_lo); 676 677 printk(KERN_INFO); 678 for (i = 0; i < MD_SB_DISKS; i++) { 679 mdp_disk_t *desc; 680 681 desc = sb->disks + i; 682 if (desc->number || desc->major || desc->minor || 683 desc->raid_disk || (desc->state && (desc->state != 4))) { 684 printk(" D %2d: ", i); 685 print_desc(desc); 686 } 687 } 688 printk(KERN_INFO "md: THIS: "); 689 print_desc(&sb->this_disk); 690 691} 692 693static void print_rdev(mdk_rdev_t *rdev) 694{ 695 printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", 696 partition_name(rdev->dev), partition_name(rdev->old_dev), 697 rdev->size, rdev->faulty, rdev->desc_nr); 698 if (rdev->sb) { 699 printk(KERN_INFO "md: rdev superblock:\n"); 700 print_sb(rdev->sb); 701 } else 702 printk(KERN_INFO "md: no rdev superblock!\n"); 703} 704 705void md_print_devices(void) 706{ 707 struct md_list_head *tmp, *tmp2; 708 mdk_rdev_t *rdev; 709 mddev_t *mddev; 710 711 printk("\n"); 712 printk("md: **********************************\n"); 713 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 714 printk("md: **********************************\n"); 715 ITERATE_MDDEV(mddev,tmp) { 716 printk("md%d: ", mdidx(mddev)); 717 718 ITERATE_RDEV(mddev,rdev,tmp2) 719 printk("<%s>", partition_name(rdev->dev)); 720 721 if (mddev->sb) { 722 printk(" array superblock:\n"); 723 print_sb(mddev->sb); 724 } else 725 printk(" no array superblock.\n"); 726 727 ITERATE_RDEV(mddev,rdev,tmp2) 728 print_rdev(rdev); 729 } 730 printk("md: **********************************\n"); 731 printk("\n"); 732} 733 734static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 735{ 736 int ret; 737 mdp_super_t *tmp1, *tmp2; 738 739 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 740 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 741 742 if (!tmp1 || !tmp2) { 743 ret = 0; 744 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 745 goto abort; 746 } 747 748 *tmp1 = *sb1; 749 *tmp2 = *sb2; 750 751 /* 752 * nr_disks is not constant 753 */ 754 tmp1->nr_disks = 0; 755 tmp2->nr_disks = 0; 756 757 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 758 ret = 0; 759 else 760 ret = 1; 761 762abort: 763 if (tmp1) 764 kfree(tmp1); 765 if (tmp2) 766 kfree(tmp2); 767 768 return ret; 769} 770 771static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) 772{ 773 if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && 774 (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && 775 (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && 776 (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) 777 778 return 1; 779 780 return 0; 781} 782 783static mdk_rdev_t * find_rdev_all(kdev_t dev) 784{ 785 struct md_list_head *tmp; 786 mdk_rdev_t *rdev; 787 788 list_for_each(tmp, &all_raid_disks) { 789 rdev = md_list_entry(tmp, mdk_rdev_t, all); 790 if (rdev->dev == dev) 791 return rdev; 792 } 793 return NULL; 794} 795 796#define GETBLK_FAILED KERN_ERR \ 797"md: getblk failed for device %s\n" 798 799static int write_disk_sb(mdk_rdev_t * rdev) 800{ 801 kdev_t dev; 802 unsigned long sb_offset, size; 803 804 if (!rdev->sb) { 805 MD_BUG(); 806 return 1; 807 } 808 if (rdev->faulty) { 809 MD_BUG(); 810 return 1; 811 } 812 if (rdev->sb->md_magic != MD_SB_MAGIC) { 813 MD_BUG(); 814 return 1; 815 } 816 817 dev = rdev->dev; 818 sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); 819 if (rdev->sb_offset != sb_offset) { 820 printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n", 821 partition_name(dev), rdev->sb_offset, sb_offset); 822 goto skip; 823 } 824 /* 825 * If the disk went offline meanwhile and it's just a spare, then 826 * its size has changed to zero silently, and the MD code does 827 * not yet know that it's faulty. 828 */ 829 size = calc_dev_size(dev, rdev->mddev, 1); 830 if (size != rdev->size) { 831 printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n", 832 partition_name(dev), rdev->size, size); 833 goto skip; 834 } 835 836 printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); 837 838 if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) { 839 printk("md: write_disk_sb failed for device %s\n", partition_name(dev)); 840 return 1; 841 } 842skip: 843 return 0; 844} 845#undef GETBLK_FAILED 846 847static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) 848{ 849 int i, ok = 0; 850 mdp_disk_t *desc; 851 852 for (i = 0; i < MD_SB_DISKS; i++) { 853 desc = mddev->sb->disks + i; 854#if 0 855 if (disk_faulty(desc)) { 856 if (MKDEV(desc->major,desc->minor) == rdev->dev) 857 ok = 1; 858 continue; 859 } 860#endif 861 if (MKDEV(desc->major,desc->minor) == rdev->dev) { 862 rdev->sb->this_disk = *desc; 863 rdev->desc_nr = desc->number; 864 ok = 1; 865 break; 866 } 867 } 868 869 if (!ok) { 870 MD_BUG(); 871 } 872} 873 874static int sync_sbs(mddev_t * mddev) 875{ 876 mdk_rdev_t *rdev; 877 mdp_super_t *sb; 878 struct md_list_head *tmp; 879 880 ITERATE_RDEV(mddev,rdev,tmp) { 881 if (rdev->faulty || rdev->alias_device) 882 continue; 883 sb = rdev->sb; 884 *sb = *mddev->sb; 885 set_this_disk(mddev, rdev); 886 sb->sb_csum = calc_sb_csum(sb); 887 } 888 return 0; 889} 890 891int md_update_sb(mddev_t * mddev) 892{ 893 int err, count = 100; 894 struct md_list_head *tmp; 895 mdk_rdev_t *rdev; 896 897 if (!mddev->sb_dirty) { 898 printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0)); 899 return 0; 900 } 901 mddev->sb_dirty = 0; 902repeat: 903 mddev->sb->utime = CURRENT_TIME; 904 if ((++mddev->sb->events_lo)==0) 905 ++mddev->sb->events_hi; 906 907 if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { 908 /* 909 * oops, this 64-bit counter should never wrap. 910 * Either we are in around ~1 trillion A.C., assuming 911 * 1 reboot per second, or we have a bug: 912 */ 913 MD_BUG(); 914 mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; 915 } 916 sync_sbs(mddev); 917 918 /* 919 * do not write anything to disk if using 920 * nonpersistent superblocks 921 */ 922 if (mddev->sb->not_persistent) 923 return 0; 924 925 printk(KERN_INFO "md: updating md%d RAID superblock on device\n", 926 mdidx(mddev)); 927 928 err = 0; 929 ITERATE_RDEV(mddev,rdev,tmp) { 930 printk(KERN_INFO "md: "); 931 if (rdev->faulty) 932 printk("(skipping faulty "); 933 if (rdev->alias_device) 934 printk("(skipping alias "); 935 if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) { 936 printk("(skipping new-faulty %s )\n", 937 partition_name(rdev->dev)); 938 continue; 939 } 940 printk("%s ", partition_name(rdev->dev)); 941 if (!rdev->faulty && !rdev->alias_device) { 942 printk("[events: %08lx]", 943 (unsigned long)rdev->sb->events_lo); 944 err += write_disk_sb(rdev); 945 } else 946 printk(")\n"); 947 } 948 if (err) { 949 if (--count) { 950 printk(KERN_ERR "md: errors occurred during superblock update, repeating\n"); 951 goto repeat; 952 } 953 printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n"); 954 } 955 return 0; 956} 957 958/* 959 * Import a device. If 'on_disk', then sanity check the superblock 960 * 961 * mark the device faulty if: 962 * 963 * - the device is nonexistent (zero size) 964 * - the device has no valid superblock 965 * 966 */ 967static int md_import_device(kdev_t newdev, int on_disk) 968{ 969 int err; 970 mdk_rdev_t *rdev; 971 unsigned int size; 972 973 if (find_rdev_all(newdev)) 974 return -EEXIST; 975 976 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 977 if (!rdev) { 978 printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev)); 979 return -ENOMEM; 980 } 981 memset(rdev, 0, sizeof(*rdev)); 982 983 if (is_mounted(newdev)) { 984 printk(KERN_WARNING "md: can not import %s, has active inodes!\n", 985 partition_name(newdev)); 986 err = -EBUSY; 987 goto abort_free; 988 } 989 990 if ((err = alloc_disk_sb(rdev))) 991 goto abort_free; 992 993 rdev->dev = newdev; 994 if (lock_rdev(rdev)) { 995 printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n", 996 partition_name(newdev)); 997 err = -EINVAL; 998 goto abort_free; 999 } 1000 rdev->desc_nr = -1; 1001 rdev->faulty = 0; 1002 1003 size = 0; 1004 if (blk_size[MAJOR(newdev)]) 1005 size = blk_size[MAJOR(newdev)][MINOR(newdev)]; 1006 if (!size) { 1007 printk(KERN_WARNING "md: %s has zero size, marking faulty!\n", 1008 partition_name(newdev)); 1009 err = -EINVAL; 1010 goto abort_free; 1011 } 1012 1013 if (on_disk) { 1014 if ((err = read_disk_sb(rdev))) { 1015 printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", 1016 partition_name(newdev)); 1017 goto abort_free; 1018 } 1019 if ((err = check_disk_sb(rdev))) { 1020 printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", 1021 partition_name(newdev)); 1022 goto abort_free; 1023 } 1024 1025 if (rdev->sb->level != -4) { 1026 rdev->old_dev = MKDEV(rdev->sb->this_disk.major, 1027 rdev->sb->this_disk.minor); 1028 rdev->desc_nr = rdev->sb->this_disk.number; 1029 } else { 1030 rdev->old_dev = MKDEV(0, 0); 1031 rdev->desc_nr = -1; 1032 } 1033 } 1034 md_list_add(&rdev->all, &all_raid_disks); 1035 MD_INIT_LIST_HEAD(&rdev->pending); 1036 INIT_LIST_HEAD(&rdev->same_set); 1037 1038 return 0; 1039 1040abort_free: 1041 if (rdev->sb) { 1042 if (rdev->bdev) 1043 unlock_rdev(rdev); 1044 free_disk_sb(rdev); 1045 } 1046 kfree(rdev); 1047 return err; 1048} 1049 1050/* 1051 * Check a full RAID array for plausibility 1052 */ 1053 1054#define INCONSISTENT KERN_ERR \ 1055"md: fatal superblock inconsistency in %s -- removing from array\n" 1056 1057#define OUT_OF_DATE KERN_ERR \ 1058"md: superblock update time inconsistency -- using the most recent one\n" 1059 1060#define OLD_VERSION KERN_ALERT \ 1061"md: md%d: unsupported raid array version %d.%d.%d\n" 1062 1063#define NOT_CLEAN_IGNORE KERN_ERR \ 1064"md: md%d: raid array is not clean -- starting background reconstruction\n" 1065 1066#define UNKNOWN_LEVEL KERN_ERR \ 1067"md: md%d: unsupported raid level %d\n" 1068 1069static int analyze_sbs(mddev_t * mddev) 1070{ 1071 int out_of_date = 0, i, first; 1072 struct md_list_head *tmp, *tmp2; 1073 mdk_rdev_t *rdev, *rdev2, *freshest; 1074 mdp_super_t *sb; 1075 1076 /* 1077 * Verify the RAID superblock on each real device 1078 */ 1079 ITERATE_RDEV(mddev,rdev,tmp) { 1080 if (rdev->faulty) { 1081 MD_BUG(); 1082 goto abort; 1083 } 1084 if (!rdev->sb) { 1085 MD_BUG(); 1086 goto abort; 1087 } 1088 if (check_disk_sb(rdev)) 1089 goto abort; 1090 } 1091 1092 /* 1093 * The superblock constant part has to be the same 1094 * for all disks in the array. 1095 */ 1096 sb = NULL; 1097 1098 ITERATE_RDEV(mddev,rdev,tmp) { 1099 if (!sb) { 1100 sb = rdev->sb; 1101 continue; 1102 } 1103 if (!sb_equal(sb, rdev->sb)) { 1104 printk(INCONSISTENT, partition_name(rdev->dev)); 1105 kick_rdev_from_array(rdev); 1106 continue; 1107 } 1108 } 1109 1110 /* 1111 * OK, we have all disks and the array is ready to run. Let's 1112 * find the freshest superblock, that one will be the superblock 1113 * that represents the whole array. 1114 */ 1115 if (!mddev->sb) 1116 if (alloc_array_sb(mddev)) 1117 goto abort; 1118 sb = mddev->sb; 1119 freshest = NULL; 1120 1121 ITERATE_RDEV(mddev,rdev,tmp) { 1122 __u64 ev1, ev2; 1123 /* 1124 * if the checksum is invalid, use the superblock 1125 * only as a last resort. (decrease it's age by 1126 * one event) 1127 */ 1128 if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { 1129 if (rdev->sb->events_lo || rdev->sb->events_hi) 1130 if ((rdev->sb->events_lo--)==0) 1131 rdev->sb->events_hi--; 1132 } 1133 1134 printk(KERN_INFO "md: %s's event counter: %08lx\n", 1135 partition_name(rdev->dev), 1136 (unsigned long)rdev->sb->events_lo); 1137 if (!freshest) { 1138 freshest = rdev; 1139 continue; 1140 } 1141 /* 1142 * Find the newest superblock version 1143 */ 1144 ev1 = md_event(rdev->sb); 1145 ev2 = md_event(freshest->sb); 1146 if (ev1 != ev2) { 1147 out_of_date = 1; 1148 if (ev1 > ev2) 1149 freshest = rdev; 1150 } 1151 } 1152 if (out_of_date) { 1153 printk(OUT_OF_DATE); 1154 printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev)); 1155 } 1156 memcpy (sb, freshest->sb, sizeof(*sb)); 1157 1158 /* 1159 * at this point we have picked the 'best' superblock 1160 * from all available superblocks. 1161 * now we validate this superblock and kick out possibly 1162 * failed disks. 1163 */ 1164 ITERATE_RDEV(mddev,rdev,tmp) { 1165 /* 1166 * Kick all non-fresh devices 1167 */ 1168 __u64 ev1, ev2; 1169 ev1 = md_event(rdev->sb); 1170 ev2 = md_event(sb); 1171 ++ev1; 1172 if (ev1 < ev2) { 1173 printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", 1174 partition_name(rdev->dev)); 1175 kick_rdev_from_array(rdev); 1176 continue; 1177 } 1178 } 1179 1180 /* 1181 * Fix up changed device names ... but only if this disk has a 1182 * recent update time. Use faulty checksum ones too. 1183 */ 1184 if (mddev->sb->level != -4) 1185 ITERATE_RDEV(mddev,rdev,tmp) { 1186 __u64 ev1, ev2, ev3; 1187 if (rdev->faulty || rdev->alias_device) { 1188 MD_BUG(); 1189 goto abort; 1190 } 1191 ev1 = md_event(rdev->sb); 1192 ev2 = md_event(sb); 1193 ev3 = ev2; 1194 --ev3; 1195 if ((rdev->dev != rdev->old_dev) && 1196 ((ev1 == ev2) || (ev1 == ev3))) { 1197 mdp_disk_t *desc; 1198 1199 printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", 1200 partition_name(rdev->old_dev), partition_name(rdev->dev)); 1201 if (rdev->desc_nr == -1) { 1202 MD_BUG(); 1203 goto abort; 1204 } 1205 desc = &sb->disks[rdev->desc_nr]; 1206 if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { 1207 MD_BUG(); 1208 goto abort; 1209 } 1210 desc->major = MAJOR(rdev->dev); 1211 desc->minor = MINOR(rdev->dev); 1212 desc = &rdev->sb->this_disk; 1213 desc->major = MAJOR(rdev->dev); 1214 desc->minor = MINOR(rdev->dev); 1215 } 1216 } 1217 1218 /* 1219 * Remove unavailable and faulty devices ... 1220 * 1221 * note that if an array becomes completely unrunnable due to 1222 * missing devices, we do not write the superblock back, so the 1223 * administrator has a chance to fix things up. The removal thus 1224 * only happens if it's nonfatal to the contents of the array. 1225 */ 1226 for (i = 0; i < MD_SB_DISKS; i++) { 1227 int found; 1228 mdp_disk_t *desc; 1229 kdev_t dev; 1230 1231 desc = sb->disks + i; 1232 dev = MKDEV(desc->major, desc->minor); 1233 1234 /* 1235 * We kick faulty devices/descriptors immediately. 1236 * 1237 * Note: multipath devices are a special case. Since we 1238 * were able to read the superblock on the path, we don't 1239 * care if it was previously marked as faulty, it's up now 1240 * so enable it. 1241 */ 1242 if (disk_faulty(desc) && mddev->sb->level != -4) { 1243 found = 0; 1244 ITERATE_RDEV(mddev,rdev,tmp) { 1245 if (rdev->desc_nr != desc->number) 1246 continue; 1247 printk(KERN_WARNING "md%d: kicking faulty %s!\n", 1248 mdidx(mddev),partition_name(rdev->dev)); 1249 kick_rdev_from_array(rdev); 1250 found = 1; 1251 break; 1252 } 1253 if (!found) { 1254 if (dev == MKDEV(0,0)) 1255 continue; 1256 printk(KERN_WARNING "md%d: removing former faulty %s!\n", 1257 mdidx(mddev), partition_name(dev)); 1258 } 1259 remove_descriptor(desc, sb); 1260 continue; 1261 } else if (disk_faulty(desc)) { 1262 /* 1263 * multipath entry marked as faulty, unfaulty it 1264 */ 1265 rdev = find_rdev(mddev, dev); 1266 if(rdev) 1267 mark_disk_spare(desc); 1268 else 1269 remove_descriptor(desc, sb); 1270 } 1271 1272 if (dev == MKDEV(0,0)) 1273 continue; 1274 /* 1275 * Is this device present in the rdev ring? 1276 */ 1277 found = 0; 1278 ITERATE_RDEV(mddev,rdev,tmp) { 1279 /* 1280 * Multi-path IO special-case: since we have no 1281 * this_disk descriptor at auto-detect time, 1282 * we cannot check rdev->number. 1283 * We can check the device though. 1284 */ 1285 if ((sb->level == -4) && (rdev->dev == 1286 MKDEV(desc->major,desc->minor))) { 1287 found = 1; 1288 break; 1289 } 1290 if (rdev->desc_nr == desc->number) { 1291 found = 1; 1292 break; 1293 } 1294 } 1295 if (found) 1296 continue; 1297 1298 printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", 1299 mdidx(mddev), partition_name(dev)); 1300 remove_descriptor(desc, sb); 1301 } 1302 1303 /* 1304 * Double check wether all devices mentioned in the 1305 * superblock are in the rdev ring. 1306 */ 1307 first = 1; 1308 for (i = 0; i < MD_SB_DISKS; i++) { 1309 mdp_disk_t *desc; 1310 kdev_t dev; 1311 1312 desc = sb->disks + i; 1313 dev = MKDEV(desc->major, desc->minor); 1314 1315 if (dev == MKDEV(0,0)) 1316 continue; 1317 1318 if (disk_faulty(desc)) { 1319 MD_BUG(); 1320 goto abort; 1321 } 1322 1323 rdev = find_rdev(mddev, dev); 1324 if (!rdev) { 1325 MD_BUG(); 1326 goto abort; 1327 } 1328 /* 1329 * In the case of Multipath-IO, we have no 1330 * other information source to find out which 1331 * disk is which, only the position of the device 1332 * in the superblock: 1333 */ 1334 if (mddev->sb->level == -4) { 1335 if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) { 1336 MD_BUG(); 1337 goto abort; 1338 } 1339 rdev->desc_nr = i; 1340 if (!first) 1341 rdev->alias_device = 1; 1342 else 1343 first = 0; 1344 } 1345 } 1346 1347 /* 1348 * Kick all rdevs that are not in the 1349 * descriptor array: 1350 */ 1351 ITERATE_RDEV(mddev,rdev,tmp) { 1352 if (rdev->desc_nr == -1) 1353 kick_rdev_from_array(rdev); 1354 } 1355 1356 /* 1357 * Do a final reality check. 1358 */ 1359 if (mddev->sb->level != -4) { 1360 ITERATE_RDEV(mddev,rdev,tmp) { 1361 if (rdev->desc_nr == -1) { 1362 MD_BUG(); 1363 goto abort; 1364 } 1365 /* 1366 * is the desc_nr unique? 1367 */ 1368 ITERATE_RDEV(mddev,rdev2,tmp2) { 1369 if ((rdev2 != rdev) && 1370 (rdev2->desc_nr == rdev->desc_nr)) { 1371 MD_BUG(); 1372 goto abort; 1373 } 1374 } 1375 /* 1376 * is the device unique? 1377 */ 1378 ITERATE_RDEV(mddev,rdev2,tmp2) { 1379 if ((rdev2 != rdev) && 1380 (rdev2->dev == rdev->dev)) { 1381 MD_BUG(); 1382 goto abort; 1383 } 1384 } 1385 } 1386 } 1387 1388 /* 1389 * Check if we can support this RAID array 1390 */ 1391 if (sb->major_version != MD_MAJOR_VERSION || 1392 sb->minor_version > MD_MINOR_VERSION) { 1393 1394 printk(OLD_VERSION, mdidx(mddev), sb->major_version, 1395 sb->minor_version, sb->patch_version); 1396 goto abort; 1397 } 1398 1399 if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || 1400 (sb->level == 4) || (sb->level == 5))) 1401 printk(NOT_CLEAN_IGNORE, mdidx(mddev)); 1402 1403 return 0; 1404abort: 1405 return 1; 1406} 1407 1408#undef INCONSISTENT 1409#undef OUT_OF_DATE 1410#undef OLD_VERSION 1411#undef OLD_LEVEL 1412 1413static int device_size_calculation(mddev_t * mddev) 1414{ 1415 int data_disks = 0, persistent; 1416 unsigned int readahead; 1417 mdp_super_t *sb = mddev->sb; 1418 struct md_list_head *tmp; 1419 mdk_rdev_t *rdev; 1420 1421 /* 1422 * Do device size calculation. Bail out if too small. 1423 * (we have to do this after having validated chunk_size, 1424 * because device size has to be modulo chunk_size) 1425 */ 1426 persistent = !mddev->sb->not_persistent; 1427 ITERATE_RDEV(mddev,rdev,tmp) { 1428 if (rdev->faulty) 1429 continue; 1430 if (rdev->size) { 1431 MD_BUG(); 1432 continue; 1433 } 1434 rdev->size = calc_dev_size(rdev->dev, mddev, persistent); 1435 if (rdev->size < sb->chunk_size / 1024) { 1436 printk(KERN_WARNING 1437 "md: Dev %s smaller than chunk_size: %ldk < %dk\n", 1438 partition_name(rdev->dev), 1439 rdev->size, sb->chunk_size / 1024); 1440 return -EINVAL; 1441 } 1442 } 1443 1444 switch (sb->level) { 1445 case -4: 1446 data_disks = 1; 1447 break; 1448 case -3: 1449 data_disks = 1; 1450 break; 1451 case -2: 1452 data_disks = 1; 1453 break; 1454 case -1: 1455 zoned_raid_size(mddev); 1456 data_disks = 1; 1457 break; 1458 case 0: 1459 zoned_raid_size(mddev); 1460 data_disks = sb->raid_disks; 1461 break; 1462 case 1: 1463 data_disks = 1; 1464 break; 1465 case 4: 1466 case 5: 1467 data_disks = sb->raid_disks-1; 1468 break; 1469 default: 1470 printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level); 1471 goto abort; 1472 } 1473 if (!md_size[mdidx(mddev)]) 1474 md_size[mdidx(mddev)] = sb->size * data_disks; 1475 1476 readahead = MD_READAHEAD; 1477 if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { 1478 readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; 1479 if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) 1480 readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; 1481 } else { 1482 // (no multipath branch - it uses the default setting) 1483 if (sb->level == -3) 1484 readahead = 0; 1485 } 1486 1487 printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", 1488 mdidx(mddev), readahead*(PAGE_SIZE/1024)); 1489 1490 printk(KERN_INFO 1491 "md%d: %d data-disks, max readahead per data-disk: %ldk\n", 1492 mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); 1493 return 0; 1494abort: 1495 return 1; 1496} 1497 1498 1499#define TOO_BIG_CHUNKSIZE KERN_ERR \ 1500"too big chunk_size: %d > %d\n" 1501 1502#define TOO_SMALL_CHUNKSIZE KERN_ERR \ 1503"too small chunk_size: %d < %ld\n" 1504 1505#define BAD_CHUNKSIZE KERN_ERR \ 1506"no chunksize specified, see 'man raidtab'\n" 1507 1508static int do_md_run(mddev_t * mddev) 1509{ 1510 int pnum, err; 1511 int chunk_size; 1512 struct md_list_head *tmp; 1513 mdk_rdev_t *rdev; 1514 1515 1516 if (list_empty(&mddev->disks)) { 1517 MD_BUG(); 1518 return -EINVAL; 1519 } 1520 1521 if (mddev->pers) 1522 return -EBUSY; 1523 1524 /* 1525 * Resize disks to align partitions size on a given 1526 * chunk size. 1527 */ 1528 md_size[mdidx(mddev)] = 0; 1529 1530 /* 1531 * Analyze all RAID superblock(s) 1532 */ 1533 if (analyze_sbs(mddev)) { 1534 MD_BUG(); 1535 return -EINVAL; 1536 } 1537 1538 chunk_size = mddev->sb->chunk_size; 1539 pnum = level_to_pers(mddev->sb->level); 1540 1541 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1542 if (!chunk_size) { 1543 /* 1544 * 'default chunksize' in the old md code used to 1545 * be PAGE_SIZE, baaad. 1546 * we abort here to be on the safe side. We dont 1547 * want to continue the bad practice. 1548 */ 1549 printk(BAD_CHUNKSIZE); 1550 return -EINVAL; 1551 } 1552 if (chunk_size > MAX_CHUNK_SIZE) { 1553 printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); 1554 return -EINVAL; 1555 } 1556 /* 1557 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1558 */ 1559 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1560 MD_BUG(); 1561 return -EINVAL; 1562 } 1563 if (chunk_size < PAGE_SIZE) { 1564 printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); 1565 return -EINVAL; 1566 } 1567 } else 1568 if (chunk_size) 1569 printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", 1570 mddev->sb->level); 1571 1572 if (pnum >= MAX_PERSONALITY) { 1573 MD_BUG(); 1574 return -EINVAL; 1575 } 1576 1577 if (!pers[pnum]) 1578 { 1579#ifdef CONFIG_KMOD 1580 char module_name[80]; 1581 sprintf (module_name, "md-personality-%d", pnum); 1582 request_module (module_name); 1583 if (!pers[pnum]) 1584#endif 1585 { 1586 printk(KERN_ERR "md: personality %d is not loaded!\n", 1587 pnum); 1588 return -EINVAL; 1589 } 1590 } 1591 1592 if (device_size_calculation(mddev)) 1593 return -EINVAL; 1594 1595 /* 1596 * Drop all container device buffers, from now on 1597 * the only valid external interface is through the md 1598 * device. 1599 * Also find largest hardsector size 1600 */ 1601 md_hardsect_sizes[mdidx(mddev)] = 512; 1602 ITERATE_RDEV(mddev,rdev,tmp) { 1603 if (rdev->faulty) 1604 continue; 1605 invalidate_device(rdev->dev, 1); 1606 if (get_hardsect_size(rdev->dev) 1607 > md_hardsect_sizes[mdidx(mddev)]) 1608 md_hardsect_sizes[mdidx(mddev)] = 1609 get_hardsect_size(rdev->dev); 1610 } 1611 md_blocksizes[mdidx(mddev)] = 1024; 1612 if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) 1613 md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; 1614 mddev->pers = pers[pnum]; 1615 1616 blk_queue_make_request(&mddev->queue, mddev->pers->make_request); 1617 mddev->queue.queuedata = mddev; 1618 1619 err = mddev->pers->run(mddev); 1620 if (err) { 1621 printk(KERN_ERR "md: pers->run() failed ...\n"); 1622 mddev->pers = NULL; 1623 return -EINVAL; 1624 } 1625 1626 mddev->sb->state &= ~(1 << MD_SB_CLEAN); 1627 mddev->sb_dirty = 1; 1628 md_update_sb(mddev); 1629 1630 /* 1631 * md_size has units of 1K blocks, which are 1632 * twice as large as sectors. 1633 */ 1634 md_hd_struct[mdidx(mddev)].start_sect = 0; 1635 register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)), 1636 1, &md_fops, md_size[mdidx(mddev)]<<1); 1637 1638 read_ahead[MD_MAJOR] = 1024; 1639 return (0); 1640} 1641 1642#undef TOO_BIG_CHUNKSIZE 1643#undef BAD_CHUNKSIZE 1644 1645static int restart_array(mddev_t *mddev) 1646{ 1647 int err; 1648 1649 /* 1650 * Complain if it has no devices 1651 */ 1652 err = -ENXIO; 1653 if (list_empty(&mddev->disks)) 1654 goto out; 1655 1656 if (mddev->pers) { 1657 err = -EBUSY; 1658 if (!mddev->ro) 1659 goto out; 1660 1661 mddev->ro = 0; 1662 set_device_ro(mddev_to_kdev(mddev), 0); 1663 1664 printk(KERN_INFO 1665 "md: md%d switched to read-write mode.\n", mdidx(mddev)); 1666 /* 1667 * Kick recovery or resync if necessary 1668 */ 1669 md_recover_arrays(); 1670 if (mddev->pers->restart_resync) 1671 mddev->pers->restart_resync(mddev); 1672 err = 0; 1673 } else { 1674 printk(KERN_ERR "md: md%d has no personality assigned.\n", 1675 mdidx(mddev)); 1676 err = -EINVAL; 1677 } 1678 1679out: 1680 return err; 1681} 1682 1683#define STILL_MOUNTED KERN_WARNING \ 1684"md: md%d still mounted.\n" 1685#define STILL_IN_USE \ 1686"md: md%d still in use.\n" 1687 1688static int do_md_stop(mddev_t * mddev, int ro) 1689{ 1690 int err = 0, resync_interrupted = 0; 1691 kdev_t dev = mddev_to_kdev(mddev); 1692 1693 if (atomic_read(&mddev->active)>1) { 1694 printk(STILL_IN_USE, mdidx(mddev)); 1695 err = -EBUSY; 1696 goto out; 1697 } 1698 1699 if (mddev->pers) { 1700 /* 1701 * It is safe to call stop here, it only frees private 1702 * data. Also, it tells us if a device is unstoppable 1703 * (eg. resyncing is in progress) 1704 */ 1705 if (mddev->pers->stop_resync) 1706 if (mddev->pers->stop_resync(mddev)) 1707 resync_interrupted = 1; 1708 1709 if (mddev->recovery_running) 1710 md_interrupt_thread(md_recovery_thread); 1711 1712 /* 1713 * This synchronizes with signal delivery to the 1714 * resync or reconstruction thread. It also nicely 1715 * hangs the process if some reconstruction has not 1716 * finished. 1717 */ 1718 down(&mddev->recovery_sem); 1719 up(&mddev->recovery_sem); 1720 1721 invalidate_device(dev, 1); 1722 1723 if (ro) { 1724 err = -ENXIO; 1725 if (mddev->ro) 1726 goto out; 1727 mddev->ro = 1; 1728 } else { 1729 if (mddev->ro) 1730 set_device_ro(dev, 0); 1731 if (mddev->pers->stop(mddev)) { 1732 err = -EBUSY; 1733 if (mddev->ro) 1734 set_device_ro(dev, 1); 1735 goto out; 1736 } 1737 if (mddev->ro) 1738 mddev->ro = 0; 1739 } 1740 if (mddev->sb) { 1741 /* 1742 * mark it clean only if there was no resync 1743 * interrupted. 1744 */ 1745 if (!mddev->recovery_running && !resync_interrupted) { 1746 printk(KERN_INFO "md: marking sb clean...\n"); 1747 mddev->sb->state |= 1 << MD_SB_CLEAN; 1748 } 1749 mddev->sb_dirty = 1; 1750 md_update_sb(mddev); 1751 } 1752 if (ro) 1753 set_device_ro(dev, 1); 1754 } 1755 1756 /* 1757 * Free resources if final stop 1758 */ 1759 if (!ro) { 1760 printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); 1761 free_mddev(mddev); 1762 } else 1763 printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev)); 1764 err = 0; 1765out: 1766 return err; 1767} 1768 1769/* 1770 * We have to safely support old arrays too. 1771 */ 1772int detect_old_array(mdp_super_t *sb) 1773{ 1774 if (sb->major_version > 0) 1775 return 0; 1776 if (sb->minor_version >= 90) 1777 return 0; 1778 1779 return -EINVAL; 1780} 1781 1782 1783static void autorun_array(mddev_t *mddev) 1784{ 1785 mdk_rdev_t *rdev; 1786 struct md_list_head *tmp; 1787 int err; 1788 1789 if (list_empty(&mddev->disks)) { 1790 MD_BUG(); 1791 return; 1792 } 1793 1794 printk(KERN_INFO "md: running: "); 1795 1796 ITERATE_RDEV(mddev,rdev,tmp) { 1797 printk("<%s>", partition_name(rdev->dev)); 1798 } 1799 printk("\n"); 1800 1801 err = do_md_run (mddev); 1802 if (err) { 1803 printk(KERN_WARNING "md :do_md_run() returned %d\n", err); 1804 /* 1805 * prevent the writeback of an unrunnable array 1806 */ 1807 mddev->sb_dirty = 0; 1808 do_md_stop (mddev, 0); 1809 } 1810} 1811 1812/* 1813 * lets try to run arrays based on all disks that have arrived 1814 * until now. (those are in the ->pending list) 1815 * 1816 * the method: pick the first pending disk, collect all disks with 1817 * the same UUID, remove all from the pending list and put them into 1818 * the 'same_array' list. Then order this list based on superblock 1819 * update time (freshest comes first), kick out 'old' disks and 1820 * compare superblocks. If everything's fine then run it. 1821 * 1822 * If "unit" is allocated, then bump its reference count 1823 */ 1824static void autorun_devices(kdev_t countdev) 1825{ 1826 struct md_list_head candidates; 1827 struct md_list_head *tmp; 1828 mdk_rdev_t *rdev0, *rdev; 1829 mddev_t *mddev; 1830 kdev_t md_kdev; 1831 1832 1833 printk(KERN_INFO "md: autorun ...\n"); 1834 while (!list_empty(&pending_raid_disks)) { 1835 rdev0 = md_list_entry(pending_raid_disks.next, 1836 mdk_rdev_t, pending); 1837 1838 printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev)); 1839 MD_INIT_LIST_HEAD(&candidates); 1840 ITERATE_RDEV_PENDING(rdev,tmp) { 1841 if (uuid_equal(rdev0, rdev)) { 1842 if (!sb_equal(rdev0->sb, rdev->sb)) { 1843 printk(KERN_WARNING 1844 "md: %s has same UUID as %s, but superblocks differ ...\n", 1845 partition_name(rdev->dev), partition_name(rdev0->dev)); 1846 continue; 1847 } 1848 printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev)); 1849 md_list_del(&rdev->pending); 1850 md_list_add(&rdev->pending, &candidates); 1851 } 1852 } 1853 /* 1854 * now we have a set of devices, with all of them having 1855 * mostly sane superblocks. It's time to allocate the 1856 * mddev. 1857 */ 1858 md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); 1859 mddev = kdev_to_mddev(md_kdev); 1860 if (mddev) { 1861 printk(KERN_WARNING "md: md%d already running, cannot run %s\n", 1862 mdidx(mddev), partition_name(rdev0->dev)); 1863 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) 1864 export_rdev(rdev); 1865 continue; 1866 } 1867 mddev = alloc_mddev(md_kdev); 1868 if (!mddev) { 1869 printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); 1870 break; 1871 } 1872 if (md_kdev == countdev) 1873 atomic_inc(&mddev->active); 1874 printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); 1875 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { 1876 bind_rdev_to_array(rdev, mddev); 1877 list_del_init(&rdev->pending); 1878 } 1879 autorun_array(mddev); 1880 } 1881 printk(KERN_INFO "md: ... autorun DONE.\n"); 1882} 1883 1884/* 1885 * import RAID devices based on one partition 1886 * if possible, the array gets run as well. 1887 */ 1888 1889#define BAD_VERSION KERN_ERR \ 1890"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" 1891 1892#define OUT_OF_MEM KERN_ALERT \ 1893"md: out of memory.\n" 1894 1895#define NO_DEVICE KERN_ERR \ 1896"md: disabled device %s\n" 1897 1898#define AUTOADD_FAILED KERN_ERR \ 1899"md: auto-adding devices to md%d FAILED (error %d).\n" 1900 1901#define AUTOADD_FAILED_USED KERN_ERR \ 1902"md: cannot auto-add device %s to md%d, already used.\n" 1903 1904#define AUTORUN_FAILED KERN_ERR \ 1905"md: auto-running md%d FAILED (error %d).\n" 1906 1907#define MDDEV_BUSY KERN_ERR \ 1908"md: cannot auto-add to md%d, already running.\n" 1909 1910#define AUTOADDING KERN_INFO \ 1911"md: auto-adding devices to md%d, based on %s's superblock.\n" 1912 1913#define AUTORUNNING KERN_INFO \ 1914"md: auto-running md%d.\n" 1915 1916static int autostart_array(kdev_t startdev, kdev_t countdev) 1917{ 1918 int err = -EINVAL, i; 1919 mdp_super_t *sb = NULL; 1920 mdk_rdev_t *start_rdev = NULL, *rdev; 1921 1922 if (md_import_device(startdev, 1)) { 1923 printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev)); 1924 goto abort; 1925 } 1926 1927 start_rdev = find_rdev_all(startdev); 1928 if (!start_rdev) { 1929 MD_BUG(); 1930 goto abort; 1931 } 1932 if (start_rdev->faulty) { 1933 printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", 1934 partition_name(startdev)); 1935 goto abort; 1936 } 1937 md_list_add(&start_rdev->pending, &pending_raid_disks); 1938 1939 sb = start_rdev->sb; 1940 1941 err = detect_old_array(sb); 1942 if (err) { 1943 printk(KERN_WARNING "md: array version is too old to be autostarted ," 1944 "use raidtools 0.90 mkraid --upgrade to upgrade the array " 1945 "without data loss!\n"); 1946 goto abort; 1947 } 1948 1949 for (i = 0; i < MD_SB_DISKS; i++) { 1950 mdp_disk_t *desc; 1951 kdev_t dev; 1952 1953 desc = sb->disks + i; 1954 dev = MKDEV(desc->major, desc->minor); 1955 1956 if (dev == MKDEV(0,0)) 1957 continue; 1958 if (dev == startdev) 1959 continue; 1960 if (md_import_device(dev, 1)) { 1961 printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n", 1962 partition_name(dev)); 1963 continue; 1964 } 1965 rdev = find_rdev_all(dev); 1966 if (!rdev) { 1967 MD_BUG(); 1968 goto abort; 1969 } 1970 md_list_add(&rdev->pending, &pending_raid_disks); 1971 } 1972 1973 /* 1974 * possibly return codes 1975 */ 1976 autorun_devices(countdev); 1977 return 0; 1978 1979abort: 1980 if (start_rdev) 1981 export_rdev(start_rdev); 1982 return err; 1983} 1984 1985#undef BAD_VERSION 1986#undef OUT_OF_MEM 1987#undef NO_DEVICE 1988#undef AUTOADD_FAILED_USED 1989#undef AUTOADD_FAILED 1990#undef AUTORUN_FAILED 1991#undef AUTOADDING 1992#undef AUTORUNNING 1993 1994 1995static int get_version(void * arg) 1996{ 1997 mdu_version_t ver; 1998 1999 ver.major = MD_MAJOR_VERSION; 2000 ver.minor = MD_MINOR_VERSION; 2001 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2002 2003 if (md_copy_to_user(arg, &ver, sizeof(ver))) 2004 return -EFAULT; 2005 2006 return 0; 2007} 2008 2009#define SET_FROM_SB(x) info.x = mddev->sb->x 2010static int get_array_info(mddev_t * mddev, void * arg) 2011{ 2012 mdu_array_info_t info; 2013 2014 if (!mddev->sb) { 2015 MD_BUG(); 2016 return -EINVAL; 2017 } 2018 2019 SET_FROM_SB(major_version); 2020 SET_FROM_SB(minor_version); 2021 SET_FROM_SB(patch_version); 2022 SET_FROM_SB(ctime); 2023 SET_FROM_SB(level); 2024 SET_FROM_SB(size); 2025 SET_FROM_SB(nr_disks); 2026 SET_FROM_SB(raid_disks); 2027 SET_FROM_SB(md_minor); 2028 SET_FROM_SB(not_persistent); 2029 2030 SET_FROM_SB(utime); 2031 SET_FROM_SB(state); 2032 SET_FROM_SB(active_disks); 2033 SET_FROM_SB(working_disks); 2034 SET_FROM_SB(failed_disks); 2035 SET_FROM_SB(spare_disks); 2036 2037 SET_FROM_SB(layout); 2038 SET_FROM_SB(chunk_size); 2039 2040 if (md_copy_to_user(arg, &info, sizeof(info))) 2041 return -EFAULT; 2042 2043 return 0; 2044} 2045#undef SET_FROM_SB 2046 2047#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x 2048static int get_disk_info(mddev_t * mddev, void * arg) 2049{ 2050 mdu_disk_info_t info; 2051 unsigned int nr; 2052 2053 if (!mddev->sb) 2054 return -EINVAL; 2055 2056 if (md_copy_from_user(&info, arg, sizeof(info))) 2057 return -EFAULT; 2058 2059 nr = info.number; 2060 if (nr >= MD_SB_DISKS) 2061 return -EINVAL; 2062 2063 SET_FROM_SB(major); 2064 SET_FROM_SB(minor); 2065 SET_FROM_SB(raid_disk); 2066 SET_FROM_SB(state); 2067 2068 if (md_copy_to_user(arg, &info, sizeof(info))) 2069 return -EFAULT; 2070 2071 return 0; 2072} 2073#undef SET_FROM_SB 2074 2075#define SET_SB(x) mddev->sb->disks[nr].x = info->x 2076 2077static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2078{ 2079 int err, size, persistent; 2080 mdk_rdev_t *rdev; 2081 unsigned int nr; 2082 kdev_t dev; 2083 dev = MKDEV(info->major,info->minor); 2084 2085 if (find_rdev_all(dev)) { 2086 printk(KERN_WARNING "md: device %s already used in a RAID array!\n", 2087 partition_name(dev)); 2088 return -EBUSY; 2089 } 2090 if (!mddev->sb) { 2091 /* expecting a device which has a superblock */ 2092 err = md_import_device(dev, 1); 2093 if (err) { 2094 printk(KERN_WARNING "md: md_import_device returned %d\n", err); 2095 return -EINVAL; 2096 } 2097 rdev = find_rdev_all(dev); 2098 if (!rdev) { 2099 MD_BUG(); 2100 return -EINVAL; 2101 } 2102 if (!list_empty(&mddev->disks)) { 2103 mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, 2104 mdk_rdev_t, same_set); 2105 if (!uuid_equal(rdev0, rdev)) { 2106 printk(KERN_WARNING "md: %s has different UUID to %s\n", 2107 partition_name(rdev->dev), partition_name(rdev0->dev)); 2108 export_rdev(rdev); 2109 return -EINVAL; 2110 } 2111 if (!sb_equal(rdev0->sb, rdev->sb)) { 2112 printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n", 2113 partition_name(rdev->dev), partition_name(rdev0->dev)); 2114 export_rdev(rdev); 2115 return -EINVAL; 2116 } 2117 } 2118 bind_rdev_to_array(rdev, mddev); 2119 return 0; 2120 } 2121 2122 nr = info->number; 2123 if (nr >= mddev->sb->nr_disks) { 2124 MD_BUG(); 2125 return -EINVAL; 2126 } 2127 2128 2129 SET_SB(number); 2130 SET_SB(major); 2131 SET_SB(minor); 2132 SET_SB(raid_disk); 2133 SET_SB(state); 2134 2135 if ((info->state & (1<<MD_DISK_FAULTY))==0) { 2136 err = md_import_device (dev, 0); 2137 if (err) { 2138 printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); 2139 return -EINVAL; 2140 } 2141 rdev = find_rdev_all(dev); 2142 if (!rdev) { 2143 MD_BUG(); 2144 return -EINVAL; 2145 } 2146 2147 rdev->old_dev = dev; 2148 rdev->desc_nr = info->number; 2149 2150 bind_rdev_to_array(rdev, mddev); 2151 2152 persistent = !mddev->sb->not_persistent; 2153 if (!persistent) 2154 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2155 2156 size = calc_dev_size(dev, mddev, persistent); 2157 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); 2158 2159 if (!mddev->sb->size || (mddev->sb->size > size)) 2160 mddev->sb->size = size; 2161 } 2162 2163 /* 2164 * sync all other superblocks with the main superblock 2165 */ 2166 sync_sbs(mddev); 2167 2168 return 0; 2169} 2170#undef SET_SB 2171 2172static int hot_generate_error(mddev_t * mddev, kdev_t dev) 2173{ 2174 struct request_queue *q; 2175 mdk_rdev_t *rdev; 2176 mdp_disk_t *disk; 2177 2178 if (!mddev->pers) 2179 return -ENODEV; 2180 2181 printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", 2182 partition_name(dev), mdidx(mddev)); 2183 2184 rdev = find_rdev(mddev, dev); 2185 if (!rdev) { 2186 MD_BUG(); 2187 return -ENXIO; 2188 } 2189 2190 if (rdev->desc_nr == -1) { 2191 MD_BUG(); 2192 return -EINVAL; 2193 } 2194 disk = &mddev->sb->disks[rdev->desc_nr]; 2195 if (!disk_active(disk)) 2196 return -ENODEV; 2197 2198 q = blk_get_queue(rdev->dev); 2199 if (!q) { 2200 MD_BUG(); 2201 return -ENODEV; 2202 } 2203 printk(KERN_INFO "md: okay, generating error!\n"); 2204// q->oneshot_error = 1; // disabled for now 2205 2206 return 0; 2207} 2208 2209static int hot_remove_disk(mddev_t * mddev, kdev_t dev) 2210{ 2211 int err; 2212 mdk_rdev_t *rdev; 2213 mdp_disk_t *disk; 2214 2215 if (!mddev->pers) 2216 return -ENODEV; 2217 2218 printk(KERN_INFO "md: trying to remove %s from md%d ... \n", 2219 partition_name(dev), mdidx(mddev)); 2220 2221 if (!mddev->pers->diskop) { 2222 printk(KERN_WARNING "md%d: personality does not support diskops!\n", 2223 mdidx(mddev)); 2224 return -EINVAL; 2225 } 2226 2227 rdev = find_rdev(mddev, dev); 2228 if (!rdev) 2229 return -ENXIO; 2230 2231 if (rdev->desc_nr == -1) { 2232 MD_BUG(); 2233 return -EINVAL; 2234 } 2235 disk = &mddev->sb->disks[rdev->desc_nr]; 2236 if (disk_active(disk)) 2237 goto busy; 2238 2239 if (disk_removed(disk)) 2240 return -EINVAL; 2241 2242 err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); 2243 if (err == -EBUSY) 2244 goto busy; 2245 2246 if (err) { 2247 MD_BUG(); 2248 return -EINVAL; 2249 } 2250 2251 remove_descriptor(disk, mddev->sb); 2252 kick_rdev_from_array(rdev); 2253 mddev->sb_dirty = 1; 2254 md_update_sb(mddev); 2255 2256 return 0; 2257busy: 2258 printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", 2259 partition_name(dev), mdidx(mddev)); 2260 return -EBUSY; 2261} 2262 2263static int hot_add_disk(mddev_t * mddev, kdev_t dev) 2264{ 2265 int i, err, persistent; 2266 unsigned int size; 2267 mdk_rdev_t *rdev; 2268 mdp_disk_t *disk; 2269 2270 if (!mddev->pers) 2271 return -ENODEV; 2272 2273 printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", 2274 partition_name(dev), mdidx(mddev)); 2275 2276 if (!mddev->pers->diskop) { 2277 printk(KERN_WARNING "md%d: personality does not support diskops!\n", 2278 mdidx(mddev)); 2279 return -EINVAL; 2280 } 2281 2282 persistent = !mddev->sb->not_persistent; 2283 2284 rdev = find_rdev(mddev, dev); 2285 if (rdev) 2286 return -EBUSY; 2287 2288 err = md_import_device (dev, 0); 2289 if (err) { 2290 printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); 2291 return -EINVAL; 2292 } 2293 rdev = find_rdev_all(dev); 2294 if (!rdev) { 2295 MD_BUG(); 2296 return -EINVAL; 2297 } 2298 if (rdev->faulty) { 2299 printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n", 2300 partition_name(dev), mdidx(mddev)); 2301 err = -EINVAL; 2302 goto abort_export; 2303 } 2304 size = calc_dev_size(dev, mddev, persistent); 2305 2306 if (size < mddev->sb->size) { 2307 printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n", 2308 mdidx(mddev), size, mddev->sb->size); 2309 err = -ENOSPC; 2310 goto abort_export; 2311 } 2312 bind_rdev_to_array(rdev, mddev); 2313 2314 /* 2315 * The rest should better be atomic, we can have disk failures 2316 * noticed in interrupt contexts ... 2317 */ 2318 rdev->old_dev = dev; 2319 rdev->size = size; 2320 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); 2321 2322 disk = mddev->sb->disks + mddev->sb->raid_disks; 2323 for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { 2324 disk = mddev->sb->disks + i; 2325 2326 if (!disk->major && !disk->minor) 2327 break; 2328 if (disk_removed(disk)) 2329 break; 2330 } 2331 if (i == MD_SB_DISKS) { 2332 printk(KERN_WARNING "md%d: can not hot-add to full array!\n", 2333 mdidx(mddev)); 2334 err = -EBUSY; 2335 goto abort_unbind_export; 2336 } 2337 2338 if (disk_removed(disk)) { 2339 /* 2340 * reuse slot 2341 */ 2342 if (disk->number != i) { 2343 MD_BUG(); 2344 err = -EINVAL; 2345 goto abort_unbind_export; 2346 } 2347 } else { 2348 disk->number = i; 2349 } 2350 2351 disk->raid_disk = disk->number; 2352 disk->major = MAJOR(dev); 2353 disk->minor = MINOR(dev); 2354 2355 if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { 2356 MD_BUG(); 2357 err = -EINVAL; 2358 goto abort_unbind_export; 2359 } 2360 2361 mark_disk_spare(disk); 2362 mddev->sb->nr_disks++; 2363 mddev->sb->spare_disks++; 2364 mddev->sb->working_disks++; 2365 2366 mddev->sb_dirty = 1; 2367 md_update_sb(mddev); 2368 2369 /* 2370 * Kick recovery, maybe this spare has to be added to the 2371 * array immediately. 2372 */ 2373 md_recover_arrays(); 2374 2375 return 0; 2376 2377abort_unbind_export: 2378 unbind_rdev_from_array(rdev); 2379 2380abort_export: 2381 export_rdev(rdev); 2382 return err; 2383} 2384 2385#define SET_SB(x) mddev->sb->x = info->x 2386static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2387{ 2388 2389 if (alloc_array_sb(mddev)) 2390 return -ENOMEM; 2391 2392 mddev->sb->major_version = MD_MAJOR_VERSION; 2393 mddev->sb->minor_version = MD_MINOR_VERSION; 2394 mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; 2395 mddev->sb->ctime = CURRENT_TIME; 2396 2397 SET_SB(level); 2398 SET_SB(size); 2399 SET_SB(nr_disks); 2400 SET_SB(raid_disks); 2401 SET_SB(md_minor); 2402 SET_SB(not_persistent); 2403 2404 SET_SB(state); 2405 SET_SB(active_disks); 2406 SET_SB(working_disks); 2407 SET_SB(failed_disks); 2408 SET_SB(spare_disks); 2409 2410 SET_SB(layout); 2411 SET_SB(chunk_size); 2412 2413 mddev->sb->md_magic = MD_SB_MAGIC; 2414 2415 /* 2416 * Generate a 128 bit UUID 2417 */ 2418 get_random_bytes(&mddev->sb->set_uuid0, 4); 2419 get_random_bytes(&mddev->sb->set_uuid1, 4); 2420 get_random_bytes(&mddev->sb->set_uuid2, 4); 2421 get_random_bytes(&mddev->sb->set_uuid3, 4); 2422 2423 return 0; 2424} 2425#undef SET_SB 2426 2427static int set_disk_faulty(mddev_t *mddev, kdev_t dev) 2428{ 2429 int ret; 2430 2431 ret = md_error(mddev, dev); 2432 return ret; 2433} 2434 2435static int md_ioctl(struct inode *inode, struct file *file, 2436 unsigned int cmd, unsigned long arg) 2437{ 2438 unsigned int minor; 2439 int err = 0; 2440 struct hd_geometry *loc = (struct hd_geometry *) arg; 2441 mddev_t *mddev = NULL; 2442 kdev_t dev; 2443 2444 if (!md_capable_admin()) 2445 return -EACCES; 2446 2447 dev = inode->i_rdev; 2448 minor = MINOR(dev); 2449 if (minor >= MAX_MD_DEVS) { 2450 MD_BUG(); 2451 return -EINVAL; 2452 } 2453 2454 /* 2455 * Commands dealing with the RAID driver but not any 2456 * particular array: 2457 */ 2458 switch (cmd) 2459 { 2460 case RAID_VERSION: 2461 err = get_version((void *)arg); 2462 goto done; 2463 2464 case PRINT_RAID_DEBUG: 2465 err = 0; 2466 md_print_devices(); 2467 goto done_unlock; 2468 2469#ifndef MODULE 2470 case RAID_AUTORUN: 2471 err = 0; 2472 autostart_arrays(); 2473 goto done; 2474#endif 2475 2476 case BLKGETSIZE: 2477 case BLKGETSIZE64: 2478 case BLKRAGET: 2479 case BLKRASET: 2480 case BLKFLSBUF: 2481 case BLKBSZGET: 2482 case BLKBSZSET: 2483 err = blk_ioctl (dev, cmd, arg); 2484 goto abort; 2485 2486 default:; 2487 } 2488 2489 /* 2490 * Commands creating/starting a new array: 2491 */ 2492 2493 mddev = kdev_to_mddev(dev); 2494 2495 switch (cmd) 2496 { 2497 case SET_ARRAY_INFO: 2498 case START_ARRAY: 2499 if (mddev) { 2500 printk(KERN_WARNING "md: array md%d already exists!\n", 2501 mdidx(mddev)); 2502 err = -EEXIST; 2503 goto abort; 2504 } 2505 default:; 2506 } 2507 switch (cmd) 2508 { 2509 case SET_ARRAY_INFO: 2510 mddev = alloc_mddev(dev); 2511 if (!mddev) { 2512 err = -ENOMEM; 2513 goto abort; 2514 } 2515 atomic_inc(&mddev->active); 2516 2517 /* 2518 * alloc_mddev() should possibly self-lock. 2519 */ 2520 err = lock_mddev(mddev); 2521 if (err) { 2522 printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n", 2523 err, cmd); 2524 goto abort; 2525 } 2526 2527 if (mddev->sb) { 2528 printk(KERN_WARNING "md: array md%d already has a superblock!\n", 2529 mdidx(mddev)); 2530 err = -EBUSY; 2531 goto abort_unlock; 2532 } 2533 if (arg) { 2534 mdu_array_info_t info; 2535 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { 2536 err = -EFAULT; 2537 goto abort_unlock; 2538 } 2539 err = set_array_info(mddev, &info); 2540 if (err) { 2541 printk(KERN_WARNING "md: couldnt set array info. %d\n", err); 2542 goto abort_unlock; 2543 } 2544 } 2545 goto done_unlock; 2546 2547 case START_ARRAY: 2548 /* 2549 * possibly make it lock the array ... 2550 */ 2551 err = autostart_array((kdev_t)arg, dev); 2552 if (err) { 2553 printk(KERN_WARNING "md: autostart %s failed!\n", 2554 partition_name((kdev_t)arg)); 2555 goto abort; 2556 } 2557 goto done; 2558 2559 default:; 2560 } 2561 2562 /* 2563 * Commands querying/configuring an existing array: 2564 */ 2565 2566 if (!mddev) { 2567 err = -ENODEV; 2568 goto abort; 2569 } 2570 err = lock_mddev(mddev); 2571 if (err) { 2572 printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); 2573 goto abort; 2574 } 2575 /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ 2576 if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { 2577 err = -ENODEV; 2578 goto abort_unlock; 2579 } 2580 2581 /* 2582 * Commands even a read-only array can execute: 2583 */ 2584 switch (cmd) 2585 { 2586 case GET_ARRAY_INFO: 2587 err = get_array_info(mddev, (void *)arg); 2588 goto done_unlock; 2589 2590 case GET_DISK_INFO: 2591 err = get_disk_info(mddev, (void *)arg); 2592 goto done_unlock; 2593 2594 case RESTART_ARRAY_RW: 2595 err = restart_array(mddev); 2596 goto done_unlock; 2597 2598 case STOP_ARRAY: 2599 if (!(err = do_md_stop (mddev, 0))) 2600 mddev = NULL; 2601 goto done_unlock; 2602 2603 case STOP_ARRAY_RO: 2604 err = do_md_stop (mddev, 1); 2605 goto done_unlock; 2606 2607 /* 2608 * We have a problem here : there is no easy way to give a CHS 2609 * virtual geometry. We currently pretend that we have a 2 heads 2610 * 4 sectors (with a BIG number of cylinders...). This drives 2611 * dosfs just mad... ;-) 2612 */ 2613 case HDIO_GETGEO: 2614 if (!loc) { 2615 err = -EINVAL; 2616 goto abort_unlock; 2617 } 2618 err = md_put_user (2, (char *) &loc->heads); 2619 if (err) 2620 goto abort_unlock; 2621 err = md_put_user (4, (char *) &loc->sectors); 2622 if (err) 2623 goto abort_unlock; 2624 err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, 2625 (short *) &loc->cylinders); 2626 if (err) 2627 goto abort_unlock; 2628 err = md_put_user (md_hd_struct[minor].start_sect, 2629 (long *) &loc->start); 2630 goto done_unlock; 2631 } 2632 2633 /* 2634 * The remaining ioctls are changing the state of the 2635 * superblock, so we do not allow read-only arrays 2636 * here: 2637 */ 2638 if (mddev->ro) { 2639 err = -EROFS; 2640 goto abort_unlock; 2641 } 2642 2643 switch (cmd) 2644 { 2645 case ADD_NEW_DISK: 2646 { 2647 mdu_disk_info_t info; 2648 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) 2649 err = -EFAULT; 2650 else 2651 err = add_new_disk(mddev, &info); 2652 goto done_unlock; 2653 } 2654 case HOT_GENERATE_ERROR: 2655 err = hot_generate_error(mddev, (kdev_t)arg); 2656 goto done_unlock; 2657 case HOT_REMOVE_DISK: 2658 err = hot_remove_disk(mddev, (kdev_t)arg); 2659 goto done_unlock; 2660 2661 case HOT_ADD_DISK: 2662 err = hot_add_disk(mddev, (kdev_t)arg); 2663 goto done_unlock; 2664 2665 case SET_DISK_FAULTY: 2666 err = set_disk_faulty(mddev, (kdev_t)arg); 2667 goto done_unlock; 2668 2669 case RUN_ARRAY: 2670 { 2671 err = do_md_run (mddev); 2672 /* 2673 * we have to clean up the mess if 2674 * the array cannot be run for some 2675 * reason ... 2676 */ 2677 if (err) { 2678 mddev->sb_dirty = 0; 2679 if (!do_md_stop (mddev, 0)) 2680 mddev = NULL; 2681 } 2682 goto done_unlock; 2683 } 2684 2685 default: 2686 printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " 2687 "upgrade your software to use new ictls.\n", 2688 current->comm, current->pid); 2689 err = -EINVAL; 2690 goto abort_unlock; 2691 } 2692 2693done_unlock: 2694abort_unlock: 2695 if (mddev) 2696 unlock_mddev(mddev); 2697 2698 return err; 2699done: 2700 if (err) 2701 MD_BUG(); 2702abort: 2703 return err; 2704} 2705 2706static int md_open(struct inode *inode, struct file *file) 2707{ 2708 /* 2709 * Always succeed, but increment the usage count 2710 */ 2711 mddev_t *mddev = kdev_to_mddev(inode->i_rdev); 2712 if (mddev) 2713 atomic_inc(&mddev->active); 2714 return (0); 2715} 2716 2717static int md_release(struct inode *inode, struct file * file) 2718{ 2719 mddev_t *mddev = kdev_to_mddev(inode->i_rdev); 2720 if (mddev) 2721 atomic_dec(&mddev->active); 2722 return 0; 2723} 2724 2725static struct block_device_operations md_fops= 2726{ 2727 owner: THIS_MODULE, 2728 open: md_open, 2729 release: md_release, 2730 ioctl: md_ioctl, 2731}; 2732 2733 2734int md_thread(void * arg) 2735{ 2736 mdk_thread_t *thread = arg; 2737 2738 md_lock_kernel(); 2739 2740 /* 2741 * Detach thread 2742 */ 2743 2744 daemonize(); 2745 2746 sprintf(current->comm, thread->name); 2747 md_init_signals(); 2748 md_flush_signals(); 2749 thread->tsk = current; 2750 2751 /* 2752 * md_thread is a 'system-thread', it's priority should be very 2753 * high. We avoid resource deadlocks individually in each 2754 * raid personality. (RAID5 does preallocation) We also use RR and 2755 * the very same RT priority as kswapd, thus we will never get 2756 * into a priority inversion deadlock. 2757 * 2758 * we definitely have to have equal or higher priority than 2759 * bdflush, otherwise bdflush will deadlock if there are too 2760 * many dirty RAID5 blocks. 2761 */ 2762 current->policy = SCHED_OTHER; 2763 current->nice = -20; 2764 md_unlock_kernel(); 2765 2766 complete(thread->event); 2767 while (thread->run) { 2768 void (*run)(void *data); 2769 2770 wait_event_interruptible(thread->wqueue, 2771 test_bit(THREAD_WAKEUP, &thread->flags)); 2772 2773 clear_bit(THREAD_WAKEUP, &thread->flags); 2774 2775 run = thread->run; 2776 if (run) { 2777 run(thread->data); 2778 run_task_queue(&tq_disk); 2779 } 2780 if (md_signal_pending(current)) 2781 md_flush_signals(); 2782 } 2783 complete(thread->event); 2784 return 0; 2785} 2786 2787void md_wakeup_thread(mdk_thread_t *thread) 2788{ 2789 dprintk("md: waking up MD thread %p.\n", thread); 2790 set_bit(THREAD_WAKEUP, &thread->flags); 2791 wake_up(&thread->wqueue); 2792} 2793 2794mdk_thread_t *md_register_thread(void (*run) (void *), 2795 void *data, const char *name) 2796{ 2797 mdk_thread_t *thread; 2798 int ret; 2799 struct completion event; 2800 2801 thread = (mdk_thread_t *) kmalloc 2802 (sizeof(mdk_thread_t), GFP_KERNEL); 2803 if (!thread) 2804 return NULL; 2805 2806 memset(thread, 0, sizeof(mdk_thread_t)); 2807 md_init_waitqueue_head(&thread->wqueue); 2808 2809 init_completion(&event); 2810 thread->event = &event; 2811 thread->run = run; 2812 thread->data = data; 2813 thread->name = name; 2814 ret = kernel_thread(md_thread, thread, 0); 2815 if (ret < 0) { 2816 kfree(thread); 2817 return NULL; 2818 } 2819 wait_for_completion(&event); 2820 return thread; 2821} 2822 2823void md_interrupt_thread(mdk_thread_t *thread) 2824{ 2825 if (!thread->tsk) { 2826 MD_BUG(); 2827 return; 2828 } 2829 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 2830 send_sig(SIGKILL, thread->tsk, 1); 2831} 2832 2833void md_unregister_thread(mdk_thread_t *thread) 2834{ 2835 struct completion event; 2836 2837 init_completion(&event); 2838 2839 thread->event = &event; 2840 thread->run = NULL; 2841 thread->name = NULL; 2842 md_interrupt_thread(thread); 2843 wait_for_completion(&event); 2844 kfree(thread); 2845} 2846 2847void md_recover_arrays(void) 2848{ 2849 if (!md_recovery_thread) { 2850 MD_BUG(); 2851 return; 2852 } 2853 md_wakeup_thread(md_recovery_thread); 2854} 2855 2856 2857int md_error(mddev_t *mddev, kdev_t rdev) 2858{ 2859 mdk_rdev_t * rrdev; 2860 2861 dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 2862 MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev), 2863 __builtin_return_address(0),__builtin_return_address(1), 2864 __builtin_return_address(2),__builtin_return_address(3)); 2865 2866 if (!mddev) { 2867 MD_BUG(); 2868 return 0; 2869 } 2870 rrdev = find_rdev(mddev, rdev); 2871 if (!rrdev || rrdev->faulty) 2872 return 0; 2873 if (!mddev->pers->error_handler 2874 || mddev->pers->error_handler(mddev,rdev) <= 0) { 2875 rrdev->faulty = 1; 2876 } else 2877 return 1; 2878 /* 2879 * if recovery was running, stop it now. 2880 */ 2881 if (mddev->pers->stop_resync) 2882 mddev->pers->stop_resync(mddev); 2883 if (mddev->recovery_running) 2884 md_interrupt_thread(md_recovery_thread); 2885 md_recover_arrays(); 2886 2887 return 0; 2888} 2889 2890static void status_unused(struct seq_file *seq) 2891{ 2892 int i = 0; 2893 mdk_rdev_t *rdev; 2894 struct md_list_head *tmp; 2895 2896 seq_printf(seq, "unused devices: "); 2897 2898 ITERATE_RDEV_ALL(rdev,tmp) { 2899 if (list_empty(&rdev->same_set)) { 2900 /* 2901 * The device is not yet used by any array. 2902 */ 2903 i++; 2904 seq_printf(seq, "%s ", 2905 partition_name(rdev->dev)); 2906 } 2907 } 2908 if (!i) 2909 seq_printf(seq, "<none>"); 2910 2911 seq_printf(seq, "\n"); 2912} 2913 2914 2915static void status_resync(struct seq_file *seq, mddev_t * mddev) 2916{ 2917 unsigned long max_blocks, resync, res, dt, db, rt; 2918 2919 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 2920 max_blocks = mddev->sb->size; 2921 2922 /* 2923 * Should not happen. 2924 */ 2925 if (!max_blocks) 2926 MD_BUG(); 2927 2928 res = (resync/1024)*1000/(max_blocks/1024 + 1); 2929 { 2930 int i, x = res/50, y = 20-x; 2931 seq_printf(seq, "["); 2932 for (i = 0; i < x; i++) 2933 seq_printf(seq, "="); 2934 seq_printf(seq, ">"); 2935 for (i = 0; i < y; i++) 2936 seq_printf(seq, "."); 2937 seq_printf(seq, "] "); 2938 } 2939 if (!mddev->recovery_running) 2940 /* 2941 * true resync 2942 */ 2943 seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)", 2944 res/10, res % 10, resync, max_blocks); 2945 else 2946 /* 2947 * recovery ... 2948 */ 2949 seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)", 2950 res/10, res % 10, resync, max_blocks); 2951 2952 /* 2953 * We do not want to overflow, so the order of operands and 2954 * the * 100 / 100 trick are important. We do a +1 to be 2955 * safe against division by zero. We only estimate anyway. 2956 * 2957 * dt: time from mark until now 2958 * db: blocks written from mark until now 2959 * rt: remaining time 2960 */ 2961 dt = ((jiffies - mddev->resync_mark) / HZ); 2962 if (!dt) dt++; 2963 db = resync - (mddev->resync_mark_cnt/2); 2964 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 2965 2966 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 2967 2968 seq_printf(seq, " speed=%ldK/sec", db/dt); 2969 2970} 2971 2972 2973static void *md_seq_start(struct seq_file *seq, loff_t *pos) 2974{ 2975 struct list_head *tmp; 2976 loff_t l = *pos; 2977 mddev_t *mddev; 2978 2979 if (l > 0x10000) 2980 return NULL; 2981 if (!l--) 2982 /* header */ 2983 return (void*)1; 2984 2985 list_for_each(tmp,&all_mddevs) 2986 if (!l--) { 2987 mddev = list_entry(tmp, mddev_t, all_mddevs); 2988 return mddev; 2989 } 2990 return (void*)2;/* tail */ 2991} 2992 2993static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2994{ 2995 struct list_head *tmp; 2996 mddev_t *next_mddev, *mddev = v; 2997 2998 ++*pos; 2999 if (v == (void*)2) 3000 return NULL; 3001 3002 if (v == (void*)1) 3003 tmp = all_mddevs.next; 3004 else 3005 tmp = mddev->all_mddevs.next; 3006 if (tmp != &all_mddevs) 3007 next_mddev = list_entry(tmp,mddev_t,all_mddevs); 3008 else { 3009 next_mddev = (void*)2; 3010 *pos = 0x10000; 3011 } 3012 3013 return next_mddev; 3014 3015} 3016 3017static void md_seq_stop(struct seq_file *seq, void *v) 3018{ 3019 3020} 3021 3022static int md_seq_show(struct seq_file *seq, void *v) 3023{ 3024 int j, size; 3025 struct md_list_head *tmp2; 3026 mdk_rdev_t *rdev; 3027 mddev_t *mddev = v; 3028 3029 if (v == (void*)1) { 3030 seq_printf(seq, "Personalities : "); 3031 for (j = 0; j < MAX_PERSONALITY; j++) 3032 if (pers[j]) 3033 seq_printf(seq, "[%s] ", pers[j]->name); 3034 3035 seq_printf(seq, "\n"); 3036 seq_printf(seq, "read_ahead "); 3037 if (read_ahead[MD_MAJOR] == INT_MAX) 3038 seq_printf(seq, "not set\n"); 3039 else 3040 seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]); 3041 return 0; 3042 } 3043 if (v == (void*)2) { 3044 status_unused(seq); 3045 return 0; 3046 } 3047 3048 seq_printf(seq, "md%d : %sactive", mdidx(mddev), 3049 mddev->pers ? "" : "in"); 3050 if (mddev->pers) { 3051 if (mddev->ro) 3052 seq_printf(seq, " (read-only)"); 3053 seq_printf(seq, " %s", mddev->pers->name); 3054 } 3055 3056 size = 0; 3057 ITERATE_RDEV(mddev,rdev,tmp2) { 3058 seq_printf(seq, " %s[%d]", 3059 partition_name(rdev->dev), rdev->desc_nr); 3060 if (rdev->faulty) { 3061 seq_printf(seq, "(F)"); 3062 continue; 3063 } 3064 size += rdev->size; 3065 } 3066 3067 if (!list_empty(&mddev->disks)) { 3068 if (mddev->pers) 3069 seq_printf(seq, "\n %d blocks", 3070 md_size[mdidx(mddev)]); 3071 else 3072 seq_printf(seq, "\n %d blocks", size); 3073 } 3074 3075 if (mddev->pers) { 3076 3077 mddev->pers->status (seq, mddev); 3078 3079 seq_printf(seq, "\n "); 3080 if (mddev->curr_resync) { 3081 status_resync (seq, mddev); 3082 } else { 3083 if (sem_getcount(&mddev->resync_sem) != 1) 3084 seq_printf(seq, " resync=DELAYED"); 3085 } 3086 } 3087 seq_printf(seq, "\n"); 3088 3089 return 0; 3090} 3091 3092 3093static struct seq_operations md_seq_ops = { 3094 .start = md_seq_start, 3095 .next = md_seq_next, 3096 .stop = md_seq_stop, 3097 .show = md_seq_show, 3098}; 3099 3100static int md_seq_open(struct inode *inode, struct file *file) 3101{ 3102 int error; 3103 3104 error = seq_open(file, &md_seq_ops); 3105 return error; 3106} 3107 3108static struct file_operations md_seq_fops = { 3109 .open = md_seq_open, 3110 .read = seq_read, 3111 .llseek = seq_lseek, 3112 .release = seq_release, 3113}; 3114 3115 3116int register_md_personality(int pnum, mdk_personality_t *p) 3117{ 3118 if (pnum >= MAX_PERSONALITY) { 3119 MD_BUG(); 3120 return -EINVAL; 3121 } 3122 3123 if (pers[pnum]) { 3124 MD_BUG(); 3125 return -EBUSY; 3126 } 3127 3128 pers[pnum] = p; 3129 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3130 return 0; 3131} 3132 3133int unregister_md_personality(int pnum) 3134{ 3135 if (pnum >= MAX_PERSONALITY) { 3136 MD_BUG(); 3137 return -EINVAL; 3138 } 3139 3140 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3141 pers[pnum] = NULL; 3142 return 0; 3143} 3144 3145mdp_disk_t *get_spare(mddev_t *mddev) 3146{ 3147 mdp_super_t *sb = mddev->sb; 3148 mdp_disk_t *disk; 3149 mdk_rdev_t *rdev; 3150 struct md_list_head *tmp; 3151 3152 ITERATE_RDEV(mddev,rdev,tmp) { 3153 if (rdev->faulty) 3154 continue; 3155 if (!rdev->sb) { 3156 MD_BUG(); 3157 continue; 3158 } 3159 disk = &sb->disks[rdev->desc_nr]; 3160 if (disk_faulty(disk)) { 3161 MD_BUG(); 3162 continue; 3163 } 3164 if (disk_active(disk)) 3165 continue; 3166 return disk; 3167 } 3168 return NULL; 3169} 3170 3171static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; 3172void md_sync_acct(kdev_t dev, unsigned long nr_sectors) 3173{ 3174 unsigned int major = MAJOR(dev); 3175 unsigned int index; 3176 3177 index = disk_index(dev); 3178 if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) 3179 return; 3180 3181 sync_io[major][index] += nr_sectors; 3182} 3183 3184static int is_mddev_idle(mddev_t *mddev) 3185{ 3186 mdk_rdev_t * rdev; 3187 struct md_list_head *tmp; 3188 int idle; 3189 unsigned long curr_events; 3190 3191 idle = 1; 3192 ITERATE_RDEV(mddev,rdev,tmp) { 3193 int major = MAJOR(rdev->dev); 3194 int idx = disk_index(rdev->dev); 3195 3196 if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) 3197 continue; 3198 3199 curr_events = kstat.dk_drive_rblk[major][idx] + 3200 kstat.dk_drive_wblk[major][idx] ; 3201 curr_events -= sync_io[major][idx]; 3202 if ((curr_events - rdev->last_events) > 32) { 3203 rdev->last_events = curr_events; 3204 idle = 0; 3205 } 3206 } 3207 return idle; 3208} 3209 3210MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3211 3212void md_done_sync(mddev_t *mddev, int blocks, int ok) 3213{ 3214 /* another "blocks" (512byte) blocks have been synced */ 3215 atomic_sub(blocks, &mddev->recovery_active); 3216 wake_up(&mddev->recovery_wait); 3217 if (!ok) { 3218 // stop recovery, signal do_sync .... 3219 if (mddev->pers->stop_resync) 3220 mddev->pers->stop_resync(mddev); 3221 if (mddev->recovery_running) 3222 md_interrupt_thread(md_recovery_thread); 3223 } 3224} 3225 3226#define SYNC_MARKS 10 3227#define SYNC_MARK_STEP (3*HZ) 3228int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) 3229{ 3230 mddev_t *mddev2; 3231 unsigned int max_sectors, currspeed, 3232 j, window, err, serialize; 3233 unsigned long mark[SYNC_MARKS]; 3234 unsigned long mark_cnt[SYNC_MARKS]; 3235 int last_mark,m; 3236 struct md_list_head *tmp; 3237 unsigned long last_check; 3238 3239 3240 err = down_interruptible(&mddev->resync_sem); 3241 if (err) 3242 goto out_nolock; 3243 3244recheck: 3245 serialize = 0; 3246 ITERATE_MDDEV(mddev2,tmp) { 3247 if (mddev2 == mddev) 3248 continue; 3249 if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { 3250 printk(KERN_INFO "md: delaying resync of md%d until md%d " 3251 "has finished resync (they share one or more physical units)\n", 3252 mdidx(mddev), mdidx(mddev2)); 3253 serialize = 1; 3254 break; 3255 } 3256 } 3257 if (serialize) { 3258 interruptible_sleep_on(&resync_wait); 3259 if (md_signal_pending(current)) { 3260 md_flush_signals(); 3261 err = -EINTR; 3262 goto out; 3263 } 3264 goto recheck; 3265 } 3266 3267 mddev->curr_resync = 1; 3268 3269 max_sectors = mddev->sb->size<<1; 3270 3271 printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); 3272 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", 3273 sysctl_speed_limit_min); 3274 printk(KERN_INFO "md: using maximum available idle IO bandwith " 3275 "(but not more than %d KB/sec) for reconstruction.\n", 3276 sysctl_speed_limit_max); 3277 3278 /* 3279 * Resync has low priority. 3280 */ 3281 current->nice = 19; 3282 3283 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3284 for (m = 0; m < SYNC_MARKS; m++) { 3285 mark[m] = jiffies; 3286 mark_cnt[m] = 0; 3287 } 3288 last_mark = 0; 3289 mddev->resync_mark = mark[last_mark]; 3290 mddev->resync_mark_cnt = mark_cnt[last_mark]; 3291 3292 /* 3293 * Tune reconstruction: 3294 */ 3295 window = vm_max_readahead*(PAGE_SIZE/512); 3296 printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", 3297 window/2,max_sectors/2); 3298 3299 atomic_set(&mddev->recovery_active, 0); 3300 init_waitqueue_head(&mddev->recovery_wait); 3301 last_check = 0; 3302 for (j = 0; j < max_sectors;) { 3303 int sectors; 3304 3305 sectors = mddev->pers->sync_request(mddev, j); 3306 3307 if (sectors < 0) { 3308 err = sectors; 3309 goto out; 3310 } 3311 atomic_add(sectors, &mddev->recovery_active); 3312 j += sectors; 3313 mddev->curr_resync = j; 3314 3315 if (last_check + window > j) 3316 continue; 3317 3318 last_check = j; 3319 3320 run_task_queue(&tq_disk); 3321 3322 repeat: 3323 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { 3324 /* step marks */ 3325 int next = (last_mark+1) % SYNC_MARKS; 3326 3327 mddev->resync_mark = mark[next]; 3328 mddev->resync_mark_cnt = mark_cnt[next]; 3329 mark[next] = jiffies; 3330 mark_cnt[next] = j - atomic_read(&mddev->recovery_active); 3331 last_mark = next; 3332 } 3333 3334 3335 if (md_signal_pending(current)) { 3336 /* 3337 * got a signal, exit. 3338 */ 3339 mddev->curr_resync = 0; 3340 printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); 3341 md_flush_signals(); 3342 err = -EINTR; 3343 goto out; 3344 } 3345 3346 /* 3347 * this loop exits only if either when we are slower than 3348 * the 'hard' speed limit, or the system was IO-idle for 3349 * a jiffy. 3350 * the system might be non-idle CPU-wise, but we only care 3351 * about not overloading the IO subsystem. (things like an 3352 * e2fsck being done on the RAID array should execute fast) 3353 */ 3354 if (md_need_resched(current)) 3355 schedule(); 3356 3357 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; 3358 3359 if (currspeed > sysctl_speed_limit_min) { 3360 current->nice = 19; 3361 3362 if ((currspeed > sysctl_speed_limit_max) || 3363 !is_mddev_idle(mddev)) { 3364 current->state = TASK_INTERRUPTIBLE; 3365 md_schedule_timeout(HZ/4); 3366 goto repeat; 3367 } 3368 } else 3369 current->nice = -20; 3370 } 3371 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); 3372 err = 0; 3373 /* 3374 * this also signals 'finished resyncing' to md_stop 3375 */ 3376out: 3377 wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); 3378 up(&mddev->resync_sem); 3379out_nolock: 3380 mddev->curr_resync = 0; 3381 wake_up(&resync_wait); 3382 return err; 3383} 3384 3385 3386/* 3387 * This is a kernel thread which syncs a spare disk with the active array 3388 * 3389 * the amount of foolproofing might seem to be a tad excessive, but an 3390 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs 3391 * of my root partition with the first 0.5 gigs of my /home partition ... so 3392 * i'm a bit nervous ;) 3393 */ 3394void md_do_recovery(void *data) 3395{ 3396 int err; 3397 mddev_t *mddev; 3398 mdp_super_t *sb; 3399 mdp_disk_t *spare; 3400 struct md_list_head *tmp; 3401 3402 printk(KERN_INFO "md: recovery thread got woken up ...\n"); 3403restart: 3404 ITERATE_MDDEV(mddev,tmp) { 3405 sb = mddev->sb; 3406 if (!sb) 3407 continue; 3408 if (mddev->recovery_running) 3409 continue; 3410 if (sb->active_disks == sb->raid_disks) 3411 continue; 3412 if (mddev->sb_dirty) 3413 md_update_sb(mddev); 3414 if (!sb->spare_disks) { 3415 printk(KERN_ERR "md%d: no spare disk to reconstruct array! " 3416 "-- continuing in degraded mode\n", mdidx(mddev)); 3417 continue; 3418 } 3419 /* 3420 * now here we get the spare and resync it. 3421 */ 3422 spare = get_spare(mddev); 3423 if (!spare) 3424 continue; 3425 printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", 3426 mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); 3427 if (!mddev->pers->diskop) 3428 continue; 3429 if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) 3430 continue; 3431 down(&mddev->recovery_sem); 3432 mddev->recovery_running = 1; 3433 err = md_do_sync(mddev, spare); 3434 if (err == -EIO) { 3435 printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", 3436 mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); 3437 if (!disk_faulty(spare)) { 3438 mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); 3439 mark_disk_faulty(spare); 3440 mark_disk_nonsync(spare); 3441 mark_disk_inactive(spare); 3442 sb->spare_disks--; 3443 sb->working_disks--; 3444 sb->failed_disks++; 3445 } 3446 } else 3447 if (disk_faulty(spare)) 3448 mddev->pers->diskop(mddev, &spare, 3449 DISKOP_SPARE_INACTIVE); 3450 if (err == -EINTR || err == -ENOMEM) { 3451 /* 3452 * Recovery got interrupted, or ran out of mem ... 3453 * signal back that we have finished using the array. 3454 */ 3455 mddev->pers->diskop(mddev, &spare, 3456 DISKOP_SPARE_INACTIVE); 3457 up(&mddev->recovery_sem); 3458 mddev->recovery_running = 0; 3459 continue; 3460 } else { 3461 mddev->recovery_running = 0; 3462 up(&mddev->recovery_sem); 3463 } 3464 if (!disk_faulty(spare)) { 3465 /* 3466 * the SPARE_ACTIVE diskop possibly changes the 3467 * pointer too 3468 */ 3469 mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); 3470 mark_disk_sync(spare); 3471 mark_disk_active(spare); 3472 sb->active_disks++; 3473 sb->spare_disks--; 3474 } 3475 mddev->sb_dirty = 1; 3476 md_update_sb(mddev); 3477 goto restart; 3478 } 3479 printk(KERN_INFO "md: recovery thread finished ...\n"); 3480 3481} 3482 3483int md_notify_reboot(struct notifier_block *this, 3484 unsigned long code, void *x) 3485{ 3486 struct md_list_head *tmp; 3487 mddev_t *mddev; 3488 3489 if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) 3490 || (code == MD_SYS_POWER_OFF)) { 3491 3492 printk(KERN_INFO "md: stopping all md devices.\n"); 3493 3494 ITERATE_MDDEV(mddev,tmp) 3495 do_md_stop (mddev, 1); 3496 /* 3497 * certain more exotic SCSI devices are known to be 3498 * volatile wrt too early system reboots. While the 3499 * right place to handle this issue is the given 3500 * driver, we do want to have a safe RAID driver ... 3501 */ 3502 md_mdelay(1000*1); 3503 } 3504 return NOTIFY_DONE; 3505} 3506 3507struct notifier_block md_notifier = { 3508 notifier_call: md_notify_reboot, 3509 next: NULL, 3510 priority: INT_MAX, /* before any real devices */ 3511}; 3512 3513static void md_geninit(void) 3514{ 3515 struct proc_dir_entry *p; 3516 int i; 3517 3518 for(i = 0; i < MAX_MD_DEVS; i++) { 3519 md_blocksizes[i] = 1024; 3520 md_size[i] = 0; 3521 md_hardsect_sizes[i] = 512; 3522 } 3523 blksize_size[MAJOR_NR] = md_blocksizes; 3524 blk_size[MAJOR_NR] = md_size; 3525 max_readahead[MAJOR_NR] = md_maxreadahead; 3526 hardsect_size[MAJOR_NR] = md_hardsect_sizes; 3527 3528 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 3529 3530#ifdef CONFIG_PROC_FS 3531 p = create_proc_entry("mdstat", S_IRUGO, NULL); 3532 if (p) 3533 p->proc_fops = &md_seq_fops; 3534#endif 3535} 3536 3537request_queue_t * md_queue_proc(kdev_t dev) 3538{ 3539 mddev_t *mddev = kdev_to_mddev(dev); 3540 if (mddev == NULL) 3541 return BLK_DEFAULT_QUEUE(MAJOR_NR); 3542 else 3543 return &mddev->queue; 3544} 3545 3546int md__init md_init(void) 3547{ 3548 static char * name = "mdrecoveryd"; 3549 int minor; 3550 3551 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n", 3552 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3553 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3554 3555 if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) 3556 { 3557 printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR); 3558 return (-1); 3559 } 3560 devfs_handle = devfs_mk_dir (NULL, "md", NULL); 3561 /* we don't use devfs_register_series because we want to fill md_hd_struct */ 3562 for (minor=0; minor < MAX_MD_DEVS; ++minor) { 3563 char devname[128]; 3564 sprintf (devname, "%u", minor); 3565 md_hd_struct[minor].de = devfs_register (devfs_handle, 3566 devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor, 3567 S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); 3568 } 3569 3570 /* all requests on an uninitialised device get failed... */ 3571 blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request); 3572 blk_dev[MAJOR_NR].queue = md_queue_proc; 3573 3574 3575 read_ahead[MAJOR_NR] = INT_MAX; 3576 3577 add_gendisk(&md_gendisk); 3578 3579 md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); 3580 if (!md_recovery_thread) 3581 printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n"); 3582 3583 md_register_reboot_notifier(&md_notifier); 3584 raid_table_header = register_sysctl_table(raid_root_table, 1); 3585 3586 md_geninit(); 3587 return (0); 3588} 3589 3590 3591#ifndef MODULE 3592 3593/* 3594 * When md (and any require personalities) are compiled into the kernel 3595 * (not a module), arrays can be assembles are boot time using with AUTODETECT 3596 * where specially marked partitions are registered with md_autodetect_dev(), 3597 * and with MD_BOOT where devices to be collected are given on the boot line 3598 * with md=..... 3599 * The code for that is here. 3600 */ 3601 3602struct { 3603 int set; 3604 int noautodetect; 3605} raid_setup_args md__initdata; 3606 3607/* 3608 * Searches all registered partitions for autorun RAID arrays 3609 * at boot time. 3610 */ 3611static kdev_t detected_devices[128]; 3612static int dev_cnt; 3613 3614void md_autodetect_dev(kdev_t dev) 3615{ 3616 if (dev_cnt >= 0 && dev_cnt < 127) 3617 detected_devices[dev_cnt++] = dev; 3618} 3619 3620 3621static void autostart_arrays(void) 3622{ 3623 mdk_rdev_t *rdev; 3624 int i; 3625 3626 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 3627 3628 for (i = 0; i < dev_cnt; i++) { 3629 kdev_t dev = detected_devices[i]; 3630 3631 if (md_import_device(dev,1)) { 3632 printk(KERN_ALERT "md: could not import %s!\n", 3633 partition_name(dev)); 3634 continue; 3635 } 3636 /* 3637 * Sanity checks: 3638 */ 3639 rdev = find_rdev_all(dev); 3640 if (!rdev) { 3641 MD_BUG(); 3642 continue; 3643 } 3644 if (rdev->faulty) { 3645 MD_BUG(); 3646 continue; 3647 } 3648 md_list_add(&rdev->pending, &pending_raid_disks); 3649 } 3650 dev_cnt = 0; 3651 3652 autorun_devices(-1); 3653} 3654 3655static struct { 3656 char device_set [MAX_MD_DEVS]; 3657 int pers[MAX_MD_DEVS]; 3658 int chunk[MAX_MD_DEVS]; 3659 char *device_names[MAX_MD_DEVS]; 3660} md_setup_args md__initdata; 3661 3662/* 3663 * Parse the command-line parameters given our kernel, but do not 3664 * actually try to invoke the MD device now; that is handled by 3665 * md_setup_drive after the low-level disk drivers have initialised. 3666 * 3667 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which 3668 * assigns the task of parsing integer arguments to the 3669 * invoked program now). Added ability to initialise all 3670 * the MD devices (by specifying multiple "md=" lines) 3671 * instead of just one. -- KTK 3672 * 18May2000: Added support for persistant-superblock arrays: 3673 * md=n,0,factor,fault,device-list uses RAID0 for device n 3674 * md=n,-1,factor,fault,device-list uses LINEAR for device n 3675 * md=n,device-list reads a RAID superblock from the devices 3676 * elements in device-list are read by name_to_kdev_t so can be 3677 * a hex number or something like /dev/hda1 /dev/sdb 3678 * 2001-06-03: Dave Cinege <dcinege@psychosis.com> 3679 * Shifted name_to_kdev_t() and related operations to md_set_drive() 3680 * for later execution. Rewrote section to make devfs compatible. 3681 */ 3682static int md__init md_setup(char *str) 3683{ 3684 int minor, level, factor, fault; 3685 char *pername = ""; 3686 char *str1 = str; 3687 3688 if (get_option(&str, &minor) != 2) { /* MD Number */ 3689 printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); 3690 return 0; 3691 } 3692 if (minor >= MAX_MD_DEVS) { 3693 printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor); 3694 return 0; 3695 } else if (md_setup_args.device_names[minor]) { 3696 printk(KERN_WARNING "md: md=%d, Specified more then once. " 3697 "Replacing previous definition.\n", minor); 3698 } 3699 switch (get_option(&str, &level)) { /* RAID Personality */ 3700 case 2: /* could be 0 or -1.. */ 3701 if (level == 0 || level == -1) { 3702 if (get_option(&str, &factor) != 2 || /* Chunk Size */ 3703 get_option(&str, &fault) != 2) { 3704 printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); 3705 return 0; 3706 } 3707 md_setup_args.pers[minor] = level; 3708 md_setup_args.chunk[minor] = 1 << (factor+12); 3709 switch(level) { 3710 case -1: 3711 level = LINEAR; 3712 pername = "linear"; 3713 break; 3714 case 0: 3715 level = RAID0; 3716 pername = "raid0"; 3717 break; 3718 default: 3719 printk(KERN_WARNING 3720 "md: The kernel has not been configured for raid%d support!\n", 3721 level); 3722 return 0; 3723 } 3724 md_setup_args.pers[minor] = level; 3725 break; 3726 } 3727 /* FALL THROUGH */ 3728 case 1: /* the first device is numeric */ 3729 str = str1; 3730 /* FALL THROUGH */ 3731 case 0: 3732 md_setup_args.pers[minor] = 0; 3733 pername="super-block"; 3734 } 3735 3736 printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", 3737 minor, pername, str); 3738 md_setup_args.device_names[minor] = str; 3739 3740 return 1; 3741} 3742 3743extern kdev_t name_to_kdev_t(char *line) md__init; 3744void md__init md_setup_drive(void) 3745{ 3746 int minor, i; 3747 kdev_t dev; 3748 mddev_t*mddev; 3749 kdev_t devices[MD_SB_DISKS+1]; 3750 3751 for (minor = 0; minor < MAX_MD_DEVS; minor++) { 3752 int err = 0; 3753 char *devname; 3754 mdu_disk_info_t dinfo; 3755 3756 if ((devname = md_setup_args.device_names[minor]) == 0) continue; 3757 3758 for (i = 0; i < MD_SB_DISKS && devname != 0; i++) { 3759 3760 char *p; 3761 void *handle; 3762 3763 p = strchr(devname, ','); 3764 if (p) 3765 *p++ = 0; 3766 3767 dev = name_to_kdev_t(devname); 3768 handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev), 3769 DEVFS_SPECIAL_BLK, 1); 3770 if (handle != 0) { 3771 unsigned major, minor; 3772 devfs_get_maj_min(handle, &major, &minor); 3773 dev = MKDEV(major, minor); 3774 } 3775 if (dev == 0) { 3776 printk(KERN_WARNING "md: Unknown device name: %s\n", devname); 3777 break; 3778 } 3779 3780 devices[i] = dev; 3781 md_setup_args.device_set[minor] = 1; 3782 3783 devname = p; 3784 } 3785 devices[i] = 0; 3786 3787 if (md_setup_args.device_set[minor] == 0) 3788 continue; 3789 3790 if (mddev_map[minor]) { 3791 printk(KERN_WARNING 3792 "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", 3793 minor); 3794 continue; 3795 } 3796 printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]); 3797 3798 mddev = alloc_mddev(MKDEV(MD_MAJOR,minor)); 3799 if (!mddev) { 3800 printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor); 3801 continue; 3802 } 3803 if (md_setup_args.pers[minor]) { 3804 /* non-persistent */ 3805 mdu_array_info_t ainfo; 3806 ainfo.level = pers_to_level(md_setup_args.pers[minor]); 3807 ainfo.size = 0; 3808 ainfo.nr_disks =0; 3809 ainfo.raid_disks =0; 3810 ainfo.md_minor =minor; 3811 ainfo.not_persistent = 1; 3812 3813 ainfo.state = (1 << MD_SB_CLEAN); 3814 ainfo.active_disks = 0; 3815 ainfo.working_disks = 0; 3816 ainfo.failed_disks = 0; 3817 ainfo.spare_disks = 0; 3818 ainfo.layout = 0; 3819 ainfo.chunk_size = md_setup_args.chunk[minor]; 3820 err = set_array_info(mddev, &ainfo); 3821 for (i = 0; !err && (dev = devices[i]); i++) { 3822 dinfo.number = i; 3823 dinfo.raid_disk = i; 3824 dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC); 3825 dinfo.major = MAJOR(dev); 3826 dinfo.minor = MINOR(dev); 3827 mddev->sb->nr_disks++; 3828 mddev->sb->raid_disks++; 3829 mddev->sb->active_disks++; 3830 mddev->sb->working_disks++; 3831 err = add_new_disk (mddev, &dinfo); 3832 } 3833 } else { 3834 /* persistent */ 3835 for (i = 0; (dev = devices[i]); i++) { 3836 dinfo.major = MAJOR(dev); 3837 dinfo.minor = MINOR(dev); 3838 add_new_disk (mddev, &dinfo); 3839 } 3840 } 3841 if (!err) 3842 err = do_md_run(mddev); 3843 if (err) { 3844 mddev->sb_dirty = 0; 3845 do_md_stop(mddev, 0); 3846 printk(KERN_WARNING "md: starting md%d failed\n", minor); 3847 } 3848 } 3849} 3850 3851static int md__init raid_setup(char *str) 3852{ 3853 int len, pos; 3854 3855 len = strlen(str) + 1; 3856 pos = 0; 3857 3858 while (pos < len) { 3859 char *comma = strchr(str+pos, ','); 3860 int wlen; 3861 if (comma) 3862 wlen = (comma-str)-pos; 3863 else wlen = (len-1)-pos; 3864 3865 if (strncmp(str, "noautodetect", wlen) == 0) 3866 raid_setup_args.noautodetect = 1; 3867 pos += wlen+1; 3868 } 3869 raid_setup_args.set = 1; 3870 return 1; 3871} 3872 3873int md__init md_run_setup(void) 3874{ 3875 if (raid_setup_args.noautodetect) 3876 printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); 3877 else 3878 autostart_arrays(); 3879 md_setup_drive(); 3880 return 0; 3881} 3882 3883__setup("raid=", raid_setup); 3884__setup("md=", md_setup); 3885 3886__initcall(md_init); 3887__initcall(md_run_setup); 3888 3889#else /* It is a MODULE */ 3890 3891int init_module(void) 3892{ 3893 return md_init(); 3894} 3895 3896static void free_device_names(void) 3897{ 3898 while (!list_empty(&device_names)) { 3899 struct dname *tmp = list_entry(device_names.next, 3900 dev_name_t, list); 3901 list_del(&tmp->list); 3902 kfree(tmp); 3903 } 3904} 3905 3906 3907void cleanup_module(void) 3908{ 3909 md_unregister_thread(md_recovery_thread); 3910 devfs_unregister(devfs_handle); 3911 3912 devfs_unregister_blkdev(MAJOR_NR,"md"); 3913 unregister_reboot_notifier(&md_notifier); 3914 unregister_sysctl_table(raid_table_header); 3915#ifdef CONFIG_PROC_FS 3916 remove_proc_entry("mdstat", NULL); 3917#endif 3918 3919 del_gendisk(&md_gendisk); 3920 3921 blk_dev[MAJOR_NR].queue = NULL; 3922 blksize_size[MAJOR_NR] = NULL; 3923 blk_size[MAJOR_NR] = NULL; 3924 max_readahead[MAJOR_NR] = NULL; 3925 hardsect_size[MAJOR_NR] = NULL; 3926 3927 free_device_names(); 3928 3929} 3930#endif 3931 3932MD_EXPORT_SYMBOL(md_size); 3933MD_EXPORT_SYMBOL(register_md_personality); 3934MD_EXPORT_SYMBOL(unregister_md_personality); 3935MD_EXPORT_SYMBOL(partition_name); 3936MD_EXPORT_SYMBOL(md_error); 3937MD_EXPORT_SYMBOL(md_do_sync); 3938MD_EXPORT_SYMBOL(md_sync_acct); 3939MD_EXPORT_SYMBOL(md_done_sync); 3940MD_EXPORT_SYMBOL(md_recover_arrays); 3941MD_EXPORT_SYMBOL(md_register_thread); 3942MD_EXPORT_SYMBOL(md_unregister_thread); 3943MD_EXPORT_SYMBOL(md_update_sb); 3944MD_EXPORT_SYMBOL(md_wakeup_thread); 3945MD_EXPORT_SYMBOL(md_print_devices); 3946MD_EXPORT_SYMBOL(find_rdev_nr); 3947MD_EXPORT_SYMBOL(md_interrupt_thread); 3948MD_EXPORT_SYMBOL(mddev_map); 3949MODULE_LICENSE("GPL"); 3950