1@@ -1,3674 +1,101 @@ 2-/* 3- md.c : Multiple Devices driver for Linux 4- Copyright (C) 1998, 1999, 2000 Ingo Molnar 5- 6- completely rewritten, based on the MD driver code from Marc Zyngier 7- 8- Changes: 9- 10- - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11- - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12- - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13- - kmod support by: Cyrus Durgin 14- - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15- - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16- 17- - lots of fixes and improvements to the RAID1/RAID5 and generic 18- RAID code (such as request based resynchronization): 19- 20- Neil Brown <neilb@cse.unsw.edu.au>. 21- 22- This program is free software; you can redistribute it and/or modify 23- it under the terms of the GNU General Public License as published by 24- the Free Software Foundation; either version 2, or (at your option) 25- any later version. 26- 27- You should have received a copy of the GNU General Public License 28- (for example /usr/src/linux/COPYING); if not, write to the Free 29- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 30-*/ 31- 32-#include <linux/module.h> 33-#include <linux/config.h> 34-#include <linux/linkage.h> 35-#include <linux/raid/md.h> 36-#include <linux/sysctl.h> 37-#include <linux/bio.h> 38-#include <linux/devfs_fs_kernel.h> 39-#include <linux/buffer_head.h> /* for invalidate_bdev */ 40-#include <linux/suspend.h> 41- 42-#include <linux/init.h> 43- 44-#ifdef CONFIG_KMOD 45-#include <linux/kmod.h> 46-#endif 47- 48-#define __KERNEL_SYSCALLS__ 49-#include <linux/unistd.h> 50- 51-#include <asm/unaligned.h> 52- 53-#define MAJOR_NR MD_MAJOR 54-#define MD_DRIVER 55-#define DEVICE_NR(device) (minor(device)) 56- 57-#include <linux/blk.h> 58- 59-#define DEBUG 0 60-#define dprintk(x...) ((void)(DEBUG && printk(x))) 61- 62- 63-#ifndef MODULE 64-static void autostart_arrays (void); 65-#endif 66- 67-static mdk_personality_t *pers[MAX_PERSONALITY]; 68-static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED; 69- 70-/* 71- * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 72- * is 1000 KB/sec, so the extra system load does not show up that much. 73- * Increase it if you want to have more _guaranteed_ speed. Note that 74- * the RAID driver will use the maximum available bandwith if the IO 75- * subsystem is idle. There is also an 'absolute maximum' reconstruction 76- * speed limit - in case reconstruction slows down your system despite 77- * idle IO detection. 78- * 79- * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 80- */ 81- 82-static int sysctl_speed_limit_min = 1000; 83-static int sysctl_speed_limit_max = 200000; 84- 85-static struct ctl_table_header *raid_table_header; 86- 87-static ctl_table raid_table[] = { 88- { 89- .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 90- .procname = "speed_limit_min", 91- .data = &sysctl_speed_limit_min, 92- .maxlen = sizeof(int), 93- .mode = 0644, 94- .proc_handler = &proc_dointvec, 95- }, 96- { 97- .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 98- .procname = "speed_limit_max", 99- .data = &sysctl_speed_limit_max, 100- .maxlen = sizeof(int), 101- .mode = 0644, 102- .proc_handler = &proc_dointvec, 103- }, 104- { .ctl_name = 0 } 105-}; 106- 107-static ctl_table raid_dir_table[] = { 108- { 109- .ctl_name = DEV_RAID, 110- .procname = "raid", 111- .maxlen = 0, 112- .mode = 0555, 113- .child = raid_table, 114- }, 115- { .ctl_name = 0 } 116-}; 117- 118-static ctl_table raid_root_table[] = { 119- { 120- .ctl_name = CTL_DEV, 121- .procname = "dev", 122- .maxlen = 0, 123- .mode = 0555, 124- .child = raid_dir_table, 125- }, 126- { .ctl_name = 0 } 127-}; 128- 129-static struct block_device_operations md_fops; 130- 131-static struct gendisk *disks[MAX_MD_DEVS]; 132- 133-/* 134- * Enables to iterate over all existing md arrays 135- * all_mddevs_lock protects this list as well as mddev_map. 136- */ 137-static LIST_HEAD(all_mddevs); 138-static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; 139- 140- 141-/* 142- * iterates through all used mddevs in the system. 143- * We take care to grab the all_mddevs_lock whenever navigating 144- * the list, and to always hold a refcount when unlocked. 145- * Any code which breaks out of this loop while own 146- * a reference to the current mddev and must mddev_put it. 147- */ 148-#define ITERATE_MDDEV(mddev,tmp) \ 149- \ 150- for (({ spin_lock(&all_mddevs_lock); \ 151- tmp = all_mddevs.next; \ 152- mddev = NULL;}); \ 153- ({ if (tmp != &all_mddevs) \ 154- mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 155- spin_unlock(&all_mddevs_lock); \ 156- if (mddev) mddev_put(mddev); \ 157- mddev = list_entry(tmp, mddev_t, all_mddevs); \ 158- tmp != &all_mddevs;}); \ 159- ({ spin_lock(&all_mddevs_lock); \ 160- tmp = tmp->next;}) \ 161- ) 162- 163-static mddev_t *mddev_map[MAX_MD_DEVS]; 164- 165-static int md_fail_request (request_queue_t *q, struct bio *bio) 166-{ 167- bio_io_error(bio, bio->bi_size); 168- return 0; 169-} 170- 171-static inline mddev_t *mddev_get(mddev_t *mddev) 172-{ 173- atomic_inc(&mddev->active); 174- return mddev; 175-} 176- 177-static void mddev_put(mddev_t *mddev) 178-{ 179- if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 180- return; 181- if (!mddev->raid_disks && list_empty(&mddev->disks)) { 182- list_del(&mddev->all_mddevs); 183- mddev_map[mdidx(mddev)] = NULL; 184- kfree(mddev); 185- MOD_DEC_USE_COUNT; 186- } 187- spin_unlock(&all_mddevs_lock); 188-} 189- 190-static mddev_t * mddev_find(int unit) 191-{ 192- mddev_t *mddev, *new = NULL; 193- 194- retry: 195- spin_lock(&all_mddevs_lock); 196- if (mddev_map[unit]) { 197- mddev = mddev_get(mddev_map[unit]); 198- spin_unlock(&all_mddevs_lock); 199- if (new) 200- kfree(new); 201- return mddev; 202- } 203- if (new) { 204- mddev_map[unit] = new; 205- list_add(&new->all_mddevs, &all_mddevs); 206- spin_unlock(&all_mddevs_lock); 207- MOD_INC_USE_COUNT; 208- return new; 209- } 210- spin_unlock(&all_mddevs_lock); 211- 212- new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 213- if (!new) 214- return NULL; 215- 216- memset(new, 0, sizeof(*new)); 217- 218- new->__minor = unit; 219- init_MUTEX(&new->reconfig_sem); 220- INIT_LIST_HEAD(&new->disks); 221- INIT_LIST_HEAD(&new->all_mddevs); 222- init_timer(&new->safemode_timer); 223- atomic_set(&new->active, 1); 224- blk_queue_make_request(&new->queue, md_fail_request); 225- 226- goto retry; 227-} 228- 229-static inline int mddev_lock(mddev_t * mddev) 230-{ 231- return down_interruptible(&mddev->reconfig_sem); 232-} 233- 234-static inline void mddev_lock_uninterruptible(mddev_t * mddev) 235-{ 236- down(&mddev->reconfig_sem); 237-} 238- 239-static inline int mddev_trylock(mddev_t * mddev) 240-{ 241- return down_trylock(&mddev->reconfig_sem); 242-} 243- 244-static inline void mddev_unlock(mddev_t * mddev) 245-{ 246- up(&mddev->reconfig_sem); 247-} 248- 249-mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 250-{ 251- mdk_rdev_t * rdev; 252- struct list_head *tmp; 253- 254- ITERATE_RDEV(mddev,rdev,tmp) { 255- if (rdev->desc_nr == nr) 256- return rdev; 257- } 258- return NULL; 259-} 260- 261-static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 262-{ 263- struct list_head *tmp; 264- mdk_rdev_t *rdev; 265- 266- ITERATE_RDEV(mddev,rdev,tmp) { 267- if (rdev->bdev->bd_dev == dev) 268- return rdev; 269- } 270- return NULL; 271-} 272- 273-inline static sector_t calc_dev_sboffset(struct block_device *bdev) 274-{ 275- sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 276- return MD_NEW_SIZE_BLOCKS(size); 277-} 278- 279-static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 280-{ 281- sector_t size; 282- 283- size = rdev->sb_offset; 284- 285- if (chunk_size) 286- size &= ~((sector_t)chunk_size/1024 - 1); 287- return size; 288-} 289- 290-static int alloc_disk_sb(mdk_rdev_t * rdev) 291-{ 292- if (rdev->sb_page) 293- MD_BUG(); 294- 295- rdev->sb_page = alloc_page(GFP_KERNEL); 296- if (!rdev->sb_page) { 297- printk(KERN_ALERT "md: out of memory.\n"); 298- return -EINVAL; 299- } 300- 301- return 0; 302-} 303- 304-static void free_disk_sb(mdk_rdev_t * rdev) 305-{ 306- if (rdev->sb_page) { 307- page_cache_release(rdev->sb_page); 308- rdev->sb_loaded = 0; 309- rdev->sb_page = NULL; 310- rdev->sb_offset = 0; 311- rdev->size = 0; 312- } 313-} 314- 315- 316-static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 317-{ 318- if (bio->bi_size) 319- return 1; 320- 321- complete((struct completion*)bio->bi_private); 322- return 0; 323-} 324- 325-static int sync_page_io(struct block_device *bdev, sector_t sector, int size, 326- struct page *page, int rw) 327-{ 328- struct bio bio; 329- struct bio_vec vec; 330- struct completion event; 331- 332- bio_init(&bio); 333- bio.bi_io_vec = &vec; 334- vec.bv_page = page; 335- vec.bv_len = size; 336- vec.bv_offset = 0; 337- bio.bi_vcnt = 1; 338- bio.bi_idx = 0; 339- bio.bi_size = size; 340- bio.bi_bdev = bdev; 341- bio.bi_sector = sector; 342- init_completion(&event); 343- bio.bi_private = &event; 344- bio.bi_end_io = bi_complete; 345- submit_bio(rw, &bio); 346- blk_run_queues(); 347- wait_for_completion(&event); 348- 349- return test_bit(BIO_UPTODATE, &bio.bi_flags); 350-} 351- 352-static int read_disk_sb(mdk_rdev_t * rdev) 353-{ 354- 355- if (!rdev->sb_page) { 356- MD_BUG(); 357- return -EINVAL; 358- } 359- if (rdev->sb_loaded) 360- return 0; 361- 362- 363- if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) 364- goto fail; 365- rdev->sb_loaded = 1; 366- return 0; 367- 368-fail: 369- printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", 370- bdev_partition_name(rdev->bdev)); 371- return -EINVAL; 372-} 373- 374-static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 375-{ 376- if ( (sb1->set_uuid0 == sb2->set_uuid0) && 377- (sb1->set_uuid1 == sb2->set_uuid1) && 378- (sb1->set_uuid2 == sb2->set_uuid2) && 379- (sb1->set_uuid3 == sb2->set_uuid3)) 380- 381- return 1; 382- 383- return 0; 384-} 385- 386- 387-static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 388-{ 389- int ret; 390- mdp_super_t *tmp1, *tmp2; 391- 392- tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 393- tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 394- 395- if (!tmp1 || !tmp2) { 396- ret = 0; 397- printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 398- goto abort; 399- } 400- 401- *tmp1 = *sb1; 402- *tmp2 = *sb2; 403- 404- /* 405- * nr_disks is not constant 406- */ 407- tmp1->nr_disks = 0; 408- tmp2->nr_disks = 0; 409- 410- if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 411- ret = 0; 412- else 413- ret = 1; 414- 415-abort: 416- if (tmp1) 417- kfree(tmp1); 418- if (tmp2) 419- kfree(tmp2); 420- 421- return ret; 422-} 423- 424-static unsigned int calc_sb_csum(mdp_super_t * sb) 425-{ 426- unsigned int disk_csum, csum; 427- 428- disk_csum = sb->sb_csum; 429- sb->sb_csum = 0; 430- csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 431- sb->sb_csum = disk_csum; 432- return csum; 433-} 434- 435-/* 436- * Handle superblock details. 437- * We want to be able to handle multiple superblock formats 438- * so we have a common interface to them all, and an array of 439- * different handlers. 440- * We rely on user-space to write the initial superblock, and support 441- * reading and updating of superblocks. 442- * Interface methods are: 443- * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 444- * loads and validates a superblock on dev. 445- * if refdev != NULL, compare superblocks on both devices 446- * Return: 447- * 0 - dev has a superblock that is compatible with refdev 448- * 1 - dev has a superblock that is compatible and newer than refdev 449- * so dev should be used as the refdev in future 450- * -EINVAL superblock incompatible or invalid 451- * -othererror e.g. -EIO 452- * 453- * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 454- * Verify that dev is acceptable into mddev. 455- * The first time, mddev->raid_disks will be 0, and data from 456- * dev should be merged in. Subsequent calls check that dev 457- * is new enough. Return 0 or -EINVAL 458- * 459- * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 460- * Update the superblock for rdev with data in mddev 461- * This does not write to disc. 462- * 463- */ 464- 465-struct super_type { 466- char *name; 467- struct module *owner; 468- int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 469- int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 470- void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 471-}; 472- 473-/* 474- * load_super for 0.90.0 475- */ 476-static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 477-{ 478- mdp_super_t *sb; 479- int ret; 480- sector_t sb_offset; 481- 482- /* 483- * Calculate the position of the superblock, 484- * it's at the end of the disk. 485- * 486- * It also happens to be a multiple of 4Kb. 487- */ 488- sb_offset = calc_dev_sboffset(rdev->bdev); 489- rdev->sb_offset = sb_offset; 490- 491- ret = read_disk_sb(rdev); 492- if (ret) return ret; 493- 494- ret = -EINVAL; 495- 496- sb = (mdp_super_t*)page_address(rdev->sb_page); 497- 498- if (sb->md_magic != MD_SB_MAGIC) { 499- printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 500- bdev_partition_name(rdev->bdev)); 501- goto abort; 502- } 503- 504- if (sb->major_version != 0 || 505- sb->minor_version != 90) { 506- printk(KERN_WARNING "Bad version number %d.%d on %s\n", 507- sb->major_version, sb->minor_version, 508- bdev_partition_name(rdev->bdev)); 509- goto abort; 510- } 511- 512- if (sb->md_minor >= MAX_MD_DEVS) { 513- printk(KERN_ERR "md: %s: invalid raid minor (%x)\n", 514- bdev_partition_name(rdev->bdev), sb->md_minor); 515- goto abort; 516- } 517- if (sb->raid_disks <= 0) 518- goto abort; 519- 520- if (calc_sb_csum(sb) != sb->sb_csum) { 521- printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 522- bdev_partition_name(rdev->bdev)); 523- goto abort; 524- } 525- 526- rdev->preferred_minor = sb->md_minor; 527- rdev->data_offset = 0; 528- 529- if (sb->level == MULTIPATH) 530- rdev->desc_nr = -1; 531- else 532- rdev->desc_nr = sb->this_disk.number; 533- 534- if (refdev == 0) 535- ret = 1; 536- else { 537- __u64 ev1, ev2; 538- mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 539- if (!uuid_equal(refsb, sb)) { 540- printk(KERN_WARNING "md: %s has different UUID to %s\n", 541- bdev_partition_name(rdev->bdev), 542- bdev_partition_name(refdev->bdev)); 543- goto abort; 544- } 545- if (!sb_equal(refsb, sb)) { 546- printk(KERN_WARNING "md: %s has same UUID" 547- " but different superblock to %s\n", 548- bdev_partition_name(rdev->bdev), 549- bdev_partition_name(refdev->bdev)); 550- goto abort; 551- } 552- ev1 = md_event(sb); 553- ev2 = md_event(refsb); 554- if (ev1 > ev2) 555- ret = 1; 556- else 557- ret = 0; 558- } 559- rdev->size = calc_dev_size(rdev, sb->chunk_size); 560- 561- abort: 562- return ret; 563-} 564- 565-/* 566- * validate_super for 0.90.0 567- */ 568-static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 569-{ 570- mdp_disk_t *desc; 571- mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 572- 573- if (mddev->raid_disks == 0) { 574- mddev->major_version = 0; 575- mddev->minor_version = sb->minor_version; 576- mddev->patch_version = sb->patch_version; 577- mddev->persistent = ! sb->not_persistent; 578- mddev->chunk_size = sb->chunk_size; 579- mddev->ctime = sb->ctime; 580- mddev->utime = sb->utime; 581- mddev->level = sb->level; 582- mddev->layout = sb->layout; 583- mddev->raid_disks = sb->raid_disks; 584- mddev->size = sb->size; 585- mddev->events = md_event(sb); 586- 587- if (sb->state & (1<<MD_SB_CLEAN)) 588- mddev->recovery_cp = MaxSector; 589- else { 590- if (sb->events_hi == sb->cp_events_hi && 591- sb->events_lo == sb->cp_events_lo) { 592- mddev->recovery_cp = sb->recovery_cp; 593- } else 594- mddev->recovery_cp = 0; 595- } 596- 597- memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 598- memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 599- memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 600- memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 601- 602- mddev->max_disks = MD_SB_DISKS; 603- } else { 604- __u64 ev1; 605- ev1 = md_event(sb); 606- ++ev1; 607- if (ev1 < mddev->events) 608- return -EINVAL; 609- } 610- if (mddev->level != LEVEL_MULTIPATH) { 611- rdev->raid_disk = -1; 612- rdev->in_sync = rdev->faulty = 0; 613- desc = sb->disks + rdev->desc_nr; 614- 615- if (desc->state & (1<<MD_DISK_FAULTY)) 616- rdev->faulty = 1; 617- else if (desc->state & (1<<MD_DISK_SYNC) && 618- desc->raid_disk < mddev->raid_disks) { 619- rdev->in_sync = 1; 620- rdev->raid_disk = desc->raid_disk; 621- } 622- } 623- return 0; 624-} 625- 626-/* 627- * sync_super for 0.90.0 628- */ 629-static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 630-{ 631- mdp_super_t *sb; 632- struct list_head *tmp; 633- mdk_rdev_t *rdev2; 634- int next_spare = mddev->raid_disks; 635- 636- /* make rdev->sb match mddev data.. 637- * 638- * 1/ zero out disks 639- * 2/ Add info for each disk, keeping track of highest desc_nr 640- * 3/ any empty disks < highest become removed 641- * 642- * disks[0] gets initialised to REMOVED because 643- * we cannot be sure from other fields if it has 644- * been initialised or not. 645- */ 646- int highest = 0; 647- int i; 648- int active=0, working=0,failed=0,spare=0,nr_disks=0; 649- 650- sb = (mdp_super_t*)page_address(rdev->sb_page); 651- 652- memset(sb, 0, sizeof(*sb)); 653- 654- sb->md_magic = MD_SB_MAGIC; 655- sb->major_version = mddev->major_version; 656- sb->minor_version = mddev->minor_version; 657- sb->patch_version = mddev->patch_version; 658- sb->gvalid_words = 0; /* ignored */ 659- memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 660- memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 661- memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 662- memcpy(&sb->set_uuid3, mddev->uuid+12,4); 663- 664- sb->ctime = mddev->ctime; 665- sb->level = mddev->level; 666- sb->size = mddev->size; 667- sb->raid_disks = mddev->raid_disks; 668- sb->md_minor = mddev->__minor; 669- sb->not_persistent = !mddev->persistent; 670- sb->utime = mddev->utime; 671- sb->state = 0; 672- sb->events_hi = (mddev->events>>32); 673- sb->events_lo = (u32)mddev->events; 674- 675- if (mddev->in_sync) 676- { 677- sb->recovery_cp = mddev->recovery_cp; 678- sb->cp_events_hi = (mddev->events>>32); 679- sb->cp_events_lo = (u32)mddev->events; 680- if (mddev->recovery_cp == MaxSector) 681- sb->state = (1<< MD_SB_CLEAN); 682- } else 683- sb->recovery_cp = 0; 684- 685- sb->layout = mddev->layout; 686- sb->chunk_size = mddev->chunk_size; 687- 688- sb->disks[0].state = (1<<MD_DISK_REMOVED); 689- ITERATE_RDEV(mddev,rdev2,tmp) { 690- mdp_disk_t *d; 691- if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 692- rdev2->desc_nr = rdev2->raid_disk; 693- else 694- rdev2->desc_nr = next_spare++; 695- d = &sb->disks[rdev2->desc_nr]; 696- nr_disks++; 697- d->number = rdev2->desc_nr; 698- d->major = MAJOR(rdev2->bdev->bd_dev); 699- d->minor = MINOR(rdev2->bdev->bd_dev); 700- if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) 701- d->raid_disk = rdev2->raid_disk; 702- else 703- d->raid_disk = rdev2->desc_nr; /* compatibility */ 704- if (rdev2->faulty) { 705- d->state = (1<<MD_DISK_FAULTY); 706- failed++; 707- } else if (rdev2->in_sync) { 708- d->state = (1<<MD_DISK_ACTIVE); 709- d->state |= (1<<MD_DISK_SYNC); 710- active++; 711- working++; 712- } else { 713- d->state = 0; 714- spare++; 715- working++; 716- } 717- if (rdev2->desc_nr > highest) 718- highest = rdev2->desc_nr; 719- } 720- 721- /* now set the "removed" bit on any non-trailing holes */ 722- for (i=0; i<highest; i++) { 723- mdp_disk_t *d = &sb->disks[i]; 724- if (d->state == 0 && d->number == 0) { 725- d->number = i; 726- d->raid_disk = i; 727- d->state = (1<<MD_DISK_REMOVED); 728- } 729- } 730- sb->nr_disks = nr_disks; 731- sb->active_disks = active; 732- sb->working_disks = working; 733- sb->failed_disks = failed; 734- sb->spare_disks = spare; 735- 736- sb->this_disk = sb->disks[rdev->desc_nr]; 737- sb->sb_csum = calc_sb_csum(sb); 738-} 739- 740-/* 741- * version 1 superblock 742- */ 743- 744-static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 745-{ 746- unsigned int disk_csum, csum; 747- int size = 256 + sb->max_dev*2; 748- 749- disk_csum = sb->sb_csum; 750- sb->sb_csum = 0; 751- csum = csum_partial((void *)sb, size, 0); 752- sb->sb_csum = disk_csum; 753- return csum; 754-} 755- 756-static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 757-{ 758- struct mdp_superblock_1 *sb; 759- int ret; 760- sector_t sb_offset; 761- 762- /* 763- * Calculate the position of the superblock. 764- * It is always aligned to a 4K boundary and 765- * depeding on minor_version, it can be: 766- * 0: At least 8K, but less than 12K, from end of device 767- * 1: At start of device 768- * 2: 4K from start of device. 769- */ 770- switch(minor_version) { 771- case 0: 772- sb_offset = rdev->bdev->bd_inode->i_size >> 9; 773- sb_offset -= 8*2; 774- sb_offset &= ~(4*2); 775- /* convert from sectors to K */ 776- sb_offset /= 2; 777- break; 778- case 1: 779- sb_offset = 0; 780- break; 781- case 2: 782- sb_offset = 4; 783- break; 784- default: 785- return -EINVAL; 786- } 787- rdev->sb_offset = sb_offset; 788- 789- ret = read_disk_sb(rdev); 790- if (ret) return ret; 791- 792- 793- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 794- 795- if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 796- sb->major_version != cpu_to_le32(1) || 797- le32_to_cpu(sb->max_dev) > (4096-256)/2 || 798- le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 799- sb->feature_map != 0) 800- return -EINVAL; 801- 802- if (calc_sb_1_csum(sb) != sb->sb_csum) { 803- printk("md: invalid superblock checksum on %s\n", 804- bdev_partition_name(rdev->bdev)); 805- return -EINVAL; 806- } 807- rdev->preferred_minor = 0xffff; 808- rdev->data_offset = le64_to_cpu(sb->data_offset); 809- 810- if (refdev == 0) 811- return 1; 812- else { 813- __u64 ev1, ev2; 814- struct mdp_superblock_1 *refsb = 815- (struct mdp_superblock_1*)page_address(refdev->sb_page); 816- 817- if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 818- sb->level != refsb->level || 819- sb->layout != refsb->layout || 820- sb->chunksize != refsb->chunksize) { 821- printk(KERN_WARNING "md: %s has strangely different" 822- " superblock to %s\n", 823- bdev_partition_name(rdev->bdev), 824- bdev_partition_name(refdev->bdev)); 825- return -EINVAL; 826- } 827- ev1 = le64_to_cpu(sb->events); 828- ev2 = le64_to_cpu(refsb->events); 829- 830- if (ev1 > ev2) 831- return 1; 832- } 833- if (minor_version) 834- rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 835- else 836- rdev->size = rdev->sb_offset; 837- if (rdev->size < le64_to_cpu(sb->data_size)/2) 838- return -EINVAL; 839- rdev->size = le64_to_cpu(sb->data_size)/2; 840- if (le32_to_cpu(sb->chunksize)) 841- rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 842- return 0; 843-} 844- 845-static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 846-{ 847- struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 848- 849- if (mddev->raid_disks == 0) { 850- mddev->major_version = 1; 851- mddev->minor_version = 0; 852- mddev->patch_version = 0; 853- mddev->persistent = 1; 854- mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 855- mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 856- mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 857- mddev->level = le32_to_cpu(sb->level); 858- mddev->layout = le32_to_cpu(sb->layout); 859- mddev->raid_disks = le32_to_cpu(sb->raid_disks); 860- mddev->size = (u32)le64_to_cpu(sb->size); 861- mddev->events = le64_to_cpu(sb->events); 862- 863- mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 864- memcpy(mddev->uuid, sb->set_uuid, 16); 865- 866- mddev->max_disks = (4096-256)/2; 867- } else { 868- __u64 ev1; 869- ev1 = le64_to_cpu(sb->events); 870- ++ev1; 871- if (ev1 < mddev->events) 872- return -EINVAL; 873- } 874- 875- if (mddev->level != LEVEL_MULTIPATH) { 876- int role; 877- rdev->desc_nr = le32_to_cpu(sb->dev_number); 878- role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 879- switch(role) { 880- case 0xffff: /* spare */ 881- rdev->in_sync = 0; 882- rdev->faulty = 0; 883- rdev->raid_disk = -1; 884- break; 885- case 0xfffe: /* faulty */ 886- rdev->in_sync = 0; 887- rdev->faulty = 1; 888- rdev->raid_disk = -1; 889- break; 890- default: 891- rdev->in_sync = 1; 892- rdev->faulty = 0; 893- rdev->raid_disk = role; 894- break; 895- } 896- } 897- return 0; 898-} 899- 900-static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 901-{ 902- struct mdp_superblock_1 *sb; 903- struct list_head *tmp; 904- mdk_rdev_t *rdev2; 905- int max_dev, i; 906- /* make rdev->sb match mddev and rdev data. */ 907- 908- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 909- 910- sb->feature_map = 0; 911- sb->pad0 = 0; 912- memset(sb->pad1, 0, sizeof(sb->pad1)); 913- memset(sb->pad2, 0, sizeof(sb->pad2)); 914- memset(sb->pad3, 0, sizeof(sb->pad3)); 915- 916- sb->utime = cpu_to_le64((__u64)mddev->utime); 917- sb->events = cpu_to_le64(mddev->events); 918- if (mddev->in_sync) 919- sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 920- else 921- sb->resync_offset = cpu_to_le64(0); 922- 923- max_dev = 0; 924- ITERATE_RDEV(mddev,rdev2,tmp) 925- if (rdev2->desc_nr > max_dev) 926- max_dev = rdev2->desc_nr; 927- 928- sb->max_dev = max_dev; 929- for (i=0; i<max_dev;i++) 930- sb->dev_roles[max_dev] = cpu_to_le16(0xfffe); 931- 932- ITERATE_RDEV(mddev,rdev2,tmp) { 933- i = rdev2->desc_nr; 934- if (rdev2->faulty) 935- sb->dev_roles[i] = cpu_to_le16(0xfffe); 936- else if (rdev2->in_sync) 937- sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 938- else 939- sb->dev_roles[i] = cpu_to_le16(0xffff); 940- } 941- 942- sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 943-} 944- 945- 946-struct super_type super_types[] = { 947- [0] = { 948- .name = "0.90.0", 949- .owner = THIS_MODULE, 950- .load_super = super_90_load, 951- .validate_super = super_90_validate, 952- .sync_super = super_90_sync, 953- }, 954- [1] = { 955- .name = "md-1", 956- .owner = THIS_MODULE, 957- .load_super = super_1_load, 958- .validate_super = super_1_validate, 959- .sync_super = super_1_sync, 960- }, 961-}; 962- 963-static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 964-{ 965- struct list_head *tmp; 966- mdk_rdev_t *rdev; 967- 968- ITERATE_RDEV(mddev,rdev,tmp) 969- if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 970- return rdev; 971- 972- return NULL; 973-} 974- 975-static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 976-{ 977- struct list_head *tmp; 978- mdk_rdev_t *rdev; 979- 980- ITERATE_RDEV(mddev1,rdev,tmp) 981- if (match_dev_unit(mddev2, rdev)) 982- return 1; 983- 984- return 0; 985-} 986- 987-static LIST_HEAD(pending_raid_disks); 988- 989-static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 990-{ 991- mdk_rdev_t *same_pdev; 992- 993- if (rdev->mddev) { 994- MD_BUG(); 995- return -EINVAL; 996- } 997- same_pdev = match_dev_unit(mddev, rdev); 998- if (same_pdev) 999- printk(KERN_WARNING 1000- "md%d: WARNING: %s appears to be on the same physical" 1001- " disk as %s. True\n protection against single-disk" 1002- " failure might be compromised.\n", 1003- mdidx(mddev), bdev_partition_name(rdev->bdev), 1004- bdev_partition_name(same_pdev->bdev)); 1005- 1006- /* Verify rdev->desc_nr is unique. 1007- * If it is -1, assign a free number, else 1008- * check number is not in use 1009- */ 1010- if (rdev->desc_nr < 0) { 1011- int choice = 0; 1012- if (mddev->pers) choice = mddev->raid_disks; 1013- while (find_rdev_nr(mddev, choice)) 1014- choice++; 1015- rdev->desc_nr = choice; 1016- } else { 1017- if (find_rdev_nr(mddev, rdev->desc_nr)) 1018- return -EBUSY; 1019- } 1020- 1021- list_add(&rdev->same_set, &mddev->disks); 1022- rdev->mddev = mddev; 1023- printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev)); 1024- return 0; 1025-} 1026- 1027-static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1028-{ 1029- if (!rdev->mddev) { 1030- MD_BUG(); 1031- return; 1032- } 1033- list_del_init(&rdev->same_set); 1034- printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev)); 1035- rdev->mddev = NULL; 1036-} 1037- 1038-/* 1039- * prevent the device from being mounted, repartitioned or 1040- * otherwise reused by a RAID array (or any other kernel 1041- * subsystem), by opening the device. [simply getting an 1042- * inode is not enough, the SCSI module usage code needs 1043- * an explicit open() on the device] 1044- */ 1045-static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1046-{ 1047- int err = 0; 1048- struct block_device *bdev; 1049- 1050- bdev = bdget(dev); 1051- if (!bdev) 1052- return -ENOMEM; 1053- err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); 1054- if (err) 1055- return err; 1056- err = bd_claim(bdev, rdev); 1057- if (err) { 1058- blkdev_put(bdev, BDEV_RAW); 1059- return err; 1060- } 1061- rdev->bdev = bdev; 1062- return err; 1063-} 1064- 1065-static void unlock_rdev(mdk_rdev_t *rdev) 1066-{ 1067- struct block_device *bdev = rdev->bdev; 1068- rdev->bdev = NULL; 1069- if (!bdev) 1070- MD_BUG(); 1071- bd_release(bdev); 1072- blkdev_put(bdev, BDEV_RAW); 1073-} 1074- 1075-void md_autodetect_dev(dev_t dev); 1076- 1077-static void export_rdev(mdk_rdev_t * rdev) 1078-{ 1079- printk(KERN_INFO "md: export_rdev(%s)\n", 1080- bdev_partition_name(rdev->bdev)); 1081- if (rdev->mddev) 1082- MD_BUG(); 1083- free_disk_sb(rdev); 1084- list_del_init(&rdev->same_set); 1085-#ifndef MODULE 1086- md_autodetect_dev(rdev->bdev->bd_dev); 1087-#endif 1088- unlock_rdev(rdev); 1089- kfree(rdev); 1090-} 1091- 1092-static void kick_rdev_from_array(mdk_rdev_t * rdev) 1093-{ 1094- unbind_rdev_from_array(rdev); 1095- export_rdev(rdev); 1096-} 1097- 1098-static void export_array(mddev_t *mddev) 1099-{ 1100- struct list_head *tmp; 1101- mdk_rdev_t *rdev; 1102- 1103- ITERATE_RDEV(mddev,rdev,tmp) { 1104- if (!rdev->mddev) { 1105- MD_BUG(); 1106- continue; 1107- } 1108- kick_rdev_from_array(rdev); 1109- } 1110- if (!list_empty(&mddev->disks)) 1111- MD_BUG(); 1112- mddev->raid_disks = 0; 1113- mddev->major_version = 0; 1114-} 1115- 1116-static void print_desc(mdp_disk_t *desc) 1117-{ 1118- printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number, 1119- partition_name(MKDEV(desc->major,desc->minor)), 1120- desc->major,desc->minor,desc->raid_disk,desc->state); 1121-} 1122- 1123-static void print_sb(mdp_super_t *sb) 1124-{ 1125- int i; 1126- 1127- printk(KERN_INFO 1128- "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1129- sb->major_version, sb->minor_version, sb->patch_version, 1130- sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1131- sb->ctime); 1132- printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1133- sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1134- sb->md_minor, sb->layout, sb->chunk_size); 1135- printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1136- " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1137- sb->utime, sb->state, sb->active_disks, sb->working_disks, 1138- sb->failed_disks, sb->spare_disks, 1139- sb->sb_csum, (unsigned long)sb->events_lo); 1140- 1141- printk(KERN_INFO); 1142- for (i = 0; i < MD_SB_DISKS; i++) { 1143- mdp_disk_t *desc; 1144- 1145- desc = sb->disks + i; 1146- if (desc->number || desc->major || desc->minor || 1147- desc->raid_disk || (desc->state && (desc->state != 4))) { 1148- printk(" D %2d: ", i); 1149- print_desc(desc); 1150- } 1151- } 1152- printk(KERN_INFO "md: THIS: "); 1153- print_desc(&sb->this_disk); 1154- 1155-} 1156- 1157-static void print_rdev(mdk_rdev_t *rdev) 1158-{ 1159- printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ", 1160- bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size, 1161- rdev->faulty, rdev->in_sync, rdev->desc_nr); 1162- if (rdev->sb_loaded) { 1163- printk(KERN_INFO "md: rdev superblock:\n"); 1164- print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1165- } else 1166- printk(KERN_INFO "md: no rdev superblock!\n"); 1167-} 1168- 1169-void md_print_devices(void) 1170-{ 1171- struct list_head *tmp, *tmp2; 1172- mdk_rdev_t *rdev; 1173- mddev_t *mddev; 1174- 1175- printk("\n"); 1176- printk("md: **********************************\n"); 1177- printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1178- printk("md: **********************************\n"); 1179- ITERATE_MDDEV(mddev,tmp) { 1180- printk("md%d: ", mdidx(mddev)); 1181- 1182- ITERATE_RDEV(mddev,rdev,tmp2) 1183- printk("<%s>", bdev_partition_name(rdev->bdev)); 1184- 1185- ITERATE_RDEV(mddev,rdev,tmp2) 1186- print_rdev(rdev); 1187- } 1188- printk("md: **********************************\n"); 1189- printk("\n"); 1190-} 1191- 1192- 1193-static int write_disk_sb(mdk_rdev_t * rdev) 1194-{ 1195- 1196- if (!rdev->sb_loaded) { 1197- MD_BUG(); 1198- return 1; 1199- } 1200- if (rdev->faulty) { 1201- MD_BUG(); 1202- return 1; 1203- } 1204- 1205- dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1206- bdev_partition_name(rdev->bdev), 1207- (unsigned long long)rdev->sb_offset); 1208- 1209- if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) 1210- return 0; 1211- 1212- printk("md: write_disk_sb failed for device %s\n", 1213- bdev_partition_name(rdev->bdev)); 1214- return 1; 1215-} 1216- 1217-static void sync_sbs(mddev_t * mddev) 1218-{ 1219- mdk_rdev_t *rdev; 1220- struct list_head *tmp; 1221- 1222- ITERATE_RDEV(mddev,rdev,tmp) { 1223- super_types[mddev->major_version]. 1224- sync_super(mddev, rdev); 1225- rdev->sb_loaded = 1; 1226- } 1227-} 1228- 1229-static void md_update_sb(mddev_t * mddev) 1230-{ 1231- int err, count = 100; 1232- struct list_head *tmp; 1233- mdk_rdev_t *rdev; 1234- 1235- mddev->sb_dirty = 0; 1236-repeat: 1237- mddev->utime = get_seconds(); 1238- mddev->events ++; 1239- 1240- if (!mddev->events) { 1241- /* 1242- * oops, this 64-bit counter should never wrap. 1243- * Either we are in around ~1 trillion A.C., assuming 1244- * 1 reboot per second, or we have a bug: 1245- */ 1246- MD_BUG(); 1247- mddev->events --; 1248- } 1249- sync_sbs(mddev); 1250- 1251- /* 1252- * do not write anything to disk if using 1253- * nonpersistent superblocks 1254- */ 1255- if (!mddev->persistent) 1256- return; 1257- 1258- dprintk(KERN_INFO 1259- "md: updating md%d RAID superblock on device (in sync %d)\n", 1260- mdidx(mddev),mddev->in_sync); 1261- 1262- err = 0; 1263- ITERATE_RDEV(mddev,rdev,tmp) { 1264- dprintk(KERN_INFO "md: "); 1265- if (rdev->faulty) 1266- dprintk("(skipping faulty "); 1267- 1268- dprintk("%s ", bdev_partition_name(rdev->bdev)); 1269- if (!rdev->faulty) { 1270- err += write_disk_sb(rdev); 1271- } else 1272- dprintk(")\n"); 1273- if (!err && mddev->level == LEVEL_MULTIPATH) 1274- /* only need to write one superblock... */ 1275- break; 1276- } 1277- if (err) { 1278- if (--count) { 1279- printk(KERN_ERR "md: errors occurred during superblock" 1280- " update, repeating\n"); 1281- goto repeat; 1282- } 1283- printk(KERN_ERR \ 1284- "md: excessive errors occurred during superblock update, exiting\n"); 1285- } 1286-} 1287- 1288-/* 1289- * Import a device. If 'super_format' >= 0, then sanity check the superblock 1290- * 1291- * mark the device faulty if: 1292- * 1293- * - the device is nonexistent (zero size) 1294- * - the device has no valid superblock 1295- * 1296- * a faulty rdev _never_ has rdev->sb set. 1297- */ 1298-static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1299-{ 1300- int err; 1301- mdk_rdev_t *rdev; 1302- sector_t size; 1303- 1304- rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1305- if (!rdev) { 1306- printk(KERN_ERR "md: could not alloc mem for %s!\n", 1307- partition_name(newdev)); 1308- return ERR_PTR(-ENOMEM); 1309- } 1310- memset(rdev, 0, sizeof(*rdev)); 1311- 1312- if ((err = alloc_disk_sb(rdev))) 1313- goto abort_free; 1314- 1315- err = lock_rdev(rdev, newdev); 1316- if (err) { 1317- printk(KERN_ERR "md: could not lock %s.\n", 1318- partition_name(newdev)); 1319- goto abort_free; 1320- } 1321- rdev->desc_nr = -1; 1322- rdev->faulty = 0; 1323- rdev->in_sync = 0; 1324- rdev->data_offset = 0; 1325- atomic_set(&rdev->nr_pending, 0); 1326- 1327- size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1328- if (!size) { 1329- printk(KERN_WARNING 1330- "md: %s has zero or unknown size, marking faulty!\n", 1331- bdev_partition_name(rdev->bdev)); 1332- err = -EINVAL; 1333- goto abort_free; 1334- } 1335- 1336- if (super_format >= 0) { 1337- err = super_types[super_format]. 1338- load_super(rdev, NULL, super_minor); 1339- if (err == -EINVAL) { 1340- printk(KERN_WARNING 1341- "md: %s has invalid sb, not importing!\n", 1342- bdev_partition_name(rdev->bdev)); 1343- goto abort_free; 1344- } 1345- if (err < 0) { 1346- printk(KERN_WARNING 1347- "md: could not read %s's sb, not importing!\n", 1348- bdev_partition_name(rdev->bdev)); 1349- goto abort_free; 1350- } 1351- } 1352- INIT_LIST_HEAD(&rdev->same_set); 1353- 1354- return rdev; 1355- 1356-abort_free: 1357- if (rdev->sb_page) { 1358- if (rdev->bdev) 1359- unlock_rdev(rdev); 1360- free_disk_sb(rdev); 1361- } 1362- kfree(rdev); 1363- return ERR_PTR(err); 1364-} 1365- 1366-/* 1367- * Check a full RAID array for plausibility 1368- */ 1369- 1370- 1371-static int analyze_sbs(mddev_t * mddev) 1372-{ 1373- int i; 1374- struct list_head *tmp; 1375- mdk_rdev_t *rdev, *freshest; 1376- 1377- freshest = NULL; 1378- ITERATE_RDEV(mddev,rdev,tmp) 1379- switch (super_types[mddev->major_version]. 1380- load_super(rdev, freshest, mddev->minor_version)) { 1381- case 1: 1382- freshest = rdev; 1383- break; 1384- case 0: 1385- break; 1386- default: 1387- printk( KERN_ERR \ 1388- "md: fatal superblock inconsistency in %s" 1389- " -- removing from array\n", 1390- bdev_partition_name(rdev->bdev)); 1391- kick_rdev_from_array(rdev); 1392- } 1393- 1394- 1395- super_types[mddev->major_version]. 1396- validate_super(mddev, freshest); 1397- 1398- i = 0; 1399- ITERATE_RDEV(mddev,rdev,tmp) { 1400- if (rdev != freshest) 1401- if (super_types[mddev->major_version]. 1402- validate_super(mddev, rdev)) { 1403- printk(KERN_WARNING "md: kicking non-fresh %s" 1404- " from array!\n", 1405- bdev_partition_name(rdev->bdev)); 1406- kick_rdev_from_array(rdev); 1407- continue; 1408- } 1409- if (mddev->level == LEVEL_MULTIPATH) { 1410- rdev->desc_nr = i++; 1411- rdev->raid_disk = rdev->desc_nr; 1412- rdev->in_sync = 1; 1413- } 1414- } 1415- 1416- 1417- /* 1418- * Check if we can support this RAID array 1419- */ 1420- if (mddev->major_version != MD_MAJOR_VERSION || 1421- mddev->minor_version > MD_MINOR_VERSION) { 1422- printk(KERN_ALERT 1423- "md: md%d: unsupported raid array version %d.%d.%d\n", 1424- mdidx(mddev), mddev->major_version, 1425- mddev->minor_version, mddev->patch_version); 1426- goto abort; 1427- } 1428- 1429- if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) || 1430- (mddev->level == 4) || (mddev->level == 5))) 1431- printk(KERN_ERR "md: md%d: raid array is not clean" 1432- " -- starting background reconstruction\n", 1433- mdidx(mddev)); 1434- 1435- return 0; 1436-abort: 1437+*** 1453,90 **** 1 1438 return 1; 1439 } 1440 1441+#undef OLD_LEVEL 1442+ 1443 static int device_size_calculation(mddev_t * mddev) 1444 { 1445 int data_disks = 0; 1446 unsigned int readahead; 1447 struct list_head *tmp; 1448 mdk_rdev_t *rdev; 1449 1450 /* 1451 * Do device size calculation. Bail out if too small. 1452 * (we have to do this after having validated chunk_size, 1453 * because device size has to be modulo chunk_size) 1454 */ 1455 1456 ITERATE_RDEV(mddev,rdev,tmp) { 1457 if (rdev->faulty) 1458 continue; 1459 if (rdev->size < mddev->chunk_size / 1024) { 1460 printk(KERN_WARNING 1461 "md: Dev %s smaller than chunk_size:" 1462 " %lluk < %dk\n", 1463 bdev_partition_name(rdev->bdev), 1464 (unsigned long long)rdev->size, 1465 mddev->chunk_size / 1024); 1466 return -EINVAL; 1467 } 1468 } 1469 1470 switch (mddev->level) { 1471 case LEVEL_MULTIPATH: 1472 data_disks = 1; 1473 break; 1474 case -3: 1475 data_disks = 1; 1476 break; 1477 case -2: 1478 data_disks = 1; 1479 break; 1480 case LEVEL_LINEAR: 1481 zoned_raid_size(mddev); 1482 data_disks = 1; 1483 break; 1484 case 0: 1485 zoned_raid_size(mddev); 1486 data_disks = mddev->raid_disks; 1487 break; 1488 case 1: 1489 data_disks = 1; 1490 break; 1491 case 4: 1492 case 5: 1493 data_disks = mddev->raid_disks-1; 1494 break; 1495 default: 1496 printk(KERN_ERR "md: md%d: unsupported raid level %d\n", 1497 mdidx(mddev), mddev->level); 1498 goto abort; 1499 } 1500 if (!md_size[mdidx(mddev)]) 1501 md_size[mdidx(mddev)] = mddev->size * data_disks; 1502 1503 readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; 1504 if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) { 1505 readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks; 1506 if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) 1507 readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; 1508 } else { 1509 // (no multipath branch - it uses the default setting) 1510 if (mddev->level == -3) 1511 readahead = 0; 1512 } 1513 1514 printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", 1515 mdidx(mddev), readahead*(PAGE_SIZE/1024)); 1516 1517 printk(KERN_INFO 1518 "md%d: %d data-disks, max readahead per data-disk: %ldk\n", 1519 mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); 1520 return 0; 1521 abort: 1522 return 1; 1523 } 1524 1525 static struct gendisk *md_probe(dev_t dev, int *part, void *data) 1526 { 1527 static DECLARE_MUTEX(disks_sem); 1528- int unit = MINOR(dev); 1529- mddev_t *mddev = mddev_find(unit); 1530- struct gendisk *disk; 1531- 1532- if (!mddev) 1533- return NULL; 1534- 1535- down(&disks_sem); 1536- if (disks[unit]) { 1537- up(&disks_sem); 1538- mddev_put(mddev); 1539- return NULL; 1540- } 1541- disk = alloc_disk(1); 1542- if (!disk) { 1543- up(&disks_sem); 1544- mddev_put(mddev); 1545- return NULL; 1546- } 1547- disk->major = MD_MAJOR; 1548- disk->first_minor = mdidx(mddev); 1549- sprintf(disk->disk_name, "md%d", mdidx(mddev)); 1550- disk->fops = &md_fops; 1551- disk->private_data = mddev; 1552- disk->queue = &mddev->queue; 1553- add_disk(disk); 1554- disks[mdidx(mddev)] = disk; 1555- up(&disks_sem); 1556- return NULL; 1557-} 1558- 1559-void md_wakeup_thread(mdk_thread_t *thread); 1560- 1561-static void md_safemode_timeout(unsigned long data) 1562-{ 1563- mddev_t *mddev = (mddev_t *) data; 1564- 1565- mddev->safemode = 1; 1566- md_wakeup_thread(mddev->thread); 1567-} 1568- 1569- 1570-static int do_md_run(mddev_t * mddev) 1571-{ 1572- int pnum, err; 1573- int chunk_size; 1574- struct list_head *tmp; 1575- mdk_rdev_t *rdev; 1576- struct gendisk *disk; 1577- 1578- if (list_empty(&mddev->disks)) { 1579- MD_BUG(); 1580- return -EINVAL; 1581- } 1582- 1583- if (mddev->pers) 1584- return -EBUSY; 1585- 1586- /* 1587- * Analyze all RAID superblock(s) 1588- */ 1589- if (!mddev->raid_disks && analyze_sbs(mddev)) { 1590- MD_BUG(); 1591- return -EINVAL; 1592- } 1593- 1594- chunk_size = mddev->chunk_size; 1595- pnum = level_to_pers(mddev->level); 1596- 1597- if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1598- if (!chunk_size) { 1599- /* 1600- * 'default chunksize' in the old md code used to 1601- * be PAGE_SIZE, baaad. 1602- * we abort here to be on the safe side. We don't 1603- * want to continue the bad practice. 1604- */ 1605- printk(KERN_ERR 1606- "no chunksize specified, see 'man raidtab'\n"); 1607- return -EINVAL; 1608- } 1609- if (chunk_size > MAX_CHUNK_SIZE) { 1610- printk(KERN_ERR "too big chunk_size: %d > %d\n", 1611- chunk_size, MAX_CHUNK_SIZE); 1612- return -EINVAL; 1613- } 1614- /* 1615- * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1616- */ 1617- if ( (1 << ffz(~chunk_size)) != chunk_size) { 1618- MD_BUG(); 1619- return -EINVAL; 1620- } 1621- if (chunk_size < PAGE_SIZE) { 1622- printk(KERN_ERR "too small chunk_size: %d < %ld\n", 1623- chunk_size, PAGE_SIZE); 1624- return -EINVAL; 1625- } 1626- 1627- /* devices must have minimum size of one chunk */ 1628- ITERATE_RDEV(mddev,rdev,tmp) { 1629- if (rdev->faulty) 1630- continue; 1631- if (rdev->size < chunk_size / 1024) { 1632- printk(KERN_WARNING 1633- "md: Dev %s smaller than chunk_size:" 1634- " %lluk < %dk\n", 1635- bdev_partition_name(rdev->bdev), 1636- (unsigned long long)rdev->size, 1637- chunk_size / 1024); 1638- return -EINVAL; 1639- } 1640- } 1641- } 1642- if (pnum >= MAX_PERSONALITY) { 1643- MD_BUG(); 1644- return -EINVAL; 1645- } 1646- 1647-#ifdef CONFIG_KMOD 1648- if (!pers[pnum]) 1649- { 1650- char module_name[80]; 1651- sprintf (module_name, "md-personality-%d", pnum); 1652- request_module (module_name); 1653+*** 1664,9 **** 2 1654+ } 1655 } 1656-#endif 1657 1658 if (device_size_calculation(mddev)) 1659 return -EINVAL; 1660 1661 /* 1662 * Drop all container device buffers, from now on 1663 * the only valid external interface is through the md 1664- * device. 1665- * Also find largest hardsector size 1666- */ 1667- ITERATE_RDEV(mddev,rdev,tmp) { 1668- if (rdev->faulty) 1669- continue; 1670- sync_blockdev(rdev->bdev); 1671- invalidate_bdev(rdev->bdev, 0); 1672- } 1673- 1674- md_probe(mdidx(mddev), NULL, NULL); 1675- disk = disks[mdidx(mddev)]; 1676- if (!disk) 1677- return -ENOMEM; 1678- 1679- spin_lock(&pers_lock); 1680- if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 1681- spin_unlock(&pers_lock); 1682- printk(KERN_ERR "md: personality %d is not loaded!\n", 1683- pnum); 1684- return -EINVAL; 1685- } 1686- 1687- mddev->pers = pers[pnum]; 1688- spin_unlock(&pers_lock); 1689- 1690- blk_queue_make_request(&mddev->queue, mddev->pers->make_request); 1691- printk("%s: setting max_sectors to %d, segment boundary to %d\n", 1692- disk->disk_name, 1693- chunk_size >> 9, 1694- (chunk_size>>1)-1); 1695- blk_queue_max_sectors(&mddev->queue, chunk_size >> 9); 1696- blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1); 1697- mddev->queue.queuedata = mddev; 1698- 1699- err = mddev->pers->run(mddev); 1700- if (err) { 1701- printk(KERN_ERR "md: pers->run() failed ...\n"); 1702- module_put(mddev->pers->owner); 1703- mddev->pers = NULL; 1704- return -EINVAL; 1705- } 1706- atomic_set(&mddev->writes_pending,0); 1707- mddev->safemode = 0; 1708- mddev->safemode_timer.function = md_safemode_timeout; 1709- mddev->safemode_timer.data = (unsigned long) mddev; 1710- mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 1711- mddev->in_sync = 1; 1712- 1713- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1714- md_wakeup_thread(mddev->thread); 1715- set_capacity(disk, mddev->array_size<<1); 1716- return 0; 1717-} 1718- 1719-static int restart_array(mddev_t *mddev) 1720-{ 1721- struct gendisk *disk = disks[mdidx(mddev)]; 1722- int err; 1723- 1724- /* 1725- * Complain if it has no devices 1726- */ 1727- err = -ENXIO; 1728- if (list_empty(&mddev->disks)) 1729- goto out; 1730- 1731- if (mddev->pers) { 1732- err = -EBUSY; 1733- if (!mddev->ro) 1734- goto out; 1735- 1736- mddev->safemode = 0; 1737- mddev->ro = 0; 1738- set_disk_ro(disk, 0); 1739- 1740- printk(KERN_INFO "md: md%d switched to read-write mode.\n", 1741- mdidx(mddev)); 1742- /* 1743- * Kick recovery or resync if necessary 1744- */ 1745- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1746- md_wakeup_thread(mddev->thread); 1747- err = 0; 1748- } else { 1749- printk(KERN_ERR "md: md%d has no personality assigned.\n", 1750- mdidx(mddev)); 1751- err = -EINVAL; 1752- } 1753- 1754-out: 1755- return err; 1756-} 1757- 1758-static int do_md_stop(mddev_t * mddev, int ro) 1759-{ 1760- int err = 0; 1761- struct gendisk *disk = disks[mdidx(mddev)]; 1762- 1763- if (atomic_read(&mddev->active)>2) { 1764- printk("md: md%d still in use.\n",mdidx(mddev)); 1765- err = -EBUSY; 1766- goto out; 1767- } 1768- 1769- if (mddev->pers) { 1770- if (mddev->sync_thread) { 1771- set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1772- md_unregister_thread(mddev->sync_thread); 1773- mddev->sync_thread = NULL; 1774- } 1775- 1776- del_timer_sync(&mddev->safemode_timer); 1777- 1778- invalidate_device(mk_kdev(disk->major, disk->first_minor), 1); 1779- 1780- if (ro) { 1781- err = -ENXIO; 1782- if (mddev->ro) 1783- goto out; 1784- mddev->ro = 1; 1785- } else { 1786- if (mddev->ro) 1787- set_disk_ro(disk, 0); 1788- if (mddev->pers->stop(mddev)) { 1789- err = -EBUSY; 1790- if (mddev->ro) 1791- set_disk_ro(disk, 1); 1792- goto out; 1793- } 1794- module_put(mddev->pers->owner); 1795- mddev->pers = NULL; 1796- if (mddev->ro) 1797- mddev->ro = 0; 1798- } 1799- if (mddev->raid_disks) { 1800- /* mark array as shutdown cleanly */ 1801- mddev->in_sync = 1; 1802- md_update_sb(mddev); 1803- } 1804- if (ro) 1805- set_disk_ro(disk, 1); 1806- } 1807- /* 1808- * Free resources if final stop 1809- */ 1810- if (!ro) { 1811- struct gendisk *disk; 1812- printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); 1813- 1814- export_array(mddev); 1815- 1816- mddev->array_size = 0; 1817- disk = disks[mdidx(mddev)]; 1818- if (disk) 1819- set_capacity(disk, 0); 1820- } else 1821- printk(KERN_INFO "md: md%d switched to read-only mode.\n", 1822- mdidx(mddev)); 1823- err = 0; 1824-out: 1825- return err; 1826-} 1827- 1828-static void autorun_array(mddev_t *mddev) 1829-{ 1830- mdk_rdev_t *rdev; 1831- struct list_head *tmp; 1832- int err; 1833- 1834- if (list_empty(&mddev->disks)) { 1835- MD_BUG(); 1836- return; 1837- } 1838- 1839- printk(KERN_INFO "md: running: "); 1840- 1841- ITERATE_RDEV(mddev,rdev,tmp) { 1842- printk("<%s>", bdev_partition_name(rdev->bdev)); 1843- } 1844- printk("\n"); 1845- 1846- err = do_md_run (mddev); 1847- if (err) { 1848- printk(KERN_WARNING "md :do_md_run() returned %d\n", err); 1849- do_md_stop (mddev, 0); 1850- } 1851-} 1852- 1853-/* 1854- * lets try to run arrays based on all disks that have arrived 1855- * until now. (those are in pending_raid_disks) 1856- * 1857- * the method: pick the first pending disk, collect all disks with 1858- * the same UUID, remove all from the pending list and put them into 1859- * the 'same_array' list. Then order this list based on superblock 1860- * update time (freshest comes first), kick out 'old' disks and 1861- * compare superblocks. If everything's fine then run it. 1862- * 1863- * If "unit" is allocated, then bump its reference count 1864- */ 1865-static void autorun_devices(void) 1866-{ 1867- struct list_head candidates; 1868- struct list_head *tmp; 1869- mdk_rdev_t *rdev0, *rdev; 1870- mddev_t *mddev; 1871- 1872- printk(KERN_INFO "md: autorun ...\n"); 1873- while (!list_empty(&pending_raid_disks)) { 1874- rdev0 = list_entry(pending_raid_disks.next, 1875- mdk_rdev_t, same_set); 1876- 1877- printk(KERN_INFO "md: considering %s ...\n", 1878- bdev_partition_name(rdev0->bdev)); 1879- INIT_LIST_HEAD(&candidates); 1880- ITERATE_RDEV_PENDING(rdev,tmp) 1881- if (super_90_load(rdev, rdev0, 0) >= 0) { 1882- printk(KERN_INFO "md: adding %s ...\n", 1883- bdev_partition_name(rdev->bdev)); 1884- list_move(&rdev->same_set, &candidates); 1885- } 1886- /* 1887- * now we have a set of devices, with all of them having 1888- * mostly sane superblocks. It's time to allocate the 1889- * mddev. 1890- */ 1891- 1892- mddev = mddev_find(rdev0->preferred_minor); 1893- if (!mddev) { 1894- printk(KERN_ERR 1895- "md: cannot allocate memory for md drive.\n"); 1896- break; 1897- } 1898- if (mddev_lock(mddev)) 1899- printk(KERN_WARNING "md: md%d locked, cannot run\n", 1900- mdidx(mddev)); 1901- else if (mddev->raid_disks || mddev->major_version 1902- || !list_empty(&mddev->disks)) { 1903- printk(KERN_WARNING 1904- "md: md%d already running, cannot run %s\n", 1905- mdidx(mddev), bdev_partition_name(rdev0->bdev)); 1906- mddev_unlock(mddev); 1907- } else { 1908- printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); 1909- ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 1910- list_del_init(&rdev->same_set); 1911- if (bind_rdev_to_array(rdev, mddev)) 1912- export_rdev(rdev); 1913- } 1914- autorun_array(mddev); 1915- mddev_unlock(mddev); 1916- } 1917- /* on success, candidates will be empty, on error 1918- * it won't... 1919- */ 1920- ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 1921- export_rdev(rdev); 1922- mddev_put(mddev); 1923- } 1924- printk(KERN_INFO "md: ... autorun DONE.\n"); 1925-} 1926- 1927-/* 1928- * import RAID devices based on one partition 1929- * if possible, the array gets run as well. 1930- */ 1931- 1932-static int autostart_array(dev_t startdev) 1933-{ 1934- int err = -EINVAL, i; 1935- mdp_super_t *sb = NULL; 1936- mdk_rdev_t *start_rdev = NULL, *rdev; 1937- 1938- start_rdev = md_import_device(startdev, 0, 0); 1939- if (IS_ERR(start_rdev)) { 1940- printk(KERN_WARNING "md: could not import %s!\n", 1941- partition_name(startdev)); 1942- return err; 1943- } 1944- 1945- /* NOTE: this can only work for 0.90.0 superblocks */ 1946- sb = (mdp_super_t*)page_address(start_rdev->sb_page); 1947- if (sb->major_version != 0 || 1948- sb->minor_version != 90 ) { 1949- printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 1950- export_rdev(start_rdev); 1951- return err; 1952- } 1953- 1954- if (start_rdev->faulty) { 1955- printk(KERN_WARNING 1956- "md: can not autostart based on faulty %s!\n", 1957- bdev_partition_name(start_rdev->bdev)); 1958- export_rdev(start_rdev); 1959- return err; 1960- } 1961- list_add(&start_rdev->same_set, &pending_raid_disks); 1962- 1963- for (i = 0; i < MD_SB_DISKS; i++) { 1964- mdp_disk_t *desc; 1965- dev_t dev; 1966- 1967- desc = sb->disks + i; 1968- dev = MKDEV(desc->major, desc->minor); 1969- 1970- if (!dev) 1971- continue; 1972- if (dev == startdev) 1973- continue; 1974- rdev = md_import_device(dev, 0, 0); 1975- if (IS_ERR(rdev)) { 1976- printk(KERN_WARNING "md: could not import %s," 1977- " trying to run array nevertheless.\n", 1978- partition_name(dev)); 1979- continue; 1980- } 1981- list_add(&rdev->same_set, &pending_raid_disks); 1982- } 1983- 1984- /* 1985- * possibly return codes 1986- */ 1987- autorun_devices(); 1988- return 0; 1989- 1990-} 1991- 1992- 1993-static int get_version(void * arg) 1994-{ 1995- mdu_version_t ver; 1996- 1997- ver.major = MD_MAJOR_VERSION; 1998- ver.minor = MD_MINOR_VERSION; 1999- ver.patchlevel = MD_PATCHLEVEL_VERSION; 2000- 2001- if (copy_to_user(arg, &ver, sizeof(ver))) 2002- return -EFAULT; 2003- 2004- return 0; 2005-} 2006- 2007-static int get_array_info(mddev_t * mddev, void * arg) 2008-{ 2009- mdu_array_info_t info; 2010- int nr,working,active,failed,spare; 2011- mdk_rdev_t *rdev; 2012- struct list_head *tmp; 2013- 2014- nr=working=active=failed=spare=0; 2015- ITERATE_RDEV(mddev,rdev,tmp) { 2016- nr++; 2017- if (rdev->faulty) 2018- failed++; 2019- else { 2020- working++; 2021- if (rdev->in_sync) 2022- active++; 2023- else 2024- spare++; 2025- } 2026- } 2027- 2028- info.major_version = mddev->major_version; 2029- info.minor_version = mddev->minor_version; 2030- info.patch_version = 1; 2031- info.ctime = mddev->ctime; 2032- info.level = mddev->level; 2033- info.size = mddev->size; 2034- info.nr_disks = nr; 2035- info.raid_disks = mddev->raid_disks; 2036- info.md_minor = mddev->__minor; 2037- info.not_persistent= !mddev->persistent; 2038- 2039- info.utime = mddev->utime; 2040- info.state = 0; 2041- if (mddev->in_sync) 2042- info.state = (1<<MD_SB_CLEAN); 2043- info.active_disks = active; 2044- info.working_disks = working; 2045- info.failed_disks = failed; 2046- info.spare_disks = spare; 2047- 2048- info.layout = mddev->layout; 2049- info.chunk_size = mddev->chunk_size; 2050- 2051- if (copy_to_user(arg, &info, sizeof(info))) 2052- return -EFAULT; 2053- 2054- return 0; 2055-} 2056- 2057-static int get_disk_info(mddev_t * mddev, void * arg) 2058-{ 2059- mdu_disk_info_t info; 2060- unsigned int nr; 2061- mdk_rdev_t *rdev; 2062- 2063- if (copy_from_user(&info, arg, sizeof(info))) 2064- return -EFAULT; 2065- 2066- nr = info.number; 2067- 2068- rdev = find_rdev_nr(mddev, nr); 2069- if (rdev) { 2070- info.major = MAJOR(rdev->bdev->bd_dev); 2071- info.minor = MINOR(rdev->bdev->bd_dev); 2072- info.raid_disk = rdev->raid_disk; 2073- info.state = 0; 2074- if (rdev->faulty) 2075- info.state |= (1<<MD_DISK_FAULTY); 2076- else if (rdev->in_sync) { 2077- info.state |= (1<<MD_DISK_ACTIVE); 2078- info.state |= (1<<MD_DISK_SYNC); 2079- } 2080- } else { 2081- info.major = info.minor = 0; 2082- info.raid_disk = -1; 2083- info.state = (1<<MD_DISK_REMOVED); 2084- } 2085- 2086- if (copy_to_user(arg, &info, sizeof(info))) 2087- return -EFAULT; 2088- 2089- return 0; 2090-} 2091- 2092-static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2093-{ 2094- mdk_rdev_t *rdev; 2095- dev_t dev; 2096- dev = MKDEV(info->major,info->minor); 2097- if (!mddev->raid_disks) { 2098- int err; 2099- /* expecting a device which has a superblock */ 2100- rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2101- if (IS_ERR(rdev)) { 2102- printk(KERN_WARNING 2103- "md: md_import_device returned %ld\n", 2104- PTR_ERR(rdev)); 2105- return PTR_ERR(rdev); 2106- } 2107- if (!list_empty(&mddev->disks)) { 2108- mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2109- mdk_rdev_t, same_set); 2110- int err = super_types[mddev->major_version] 2111- .load_super(rdev, rdev0, mddev->minor_version); 2112- if (err < 0) { 2113- printk(KERN_WARNING 2114- "md: %s has different UUID to %s\n", 2115- bdev_partition_name(rdev->bdev), 2116- bdev_partition_name(rdev0->bdev)); 2117- export_rdev(rdev); 2118- return -EINVAL; 2119- } 2120- } 2121- err = bind_rdev_to_array(rdev, mddev); 2122- if (err) 2123- export_rdev(rdev); 2124- return err; 2125- } 2126- 2127- /* 2128- * add_new_disk can be used once the array is assembled 2129- * to add "hot spares". They must already have a superblock 2130- * written 2131- */ 2132- if (mddev->pers) { 2133- int err; 2134- if (!mddev->pers->hot_add_disk) { 2135- printk(KERN_WARNING 2136- "md%d: personality does not support diskops!\n", 2137- mdidx(mddev)); 2138- return -EINVAL; 2139- } 2140- rdev = md_import_device(dev, mddev->major_version, 2141- mddev->minor_version); 2142- if (IS_ERR(rdev)) { 2143- printk(KERN_WARNING 2144- "md: md_import_device returned %ld\n", 2145- PTR_ERR(rdev)); 2146- return PTR_ERR(rdev); 2147- } 2148- rdev->in_sync = 0; /* just to be sure */ 2149- rdev->raid_disk = -1; 2150- err = bind_rdev_to_array(rdev, mddev); 2151- if (err) 2152- export_rdev(rdev); 2153- if (mddev->thread) 2154- md_wakeup_thread(mddev->thread); 2155- return err; 2156- } 2157- 2158- /* otherwise, add_new_disk is only allowed 2159- * for major_version==0 superblocks 2160- */ 2161- if (mddev->major_version != 0) { 2162- printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n", 2163- mdidx(mddev)); 2164- return -EINVAL; 2165- } 2166- 2167- if (!(info->state & (1<<MD_DISK_FAULTY))) { 2168- int err; 2169- rdev = md_import_device (dev, -1, 0); 2170- if (IS_ERR(rdev)) { 2171- printk(KERN_WARNING 2172- "md: error, md_import_device() returned %ld\n", 2173- PTR_ERR(rdev)); 2174- return PTR_ERR(rdev); 2175- } 2176- rdev->desc_nr = info->number; 2177- if (info->raid_disk < mddev->raid_disks) 2178- rdev->raid_disk = info->raid_disk; 2179- else 2180- rdev->raid_disk = -1; 2181- 2182- rdev->faulty = 0; 2183- if (rdev->raid_disk < mddev->raid_disks) 2184- rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); 2185- else 2186- rdev->in_sync = 0; 2187- 2188- err = bind_rdev_to_array(rdev, mddev); 2189- if (err) { 2190- export_rdev(rdev); 2191- return err; 2192- } 2193- 2194- if (!mddev->persistent) { 2195- printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2196- rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2197- } else 2198- rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2199- rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2200- 2201- if (!mddev->size || (mddev->size > rdev->size)) 2202- mddev->size = rdev->size; 2203- } 2204- 2205- return 0; 2206-} 2207- 2208-static int hot_generate_error(mddev_t * mddev, dev_t dev) 2209-{ 2210- struct request_queue *q; 2211- mdk_rdev_t *rdev; 2212- 2213- if (!mddev->pers) 2214- return -ENODEV; 2215- 2216- printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", 2217- partition_name(dev), mdidx(mddev)); 2218- 2219- rdev = find_rdev(mddev, dev); 2220- if (!rdev) { 2221- MD_BUG(); 2222- return -ENXIO; 2223- } 2224- 2225- if (rdev->desc_nr == -1) { 2226- MD_BUG(); 2227- return -EINVAL; 2228- } 2229- if (!rdev->in_sync) 2230- return -ENODEV; 2231- 2232- q = bdev_get_queue(rdev->bdev); 2233- if (!q) { 2234- MD_BUG(); 2235- return -ENODEV; 2236- } 2237- printk(KERN_INFO "md: okay, generating error!\n"); 2238-// q->oneshot_error = 1; // disabled for now 2239- 2240- return 0; 2241-} 2242- 2243-static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2244-{ 2245- mdk_rdev_t *rdev; 2246- 2247- if (!mddev->pers) 2248- return -ENODEV; 2249- 2250- printk(KERN_INFO "md: trying to remove %s from md%d ... \n", 2251- partition_name(dev), mdidx(mddev)); 2252- 2253- rdev = find_rdev(mddev, dev); 2254- if (!rdev) 2255- return -ENXIO; 2256- 2257- if (rdev->raid_disk >= 0) 2258- goto busy; 2259- 2260- kick_rdev_from_array(rdev); 2261- md_update_sb(mddev); 2262- 2263- return 0; 2264-busy: 2265- printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", 2266- bdev_partition_name(rdev->bdev), mdidx(mddev)); 2267- return -EBUSY; 2268-} 2269- 2270-static int hot_add_disk(mddev_t * mddev, dev_t dev) 2271-{ 2272- int err; 2273- unsigned int size; 2274- mdk_rdev_t *rdev; 2275- 2276- if (!mddev->pers) 2277- return -ENODEV; 2278- 2279- printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", 2280- partition_name(dev), mdidx(mddev)); 2281- 2282- if (mddev->major_version != 0) { 2283- printk(KERN_WARNING "md%d: HOT_ADD may only be used with" 2284- " version-0 superblocks.\n", 2285- mdidx(mddev)); 2286- return -EINVAL; 2287- } 2288- if (!mddev->pers->hot_add_disk) { 2289- printk(KERN_WARNING 2290- "md%d: personality does not support diskops!\n", 2291- mdidx(mddev)); 2292- return -EINVAL; 2293- } 2294- 2295- rdev = md_import_device (dev, -1, 0); 2296- if (IS_ERR(rdev)) { 2297- printk(KERN_WARNING 2298- "md: error, md_import_device() returned %ld\n", 2299- PTR_ERR(rdev)); 2300- return -EINVAL; 2301- } 2302- 2303- rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2304- size = calc_dev_size(rdev, mddev->chunk_size); 2305- rdev->size = size; 2306- 2307- if (size < mddev->size) { 2308- printk(KERN_WARNING 2309- "md%d: disk size %llu blocks < array size %llu\n", 2310- mdidx(mddev), (unsigned long long)size, 2311- (unsigned long long)mddev->size); 2312- err = -ENOSPC; 2313- goto abort_export; 2314- } 2315- 2316- if (rdev->faulty) { 2317- printk(KERN_WARNING 2318- "md: can not hot-add faulty %s disk to md%d!\n", 2319- bdev_partition_name(rdev->bdev), mdidx(mddev)); 2320- err = -EINVAL; 2321- goto abort_export; 2322- } 2323- rdev->in_sync = 0; 2324- rdev->desc_nr = -1; 2325- bind_rdev_to_array(rdev, mddev); 2326- 2327- /* 2328- * The rest should better be atomic, we can have disk failures 2329- * noticed in interrupt contexts ... 2330- */ 2331- 2332- if (rdev->desc_nr == mddev->max_disks) { 2333- printk(KERN_WARNING "md%d: can not hot-add to full array!\n", 2334- mdidx(mddev)); 2335- err = -EBUSY; 2336- goto abort_unbind_export; 2337- } 2338- 2339- rdev->raid_disk = -1; 2340- 2341- md_update_sb(mddev); 2342- 2343- /* 2344- * Kick recovery, maybe this spare has to be added to the 2345- * array immediately. 2346- */ 2347- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2348- md_wakeup_thread(mddev->thread); 2349- 2350- return 0; 2351- 2352-abort_unbind_export: 2353- unbind_rdev_from_array(rdev); 2354- 2355-abort_export: 2356- export_rdev(rdev); 2357- return err; 2358-} 2359- 2360-/* 2361- * set_array_info is used two different ways 2362- * The original usage is when creating a new array. 2363- * In this usage, raid_disks is > = and it together with 2364- * level, size, not_persistent,layout,chunksize determine the 2365- * shape of the array. 2366- * This will always create an array with a type-0.90.0 superblock. 2367- * The newer usage is when assembling an array. 2368- * In this case raid_disks will be 0, and the major_version field is 2369- * use to determine which style super-blocks are to be found on the devices. 2370- * The minor and patch _version numbers are also kept incase the 2371- * super_block handler wishes to interpret them. 2372- */ 2373-static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2374-{ 2375- 2376- if (info->raid_disks == 0) { 2377- /* just setting version number for superblock loading */ 2378- if (info->major_version < 0 || 2379- info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2380- super_types[info->major_version].name == NULL) { 2381- /* maybe try to auto-load a module? */ 2382- printk(KERN_INFO 2383- "md: superblock version %d not known\n", 2384- info->major_version); 2385- return -EINVAL; 2386- } 2387- mddev->major_version = info->major_version; 2388- mddev->minor_version = info->minor_version; 2389- mddev->patch_version = info->patch_version; 2390- return 0; 2391- } 2392- mddev->major_version = MD_MAJOR_VERSION; 2393- mddev->minor_version = MD_MINOR_VERSION; 2394- mddev->patch_version = MD_PATCHLEVEL_VERSION; 2395- mddev->ctime = get_seconds(); 2396- 2397- mddev->level = info->level; 2398- mddev->size = info->size; 2399- mddev->raid_disks = info->raid_disks; 2400- /* don't set __minor, it is determined by which /dev/md* was 2401- * openned 2402- */ 2403- if (info->state & (1<<MD_SB_CLEAN)) 2404- mddev->recovery_cp = MaxSector; 2405- else 2406- mddev->recovery_cp = 0; 2407- mddev->persistent = ! info->not_persistent; 2408- 2409- mddev->layout = info->layout; 2410- mddev->chunk_size = info->chunk_size; 2411- 2412- mddev->max_disks = MD_SB_DISKS; 2413- 2414- 2415- /* 2416- * Generate a 128 bit UUID 2417- */ 2418- get_random_bytes(mddev->uuid, 16); 2419- 2420- return 0; 2421-} 2422- 2423-static int set_disk_faulty(mddev_t *mddev, dev_t dev) 2424-{ 2425- mdk_rdev_t *rdev; 2426- 2427- rdev = find_rdev(mddev, dev); 2428- if (!rdev) 2429- return 0; 2430- 2431- md_error(mddev, rdev); 2432- return 1; 2433-} 2434- 2435-static int md_ioctl(struct inode *inode, struct file *file, 2436- unsigned int cmd, unsigned long arg) 2437-{ 2438- unsigned int minor; 2439- int err = 0; 2440- struct hd_geometry *loc = (struct hd_geometry *) arg; 2441- mddev_t *mddev = NULL; 2442- kdev_t dev; 2443- 2444- if (!capable(CAP_SYS_ADMIN)) 2445- return -EACCES; 2446- 2447- dev = inode->i_rdev; 2448- minor = minor(dev); 2449- if (minor >= MAX_MD_DEVS) { 2450- MD_BUG(); 2451- return -EINVAL; 2452- } 2453- 2454- /* 2455- * Commands dealing with the RAID driver but not any 2456- * particular array: 2457- */ 2458- switch (cmd) 2459- { 2460- case RAID_VERSION: 2461- err = get_version((void *)arg); 2462- goto done; 2463- 2464- case PRINT_RAID_DEBUG: 2465- err = 0; 2466- md_print_devices(); 2467- goto done; 2468- 2469-#ifndef MODULE 2470- case RAID_AUTORUN: 2471- err = 0; 2472- autostart_arrays(); 2473- goto done; 2474-#endif 2475- default:; 2476- } 2477- 2478- /* 2479- * Commands creating/starting a new array: 2480- */ 2481- 2482- mddev = inode->i_bdev->bd_inode->u.generic_ip; 2483- 2484- if (!mddev) { 2485- BUG(); 2486- goto abort; 2487- } 2488- 2489- 2490- if (cmd == START_ARRAY) { 2491- /* START_ARRAY doesn't need to lock the array as autostart_array 2492- * does the locking, and it could even be a different array 2493- */ 2494- err = autostart_array(arg); 2495- if (err) { 2496- printk(KERN_WARNING "md: autostart %s failed!\n", 2497- partition_name(arg)); 2498- goto abort; 2499- } 2500- goto done; 2501- } 2502- 2503- err = mddev_lock(mddev); 2504- if (err) { 2505- printk(KERN_INFO 2506- "md: ioctl lock interrupted, reason %d, cmd %d\n", 2507- err, cmd); 2508- goto abort; 2509- } 2510- 2511- switch (cmd) 2512- { 2513- case SET_ARRAY_INFO: 2514- 2515- if (!list_empty(&mddev->disks)) { 2516- printk(KERN_WARNING 2517- "md: array md%d already has disks!\n", 2518- mdidx(mddev)); 2519- err = -EBUSY; 2520- goto abort_unlock; 2521- } 2522- if (mddev->raid_disks) { 2523- printk(KERN_WARNING 2524- "md: array md%d already initialised!\n", 2525- mdidx(mddev)); 2526- err = -EBUSY; 2527- goto abort_unlock; 2528- } 2529- { 2530- mdu_array_info_t info; 2531- if (!arg) 2532- memset(&info, 0, sizeof(info)); 2533- else if (copy_from_user(&info, (void*)arg, sizeof(info))) { 2534- err = -EFAULT; 2535- goto abort_unlock; 2536- } 2537- err = set_array_info(mddev, &info); 2538- if (err) { 2539- printk(KERN_WARNING "md: couldn't set" 2540- " array info. %d\n", err); 2541- goto abort_unlock; 2542- } 2543- } 2544- goto done_unlock; 2545- 2546- default:; 2547- } 2548- 2549- /* 2550- * Commands querying/configuring an existing array: 2551- */ 2552- /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ 2553- if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { 2554- err = -ENODEV; 2555- goto abort_unlock; 2556- } 2557- 2558- /* 2559- * Commands even a read-only array can execute: 2560- */ 2561- switch (cmd) 2562- { 2563- case GET_ARRAY_INFO: 2564- err = get_array_info(mddev, (void *)arg); 2565- goto done_unlock; 2566- 2567- case GET_DISK_INFO: 2568- err = get_disk_info(mddev, (void *)arg); 2569- goto done_unlock; 2570- 2571- case RESTART_ARRAY_RW: 2572- err = restart_array(mddev); 2573- goto done_unlock; 2574- 2575- case STOP_ARRAY: 2576- err = do_md_stop (mddev, 0); 2577- goto done_unlock; 2578- 2579- case STOP_ARRAY_RO: 2580- err = do_md_stop (mddev, 1); 2581- goto done_unlock; 2582- 2583- /* 2584- * We have a problem here : there is no easy way to give a CHS 2585- * virtual geometry. We currently pretend that we have a 2 heads 2586- * 4 sectors (with a BIG number of cylinders...). This drives 2587- * dosfs just mad... ;-) 2588- */ 2589- case HDIO_GETGEO: 2590- if (!loc) { 2591- err = -EINVAL; 2592- goto abort_unlock; 2593- } 2594- err = put_user (2, (char *) &loc->heads); 2595- if (err) 2596- goto abort_unlock; 2597- err = put_user (4, (char *) &loc->sectors); 2598- if (err) 2599- goto abort_unlock; 2600- err = put_user(get_capacity(disks[mdidx(mddev)])/8, 2601- (short *) &loc->cylinders); 2602- if (err) 2603- goto abort_unlock; 2604- err = put_user (get_start_sect(inode->i_bdev), 2605- (long *) &loc->start); 2606- goto done_unlock; 2607- } 2608- 2609- /* 2610- * The remaining ioctls are changing the state of the 2611- * superblock, so we do not allow read-only arrays 2612- * here: 2613- */ 2614- if (mddev->ro) { 2615- err = -EROFS; 2616- goto abort_unlock; 2617- } 2618- 2619- switch (cmd) 2620- { 2621- case ADD_NEW_DISK: 2622- { 2623- mdu_disk_info_t info; 2624- if (copy_from_user(&info, (void*)arg, sizeof(info))) 2625- err = -EFAULT; 2626- else 2627- err = add_new_disk(mddev, &info); 2628- goto done_unlock; 2629- } 2630- case HOT_GENERATE_ERROR: 2631- err = hot_generate_error(mddev, arg); 2632- goto done_unlock; 2633- case HOT_REMOVE_DISK: 2634- err = hot_remove_disk(mddev, arg); 2635- goto done_unlock; 2636- 2637- case HOT_ADD_DISK: 2638- err = hot_add_disk(mddev, arg); 2639- goto done_unlock; 2640- 2641- case SET_DISK_FAULTY: 2642- err = set_disk_faulty(mddev, arg); 2643- goto done_unlock; 2644- 2645- case RUN_ARRAY: 2646- { 2647- err = do_md_run (mddev); 2648- /* 2649- * we have to clean up the mess if 2650- * the array cannot be run for some 2651- * reason ... 2652- * ->pers will not be set, to superblock will 2653- * not be updated. 2654- */ 2655- if (err) 2656- do_md_stop (mddev, 0); 2657- goto done_unlock; 2658- } 2659- 2660- default: 2661- if (_IOC_TYPE(cmd) == MD_MAJOR) 2662- printk(KERN_WARNING "md: %s(pid %d) used" 2663- " obsolete MD ioctl, upgrade your" 2664- " software to use new ictls.\n", 2665- current->comm, current->pid); 2666- err = -EINVAL; 2667- goto abort_unlock; 2668- } 2669- 2670-done_unlock: 2671-abort_unlock: 2672- mddev_unlock(mddev); 2673- 2674- return err; 2675-done: 2676- if (err) 2677- MD_BUG(); 2678-abort: 2679- return err; 2680-} 2681- 2682-static int md_open(struct inode *inode, struct file *file) 2683-{ 2684- /* 2685- * Succeed if we can find or allocate a mddev structure. 2686- */ 2687- mddev_t *mddev = mddev_find(minor(inode->i_rdev)); 2688- int err = -ENOMEM; 2689- 2690- if (!mddev) 2691- goto out; 2692- 2693- if ((err = mddev_lock(mddev))) 2694- goto put; 2695- 2696- err = 0; 2697- mddev_unlock(mddev); 2698- inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev); 2699- put: 2700- mddev_put(mddev); 2701- out: 2702- return err; 2703-} 2704- 2705-static int md_release(struct inode *inode, struct file * file) 2706-{ 2707- mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip; 2708- 2709- if (!mddev) 2710- BUG(); 2711- mddev_put(mddev); 2712- 2713- return 0; 2714-} 2715- 2716-static struct block_device_operations md_fops = 2717-{ 2718- .owner = THIS_MODULE, 2719- .open = md_open, 2720- .release = md_release, 2721- .ioctl = md_ioctl, 2722-}; 2723- 2724-int md_thread(void * arg) 2725-{ 2726- mdk_thread_t *thread = arg; 2727- 2728- lock_kernel(); 2729- 2730- /* 2731- * Detach thread 2732- */ 2733- 2734- daemonize(thread->name, mdidx(thread->mddev)); 2735- 2736- current->exit_signal = SIGCHLD; 2737- allow_signal(SIGKILL); 2738- thread->tsk = current; 2739- 2740- /* 2741- * md_thread is a 'system-thread', it's priority should be very 2742- * high. We avoid resource deadlocks individually in each 2743- * raid personality. (RAID5 does preallocation) We also use RR and 2744- * the very same RT priority as kswapd, thus we will never get 2745- * into a priority inversion deadlock. 2746- * 2747- * we definitely have to have equal or higher priority than 2748- * bdflush, otherwise bdflush will deadlock if there are too 2749- * many dirty RAID5 blocks. 2750- */ 2751- unlock_kernel(); 2752- 2753- complete(thread->event); 2754- while (thread->run) { 2755- void (*run)(mddev_t *); 2756- 2757- wait_event_interruptible(thread->wqueue, 2758- test_bit(THREAD_WAKEUP, &thread->flags)); 2759- if (current->flags & PF_FREEZE) 2760- refrigerator(PF_IOTHREAD); 2761- 2762- clear_bit(THREAD_WAKEUP, &thread->flags); 2763- 2764- run = thread->run; 2765- if (run) { 2766- run(thread->mddev); 2767- blk_run_queues(); 2768- } 2769- if (signal_pending(current)) 2770- flush_signals(current); 2771- } 2772- complete(thread->event); 2773- return 0; 2774-} 2775- 2776-void md_wakeup_thread(mdk_thread_t *thread) 2777-{ 2778- if (thread) { 2779- dprintk("md: waking up MD thread %p.\n", thread); 2780- set_bit(THREAD_WAKEUP, &thread->flags); 2781- wake_up(&thread->wqueue); 2782- } 2783-} 2784- 2785-mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 2786- const char *name) 2787-{ 2788- mdk_thread_t *thread; 2789- int ret; 2790- struct completion event; 2791- 2792- thread = (mdk_thread_t *) kmalloc 2793- (sizeof(mdk_thread_t), GFP_KERNEL); 2794- if (!thread) 2795- return NULL; 2796- 2797- memset(thread, 0, sizeof(mdk_thread_t)); 2798- init_waitqueue_head(&thread->wqueue); 2799- 2800- init_completion(&event); 2801- thread->event = &event; 2802- thread->run = run; 2803- thread->mddev = mddev; 2804- thread->name = name; 2805- ret = kernel_thread(md_thread, thread, 0); 2806- if (ret < 0) { 2807- kfree(thread); 2808- return NULL; 2809- } 2810- wait_for_completion(&event); 2811- return thread; 2812-} 2813- 2814-void md_interrupt_thread(mdk_thread_t *thread) 2815-{ 2816- if (!thread->tsk) { 2817- MD_BUG(); 2818- return; 2819- } 2820- dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 2821- send_sig(SIGKILL, thread->tsk, 1); 2822-} 2823- 2824-void md_unregister_thread(mdk_thread_t *thread) 2825-{ 2826- struct completion event; 2827- 2828- init_completion(&event); 2829- 2830- thread->event = &event; 2831- thread->run = NULL; 2832- thread->name = NULL; 2833- md_interrupt_thread(thread); 2834- wait_for_completion(&event); 2835- kfree(thread); 2836-} 2837- 2838-void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 2839-{ 2840- dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 2841- MD_MAJOR,mdidx(mddev), 2842- MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 2843- __builtin_return_address(0),__builtin_return_address(1), 2844- __builtin_return_address(2),__builtin_return_address(3)); 2845- 2846- if (!mddev) { 2847- MD_BUG(); 2848- return; 2849- } 2850- 2851- if (!rdev || rdev->faulty) 2852- return; 2853- if (!mddev->pers->error_handler) 2854- return; 2855- mddev->pers->error_handler(mddev,rdev); 2856- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2857- md_wakeup_thread(mddev->thread); 2858-} 2859- 2860-/* seq_file implementation /proc/mdstat */ 2861- 2862-static void status_unused(struct seq_file *seq) 2863-{ 2864- int i = 0; 2865- mdk_rdev_t *rdev; 2866- struct list_head *tmp; 2867- 2868- seq_printf(seq, "unused devices: "); 2869- 2870- ITERATE_RDEV_PENDING(rdev,tmp) { 2871- i++; 2872- seq_printf(seq, "%s ", 2873- bdev_partition_name(rdev->bdev)); 2874- } 2875- if (!i) 2876- seq_printf(seq, "<none>"); 2877- 2878- seq_printf(seq, "\n"); 2879-} 2880- 2881- 2882-static void status_resync(struct seq_file *seq, mddev_t * mddev) 2883-{ 2884- unsigned long max_blocks, resync, res, dt, db, rt; 2885- 2886- resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 2887- max_blocks = mddev->size; 2888- 2889- /* 2890- * Should not happen. 2891- */ 2892- if (!max_blocks) { 2893- MD_BUG(); 2894- return; 2895- } 2896- res = (resync/1024)*1000/(max_blocks/1024 + 1); 2897- { 2898- int i, x = res/50, y = 20-x; 2899- seq_printf(seq, "["); 2900- for (i = 0; i < x; i++) 2901- seq_printf(seq, "="); 2902- seq_printf(seq, ">"); 2903- for (i = 0; i < y; i++) 2904- seq_printf(seq, "."); 2905- seq_printf(seq, "] "); 2906- } 2907- seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 2908- (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 2909- "resync" : "recovery"), 2910- res/10, res % 10, resync, max_blocks); 2911- 2912- /* 2913- * We do not want to overflow, so the order of operands and 2914- * the * 100 / 100 trick are important. We do a +1 to be 2915- * safe against division by zero. We only estimate anyway. 2916- * 2917- * dt: time from mark until now 2918- * db: blocks written from mark until now 2919- * rt: remaining time 2920- */ 2921- dt = ((jiffies - mddev->resync_mark) / HZ); 2922- if (!dt) dt++; 2923- db = resync - (mddev->resync_mark_cnt/2); 2924- rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 2925- 2926- seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 2927- 2928- seq_printf(seq, " speed=%ldK/sec", db/dt); 2929-} 2930- 2931-static void *md_seq_start(struct seq_file *seq, loff_t *pos) 2932-{ 2933- struct list_head *tmp; 2934- loff_t l = *pos; 2935- mddev_t *mddev; 2936- 2937- if (l > 0x10000) 2938- return NULL; 2939- if (!l--) 2940- /* header */ 2941- return (void*)1; 2942- 2943- spin_lock(&all_mddevs_lock); 2944- list_for_each(tmp,&all_mddevs) 2945- if (!l--) { 2946- mddev = list_entry(tmp, mddev_t, all_mddevs); 2947- mddev_get(mddev); 2948- spin_unlock(&all_mddevs_lock); 2949- return mddev; 2950- } 2951- spin_unlock(&all_mddevs_lock); 2952- return (void*)2;/* tail */ 2953-} 2954- 2955-static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2956-{ 2957- struct list_head *tmp; 2958- mddev_t *next_mddev, *mddev = v; 2959- 2960- ++*pos; 2961- if (v == (void*)2) 2962- return NULL; 2963- 2964- spin_lock(&all_mddevs_lock); 2965- if (v == (void*)1) 2966- tmp = all_mddevs.next; 2967- else 2968- tmp = mddev->all_mddevs.next; 2969- if (tmp != &all_mddevs) 2970- next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 2971- else { 2972- next_mddev = (void*)2; 2973- *pos = 0x10000; 2974- } 2975- spin_unlock(&all_mddevs_lock); 2976- 2977- if (v != (void*)1) 2978- mddev_put(mddev); 2979- return next_mddev; 2980- 2981-} 2982- 2983-static void md_seq_stop(struct seq_file *seq, void *v) 2984-{ 2985- mddev_t *mddev = v; 2986- 2987- if (mddev && v != (void*)1 && v != (void*)2) 2988- mddev_put(mddev); 2989-} 2990- 2991-static int md_seq_show(struct seq_file *seq, void *v) 2992-{ 2993- mddev_t *mddev = v; 2994- sector_t size; 2995- struct list_head *tmp2; 2996- mdk_rdev_t *rdev; 2997- int i; 2998- 2999- if (v == (void*)1) { 3000- seq_printf(seq, "Personalities : "); 3001- spin_lock(&pers_lock); 3002- for (i = 0; i < MAX_PERSONALITY; i++) 3003- if (pers[i]) 3004- seq_printf(seq, "[%s] ", pers[i]->name); 3005- 3006- spin_unlock(&pers_lock); 3007- seq_printf(seq, "\n"); 3008- return 0; 3009- } 3010- if (v == (void*)2) { 3011- status_unused(seq); 3012- return 0; 3013- } 3014- 3015- if (mddev_lock(mddev)!=0) 3016- return -EINTR; 3017- if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3018- seq_printf(seq, "md%d : %sactive", mdidx(mddev), 3019- mddev->pers ? "" : "in"); 3020- if (mddev->pers) { 3021- if (mddev->ro) 3022- seq_printf(seq, " (read-only)"); 3023- seq_printf(seq, " %s", mddev->pers->name); 3024- } 3025- 3026- size = 0; 3027- ITERATE_RDEV(mddev,rdev,tmp2) { 3028- seq_printf(seq, " %s[%d]", 3029- bdev_partition_name(rdev->bdev), rdev->desc_nr); 3030- if (rdev->faulty) { 3031- seq_printf(seq, "(F)"); 3032- continue; 3033- } 3034- size += rdev->size; 3035- } 3036- 3037- if (!list_empty(&mddev->disks)) { 3038- if (mddev->pers) 3039- seq_printf(seq, "\n %llu blocks", 3040- (unsigned long long)mddev->array_size); 3041- else 3042- seq_printf(seq, "\n %llu blocks", 3043- (unsigned long long)size); 3044- } 3045- 3046- if (mddev->pers) { 3047- mddev->pers->status (seq, mddev); 3048- seq_printf(seq, "\n "); 3049- if (mddev->curr_resync > 2) 3050- status_resync (seq, mddev); 3051- else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3052- seq_printf(seq, " resync=DELAYED"); 3053- } 3054- 3055- seq_printf(seq, "\n"); 3056- } 3057- mddev_unlock(mddev); 3058- 3059- return 0; 3060-} 3061- 3062-static struct seq_operations md_seq_ops = { 3063- .start = md_seq_start, 3064- .next = md_seq_next, 3065- .stop = md_seq_stop, 3066- .show = md_seq_show, 3067-}; 3068- 3069-static int md_seq_open(struct inode *inode, struct file *file) 3070-{ 3071- int error; 3072- 3073- error = seq_open(file, &md_seq_ops); 3074- return error; 3075-} 3076- 3077-static struct file_operations md_seq_fops = { 3078- .open = md_seq_open, 3079- .read = seq_read, 3080- .llseek = seq_lseek, 3081- .release = seq_release, 3082-}; 3083- 3084-int register_md_personality(int pnum, mdk_personality_t *p) 3085-{ 3086- if (pnum >= MAX_PERSONALITY) { 3087- MD_BUG(); 3088- return -EINVAL; 3089- } 3090- 3091- spin_lock(&pers_lock); 3092- if (pers[pnum]) { 3093- spin_unlock(&pers_lock); 3094- MD_BUG(); 3095- return -EBUSY; 3096- } 3097- 3098- pers[pnum] = p; 3099- printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3100- spin_unlock(&pers_lock); 3101- return 0; 3102-} 3103- 3104-int unregister_md_personality(int pnum) 3105-{ 3106- if (pnum >= MAX_PERSONALITY) { 3107- MD_BUG(); 3108- return -EINVAL; 3109- } 3110- 3111- printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3112- spin_lock(&pers_lock); 3113- pers[pnum] = NULL; 3114- spin_unlock(&pers_lock); 3115- return 0; 3116-} 3117- 3118-void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) 3119-{ 3120- rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; 3121-} 3122- 3123-static int is_mddev_idle(mddev_t *mddev) 3124-{ 3125- mdk_rdev_t * rdev; 3126- struct list_head *tmp; 3127- int idle; 3128- unsigned long curr_events; 3129- 3130- idle = 1; 3131- ITERATE_RDEV(mddev,rdev,tmp) { 3132- struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3133- curr_events = disk_stat_read(disk, read_sectors) + 3134- disk_stat_read(disk, write_sectors) - 3135- disk->sync_io; 3136- if ((curr_events - rdev->last_events) > 32) { 3137- rdev->last_events = curr_events; 3138- idle = 0; 3139- } 3140- } 3141- return idle; 3142-} 3143- 3144-void md_done_sync(mddev_t *mddev, int blocks, int ok) 3145-{ 3146- /* another "blocks" (512byte) blocks have been synced */ 3147- atomic_sub(blocks, &mddev->recovery_active); 3148- wake_up(&mddev->recovery_wait); 3149- if (!ok) { 3150- set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3151- md_wakeup_thread(mddev->thread); 3152- // stop recovery, signal do_sync .... 3153- } 3154-} 3155- 3156- 3157-void md_write_start(mddev_t *mddev) 3158-{ 3159- if (!atomic_read(&mddev->writes_pending)) { 3160- mddev_lock_uninterruptible(mddev); 3161- if (mddev->in_sync) { 3162- mddev->in_sync = 0; 3163- del_timer(&mddev->safemode_timer); 3164- md_update_sb(mddev); 3165- } 3166- atomic_inc(&mddev->writes_pending); 3167- mddev_unlock(mddev); 3168- } else 3169- atomic_inc(&mddev->writes_pending); 3170-} 3171- 3172-void md_write_end(mddev_t *mddev) 3173-{ 3174- if (atomic_dec_and_test(&mddev->writes_pending)) { 3175- if (mddev->safemode == 2) 3176- md_wakeup_thread(mddev->thread); 3177- else 3178- mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 3179- } 3180-} 3181- 3182-static inline void md_enter_safemode(mddev_t *mddev) 3183-{ 3184- mddev_lock_uninterruptible(mddev); 3185- if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 3186- !mddev->in_sync && mddev->recovery_cp == MaxSector) { 3187- mddev->in_sync = 1; 3188- md_update_sb(mddev); 3189- } 3190- mddev_unlock(mddev); 3191- 3192- if (mddev->safemode == 1) 3193- mddev->safemode = 0; 3194-} 3195- 3196-void md_handle_safemode(mddev_t *mddev) 3197-{ 3198- if (signal_pending(current)) { 3199- printk(KERN_INFO "md: md%d in immediate safe mode\n", 3200- mdidx(mddev)); 3201- mddev->safemode = 2; 3202- flush_signals(current); 3203- } 3204- if (mddev->safemode) 3205- md_enter_safemode(mddev); 3206-} 3207- 3208- 3209-DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3210- 3211-#define SYNC_MARKS 10 3212-#define SYNC_MARK_STEP (3*HZ) 3213-static void md_do_sync(mddev_t *mddev) 3214-{ 3215- mddev_t *mddev2; 3216- unsigned int max_sectors, currspeed = 0, 3217- j, window; 3218- unsigned long mark[SYNC_MARKS]; 3219- unsigned long mark_cnt[SYNC_MARKS]; 3220- int last_mark,m; 3221- struct list_head *tmp; 3222- unsigned long last_check; 3223- 3224- /* just incase thread restarts... */ 3225- if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3226- return; 3227- 3228- /* we overload curr_resync somewhat here. 3229- * 0 == not engaged in resync at all 3230- * 2 == checking that there is no conflict with another sync 3231- * 1 == like 2, but have yielded to allow conflicting resync to 3232- * commense 3233- * other == active in resync - this many blocks 3234- */ 3235- do { 3236- mddev->curr_resync = 2; 3237- 3238- ITERATE_MDDEV(mddev2,tmp) { 3239- if (mddev2 == mddev) 3240- continue; 3241- if (mddev2->curr_resync && 3242- match_mddev_units(mddev,mddev2)) { 3243- printk(KERN_INFO "md: delaying resync of md%d" 3244- " until md%d has finished resync (they" 3245- " share one or more physical units)\n", 3246- mdidx(mddev), mdidx(mddev2)); 3247- if (mddev < mddev2) {/* arbitrarily yield */ 3248- mddev->curr_resync = 1; 3249- wake_up(&resync_wait); 3250- } 3251- if (wait_event_interruptible(resync_wait, 3252- mddev2->curr_resync < mddev->curr_resync)) { 3253- flush_signals(current); 3254- mddev_put(mddev2); 3255- goto skip; 3256- } 3257- } 3258- if (mddev->curr_resync == 1) { 3259- mddev_put(mddev2); 3260- break; 3261- } 3262- } 3263- } while (mddev->curr_resync < 2); 3264- 3265- max_sectors = mddev->size << 1; 3266- 3267- printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); 3268- printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3269- " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3270- printk(KERN_INFO "md: using maximum available idle IO bandwith " 3271- "(but not more than %d KB/sec) for reconstruction.\n", 3272- sysctl_speed_limit_max); 3273- 3274- is_mddev_idle(mddev); /* this also initializes IO event counters */ 3275- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3276- j = mddev->recovery_cp; 3277- else 3278- j = 0; 3279- for (m = 0; m < SYNC_MARKS; m++) { 3280- mark[m] = jiffies; 3281- mark_cnt[m] = j; 3282- } 3283- last_mark = 0; 3284- mddev->resync_mark = mark[last_mark]; 3285- mddev->resync_mark_cnt = mark_cnt[last_mark]; 3286- 3287- /* 3288- * Tune reconstruction: 3289- */ 3290- window = 32*(PAGE_SIZE/512); 3291- printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", 3292- window/2,max_sectors/2); 3293- 3294- atomic_set(&mddev->recovery_active, 0); 3295- init_waitqueue_head(&mddev->recovery_wait); 3296- last_check = 0; 3297- 3298- if (j) 3299- printk(KERN_INFO 3300- "md: resuming recovery of md%d from checkpoint.\n", 3301- mdidx(mddev)); 3302- 3303- while (j < max_sectors) { 3304- int sectors; 3305- 3306- sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); 3307- if (sectors < 0) { 3308- set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3309- goto out; 3310- } 3311- atomic_add(sectors, &mddev->recovery_active); 3312- j += sectors; 3313- if (j>1) mddev->curr_resync = j; 3314- 3315- if (last_check + window > j) 3316- continue; 3317- 3318- last_check = j; 3319- 3320- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3321- test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3322- break; 3323- 3324- blk_run_queues(); 3325- 3326- repeat: 3327- if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { 3328- /* step marks */ 3329- int next = (last_mark+1) % SYNC_MARKS; 3330- 3331- mddev->resync_mark = mark[next]; 3332- mddev->resync_mark_cnt = mark_cnt[next]; 3333- mark[next] = jiffies; 3334- mark_cnt[next] = j - atomic_read(&mddev->recovery_active); 3335- last_mark = next; 3336- } 3337- 3338- 3339- if (signal_pending(current)) { 3340- /* 3341- * got a signal, exit. 3342- */ 3343- printk(KERN_INFO 3344- "md: md_do_sync() got signal ... exiting\n"); 3345- flush_signals(current); 3346- set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3347- goto out; 3348- } 3349- 3350- /* 3351- * this loop exits only if either when we are slower than 3352- * the 'hard' speed limit, or the system was IO-idle for 3353- * a jiffy. 3354- * the system might be non-idle CPU-wise, but we only care 3355- * about not overloading the IO subsystem. (things like an 3356- * e2fsck being done on the RAID array should execute fast) 3357- */ 3358- cond_resched(); 3359- 3360- currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; 3361- 3362- if (currspeed > sysctl_speed_limit_min) { 3363- if ((currspeed > sysctl_speed_limit_max) || 3364- !is_mddev_idle(mddev)) { 3365- current->state = TASK_INTERRUPTIBLE; 3366- schedule_timeout(HZ/4); 3367- goto repeat; 3368- } 3369- } 3370- } 3371- printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); 3372- /* 3373- * this also signals 'finished resyncing' to md_stop 3374- */ 3375- out: 3376- wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 3377- 3378- /* tell personality that we are finished */ 3379- mddev->pers->sync_request(mddev, max_sectors, 1); 3380- 3381- if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3382- mddev->curr_resync > 2 && 3383- mddev->curr_resync > mddev->recovery_cp) { 3384- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3385- printk(KERN_INFO 3386- "md: checkpointing recovery of md%d.\n", 3387- mdidx(mddev)); 3388- mddev->recovery_cp = mddev->curr_resync; 3389- } else 3390- mddev->recovery_cp = MaxSector; 3391- } 3392- 3393- if (mddev->safemode) 3394- md_enter_safemode(mddev); 3395- skip: 3396- mddev->curr_resync = 0; 3397- set_bit(MD_RECOVERY_DONE, &mddev->recovery); 3398- md_wakeup_thread(mddev->thread); 3399-} 3400- 3401- 3402-/* 3403- * This routine is regularly called by all per-raid-array threads to 3404- * deal with generic issues like resync and super-block update. 3405- * Raid personalities that don't have a thread (linear/raid0) do not 3406- * need this as they never do any recovery or update the superblock. 3407- * 3408- * It does not do any resync itself, but rather "forks" off other threads 3409- * to do that as needed. 3410- * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 3411- * "->recovery" and create a thread at ->sync_thread. 3412- * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 3413- * and wakeups up this thread which will reap the thread and finish up. 3414- * This thread also removes any faulty devices (with nr_pending == 0). 3415- * 3416- * The overall approach is: 3417- * 1/ if the superblock needs updating, update it. 3418- * 2/ If a recovery thread is running, don't do anything else. 3419- * 3/ If recovery has finished, clean up, possibly marking spares active. 3420- * 4/ If there are any faulty devices, remove them. 3421- * 5/ If array is degraded, try to add spares devices 3422- * 6/ If array has spares or is not in-sync, start a resync thread. 3423- */ 3424-void md_check_recovery(mddev_t *mddev) 3425-{ 3426- mdk_rdev_t *rdev; 3427- struct list_head *rtmp; 3428- 3429- 3430- dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); 3431- 3432- if (mddev->ro) 3433- return; 3434- if ( ! ( 3435- mddev->sb_dirty || 3436- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 3437- test_bit(MD_RECOVERY_DONE, &mddev->recovery) 3438- )) 3439- return; 3440- if (mddev_trylock(mddev)==0) { 3441- int spares =0; 3442- if (mddev->sb_dirty) 3443- md_update_sb(mddev); 3444- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 3445- !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3446- /* resync/recovery still happening */ 3447- goto unlock; 3448- if (mddev->sync_thread) { 3449- /* resync has finished, collect result */ 3450- md_unregister_thread(mddev->sync_thread); 3451- mddev->sync_thread = NULL; 3452- if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) { 3453- /* success...*/ 3454- /* activate any spares */ 3455- mddev->pers->spare_active(mddev); 3456- } 3457- md_update_sb(mddev); 3458- mddev->recovery = 0; 3459- wake_up(&resync_wait); 3460- goto unlock; 3461- } 3462- if (mddev->recovery) { 3463- /* that's odd.. */ 3464- mddev->recovery = 0; 3465- wake_up(&resync_wait); 3466- } 3467- 3468- /* no recovery is running. 3469- * remove any failed drives, then 3470- * add spares if possible 3471- */ 3472- ITERATE_RDEV(mddev,rdev,rtmp) { 3473- if (rdev->raid_disk >= 0 && 3474- rdev->faulty && 3475- atomic_read(&rdev->nr_pending)==0) { 3476- mddev->pers->hot_remove_disk(mddev, rdev->raid_disk); 3477- rdev->raid_disk = -1; 3478- } 3479- if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync) 3480- spares++; 3481- } 3482- if (mddev->degraded) { 3483- ITERATE_RDEV(mddev,rdev,rtmp) 3484- if (rdev->raid_disk < 0 3485- && !rdev->faulty) { 3486- if (mddev->pers->hot_add_disk(mddev,rdev)) 3487- spares++; 3488- else 3489- break; 3490- } 3491- } 3492- 3493- if (!spares && (mddev->recovery_cp == MaxSector )) { 3494- /* nothing we can do ... */ 3495- goto unlock; 3496- } 3497- if (mddev->pers->sync_request) { 3498- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3499- if (!spares) 3500- set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3501- mddev->sync_thread = md_register_thread(md_do_sync, 3502- mddev, 3503- "md%d_resync"); 3504- if (!mddev->sync_thread) { 3505- printk(KERN_ERR "md%d: could not start resync" 3506- " thread...\n", 3507- mdidx(mddev)); 3508- /* leave the spares where they are, it shouldn't hurt */ 3509- mddev->recovery = 0; 3510- } else { 3511- md_wakeup_thread(mddev->sync_thread); 3512- } 3513- } 3514- unlock: 3515- mddev_unlock(mddev); 3516- } 3517-} 3518- 3519-int md_notify_reboot(struct notifier_block *this, 3520- unsigned long code, void *x) 3521-{ 3522- struct list_head *tmp; 3523- mddev_t *mddev; 3524- 3525- if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 3526- 3527- printk(KERN_INFO "md: stopping all md devices.\n"); 3528- 3529- ITERATE_MDDEV(mddev,tmp) 3530- if (mddev_trylock(mddev)==0) 3531- do_md_stop (mddev, 1); 3532- /* 3533- * certain more exotic SCSI devices are known to be 3534- * volatile wrt too early system reboots. While the 3535- * right place to handle this issue is the given 3536- * driver, we do want to have a safe RAID driver ... 3537- */ 3538- mdelay(1000*1); 3539- } 3540- return NOTIFY_DONE; 3541-} 3542- 3543-struct notifier_block md_notifier = { 3544- .notifier_call = md_notify_reboot, 3545- .next = NULL, 3546- .priority = INT_MAX, /* before any real devices */ 3547-}; 3548- 3549-static void md_geninit(void) 3550-{ 3551- struct proc_dir_entry *p; 3552- 3553- dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 3554- 3555-#ifdef CONFIG_PROC_FS 3556- p = create_proc_entry("mdstat", S_IRUGO, NULL); 3557- if (p) 3558- p->proc_fops = &md_seq_fops; 3559-#endif 3560-} 3561- 3562-int __init md_init(void) 3563-{ 3564- int minor; 3565- 3566- printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 3567- " MD_SB_DISKS=%d\n", 3568- MD_MAJOR_VERSION, MD_MINOR_VERSION, 3569- MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3570- 3571- if (register_blkdev(MAJOR_NR, "md")) 3572- return -1; 3573- 3574- devfs_mk_dir("md"); 3575- blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 3576- md_probe, NULL, NULL); 3577- for (minor=0; minor < MAX_MD_DEVS; ++minor) { 3578- char name[16]; 3579- sprintf(name, "md/%d", minor); 3580- devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor, 3581- S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); 3582- } 3583- 3584- register_reboot_notifier(&md_notifier); 3585- raid_table_header = register_sysctl_table(raid_root_table, 1); 3586- 3587- md_geninit(); 3588- return (0); 3589-} 3590- 3591- 3592-#ifndef MODULE 3593- 3594-/* 3595- * Searches all registered partitions for autorun RAID arrays 3596- * at boot time. 3597- */ 3598-static dev_t detected_devices[128]; 3599-static int dev_cnt; 3600- 3601-void md_autodetect_dev(dev_t dev) 3602-{ 3603- if (dev_cnt >= 0 && dev_cnt < 127) 3604- detected_devices[dev_cnt++] = dev; 3605-} 3606- 3607- 3608-static void autostart_arrays(void) 3609-{ 3610- mdk_rdev_t *rdev; 3611- int i; 3612- 3613- printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 3614- 3615- for (i = 0; i < dev_cnt; i++) { 3616- dev_t dev = detected_devices[i]; 3617- 3618- rdev = md_import_device(dev,0, 0); 3619- if (IS_ERR(rdev)) { 3620- printk(KERN_ALERT "md: could not import %s!\n", 3621- partition_name(dev)); 3622- continue; 3623- } 3624- if (rdev->faulty) { 3625- MD_BUG(); 3626- continue; 3627- } 3628- list_add(&rdev->same_set, &pending_raid_disks); 3629- } 3630- dev_cnt = 0; 3631- 3632- autorun_devices(); 3633-} 3634- 3635-#endif 3636- 3637-static __exit void md_exit(void) 3638-{ 3639- int i; 3640- blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 3641- for (i=0; i < MAX_MD_DEVS; i++) 3642- devfs_remove("md/%d", i); 3643- devfs_remove("md"); 3644- 3645- unregister_blkdev(MAJOR_NR,"md"); 3646- unregister_reboot_notifier(&md_notifier); 3647- unregister_sysctl_table(raid_table_header); 3648-#ifdef CONFIG_PROC_FS 3649- remove_proc_entry("mdstat", NULL); 3650-#endif 3651- for (i = 0; i < MAX_MD_DEVS; i++) { 3652- struct gendisk *disk = disks[i]; 3653- mddev_t *mddev; 3654- if (!disks[i]) 3655- continue; 3656- mddev = disk->private_data; 3657- del_gendisk(disk); 3658- put_disk(disk); 3659- mddev_put(mddev); 3660- } 3661-} 3662- 3663-module_init(md_init) 3664-module_exit(md_exit) 3665- 3666-EXPORT_SYMBOL(register_md_personality); 3667-EXPORT_SYMBOL(unregister_md_personality); 3668-EXPORT_SYMBOL(md_error); 3669-EXPORT_SYMBOL(md_sync_acct); 3670-EXPORT_SYMBOL(md_done_sync); 3671-EXPORT_SYMBOL(md_write_start); 3672-EXPORT_SYMBOL(md_write_end); 3673-EXPORT_SYMBOL(md_handle_safemode); 3674-EXPORT_SYMBOL(md_register_thread); 3675-EXPORT_SYMBOL(md_unregister_thread); 3676-EXPORT_SYMBOL(md_wakeup_thread); 3677-EXPORT_SYMBOL(md_print_devices); 3678-EXPORT_SYMBOL(md_interrupt_thread); 3679-EXPORT_SYMBOL(md_check_recovery); 3680-MODULE_LICENSE("GPL"); 3681