1/* 2 * raid1.c : Multiple Devices driver for Linux 3 * 4 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat 5 * 6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman 7 * 8 * RAID-1 management functions. 9 * 10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 11 * 12 * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk> 13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License as published by 17 * the Free Software Foundation; either version 2, or (at your option) 18 * any later version. 19 * 20 * You should have received a copy of the GNU General Public License 21 * (for example /usr/src/linux/COPYING); if not, write to the Free 22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 23 */ 24 25#include <linux/module.h> 26#include <linux/config.h> 27#include <linux/slab.h> 28#include <linux/raid/raid1.h> 29#include <asm/atomic.h> 30 31#define MAJOR_NR MD_MAJOR 32#define MD_DRIVER 33#define MD_PERSONALITY 34 35#define MAX_WORK_PER_DISK 128 36 37#define NR_RESERVED_BUFS 32 38 39 40/* 41 * The following can be used to debug the driver 42 */ 43#define RAID1_DEBUG 0 44 45#if RAID1_DEBUG 46#define PRINTK(x...) printk(x) 47#define inline 48#define __inline__ 49#else 50#define PRINTK(x...) do { } while (0) 51#endif 52 53 54static mdk_personality_t raid1_personality; 55static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; 56struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; 57 58static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) 59{ 60 /* return a linked list of "cnt" struct buffer_heads. 61 * don't take any off the free list unless we know we can 62 * get all we need, otherwise we could deadlock 63 */ 64 struct buffer_head *bh=NULL; 65 66 while(cnt) { 67 struct buffer_head *t; 68 md_spin_lock_irq(&conf->device_lock); 69 if (!conf->freebh_blocked && conf->freebh_cnt >= cnt) 70 while (cnt) { 71 t = conf->freebh; 72 conf->freebh = t->b_next; 73 t->b_next = bh; 74 bh = t; 75 t->b_state = 0; 76 conf->freebh_cnt--; 77 cnt--; 78 } 79 md_spin_unlock_irq(&conf->device_lock); 80 if (cnt == 0) 81 break; 82 t = kmem_cache_alloc(bh_cachep, SLAB_NOIO); 83 if (t) { 84 t->b_next = bh; 85 bh = t; 86 cnt--; 87 } else { 88 PRINTK("raid1: waiting for %d bh\n", cnt); 89 conf->freebh_blocked = 1; 90 wait_disk_event(conf->wait_buffer, 91 !conf->freebh_blocked || 92 conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2); 93 conf->freebh_blocked = 0; 94 } 95 } 96 return bh; 97} 98 99static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) 100{ 101 unsigned long flags; 102 spin_lock_irqsave(&conf->device_lock, flags); 103 while (bh) { 104 struct buffer_head *t = bh; 105 bh=bh->b_next; 106 if (t->b_pprev == NULL) 107 kmem_cache_free(bh_cachep, t); 108 else { 109 t->b_next= conf->freebh; 110 conf->freebh = t; 111 conf->freebh_cnt++; 112 } 113 } 114 spin_unlock_irqrestore(&conf->device_lock, flags); 115 wake_up(&conf->wait_buffer); 116} 117 118static int raid1_grow_bh(raid1_conf_t *conf, int cnt) 119{ 120 /* allocate cnt buffer_heads, possibly less if kmalloc fails */ 121 int i = 0; 122 123 while (i < cnt) { 124 struct buffer_head *bh; 125 bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); 126 if (!bh) break; 127 128 md_spin_lock_irq(&conf->device_lock); 129 bh->b_pprev = &conf->freebh; 130 bh->b_next = conf->freebh; 131 conf->freebh = bh; 132 conf->freebh_cnt++; 133 md_spin_unlock_irq(&conf->device_lock); 134 135 i++; 136 } 137 return i; 138} 139 140static void raid1_shrink_bh(raid1_conf_t *conf) 141{ 142 /* discard all buffer_heads */ 143 144 md_spin_lock_irq(&conf->device_lock); 145 while (conf->freebh) { 146 struct buffer_head *bh = conf->freebh; 147 conf->freebh = bh->b_next; 148 kmem_cache_free(bh_cachep, bh); 149 conf->freebh_cnt--; 150 } 151 md_spin_unlock_irq(&conf->device_lock); 152} 153 154 155static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) 156{ 157 struct raid1_bh *r1_bh = NULL; 158 159 do { 160 md_spin_lock_irq(&conf->device_lock); 161 if (!conf->freer1_blocked && conf->freer1) { 162 r1_bh = conf->freer1; 163 conf->freer1 = r1_bh->next_r1; 164 conf->freer1_cnt--; 165 r1_bh->next_r1 = NULL; 166 r1_bh->state = (1 << R1BH_PreAlloc); 167 r1_bh->bh_req.b_state = 0; 168 } 169 md_spin_unlock_irq(&conf->device_lock); 170 if (r1_bh) 171 return r1_bh; 172 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO); 173 if (r1_bh) { 174 memset(r1_bh, 0, sizeof(*r1_bh)); 175 return r1_bh; 176 } 177 conf->freer1_blocked = 1; 178 wait_disk_event(conf->wait_buffer, 179 !conf->freer1_blocked || 180 conf->freer1_cnt > NR_RESERVED_BUFS/2 181 ); 182 conf->freer1_blocked = 0; 183 } while (1); 184} 185 186static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) 187{ 188 struct buffer_head *bh = r1_bh->mirror_bh_list; 189 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); 190 191 r1_bh->mirror_bh_list = NULL; 192 193 if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { 194 unsigned long flags; 195 spin_lock_irqsave(&conf->device_lock, flags); 196 r1_bh->next_r1 = conf->freer1; 197 conf->freer1 = r1_bh; 198 conf->freer1_cnt++; 199 spin_unlock_irqrestore(&conf->device_lock, flags); 200 /* don't need to wakeup wait_buffer because 201 * raid1_free_bh below will do that 202 */ 203 } else { 204 kfree(r1_bh); 205 } 206 raid1_free_bh(conf, bh); 207} 208 209static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) 210{ 211 int i = 0; 212 213 while (i < cnt) { 214 struct raid1_bh *r1_bh; 215 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); 216 if (!r1_bh) 217 break; 218 memset(r1_bh, 0, sizeof(*r1_bh)); 219 set_bit(R1BH_PreAlloc, &r1_bh->state); 220 r1_bh->mddev = conf->mddev; 221 222 raid1_free_r1bh(r1_bh); 223 i++; 224 } 225 return i; 226} 227 228static void raid1_shrink_r1bh(raid1_conf_t *conf) 229{ 230 md_spin_lock_irq(&conf->device_lock); 231 while (conf->freer1) { 232 struct raid1_bh *r1_bh = conf->freer1; 233 conf->freer1 = r1_bh->next_r1; 234 conf->freer1_cnt--; 235 kfree(r1_bh); 236 } 237 md_spin_unlock_irq(&conf->device_lock); 238} 239 240 241 242static inline void raid1_free_buf(struct raid1_bh *r1_bh) 243{ 244 unsigned long flags; 245 struct buffer_head *bh = r1_bh->mirror_bh_list; 246 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); 247 r1_bh->mirror_bh_list = NULL; 248 249 spin_lock_irqsave(&conf->device_lock, flags); 250 r1_bh->next_r1 = conf->freebuf; 251 conf->freebuf = r1_bh; 252 spin_unlock_irqrestore(&conf->device_lock, flags); 253 raid1_free_bh(conf, bh); 254} 255 256static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) 257{ 258 struct raid1_bh *r1_bh; 259 260 md_spin_lock_irq(&conf->device_lock); 261 wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); 262 r1_bh = conf->freebuf; 263 conf->freebuf = r1_bh->next_r1; 264 r1_bh->next_r1= NULL; 265 md_spin_unlock_irq(&conf->device_lock); 266 267 return r1_bh; 268} 269 270static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) 271{ 272 int i = 0; 273 struct raid1_bh *head = NULL, **tail; 274 tail = &head; 275 276 while (i < cnt) { 277 struct raid1_bh *r1_bh; 278 struct page *page; 279 280 page = alloc_page(GFP_KERNEL); 281 if (!page) 282 break; 283 284 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); 285 if (!r1_bh) { 286 __free_page(page); 287 break; 288 } 289 memset(r1_bh, 0, sizeof(*r1_bh)); 290 r1_bh->bh_req.b_page = page; 291 r1_bh->bh_req.b_data = page_address(page); 292 *tail = r1_bh; 293 r1_bh->next_r1 = NULL; 294 tail = & r1_bh->next_r1; 295 i++; 296 } 297 /* this lock probably isn't needed, as at the time when 298 * we are allocating buffers, nobody else will be touching the 299 * freebuf list. But it doesn't hurt.... 300 */ 301 md_spin_lock_irq(&conf->device_lock); 302 *tail = conf->freebuf; 303 conf->freebuf = head; 304 md_spin_unlock_irq(&conf->device_lock); 305 return i; 306} 307 308static void raid1_shrink_buffers (raid1_conf_t *conf) 309{ 310 struct raid1_bh *head; 311 md_spin_lock_irq(&conf->device_lock); 312 head = conf->freebuf; 313 conf->freebuf = NULL; 314 md_spin_unlock_irq(&conf->device_lock); 315 316 while (head) { 317 struct raid1_bh *r1_bh = head; 318 head = r1_bh->next_r1; 319 __free_page(r1_bh->bh_req.b_page); 320 kfree(r1_bh); 321 } 322} 323 324static int raid1_map (mddev_t *mddev, kdev_t *rdev) 325{ 326 raid1_conf_t *conf = mddev_to_conf(mddev); 327 int i, disks = MD_SB_DISKS; 328 329 /* 330 * Later we do read balancing on the read side 331 * now we use the first available disk. 332 */ 333 334 for (i = 0; i < disks; i++) { 335 if (conf->mirrors[i].operational) { 336 *rdev = conf->mirrors[i].dev; 337 return (0); 338 } 339 } 340 341 printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); 342 return (-1); 343} 344 345static void raid1_reschedule_retry (struct raid1_bh *r1_bh) 346{ 347 unsigned long flags; 348 mddev_t *mddev = r1_bh->mddev; 349 raid1_conf_t *conf = mddev_to_conf(mddev); 350 351 md_spin_lock_irqsave(&retry_list_lock, flags); 352 if (raid1_retry_list == NULL) 353 raid1_retry_tail = &raid1_retry_list; 354 *raid1_retry_tail = r1_bh; 355 raid1_retry_tail = &r1_bh->next_r1; 356 r1_bh->next_r1 = NULL; 357 md_spin_unlock_irqrestore(&retry_list_lock, flags); 358 md_wakeup_thread(conf->thread); 359} 360 361 362static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) 363{ 364 unsigned long flags; 365 spin_lock_irqsave(&conf->segment_lock, flags); 366 if (sector < conf->start_active) 367 conf->cnt_done--; 368 else if (sector >= conf->start_future && conf->phase == phase) 369 conf->cnt_future--; 370 else if (!--conf->cnt_pending) 371 wake_up(&conf->wait_ready); 372 373 spin_unlock_irqrestore(&conf->segment_lock, flags); 374} 375 376static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) 377{ 378 unsigned long flags; 379 spin_lock_irqsave(&conf->segment_lock, flags); 380 if (sector >= conf->start_ready) 381 --conf->cnt_ready; 382 else if (sector >= conf->start_active) { 383 if (!--conf->cnt_active) { 384 conf->start_active = conf->start_ready; 385 wake_up(&conf->wait_done); 386 } 387 } 388 spin_unlock_irqrestore(&conf->segment_lock, flags); 389} 390 391/* 392 * raid1_end_bh_io() is called when we have finished servicing a mirrored 393 * operation and are ready to return a success/failure code to the buffer 394 * cache layer. 395 */ 396static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) 397{ 398 struct buffer_head *bh = r1_bh->master_bh; 399 400 io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), 401 test_bit(R1BH_SyncPhase, &r1_bh->state)); 402 403 bh->b_end_io(bh, uptodate); 404 raid1_free_r1bh(r1_bh); 405} 406void raid1_end_request (struct buffer_head *bh, int uptodate) 407{ 408 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); 409 410 /* 411 * this branch is our 'one mirror IO has finished' event handler: 412 */ 413 if (!uptodate) 414 md_error (r1_bh->mddev, bh->b_dev); 415 else 416 /* 417 * Set R1BH_Uptodate in our master buffer_head, so that 418 * we will return a good error code for to the higher 419 * levels even if IO on some other mirrored buffer fails. 420 * 421 * The 'master' represents the complex operation to 422 * user-side. So if something waits for IO, then it will 423 * wait for the 'master' buffer_head. 424 */ 425 set_bit (R1BH_Uptodate, &r1_bh->state); 426 427 /* 428 * We split up the read and write side, imho they are 429 * conceptually different. 430 */ 431 432 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { 433 /* 434 * we have only one buffer_head on the read side 435 */ 436 437 if (uptodate) { 438 raid1_end_bh_io(r1_bh, uptodate); 439 return; 440 } 441 /* 442 * oops, read error: 443 */ 444 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", 445 partition_name(bh->b_dev), bh->b_blocknr); 446 raid1_reschedule_retry(r1_bh); 447 return; 448 } 449 450 /* 451 * WRITE: 452 * 453 * Let's see if all mirrored write operations have finished 454 * already. 455 */ 456 457 if (atomic_dec_and_test(&r1_bh->remaining)) 458 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); 459} 460 461/* 462 * This routine returns the disk from which the requested read should 463 * be done. It bookkeeps the last read position for every disk 464 * in array and when new read requests come, the disk which last 465 * position is nearest to the request, is chosen. 466 * 467 * TODO: now if there are 2 mirrors in the same 2 devices, performance 468 * degrades dramatically because position is mirror, not device based. 469 * This should be changed to be device based. Also atomic sequential 470 * reads should be somehow balanced. 471 */ 472 473static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) 474{ 475 int new_disk = conf->last_used; 476 const int sectors = bh->b_size >> 9; 477 const unsigned long this_sector = bh->b_rsector; 478 int disk = new_disk; 479 unsigned long new_distance; 480 unsigned long current_distance; 481 482 /* 483 * Check if it is sane at all to balance 484 */ 485 486 if (!conf->mddev->in_sync) 487 goto rb_out; 488 489 490 /* make sure that disk is operational */ 491 while( !conf->mirrors[new_disk].operational) { 492 if (new_disk <= 0) new_disk = conf->raid_disks; 493 new_disk--; 494 if (new_disk == disk) { 495 /* 496 * This means no working disk was found 497 * Nothing much to do, lets not change anything 498 * and hope for the best... 499 */ 500 501 new_disk = conf->last_used; 502 503 goto rb_out; 504 } 505 } 506 disk = new_disk; 507 /* now disk == new_disk == starting point for search */ 508 509 /* 510 * Don't touch anything for sequential reads. 511 */ 512 513 if (this_sector == conf->mirrors[new_disk].head_position) 514 goto rb_out; 515 516 /* 517 * If reads have been done only on a single disk 518 * for a time, lets give another disk a change. 519 * This is for kicking those idling disks so that 520 * they would find work near some hotspot. 521 */ 522 523 if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { 524 conf->sect_count = 0; 525 526#if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92) 527 /* Work around a compiler bug in egcs-2.92.11 19980921 */ 528 new_disk = *(volatile int *)&new_disk; 529#endif 530 do { 531 if (new_disk<=0) 532 new_disk = conf->raid_disks; 533 new_disk--; 534 if (new_disk == disk) 535 break; 536 } while ((conf->mirrors[new_disk].write_only) || 537 (!conf->mirrors[new_disk].operational)); 538 539 goto rb_out; 540 } 541 542 current_distance = abs(this_sector - 543 conf->mirrors[disk].head_position); 544 545 /* Find the disk which is closest */ 546 547 do { 548 if (disk <= 0) 549 disk = conf->raid_disks; 550 disk--; 551 552 if ((conf->mirrors[disk].write_only) || 553 (!conf->mirrors[disk].operational)) 554 continue; 555 556 new_distance = abs(this_sector - 557 conf->mirrors[disk].head_position); 558 559 if (new_distance < current_distance) { 560 conf->sect_count = 0; 561 current_distance = new_distance; 562 new_disk = disk; 563 } 564 } while (disk != conf->last_used); 565 566rb_out: 567 conf->mirrors[new_disk].head_position = this_sector + sectors; 568 569 conf->last_used = new_disk; 570 conf->sect_count += sectors; 571 572 return new_disk; 573} 574 575static int raid1_make_request (request_queue_t *q, 576 struct buffer_head * bh) 577{ 578 mddev_t *mddev = q->queuedata; 579 raid1_conf_t *conf = mddev_to_conf(mddev); 580 struct buffer_head *bh_req, *bhl; 581 struct raid1_bh * r1_bh; 582 int disks = MD_SB_DISKS; 583 int i, sum_bhs = 0; 584 struct mirror_info *mirror; 585 586 if (!buffer_locked(bh)) 587 BUG(); 588 589/* 590 * make_request() can abort the operation when READA is being 591 * used and no empty request is available. 592 * 593 * Currently, just replace the command with READ/WRITE. 594 */ 595 r1_bh = raid1_alloc_r1bh (conf); 596 597 spin_lock_irq(&conf->segment_lock); 598 wait_event_lock_irq(conf->wait_done, 599 bh->b_rsector < conf->start_active || 600 bh->b_rsector >= conf->start_future, 601 conf->segment_lock); 602 if (bh->b_rsector < conf->start_active) 603 conf->cnt_done++; 604 else { 605 conf->cnt_future++; 606 if (conf->phase) 607 set_bit(R1BH_SyncPhase, &r1_bh->state); 608 } 609 spin_unlock_irq(&conf->segment_lock); 610 611 /* 612 * i think the read and write branch should be separated completely, 613 * since we want to do read balancing on the read side for example. 614 * Alternative implementations? :) --mingo 615 */ 616 617 r1_bh->master_bh = bh; 618 r1_bh->mddev = mddev; 619 r1_bh->cmd = rw; 620 621 if (rw == READ) { 622 /* 623 * read balancing logic: 624 */ 625 mirror = conf->mirrors + raid1_read_balance(conf, bh); 626 627 bh_req = &r1_bh->bh_req; 628 memcpy(bh_req, bh, sizeof(*bh)); 629 bh_req->b_blocknr = bh->b_rsector; 630 bh_req->b_dev = mirror->dev; 631 bh_req->b_rdev = mirror->dev; 632 /* bh_req->b_rsector = bh->n_rsector; */ 633 bh_req->b_end_io = raid1_end_request; 634 bh_req->b_private = r1_bh; 635 generic_make_request (rw, bh_req); 636 return 0; 637 } 638 639 /* 640 * WRITE: 641 */ 642 643 bhl = raid1_alloc_bh(conf, conf->raid_disks); 644 for (i = 0; i < disks; i++) { 645 struct buffer_head *mbh; 646 if (!conf->mirrors[i].operational) 647 continue; 648 649 /* 650 * We should use a private pool (size depending on NR_REQUEST), 651 * to avoid writes filling up the memory with bhs 652 * 653 * Such pools are much faster than kmalloc anyways (so we waste 654 * almost nothing by not using the master bh when writing and 655 * win alot of cleanness) but for now we are cool enough. --mingo 656 * 657 * It's safe to sleep here, buffer heads cannot be used in a shared 658 * manner in the write branch. Look how we lock the buffer at the 659 * beginning of this function to grok the difference ;) 660 */ 661 mbh = bhl; 662 if (mbh == NULL) { 663 MD_BUG(); 664 break; 665 } 666 bhl = mbh->b_next; 667 mbh->b_next = NULL; 668 mbh->b_this_page = (struct buffer_head *)1; 669 670 /* 671 * prepare mirrored mbh (fields ordered for max mem throughput): 672 */ 673 mbh->b_blocknr = bh->b_rsector; 674 mbh->b_dev = conf->mirrors[i].dev; 675 mbh->b_rdev = conf->mirrors[i].dev; 676 mbh->b_rsector = bh->b_rsector; 677 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) | 678 (1<<BH_Mapped) | (1<<BH_Lock); 679 680 atomic_set(&mbh->b_count, 1); 681 mbh->b_size = bh->b_size; 682 mbh->b_page = bh->b_page; 683 mbh->b_data = bh->b_data; 684 mbh->b_list = BUF_LOCKED; 685 mbh->b_end_io = raid1_end_request; 686 mbh->b_private = r1_bh; 687 688 mbh->b_next = r1_bh->mirror_bh_list; 689 r1_bh->mirror_bh_list = mbh; 690 sum_bhs++; 691 } 692 if (bhl) raid1_free_bh(conf,bhl); 693 if (!sum_bhs) { 694 /* Gag - all mirrors non-operational.. */ 695 raid1_end_bh_io(r1_bh, 0); 696 return 0; 697 } 698 md_atomic_set(&r1_bh->remaining, sum_bhs); 699 700 /* 701 * We have to be a bit careful about the semaphore above, thats 702 * why we start the requests separately. Since kmalloc() could 703 * fail, sleep and make_request() can sleep too, this is the 704 * safer solution. Imagine, end_request decreasing the semaphore 705 * before we could have set it up ... We could play tricks with 706 * the semaphore (presetting it and correcting at the end if 707 * sum_bhs is not 'n' but we have to do end_request by hand if 708 * all requests finish until we had a chance to set up the 709 * semaphore correctly ... lots of races). 710 */ 711 bh = r1_bh->mirror_bh_list; 712 while(bh) { 713 struct buffer_head *bh2 = bh; 714 bh = bh->b_next; 715 generic_make_request(rw, bh2); 716 } 717 return (0); 718} 719 720static void raid1_status(struct seq_file *seq, mddev_t *mddev) 721{ 722 raid1_conf_t *conf = mddev_to_conf(mddev); 723 int i; 724 725 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 726 conf->working_disks); 727 for (i = 0; i < conf->raid_disks; i++) 728 seq_printf(seq, "%s", 729 conf->mirrors[i].operational ? "U" : "_"); 730 seq_printf(seq, "]"); 731} 732 733#define LAST_DISK KERN_ALERT \ 734"raid1: only one disk left and IO error.\n" 735 736#define NO_SPARE_DISK KERN_ALERT \ 737"raid1: no spare disk left, degrading mirror level by one.\n" 738 739#define DISK_FAILED KERN_ALERT \ 740"raid1: Disk failure on %s, disabling device. \n" \ 741" Operation continuing on %d devices\n" 742 743#define START_SYNCING KERN_ALERT \ 744"raid1: start syncing spare disk.\n" 745 746#define ALREADY_SYNCING KERN_INFO \ 747"raid1: syncing already in progress.\n" 748 749static void mark_disk_bad (mddev_t *mddev, int failed) 750{ 751 raid1_conf_t *conf = mddev_to_conf(mddev); 752 struct mirror_info *mirror = conf->mirrors+failed; 753 mdp_super_t *sb = mddev->sb; 754 755 mirror->operational = 0; 756 mark_disk_faulty(sb->disks+mirror->number); 757 mark_disk_nonsync(sb->disks+mirror->number); 758 mark_disk_inactive(sb->disks+mirror->number); 759 if (!mirror->write_only) 760 sb->active_disks--; 761 sb->working_disks--; 762 sb->failed_disks++; 763 mddev->sb_dirty = 1; 764 md_wakeup_thread(conf->thread); 765 if (!mirror->write_only) 766 conf->working_disks--; 767 printk (DISK_FAILED, partition_name (mirror->dev), 768 conf->working_disks); 769} 770 771static int raid1_error (mddev_t *mddev, kdev_t dev) 772{ 773 raid1_conf_t *conf = mddev_to_conf(mddev); 774 struct mirror_info * mirrors = conf->mirrors; 775 int disks = MD_SB_DISKS; 776 int i; 777 778 /* Find the drive. 779 * If it is not operational, then we have already marked it as dead 780 * else if it is the last working disks, ignore the error, let the 781 * next level up know. 782 * else mark the drive as failed 783 */ 784 785 for (i = 0; i < disks; i++) 786 if (mirrors[i].dev==dev && mirrors[i].operational) 787 break; 788 if (i == disks) 789 return 0; 790 791 if (i < conf->raid_disks && conf->working_disks == 1) { 792 /* Don't fail the drive, act as though we were just a 793 * normal single drive 794 */ 795 796 return 1; 797 } 798 mark_disk_bad(mddev, i); 799 return 0; 800} 801 802#undef LAST_DISK 803#undef NO_SPARE_DISK 804#undef DISK_FAILED 805#undef START_SYNCING 806 807 808static void print_raid1_conf (raid1_conf_t *conf) 809{ 810 int i; 811 struct mirror_info *tmp; 812 813 printk("RAID1 conf printout:\n"); 814 if (!conf) { 815 printk("(conf==NULL)\n"); 816 return; 817 } 818 printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, 819 conf->raid_disks, conf->nr_disks); 820 821 for (i = 0; i < MD_SB_DISKS; i++) { 822 tmp = conf->mirrors + i; 823 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", 824 i, tmp->spare,tmp->operational, 825 tmp->number,tmp->raid_disk,tmp->used_slot, 826 partition_name(tmp->dev)); 827 } 828} 829 830static void close_sync(raid1_conf_t *conf) 831{ 832 mddev_t *mddev = conf->mddev; 833 /* If reconstruction was interrupted, we need to close the "active" and "pending" 834 * holes. 835 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 836 */ 837 /* this is really needed when recovery stops too... */ 838 spin_lock_irq(&conf->segment_lock); 839 conf->start_active = conf->start_pending; 840 conf->start_ready = conf->start_pending; 841 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); 842 conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; 843 conf->start_future = (mddev->sb->size<<1)+1; 844 conf->cnt_pending = conf->cnt_future; 845 conf->cnt_future = 0; 846 conf->phase = conf->phase ^1; 847 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); 848 conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; 849 conf->phase = 0; 850 conf->cnt_future = conf->cnt_done;; 851 conf->cnt_done = 0; 852 spin_unlock_irq(&conf->segment_lock); 853 wake_up(&conf->wait_done); 854 855 mempool_destroy(conf->r1buf_pool); 856 conf->r1buf_pool = NULL; 857} 858 859static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) 860{ 861 int err = 0; 862 int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; 863 raid1_conf_t *conf = mddev->private; 864 struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; 865 mdp_super_t *sb = mddev->sb; 866 mdp_disk_t *failed_desc, *spare_desc, *added_desc; 867 mdk_rdev_t *spare_rdev, *failed_rdev; 868 869 print_raid1_conf(conf); 870 871 switch (state) { 872 case DISKOP_SPARE_ACTIVE: 873 case DISKOP_SPARE_INACTIVE: 874 /* need to wait for pending sync io before locking device */ 875 close_sync(conf); 876 } 877 878 md_spin_lock_irq(&conf->device_lock); 879 /* 880 * find the disk ... 881 */ 882 switch (state) { 883 884 case DISKOP_SPARE_ACTIVE: 885 886 /* 887 * Find the failed disk within the RAID1 configuration ... 888 * (this can only be in the first conf->working_disks part) 889 */ 890 for (i = 0; i < conf->raid_disks; i++) { 891 tmp = conf->mirrors + i; 892 if ((!tmp->operational && !tmp->spare) || 893 !tmp->used_slot) { 894 failed_disk = i; 895 break; 896 } 897 } 898 /* 899 * When we activate a spare disk we _must_ have a disk in 900 * the lower (active) part of the array to replace. 901 */ 902 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { 903 MD_BUG(); 904 err = 1; 905 goto abort; 906 } 907 /* fall through */ 908 909 case DISKOP_SPARE_WRITE: 910 case DISKOP_SPARE_INACTIVE: 911 912 /* 913 * Find the spare disk ... (can only be in the 'high' 914 * area of the array) 915 */ 916 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { 917 tmp = conf->mirrors + i; 918 if (tmp->spare && tmp->number == (*d)->number) { 919 spare_disk = i; 920 break; 921 } 922 } 923 if (spare_disk == -1) { 924 MD_BUG(); 925 err = 1; 926 goto abort; 927 } 928 break; 929 930 case DISKOP_HOT_REMOVE_DISK: 931 932 for (i = 0; i < MD_SB_DISKS; i++) { 933 tmp = conf->mirrors + i; 934 if (tmp->used_slot && (tmp->number == (*d)->number)) { 935 if (tmp->operational) { 936 err = -EBUSY; 937 goto abort; 938 } 939 removed_disk = i; 940 break; 941 } 942 } 943 if (removed_disk == -1) { 944 MD_BUG(); 945 err = 1; 946 goto abort; 947 } 948 break; 949 950 case DISKOP_HOT_ADD_DISK: 951 952 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { 953 tmp = conf->mirrors + i; 954 if (!tmp->used_slot) { 955 added_disk = i; 956 break; 957 } 958 } 959 if (added_disk == -1) { 960 MD_BUG(); 961 err = 1; 962 goto abort; 963 } 964 break; 965 } 966 967 switch (state) { 968 /* 969 * Switch the spare disk to write-only mode: 970 */ 971 case DISKOP_SPARE_WRITE: 972 sdisk = conf->mirrors + spare_disk; 973 sdisk->operational = 1; 974 sdisk->write_only = 1; 975 break; 976 /* 977 * Deactivate a spare disk: 978 */ 979 case DISKOP_SPARE_INACTIVE: 980<<<<<<< found 981 if (conf->start_future > 0) { 982 MD_BUG(); 983 err = -EBUSY; 984 break; 985 } 986||||||| expected 987 close_sync(conf); 988======= 989>>>>>>> replacement 990 sdisk = conf->mirrors + spare_disk; 991 sdisk->operational = 0; 992 sdisk->write_only = 0; 993 break; 994 /* 995 * Activate (mark read-write) the (now sync) spare disk, 996 * which means we switch it's 'raid position' (->raid_disk) 997 * with the failed disk. (only the first 'conf->nr_disks' 998 * slots are used for 'real' disks and we must preserve this 999 * property) 1000 */ 1001 case DISKOP_SPARE_ACTIVE: 1002<<<<<<< found 1003 if (conf->start_future > 0) { 1004 MD_BUG(); 1005 err = -EBUSY; 1006 break; 1007 } 1008||||||| expected 1009 close_sync(conf); 1010======= 1011>>>>>>> replacement 1012 sdisk = conf->mirrors + spare_disk; 1013 fdisk = conf->mirrors + failed_disk; 1014 1015 spare_desc = &sb->disks[sdisk->number]; 1016 failed_desc = &sb->disks[fdisk->number]; 1017 1018 if (spare_desc != *d) { 1019 MD_BUG(); 1020 err = 1; 1021 goto abort; 1022 } 1023 1024 if (spare_desc->raid_disk != sdisk->raid_disk) { 1025 MD_BUG(); 1026 err = 1; 1027 goto abort; 1028 } 1029 1030 if (sdisk->raid_disk != spare_disk) { 1031 MD_BUG(); 1032 err = 1; 1033 goto abort; 1034 } 1035 1036 if (failed_desc->raid_disk != fdisk->raid_disk) { 1037 MD_BUG(); 1038 err = 1; 1039 goto abort; 1040 } 1041 1042 if (fdisk->raid_disk != failed_disk) { 1043 MD_BUG(); 1044 err = 1; 1045 goto abort; 1046 } 1047 1048 /* 1049 * do the switch finally 1050 */ 1051 spare_rdev = find_rdev_nr(mddev, spare_desc->number); 1052 failed_rdev = find_rdev_nr(mddev, failed_desc->number); 1053 1054 /* There must be a spare_rdev, but there may not be a 1055 * failed_rdev. That slot might be empty... 1056 */ 1057 spare_rdev->desc_nr = failed_desc->number; 1058 if (failed_rdev) 1059 failed_rdev->desc_nr = spare_desc->number; 1060 1061 xchg_values(*spare_desc, *failed_desc); 1062 xchg_values(*fdisk, *sdisk); 1063 1064 /* 1065 * (careful, 'failed' and 'spare' are switched from now on) 1066 * 1067 * we want to preserve linear numbering and we want to 1068 * give the proper raid_disk number to the now activated 1069 * disk. (this means we switch back these values) 1070 */ 1071 1072 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); 1073 xchg_values(sdisk->raid_disk, fdisk->raid_disk); 1074 xchg_values(spare_desc->number, failed_desc->number); 1075 xchg_values(sdisk->number, fdisk->number); 1076 1077 *d = failed_desc; 1078 1079 if (sdisk->dev == MKDEV(0,0)) 1080 sdisk->used_slot = 0; 1081 /* 1082 * this really activates the spare. 1083 */ 1084 fdisk->spare = 0; 1085 fdisk->write_only = 0; 1086 1087 /* 1088 * if we activate a spare, we definitely replace a 1089 * non-operational disk slot in the 'low' area of 1090 * the disk array. 1091 */ 1092 1093 conf->working_disks++; 1094 1095 break; 1096 1097 case DISKOP_HOT_REMOVE_DISK: 1098 rdisk = conf->mirrors + removed_disk; 1099 1100 if (rdisk->spare && (removed_disk < conf->raid_disks)) { 1101 MD_BUG(); 1102 err = 1; 1103 goto abort; 1104 } 1105 rdisk->dev = MKDEV(0,0); 1106 rdisk->used_slot = 0; 1107 conf->nr_disks--; 1108 break; 1109 1110 case DISKOP_HOT_ADD_DISK: 1111 adisk = conf->mirrors + added_disk; 1112 added_desc = *d; 1113 1114 if (added_disk != added_desc->number) { 1115 MD_BUG(); 1116 err = 1; 1117 goto abort; 1118 } 1119 1120 adisk->number = added_desc->number; 1121 adisk->raid_disk = added_desc->raid_disk; 1122 adisk->dev = MKDEV(added_desc->major,added_desc->minor); 1123 1124 adisk->operational = 0; 1125 adisk->write_only = 0; 1126 adisk->spare = 1; 1127 adisk->used_slot = 1; 1128 adisk->head_position = 0; 1129 conf->nr_disks++; 1130 1131 break; 1132 1133 default: 1134 MD_BUG(); 1135 err = 1; 1136 goto abort; 1137 } 1138abort: 1139 md_spin_unlock_irq(&conf->device_lock); 1140<<<<<<< found 1141 if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) 1142 /* should move to "END_REBUILD" when such exists */ 1143 raid1_shrink_buffers(conf); 1144 1145 print_raid1_conf(conf); 1146||||||| expected 1147 if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) { 1148 mempool_destroy(conf->r1buf_pool); 1149 conf->r1buf_pool = NULL; 1150 } 1151 1152 print_conf(conf); 1153======= 1154 1155 print_conf(conf); 1156>>>>>>> replacement 1157 return err; 1158} 1159 1160 1161#define IO_ERROR KERN_ALERT \ 1162"raid1: %s: unrecoverable I/O read error for block %lu\n" 1163 1164#define REDIRECT_SECTOR KERN_ERR \ 1165"raid1: %s: redirecting sector %lu to another mirror\n" 1166 1167/* 1168 * This is a kernel thread which: 1169 * 1170 * 1. Retries failed read operations on working mirrors. 1171 * 2. Updates the raid superblock when problems encounter. 1172 * 3. Performs writes following reads for array syncronising. 1173 */ 1174static void end_sync_write(struct buffer_head *bh, int uptodate); 1175static void end_sync_read(struct buffer_head *bh, int uptodate); 1176 1177static void raid1d (void *data) 1178{ 1179 struct raid1_bh *r1_bh; 1180 struct buffer_head *bh; 1181 unsigned long flags; 1182 raid1_conf_t *conf = data; 1183 mddev_t *mddev = conf->mddev; 1184 kdev_t dev; 1185 1186 if (mddev->sb_dirty) 1187 md_update_sb(mddev); 1188 1189 for (;;) { 1190 md_spin_lock_irqsave(&retry_list_lock, flags); 1191 r1_bh = raid1_retry_list; 1192 if (!r1_bh) 1193 break; 1194 raid1_retry_list = r1_bh->next_r1; 1195 md_spin_unlock_irqrestore(&retry_list_lock, flags); 1196 1197 mddev = r1_bh->mddev; 1198 bh = &r1_bh->bh_req; 1199 switch(r1_bh->cmd) { 1200 case SPECIAL: 1201 /* have to allocate lots of bh structures and 1202 * schedule writes 1203 */ 1204 if (test_bit(R1BH_Uptodate, &r1_bh->state)) { 1205 int i, sum_bhs = 0; 1206 int disks = MD_SB_DISKS; 1207 struct buffer_head *bhl, *mbh; 1208 1209 conf = mddev_to_conf(mddev); 1210 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ 1211 for (i = 0; i < disks ; i++) { 1212 if (!conf->mirrors[i].operational) 1213 continue; 1214 if (i==conf->last_used) 1215 /* we read from here, no need to write */ 1216 continue; 1217 if (i < conf->raid_disks 1218 && mddev->in_sync) 1219 /* don't need to write this, 1220 * we are just rebuilding */ 1221 continue; 1222 mbh = bhl; 1223 if (!mbh) { 1224 MD_BUG(); 1225 break; 1226 } 1227 bhl = mbh->b_next; 1228 mbh->b_this_page = (struct buffer_head *)1; 1229 1230 1231 /* 1232 * prepare mirrored bh (fields ordered for max mem throughput): 1233 */ 1234 mbh->b_blocknr = bh->b_blocknr; 1235 mbh->b_dev = conf->mirrors[i].dev; 1236 mbh->b_rdev = conf->mirrors[i].dev; 1237 mbh->b_rsector = bh->b_blocknr; 1238 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) | 1239 (1<<BH_Mapped) | (1<<BH_Lock); 1240 atomic_set(&mbh->b_count, 1); 1241 mbh->b_size = bh->b_size; 1242 mbh->b_page = bh->b_page; 1243 mbh->b_data = bh->b_data; 1244 mbh->b_list = BUF_LOCKED; 1245 mbh->b_end_io = end_sync_write; 1246 mbh->b_private = r1_bh; 1247 1248 mbh->b_next = r1_bh->mirror_bh_list; 1249 r1_bh->mirror_bh_list = mbh; 1250 1251 sum_bhs++; 1252 } 1253 md_atomic_set(&r1_bh->remaining, sum_bhs); 1254 if (bhl) raid1_free_bh(conf, bhl); 1255 mbh = r1_bh->mirror_bh_list; 1256 1257 if (!sum_bhs) { 1258 /* nowhere to write this too... I guess we 1259 * must be done 1260 */ 1261 sync_request_done(bh->b_blocknr, conf); 1262 md_done_sync(mddev, bh->b_size>>9, 0); 1263 raid1_free_buf(r1_bh); 1264 } else 1265 while (mbh) { 1266 struct buffer_head *bh1 = mbh; 1267 mbh = mbh->b_next; 1268 generic_make_request(WRITE, bh1); 1269 md_sync_acct(bh1->b_dev, bh1->b_size/512); 1270 } 1271 } else { 1272 /* There is no point trying a read-for-reconstruct 1273 * as reconstruct is about to be aborted 1274 */ 1275 1276 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); 1277 md_done_sync(mddev, bh->b_size>>9, 0); 1278 } 1279 1280 break; 1281 case READ: 1282 case READA: 1283 dev = bh->b_dev; 1284 raid1_map (mddev, &bh->b_dev); 1285 if (bh->b_dev == dev) { 1286 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); 1287 raid1_end_bh_io(r1_bh, 0); 1288 } else { 1289 printk (REDIRECT_SECTOR, 1290 partition_name(bh->b_dev), bh->b_blocknr); 1291 bh->b_rdev = bh->b_dev; 1292 bh->b_rsector = bh->b_blocknr; 1293 generic_make_request (r1_bh->cmd, bh); 1294 } 1295 break; 1296 } 1297 } 1298 md_spin_unlock_irqrestore(&retry_list_lock, flags); 1299} 1300#undef IO_ERROR 1301#undef REDIRECT_SECTOR 1302 1303<<<<<<< found 1304static void raid1syncd (void *data) 1305{ 1306 raid1_conf_t *conf = data; 1307||||||| expected 1308static void raid1syncd(void *data) 1309{ 1310 conf_t *conf = data; 1311======= 1312>>>>>>> replacement 1313 1314/* 1315 * perform a "sync" on one "block" 1316 * 1317 * We need to make sure that no normal I/O request - particularly write 1318 * requests - conflict with active sync requests. 1319 * This is achieved by conceptually dividing the device space into a 1320 * number of sections: 1321 * DONE: 0 .. a-1 These blocks are in-sync 1322 * ACTIVE: a.. b-1 These blocks may have active sync requests, but 1323 * no normal IO requests 1324 * READY: b .. c-1 These blocks have no normal IO requests - sync 1325 * request may be happening 1326 * PENDING: c .. d-1 These blocks may have IO requests, but no new 1327 * ones will be added 1328 * FUTURE: d .. end These blocks are not to be considered yet. IO may 1329 * be happening, but not sync 1330 * 1331 * We keep a 1332 * phase which flips (0 or 1) each time d moves and 1333 * a count of: 1334 * z = active io requests in FUTURE since d moved - marked with 1335 * current phase 1336 * y = active io requests in FUTURE before d moved, or PENDING - 1337 * marked with previous phase 1338 * x = active sync requests in READY 1339 * w = active sync requests in ACTIVE 1340 * v = active io requests in DONE 1341 * 1342 * Normally, a=b=c=d=0 and z= active io requests 1343 * or a=b=c=d=END and v= active io requests 1344 * Allowed changes to a,b,c,d: 1345 * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase 1346 * B: y==0 -> c=d 1347 * C: b=c, w+=x, x=0 1348 * D: w==0 -> a=b 1349 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 1350 * 1351 * At start of sync we apply A. 1352 * When y reaches 0, we apply B then A then being sync requests 1353 * When sync point reaches c-1, we wait for y==0, and W==0, and 1354 * then apply apply B then A then D then C. 1355 * Finally, we apply E 1356 * 1357 * The sync request simply issues a "read" against a working drive 1358 * This is marked so that on completion the raid1d thread is woken to 1359 * issue suitable write requests 1360 */ 1361 1362static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr) 1363{ 1364 raid1_conf_t *conf = mddev_to_conf(mddev); 1365 struct mirror_info *mirror; 1366 struct raid1_bh *r1_bh; 1367 struct buffer_head *bh; 1368 int bsize; 1369 int disk; 1370 int block_nr; 1371 int buffs; 1372 1373 if (!sector_nr) { 1374 /* we want enough buffers to hold twice the window of 128*/ 1375 buffs = 128 *2 / (PAGE_SIZE>>9); 1376 buffs = raid1_grow_buffers(conf, buffs); 1377 if (buffs < 2) 1378 goto nomem; 1379 conf->window = buffs*(PAGE_SIZE>>9)/2; 1380 } 1381 spin_lock_irq(&conf->segment_lock); 1382 if (!sector_nr) { 1383 /* initialize ...*/ 1384 conf->start_active = 0; 1385 conf->start_ready = 0; 1386 conf->start_pending = 0; 1387 conf->start_future = 0; 1388 conf->phase = 0; 1389 1390 conf->cnt_future += conf->cnt_done+conf->cnt_pending; 1391 conf->cnt_done = conf->cnt_pending = 0; 1392 if (conf->cnt_ready || conf->cnt_active) 1393 MD_BUG(); 1394 } 1395 while (sector_nr >= conf->start_pending) { 1396 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", 1397 sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, 1398 conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); 1399 wait_event_lock_irq(conf->wait_done, 1400 !conf->cnt_active, 1401 conf->segment_lock); 1402 wait_event_lock_irq(conf->wait_ready, 1403 !conf->cnt_pending, 1404 conf->segment_lock); 1405 conf->start_active = conf->start_ready; 1406 conf->start_ready = conf->start_pending; 1407 conf->start_pending = conf->start_future; 1408 conf->start_future = conf->start_future+conf->window; 1409 // Note: falling off the end is not a problem 1410 conf->phase = conf->phase ^1; 1411 conf->cnt_active = conf->cnt_ready; 1412 conf->cnt_ready = 0; 1413 conf->cnt_pending = conf->cnt_future; 1414 conf->cnt_future = 0; 1415 wake_up(&conf->wait_done); 1416 } 1417 conf->cnt_ready++; 1418 spin_unlock_irq(&conf->segment_lock); 1419 1420 1421 /* If reconstructing, and >1 working disc, 1422 * could dedicate one to rebuild and others to 1423 * service read requests .. 1424 */ 1425 disk = conf->last_used; 1426 /* make sure disk is operational */ 1427 while (!conf->mirrors[disk].operational) { 1428 if (disk <= 0) disk = conf->raid_disks; 1429 disk--; 1430 if (disk == conf->last_used) 1431 break; 1432 } 1433 conf->last_used = disk; 1434 1435 mirror = conf->mirrors+conf->last_used; 1436 1437 r1_bh = raid1_alloc_buf (conf); 1438 r1_bh->master_bh = NULL; 1439 r1_bh->mddev = mddev; 1440 r1_bh->cmd = SPECIAL; 1441 bh = &r1_bh->bh_req; 1442 1443 block_nr = sector_nr; 1444 bsize = 512; 1445 while (!(block_nr & 1) && bsize < PAGE_SIZE 1446 && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) { 1447 block_nr >>= 1; 1448 bsize <<= 1; 1449 } 1450 bh->b_size = bsize; 1451 bh->b_list = BUF_LOCKED; 1452 bh->b_dev = mirror->dev; 1453 bh->b_rdev = mirror->dev; 1454 bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock); 1455 if (!bh->b_page) 1456 BUG(); 1457 if (!bh->b_data) 1458 BUG(); 1459 if (bh->b_data != page_address(bh->b_page)) 1460 BUG(); 1461 bh->b_end_io = end_sync_read; 1462 bh->b_private = r1_bh; 1463 bh->b_blocknr = sector_nr; 1464 bh->b_rsector = sector_nr; 1465 init_waitqueue_head(&bh->b_wait); 1466 1467 generic_make_request(READ, bh); 1468 md_sync_acct(bh->b_dev, bh->b_size/512); 1469 1470 return (bsize >> 9); 1471 1472nomem: 1473<<<<<<< found 1474 raid1_shrink_buffers(conf); 1475 return -ENOMEM; 1476} 1477 1478static void end_sync_read(struct buffer_head *bh, int uptodate) 1479||||||| expected 1480 if (!sector_nr) 1481 if (init_resync(conf)) 1482 return -ENOMEM; 1483 /* 1484 * If there is non-resync activity waiting for us then 1485 * put in a delay to throttle resync. 1486======= 1487 if (sector_nr == 0) 1488 if (init_resync(conf)) 1489 return -ENOMEM; 1490 1491 max_sector = mddev->sb->size << 1; 1492 if (sector_nr >= max_sector) { 1493 close_sync(conf); 1494 return 0; 1495 } 1496 1497 /* 1498 * If there is non-resync activity waiting for us then 1499 * put in a delay to throttle resync. 1500>>>>>>> replacement 1501{ 1502 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); 1503 1504 /* we have read a block, now it needs to be re-written, 1505 * or re-read if the read failed. 1506 * We don't do much here, just schedule handling by raid1d 1507 */ 1508 if (!uptodate) 1509 md_error (r1_bh->mddev, bh->b_dev); 1510 else 1511 set_bit(R1BH_Uptodate, &r1_bh->state); 1512 raid1_reschedule_retry(r1_bh); 1513} 1514 1515static void end_sync_write(struct buffer_head *bh, int uptodate) 1516{ 1517 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); 1518 1519 if (!uptodate) 1520 md_error (r1_bh->mddev, bh->b_dev); 1521 if (atomic_dec_and_test(&r1_bh->remaining)) { 1522 mddev_t *mddev = r1_bh->mddev; 1523<<<<<<< found 1524 unsigned long sect = bh->b_blocknr; 1525 int size = bh->b_size; 1526 raid1_free_buf(r1_bh); 1527 sync_request_done(sect, mddev_to_conf(mddev)); 1528 md_done_sync(mddev,size>>9, uptodate); 1529||||||| expected 1530 r1_bio->sector = sector_nr; 1531 r1_bio->cmd = SPECIAL; 1532 1533 max_sector = mddev->sb->size << 1; 1534 if (sector_nr >= max_sector) 1535 BUG(); 1536 1537======= 1538 r1_bio->sector = sector_nr; 1539 r1_bio->cmd = SPECIAL; 1540 1541>>>>>>> replacement 1542 } 1543} 1544 1545#define INVALID_LEVEL KERN_WARNING \ 1546"raid1: md%d: raid level not set to mirroring (%d)\n" 1547 1548#define NO_SB KERN_ERR \ 1549"raid1: disabled mirror %s (couldn't access raid superblock)\n" 1550 1551#define ERRORS KERN_ERR \ 1552"raid1: disabled mirror %s (errors detected)\n" 1553 1554#define NOT_IN_SYNC KERN_ERR \ 1555"raid1: disabled mirror %s (not in sync)\n" 1556 1557#define INCONSISTENT KERN_ERR \ 1558"raid1: disabled mirror %s (inconsistent descriptor)\n" 1559 1560#define ALREADY_RUNNING KERN_ERR \ 1561"raid1: disabled mirror %s (mirror %d already operational)\n" 1562 1563#define OPERATIONAL KERN_INFO \ 1564"raid1: device %s operational as mirror %d\n" 1565 1566#define MEM_ERROR KERN_ERR \ 1567"raid1: couldn't allocate memory for md%d\n" 1568 1569#define SPARE KERN_INFO \ 1570"raid1: spare disk %s\n" 1571 1572#define NONE_OPERATIONAL KERN_ERR \ 1573"raid1: no operational mirrors for md%d\n" 1574 1575#define ARRAY_IS_ACTIVE KERN_INFO \ 1576"raid1: raid set md%d active with %d out of %d mirrors\n" 1577 1578#define THREAD_ERROR KERN_ERR \ 1579"raid1: couldn't allocate thread for md%d\n" 1580 1581#define START_RESYNC KERN_WARNING \ 1582"raid1: raid set md%d not clean; reconstructing mirrors\n" 1583 1584static int raid1_run (mddev_t *mddev) 1585{ 1586 raid1_conf_t *conf; 1587 int i, j, disk_idx; 1588 struct mirror_info *disk; 1589 mdp_super_t *sb = mddev->sb; 1590 mdp_disk_t *descriptor; 1591 mdk_rdev_t *rdev; 1592 struct md_list_head *tmp; 1593 1594 MOD_INC_USE_COUNT; 1595 1596 if (sb->level != 1) { 1597 printk(INVALID_LEVEL, mdidx(mddev), sb->level); 1598 goto out; 1599 } 1600 /* 1601 * copy the already verified devices into our private RAID1 1602 * bookkeeping area. [whatever we allocate in raid1_run(), 1603 * should be freed in raid1_stop()] 1604 */ 1605 1606 conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); 1607 mddev->private = conf; 1608 if (!conf) { 1609 printk(MEM_ERROR, mdidx(mddev)); 1610 goto out; 1611 } 1612 memset(conf, 0, sizeof(*conf)); 1613 1614 ITERATE_RDEV(mddev,rdev,tmp) { 1615 if (rdev->faulty) { 1616 printk(ERRORS, partition_name(rdev->dev)); 1617 } else { 1618 if (!rdev->sb) { 1619 MD_BUG(); 1620 continue; 1621 } 1622 } 1623 if (rdev->desc_nr == -1) { 1624 MD_BUG(); 1625 continue; 1626 } 1627 descriptor = &sb->disks[rdev->desc_nr]; 1628 disk_idx = descriptor->raid_disk; 1629 disk = conf->mirrors + disk_idx; 1630 1631 if (disk_faulty(descriptor)) { 1632 disk->number = descriptor->number; 1633 disk->raid_disk = disk_idx; 1634 disk->dev = rdev->dev; 1635 disk->sect_limit = MAX_WORK_PER_DISK; 1636 disk->operational = 0; 1637 disk->write_only = 0; 1638 disk->spare = 0; 1639 disk->used_slot = 1; 1640 disk->head_position = 0; 1641 continue; 1642 } 1643 if (disk_active(descriptor)) { 1644 if (!disk_sync(descriptor)) { 1645 printk(NOT_IN_SYNC, 1646 partition_name(rdev->dev)); 1647 continue; 1648 } 1649 if ((descriptor->number > MD_SB_DISKS) || 1650 (disk_idx > sb->raid_disks)) { 1651 1652 printk(INCONSISTENT, 1653 partition_name(rdev->dev)); 1654 continue; 1655 } 1656 if (disk->operational) { 1657 printk(ALREADY_RUNNING, 1658 partition_name(rdev->dev), 1659 disk_idx); 1660 continue; 1661 } 1662 printk(OPERATIONAL, partition_name(rdev->dev), 1663 disk_idx); 1664 disk->number = descriptor->number; 1665 disk->raid_disk = disk_idx; 1666 disk->dev = rdev->dev; 1667 disk->sect_limit = MAX_WORK_PER_DISK; 1668 disk->operational = 1; 1669 disk->write_only = 0; 1670 disk->spare = 0; 1671 disk->used_slot = 1; 1672 disk->head_position = 0; 1673 conf->working_disks++; 1674 } else { 1675 /* 1676 * Must be a spare disk .. 1677 */ 1678 printk(SPARE, partition_name(rdev->dev)); 1679 disk->number = descriptor->number; 1680 disk->raid_disk = disk_idx; 1681 disk->dev = rdev->dev; 1682 disk->sect_limit = MAX_WORK_PER_DISK; 1683 disk->operational = 0; 1684 disk->write_only = 0; 1685 disk->spare = 1; 1686 disk->used_slot = 1; 1687 disk->head_position = 0; 1688 } 1689 } 1690 conf->raid_disks = sb->raid_disks; 1691 conf->nr_disks = sb->nr_disks; 1692 conf->mddev = mddev; 1693 conf->device_lock = MD_SPIN_LOCK_UNLOCKED; 1694 1695 conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; 1696 init_waitqueue_head(&conf->wait_buffer); 1697 init_waitqueue_head(&conf->wait_done); 1698 init_waitqueue_head(&conf->wait_ready); 1699 1700 if (!conf->working_disks) { 1701 printk(NONE_OPERATIONAL, mdidx(mddev)); 1702 goto out_free_conf; 1703 } 1704 1705 1706 /* pre-allocate some buffer_head structures. 1707 * As a minimum, 1 r1bh and raid_disks buffer_heads 1708 * would probably get us by in tight memory situations, 1709 * but a few more is probably a good idea. 1710 * For now, try NR_RESERVED_BUFS r1bh and 1711 * NR_RESERVED_BUFS*raid_disks bufferheads 1712 * This will allow at least NR_RESERVED_BUFS concurrent 1713 * reads or writes even if kmalloc starts failing 1714 */ 1715 if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS || 1716 raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks) 1717 < NR_RESERVED_BUFS*conf->raid_disks) { 1718 printk(MEM_ERROR, mdidx(mddev)); 1719 goto out_free_conf; 1720 } 1721 1722 for (i = 0; i < MD_SB_DISKS; i++) { 1723 1724 descriptor = sb->disks+i; 1725 disk_idx = descriptor->raid_disk; 1726 disk = conf->mirrors + disk_idx; 1727 1728 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && 1729 !disk->used_slot) { 1730 1731 disk->number = descriptor->number; 1732 disk->raid_disk = disk_idx; 1733 disk->dev = MKDEV(0,0); 1734 1735 disk->operational = 0; 1736 disk->write_only = 0; 1737 disk->spare = 0; 1738 disk->used_slot = 1; 1739 disk->head_position = 0; 1740 } 1741 } 1742 1743 /* 1744 * find the first working one and use it as a starting point 1745 * to read balancing. 1746 */ 1747 for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) 1748 /* nothing */; 1749 conf->last_used = j; 1750 1751 1752 1753 { 1754 const char * name = "raid1d"; 1755 1756 conf->thread = md_register_thread(raid1d, conf, name); 1757 if (!conf->thread) { 1758 printk(THREAD_ERROR, mdidx(mddev)); 1759 goto out_free_conf; 1760 } 1761 } 1762 1763<<<<<<< found 1764 if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) && 1765 (conf->working_disks > 1)) { 1766 const char * name = "raid1syncd"; 1767 1768 conf->resync_thread = md_register_thread(raid1syncd, conf,name); 1769||||||| expected 1770 if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) && 1771 (conf->working_disks > 1)) { 1772 const char * name = "raid1syncd"; 1773 1774 conf->resync_thread = md_register_thread(raid1syncd, conf, name); 1775======= 1776>>>>>>> replacement 1777 1778 /* 1779 * Regenerate the "device is in sync with the raid set" bit for 1780 * each device. 1781 */ 1782 for (i = 0; i < MD_SB_DISKS; i++) { 1783 mark_disk_nonsync(sb->disks+i); 1784 for (j = 0; j < sb->raid_disks; j++) { 1785 if (!conf->mirrors[j].operational) 1786 continue; 1787 if (sb->disks[i].number == conf->mirrors[j].number) 1788 mark_disk_sync(sb->disks+i); 1789 } 1790 } 1791 sb->active_disks = conf->working_disks; 1792 1793 printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); 1794 /* 1795 * Ok, everything is just fine now 1796 */ 1797 return 0; 1798 1799out_free_conf: 1800 raid1_shrink_r1bh(conf); 1801 raid1_shrink_bh(conf); 1802 raid1_shrink_buffers(conf); 1803 kfree(conf); 1804 mddev->private = NULL; 1805out: 1806 MOD_DEC_USE_COUNT; 1807 return -EIO; 1808} 1809 1810#undef INVALID_LEVEL 1811#undef NO_SB 1812#undef ERRORS 1813#undef NOT_IN_SYNC 1814#undef INCONSISTENT 1815#undef ALREADY_RUNNING 1816#undef OPERATIONAL 1817#undef SPARE 1818#undef NONE_OPERATIONAL 1819#undef ARRAY_IS_ACTIVE 1820 1821<<<<<<< found 1822static int raid1_stop_resync (mddev_t *mddev) 1823{ 1824 raid1_conf_t *conf = mddev_to_conf(mddev); 1825 1826 if (conf->resync_thread) { 1827 if (conf->resync_mirrors) { 1828 md_interrupt_thread(conf->resync_thread); 1829 1830 printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); 1831 return 1; 1832 } 1833 return 0; 1834 } 1835 return 0; 1836} 1837 1838static int raid1_restart_resync (mddev_t *mddev) 1839{ 1840 raid1_conf_t *conf = mddev_to_conf(mddev); 1841||||||| expected 1842static int stop_resync(mddev_t *mddev) 1843{ 1844 conf_t *conf = mddev_to_conf(mddev); 1845 1846 if (conf->resync_thread) { 1847 if (conf->resync_mirrors) { 1848 md_interrupt_thread(conf->resync_thread); 1849 1850 printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); 1851 return 1; 1852 } 1853 return 0; 1854 } 1855 return 0; 1856} 1857 1858static int restart_resync(mddev_t *mddev) 1859{ 1860 conf_t *conf = mddev_to_conf(mddev); 1861======= 1862>>>>>>> replacement 1863static int raid1_stop (mddev_t *mddev) 1864{ 1865 raid1_conf_t *conf = mddev_to_conf(mddev); 1866 1867 md_unregister_thread(conf->thread); 1868 raid1_shrink_r1bh(conf); 1869 raid1_shrink_bh(conf); 1870 raid1_shrink_buffers(conf); 1871 kfree(conf); 1872 mddev->private = NULL; 1873 MOD_DEC_USE_COUNT; 1874 return 0; 1875} 1876 1877static mdk_personality_t raid1_personality= 1878{ 1879 name: "raid1", 1880 make_request: raid1_make_request, 1881 run: raid1_run, 1882 stop: raid1_stop, 1883 status: raid1_status, 1884 error_handler: raid1_error, 1885 diskop: raid1_diskop, 1886<<<<<<< found 1887 stop_resync: raid1_stop_resync, 1888 restart_resync: raid1_restart_resync, 1889||||||| expected 1890 stop_resync: stop_resync, 1891 restart_resync: restart_resync, 1892======= 1893>>>>>>> replacement 1894 sync_request: raid1_sync_request 1895}; 1896 1897static int md__init raid1_init (void) 1898{ 1899 return register_md_personality (RAID1, &raid1_personality); 1900} 1901 1902static void raid1_exit (void) 1903{ 1904 unregister_md_personality (RAID1); 1905} 1906 1907module_init(raid1_init); 1908module_exit(raid1_exit); 1909MODULE_LICENSE("GPL"); 1910