1/*
2 * raid1.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * RAID-1 management functions.
9 *
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 *
12 * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option)
18 * any later version.
19 *
20 * You should have received a copy of the GNU General Public License
21 * (for example /usr/src/linux/COPYING); if not, write to the Free
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/module.h>
26#include <linux/config.h>
27#include <linux/slab.h>
28#include <linux/raid/raid1.h>
29#include <asm/atomic.h>
30
31#define MAJOR_NR MD_MAJOR
32#define MD_DRIVER
33#define MD_PERSONALITY
34
35#define MAX_WORK_PER_DISK 128
36
37#define	NR_RESERVED_BUFS	32
38
39
40/*
41 * The following can be used to debug the driver
42 */
43#define RAID1_DEBUG	0
44
45#if RAID1_DEBUG
46#define PRINTK(x...)   printk(x)
47#define inline
48#define __inline__
49#else
50#define PRINTK(x...)  do { } while (0)
51#endif
52
53
54static mdk_personality_t raid1_personality;
55static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
56struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
57
58static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
59{
60	/* return a linked list of "cnt" struct buffer_heads.
61	 * don't take any off the free list unless we know we can
62	 * get all we need, otherwise we could deadlock
63	 */
64	struct buffer_head *bh=NULL;
65
66	while(cnt) {
67		struct buffer_head *t;
68		md_spin_lock_irq(&conf->device_lock);
69		if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
70			while (cnt) {
71				t = conf->freebh;
72				conf->freebh = t->b_next;
73				t->b_next = bh;
74				bh = t;
75				t->b_state = 0;
76				conf->freebh_cnt--;
77				cnt--;
78			}
79		md_spin_unlock_irq(&conf->device_lock);
80		if (cnt == 0)
81			break;
82		t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
83		if (t) {
84			t->b_next = bh;
85			bh = t;
86			cnt--;
87		} else {
88			PRINTK("raid1: waiting for %d bh\n", cnt);
89			conf->freebh_blocked = 1;
90			wait_disk_event(conf->wait_buffer,
91					!conf->freebh_blocked ||
92					conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
93			conf->freebh_blocked = 0;
94		}
95	}
96	return bh;
97}
98
99static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
100{
101	unsigned long flags;
102	spin_lock_irqsave(&conf->device_lock, flags);
103	while (bh) {
104		struct buffer_head *t = bh;
105		bh=bh->b_next;
106		if (t->b_pprev == NULL)
107			kmem_cache_free(bh_cachep, t);
108		else {
109			t->b_next= conf->freebh;
110			conf->freebh = t;
111			conf->freebh_cnt++;
112		}
113	}
114	spin_unlock_irqrestore(&conf->device_lock, flags);
115	wake_up(&conf->wait_buffer);
116}
117
118static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
119{
120	/* allocate cnt buffer_heads, possibly less if kmalloc fails */
121	int i = 0;
122
123	while (i < cnt) {
124		struct buffer_head *bh;
125		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
126		if (!bh) break;
127
128		md_spin_lock_irq(&conf->device_lock);
129		bh->b_pprev = &conf->freebh;
130		bh->b_next = conf->freebh;
131		conf->freebh = bh;
132		conf->freebh_cnt++;
133		md_spin_unlock_irq(&conf->device_lock);
134
135		i++;
136	}
137	return i;
138}
139
140static void raid1_shrink_bh(raid1_conf_t *conf)
141{
142	/* discard all buffer_heads */
143
144	md_spin_lock_irq(&conf->device_lock);
145	while (conf->freebh) {
146		struct buffer_head *bh = conf->freebh;
147		conf->freebh = bh->b_next;
148		kmem_cache_free(bh_cachep, bh);
149		conf->freebh_cnt--;
150	}
151	md_spin_unlock_irq(&conf->device_lock);
152}
153
154
155static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
156{
157	struct raid1_bh *r1_bh = NULL;
158
159	do {
160		md_spin_lock_irq(&conf->device_lock);
161		if (!conf->freer1_blocked && conf->freer1) {
162			r1_bh = conf->freer1;
163			conf->freer1 = r1_bh->next_r1;
164			conf->freer1_cnt--;
165			r1_bh->next_r1 = NULL;
166			r1_bh->state = (1 << R1BH_PreAlloc);
167			r1_bh->bh_req.b_state = 0;
168		}
169		md_spin_unlock_irq(&conf->device_lock);
170		if (r1_bh)
171			return r1_bh;
172		r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
173		if (r1_bh) {
174			memset(r1_bh, 0, sizeof(*r1_bh));
175			return r1_bh;
176		}
177		conf->freer1_blocked = 1;
178		wait_disk_event(conf->wait_buffer,
179				!conf->freer1_blocked ||
180				conf->freer1_cnt > NR_RESERVED_BUFS/2
181			);
182		conf->freer1_blocked = 0;
183	} while (1);
184}
185
186static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
187{
188	struct buffer_head *bh = r1_bh->mirror_bh_list;
189	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
190
191	r1_bh->mirror_bh_list = NULL;
192
193	if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
194		unsigned long flags;
195		spin_lock_irqsave(&conf->device_lock, flags);
196		r1_bh->next_r1 = conf->freer1;
197		conf->freer1 = r1_bh;
198		conf->freer1_cnt++;
199		spin_unlock_irqrestore(&conf->device_lock, flags);
200		/* don't need to wakeup wait_buffer because
201		 *  raid1_free_bh below will do that
202		 */
203	} else {
204		kfree(r1_bh);
205	}
206	raid1_free_bh(conf, bh);
207}
208
209static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
210{
211	int i = 0;
212
213	while (i < cnt) {
214		struct raid1_bh *r1_bh;
215		r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
216		if (!r1_bh)
217			break;
218		memset(r1_bh, 0, sizeof(*r1_bh));
219		set_bit(R1BH_PreAlloc, &r1_bh->state);
220		r1_bh->mddev = conf->mddev;
221
222		raid1_free_r1bh(r1_bh);
223		i++;
224	}
225	return i;
226}
227
228static void raid1_shrink_r1bh(raid1_conf_t *conf)
229{
230	md_spin_lock_irq(&conf->device_lock);
231	while (conf->freer1) {
232		struct raid1_bh *r1_bh = conf->freer1;
233		conf->freer1 = r1_bh->next_r1;
234		conf->freer1_cnt--;
235		kfree(r1_bh);
236	}
237	md_spin_unlock_irq(&conf->device_lock);
238}
239
240
241
242static inline void raid1_free_buf(struct raid1_bh *r1_bh)
243{
244	unsigned long flags;
245	struct buffer_head *bh = r1_bh->mirror_bh_list;
246	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
247	r1_bh->mirror_bh_list = NULL;
248
249	spin_lock_irqsave(&conf->device_lock, flags);
250	r1_bh->next_r1 = conf->freebuf;
251	conf->freebuf = r1_bh;
252	spin_unlock_irqrestore(&conf->device_lock, flags);
253	raid1_free_bh(conf, bh);
254}
255
256static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
257{
258	struct raid1_bh *r1_bh;
259
260	md_spin_lock_irq(&conf->device_lock);
261	wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
262	r1_bh = conf->freebuf;
263	conf->freebuf = r1_bh->next_r1;
264	r1_bh->next_r1= NULL;
265	md_spin_unlock_irq(&conf->device_lock);
266
267	return r1_bh;
268}
269
270static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
271{
272	int i = 0;
273	struct raid1_bh *head = NULL, **tail;
274	tail = &head;
275
276	while (i < cnt) {
277		struct raid1_bh *r1_bh;
278		struct page *page;
279
280		page = alloc_page(GFP_KERNEL);
281		if (!page)
282			break;
283
284		r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
285		if (!r1_bh) {
286			__free_page(page);
287			break;
288		}
289		memset(r1_bh, 0, sizeof(*r1_bh));
290		r1_bh->bh_req.b_page = page;
291		r1_bh->bh_req.b_data = page_address(page);
292		*tail = r1_bh;
293		r1_bh->next_r1 = NULL;
294		tail = & r1_bh->next_r1;
295		i++;
296	}
297	/* this lock probably isn't needed, as at the time when
298	 * we are allocating buffers, nobody else will be touching the
299	 * freebuf list.  But it doesn't hurt....
300	 */
301	md_spin_lock_irq(&conf->device_lock);
302	*tail = conf->freebuf;
303	conf->freebuf = head;
304	md_spin_unlock_irq(&conf->device_lock);
305	return i;
306}
307
308static void raid1_shrink_buffers (raid1_conf_t *conf)
309{
310	struct raid1_bh *head;
311	md_spin_lock_irq(&conf->device_lock);
312	head = conf->freebuf;
313	conf->freebuf = NULL;
314	md_spin_unlock_irq(&conf->device_lock);
315
316	while (head) {
317		struct raid1_bh *r1_bh = head;
318		head = r1_bh->next_r1;
319		__free_page(r1_bh->bh_req.b_page);
320		kfree(r1_bh);
321	}
322}
323
324static int raid1_map (mddev_t *mddev, kdev_t *rdev)
325{
326	raid1_conf_t *conf = mddev_to_conf(mddev);
327	int i, disks = MD_SB_DISKS;
328
329	/*
330	 * Later we do read balancing on the read side
331	 * now we use the first available disk.
332	 */
333
334	for (i = 0; i < disks; i++) {
335		if (conf->mirrors[i].operational) {
336			*rdev = conf->mirrors[i].dev;
337			return (0);
338		}
339	}
340
341	printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
342	return (-1);
343}
344
345static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
346{
347	unsigned long flags;
348	mddev_t *mddev = r1_bh->mddev;
349	raid1_conf_t *conf = mddev_to_conf(mddev);
350
351	md_spin_lock_irqsave(&retry_list_lock, flags);
352	if (raid1_retry_list == NULL)
353		raid1_retry_tail = &raid1_retry_list;
354	*raid1_retry_tail = r1_bh;
355	raid1_retry_tail = &r1_bh->next_r1;
356	r1_bh->next_r1 = NULL;
357	md_spin_unlock_irqrestore(&retry_list_lock, flags);
358	md_wakeup_thread(conf->thread);
359}
360
361
362static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
363{
364	unsigned long flags;
365	spin_lock_irqsave(&conf->segment_lock, flags);
366	if (sector < conf->start_active)
367		conf->cnt_done--;
368	else if (sector >= conf->start_future && conf->phase == phase)
369		conf->cnt_future--;
370	else if (!--conf->cnt_pending)
371		wake_up(&conf->wait_ready);
372
373	spin_unlock_irqrestore(&conf->segment_lock, flags);
374}
375
376static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
377{
378	unsigned long flags;
379	spin_lock_irqsave(&conf->segment_lock, flags);
380	if (sector >= conf->start_ready)
381		--conf->cnt_ready;
382	else if (sector >= conf->start_active) {
383		if (!--conf->cnt_active) {
384			conf->start_active = conf->start_ready;
385			wake_up(&conf->wait_done);
386		}
387	}
388	spin_unlock_irqrestore(&conf->segment_lock, flags);
389}
390
391/*
392 * raid1_end_bh_io() is called when we have finished servicing a mirrored
393 * operation and are ready to return a success/failure code to the buffer
394 * cache layer.
395 */
396static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
397{
398	struct buffer_head *bh = r1_bh->master_bh;
399
400	io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
401			test_bit(R1BH_SyncPhase, &r1_bh->state));
402
403	bh->b_end_io(bh, uptodate);
404	raid1_free_r1bh(r1_bh);
405}
406void raid1_end_request (struct buffer_head *bh, int uptodate)
407{
408	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
409
410	/*
411	 * this branch is our 'one mirror IO has finished' event handler:
412	 */
413	if (!uptodate)
414		md_error (r1_bh->mddev, bh->b_dev);
415	else
416		/*
417		 * Set R1BH_Uptodate in our master buffer_head, so that
418		 * we will return a good error code for to the higher
419		 * levels even if IO on some other mirrored buffer fails.
420		 *
421		 * The 'master' represents the complex operation to
422		 * user-side. So if something waits for IO, then it will
423		 * wait for the 'master' buffer_head.
424		 */
425		set_bit (R1BH_Uptodate, &r1_bh->state);
426
427	/*
428	 * We split up the read and write side, imho they are
429	 * conceptually different.
430	 */
431
432	if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
433		/*
434		 * we have only one buffer_head on the read side
435		 */
436
437		if (uptodate) {
438			raid1_end_bh_io(r1_bh, uptodate);
439			return;
440		}
441		/*
442		 * oops, read error:
443		 */
444		printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
445			 partition_name(bh->b_dev), bh->b_blocknr);
446		raid1_reschedule_retry(r1_bh);
447		return;
448	}
449
450	/*
451	 * WRITE:
452	 *
453	 * Let's see if all mirrored write operations have finished
454	 * already.
455	 */
456
457	if (atomic_dec_and_test(&r1_bh->remaining))
458		raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
459}
460
461/*
462 * This routine returns the disk from which the requested read should
463 * be done. It bookkeeps the last read position for every disk
464 * in array and when new read requests come, the disk which last
465 * position is nearest to the request, is chosen.
466 *
467 * TODO: now if there are 2 mirrors in the same 2 devices, performance
468 * degrades dramatically because position is mirror, not device based.
469 * This should be changed to be device based. Also atomic sequential
470 * reads should be somehow balanced.
471 */
472
473static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
474{
475	int new_disk = conf->last_used;
476	const int sectors = bh->b_size >> 9;
477	const unsigned long this_sector = bh->b_rsector;
478	int disk = new_disk;
479	unsigned long new_distance;
480	unsigned long current_distance;
481
482	/*
483	 * Check if it is sane at all to balance
484	 */
485
486	if (!conf->mddev->in_sync)
487		goto rb_out;
488
489
490	/* make sure that disk is operational */
491	while( !conf->mirrors[new_disk].operational) {
492		if (new_disk <= 0) new_disk = conf->raid_disks;
493		new_disk--;
494		if (new_disk == disk) {
495			/*
496			 * This means no working disk was found
497			 * Nothing much to do, lets not change anything
498			 * and hope for the best...
499			 */
500
501			new_disk = conf->last_used;
502
503			goto rb_out;
504		}
505	}
506	disk = new_disk;
507	/* now disk == new_disk == starting point for search */
508
509	/*
510	 * Don't touch anything for sequential reads.
511	 */
512
513	if (this_sector == conf->mirrors[new_disk].head_position)
514		goto rb_out;
515
516	/*
517	 * If reads have been done only on a single disk
518	 * for a time, lets give another disk a change.
519	 * This is for kicking those idling disks so that
520	 * they would find work near some hotspot.
521	 */
522
523	if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
524		conf->sect_count = 0;
525
526#if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
527		/* Work around a compiler bug in egcs-2.92.11 19980921 */
528		new_disk = *(volatile int *)&new_disk;
529#endif
530		do {
531			if (new_disk<=0)
532				new_disk = conf->raid_disks;
533			new_disk--;
534			if (new_disk == disk)
535				break;
536		} while ((conf->mirrors[new_disk].write_only) ||
537			 (!conf->mirrors[new_disk].operational));
538
539		goto rb_out;
540	}
541
542	current_distance = abs(this_sector -
543				conf->mirrors[disk].head_position);
544
545	/* Find the disk which is closest */
546
547	do {
548		if (disk <= 0)
549			disk = conf->raid_disks;
550		disk--;
551
552		if ((conf->mirrors[disk].write_only) ||
553				(!conf->mirrors[disk].operational))
554			continue;
555
556		new_distance = abs(this_sector -
557					conf->mirrors[disk].head_position);
558
559		if (new_distance < current_distance) {
560			conf->sect_count = 0;
561			current_distance = new_distance;
562			new_disk = disk;
563		}
564	} while (disk != conf->last_used);
565
566rb_out:
567	conf->mirrors[new_disk].head_position = this_sector + sectors;
568
569	conf->last_used = new_disk;
570	conf->sect_count += sectors;
571
572	return new_disk;
573}
574
575static int raid1_make_request (request_queue_t *q,
576			       struct buffer_head * bh)
577{
578	mddev_t *mddev = q->queuedata;
579	raid1_conf_t *conf = mddev_to_conf(mddev);
580	struct buffer_head *bh_req, *bhl;
581	struct raid1_bh * r1_bh;
582	int disks = MD_SB_DISKS;
583	int i, sum_bhs = 0;
584	struct mirror_info *mirror;
585
586	if (!buffer_locked(bh))
587		BUG();
588
589/*
590 * make_request() can abort the operation when READA is being
591 * used and no empty request is available.
592 *
593 * Currently, just replace the command with READ/WRITE.
594 */
595	r1_bh = raid1_alloc_r1bh (conf);
596
597	spin_lock_irq(&conf->segment_lock);
598	wait_event_lock_irq(conf->wait_done,
599			bh->b_rsector < conf->start_active ||
600			bh->b_rsector >= conf->start_future,
601			conf->segment_lock);
602	if (bh->b_rsector < conf->start_active)
603		conf->cnt_done++;
604	else {
605		conf->cnt_future++;
606		if (conf->phase)
607			set_bit(R1BH_SyncPhase, &r1_bh->state);
608	}
609	spin_unlock_irq(&conf->segment_lock);
610
611	/*
612	 * i think the read and write branch should be separated completely,
613	 * since we want to do read balancing on the read side for example.
614	 * Alternative implementations? :) --mingo
615	 */
616
617	r1_bh->master_bh = bh;
618	r1_bh->mddev = mddev;
619	r1_bh->cmd = rw;
620
621	if (rw == READ) {
622		/*
623		 * read balancing logic:
624		 */
625		mirror = conf->mirrors + raid1_read_balance(conf, bh);
626
627		bh_req = &r1_bh->bh_req;
628		memcpy(bh_req, bh, sizeof(*bh));
629		bh_req->b_blocknr = bh->b_rsector;
630		bh_req->b_dev = mirror->dev;
631		bh_req->b_rdev = mirror->dev;
632	/*	bh_req->b_rsector = bh->n_rsector; */
633		bh_req->b_end_io = raid1_end_request;
634		bh_req->b_private = r1_bh;
635		generic_make_request (rw, bh_req);
636		return 0;
637	}
638
639	/*
640	 * WRITE:
641	 */
642
643	bhl = raid1_alloc_bh(conf, conf->raid_disks);
644	for (i = 0; i < disks; i++) {
645		struct buffer_head *mbh;
646		if (!conf->mirrors[i].operational)
647			continue;
648
649	/*
650	 * We should use a private pool (size depending on NR_REQUEST),
651	 * to avoid writes filling up the memory with bhs
652	 *
653 	 * Such pools are much faster than kmalloc anyways (so we waste
654 	 * almost nothing by not using the master bh when writing and
655 	 * win alot of cleanness) but for now we are cool enough. --mingo
656 	 *
657	 * It's safe to sleep here, buffer heads cannot be used in a shared
658 	 * manner in the write branch. Look how we lock the buffer at the
659 	 * beginning of this function to grok the difference ;)
660	 */
661 		mbh = bhl;
662		if (mbh == NULL) {
663			MD_BUG();
664			break;
665		}
666		bhl = mbh->b_next;
667		mbh->b_next = NULL;
668		mbh->b_this_page = (struct buffer_head *)1;
669
670 	/*
671 	 * prepare mirrored mbh (fields ordered for max mem throughput):
672 	 */
673		mbh->b_blocknr    = bh->b_rsector;
674		mbh->b_dev        = conf->mirrors[i].dev;
675		mbh->b_rdev	  = conf->mirrors[i].dev;
676		mbh->b_rsector	  = bh->b_rsector;
677		mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
678						(1<<BH_Mapped) | (1<<BH_Lock);
679
680		atomic_set(&mbh->b_count, 1);
681 		mbh->b_size       = bh->b_size;
682 		mbh->b_page	  = bh->b_page;
683 		mbh->b_data	  = bh->b_data;
684 		mbh->b_list       = BUF_LOCKED;
685 		mbh->b_end_io     = raid1_end_request;
686 		mbh->b_private    = r1_bh;
687
688		mbh->b_next = r1_bh->mirror_bh_list;
689		r1_bh->mirror_bh_list = mbh;
690		sum_bhs++;
691	}
692	if (bhl) raid1_free_bh(conf,bhl);
693	if (!sum_bhs) {
694		/* Gag - all mirrors non-operational.. */
695		raid1_end_bh_io(r1_bh, 0);
696		return 0;
697	}
698	md_atomic_set(&r1_bh->remaining, sum_bhs);
699
700	/*
701	 * We have to be a bit careful about the semaphore above, thats
702	 * why we start the requests separately. Since kmalloc() could
703	 * fail, sleep and make_request() can sleep too, this is the
704	 * safer solution. Imagine, end_request decreasing the semaphore
705	 * before we could have set it up ... We could play tricks with
706	 * the semaphore (presetting it and correcting at the end if
707	 * sum_bhs is not 'n' but we have to do end_request by hand if
708	 * all requests finish until we had a chance to set up the
709	 * semaphore correctly ... lots of races).
710	 */
711	bh = r1_bh->mirror_bh_list;
712	while(bh) {
713		struct buffer_head *bh2 = bh;
714		bh = bh->b_next;
715		generic_make_request(rw, bh2);
716	}
717	return (0);
718}
719
720static void raid1_status(struct seq_file *seq, mddev_t *mddev)
721{
722	raid1_conf_t *conf = mddev_to_conf(mddev);
723	int i;
724
725	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
726						 conf->working_disks);
727	for (i = 0; i < conf->raid_disks; i++)
728		seq_printf(seq, "%s",
729			conf->mirrors[i].operational ? "U" : "_");
730	seq_printf(seq, "]");
731}
732
733#define LAST_DISK KERN_ALERT \
734"raid1: only one disk left and IO error.\n"
735
736#define NO_SPARE_DISK KERN_ALERT \
737"raid1: no spare disk left, degrading mirror level by one.\n"
738
739#define DISK_FAILED KERN_ALERT \
740"raid1: Disk failure on %s, disabling device. \n" \
741"	Operation continuing on %d devices\n"
742
743#define START_SYNCING KERN_ALERT \
744"raid1: start syncing spare disk.\n"
745
746#define ALREADY_SYNCING KERN_INFO \
747"raid1: syncing already in progress.\n"
748
749static void mark_disk_bad (mddev_t *mddev, int failed)
750{
751	raid1_conf_t *conf = mddev_to_conf(mddev);
752	struct mirror_info *mirror = conf->mirrors+failed;
753	mdp_super_t *sb = mddev->sb;
754
755	mirror->operational = 0;
756	mark_disk_faulty(sb->disks+mirror->number);
757	mark_disk_nonsync(sb->disks+mirror->number);
758	mark_disk_inactive(sb->disks+mirror->number);
759	if (!mirror->write_only)
760		sb->active_disks--;
761	sb->working_disks--;
762	sb->failed_disks++;
763	mddev->sb_dirty = 1;
764	md_wakeup_thread(conf->thread);
765	if (!mirror->write_only)
766		conf->working_disks--;
767	printk (DISK_FAILED, partition_name (mirror->dev),
768				 conf->working_disks);
769}
770
771static int raid1_error (mddev_t *mddev, kdev_t dev)
772{
773	raid1_conf_t *conf = mddev_to_conf(mddev);
774	struct mirror_info * mirrors = conf->mirrors;
775	int disks = MD_SB_DISKS;
776	int i;
777
778	/* Find the drive.
779	 * If it is not operational, then we have already marked it as dead
780	 * else if it is the last working disks, ignore the error, let the
781	 * next level up know.
782	 * else mark the drive as failed
783	 */
784
785	for (i = 0; i < disks; i++)
786		if (mirrors[i].dev==dev && mirrors[i].operational)
787			break;
788	if (i == disks)
789		return 0;
790
791	if (i < conf->raid_disks && conf->working_disks == 1) {
792		/* Don't fail the drive, act as though we were just a
793		 * normal single drive
794		 */
795
796		return 1;
797	}
798	mark_disk_bad(mddev, i);
799	return 0;
800}
801
802#undef LAST_DISK
803#undef NO_SPARE_DISK
804#undef DISK_FAILED
805#undef START_SYNCING
806
807
808static void print_raid1_conf (raid1_conf_t *conf)
809{
810	int i;
811	struct mirror_info *tmp;
812
813	printk("RAID1 conf printout:\n");
814	if (!conf) {
815		printk("(conf==NULL)\n");
816		return;
817	}
818	printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
819			 conf->raid_disks, conf->nr_disks);
820
821	for (i = 0; i < MD_SB_DISKS; i++) {
822		tmp = conf->mirrors + i;
823		printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
824			i, tmp->spare,tmp->operational,
825			tmp->number,tmp->raid_disk,tmp->used_slot,
826			partition_name(tmp->dev));
827	}
828}
829
830static void close_sync(raid1_conf_t *conf)
831{
832	mddev_t *mddev = conf->mddev;
833	/* If reconstruction was interrupted, we need to close the "active" and "pending"
834	 * holes.
835	 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
836	 */
837	/* this is really needed when recovery stops too... */
838	spin_lock_irq(&conf->segment_lock);
839	conf->start_active = conf->start_pending;
840	conf->start_ready = conf->start_pending;
841	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
842	conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
843	conf->start_future = (mddev->sb->size<<1)+1;
844	conf->cnt_pending = conf->cnt_future;
845	conf->cnt_future = 0;
846	conf->phase = conf->phase ^1;
847	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
848	conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
849	conf->phase = 0;
850	conf->cnt_future = conf->cnt_done;;
851	conf->cnt_done = 0;
852	spin_unlock_irq(&conf->segment_lock);
853	wake_up(&conf->wait_done);
854
855	mempool_destroy(conf->r1buf_pool);
856	conf->r1buf_pool = NULL;
857}
858
859static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
860{
861	int err = 0;
862	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
863	raid1_conf_t *conf = mddev->private;
864	struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
865	mdp_super_t *sb = mddev->sb;
866	mdp_disk_t *failed_desc, *spare_desc, *added_desc;
867	mdk_rdev_t *spare_rdev, *failed_rdev;
868
869	print_raid1_conf(conf);
870
871	switch (state) {
872	case DISKOP_SPARE_ACTIVE:
873	case DISKOP_SPARE_INACTIVE:
874		/* need to wait for pending sync io before locking device */
875		close_sync(conf);
876	}
877
878	md_spin_lock_irq(&conf->device_lock);
879	/*
880	 * find the disk ...
881	 */
882	switch (state) {
883
884	case DISKOP_SPARE_ACTIVE:
885
886		/*
887		 * Find the failed disk within the RAID1 configuration ...
888		 * (this can only be in the first conf->working_disks part)
889		 */
890		for (i = 0; i < conf->raid_disks; i++) {
891			tmp = conf->mirrors + i;
892			if ((!tmp->operational && !tmp->spare) ||
893					!tmp->used_slot) {
894				failed_disk = i;
895				break;
896			}
897		}
898		/*
899		 * When we activate a spare disk we _must_ have a disk in
900		 * the lower (active) part of the array to replace.
901		 */
902		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
903			MD_BUG();
904			err = 1;
905			goto abort;
906		}
907		/* fall through */
908
909	case DISKOP_SPARE_WRITE:
910	case DISKOP_SPARE_INACTIVE:
911
912		/*
913		 * Find the spare disk ... (can only be in the 'high'
914		 * area of the array)
915		 */
916		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
917			tmp = conf->mirrors + i;
918			if (tmp->spare && tmp->number == (*d)->number) {
919				spare_disk = i;
920				break;
921			}
922		}
923		if (spare_disk == -1) {
924			MD_BUG();
925			err = 1;
926			goto abort;
927		}
928		break;
929
930	case DISKOP_HOT_REMOVE_DISK:
931
932		for (i = 0; i < MD_SB_DISKS; i++) {
933			tmp = conf->mirrors + i;
934			if (tmp->used_slot && (tmp->number == (*d)->number)) {
935				if (tmp->operational) {
936					err = -EBUSY;
937					goto abort;
938				}
939				removed_disk = i;
940				break;
941			}
942		}
943		if (removed_disk == -1) {
944			MD_BUG();
945			err = 1;
946			goto abort;
947		}
948		break;
949
950	case DISKOP_HOT_ADD_DISK:
951
952		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
953			tmp = conf->mirrors + i;
954			if (!tmp->used_slot) {
955				added_disk = i;
956				break;
957			}
958		}
959		if (added_disk == -1) {
960			MD_BUG();
961			err = 1;
962			goto abort;
963		}
964		break;
965	}
966
967	switch (state) {
968	/*
969	 * Switch the spare disk to write-only mode:
970	 */
971	case DISKOP_SPARE_WRITE:
972		sdisk = conf->mirrors + spare_disk;
973		sdisk->operational = 1;
974		sdisk->write_only = 1;
975		break;
976	/*
977	 * Deactivate a spare disk:
978	 */
979	case DISKOP_SPARE_INACTIVE:
980<<<<<<< found
981		if (conf->start_future > 0) {
982			MD_BUG();
983			err = -EBUSY;
984			break;
985		}
986||||||| expected
987		close_sync(conf);
988=======
989>>>>>>> replacement
990		sdisk = conf->mirrors + spare_disk;
991		sdisk->operational = 0;
992		sdisk->write_only = 0;
993		break;
994	/*
995	 * Activate (mark read-write) the (now sync) spare disk,
996	 * which means we switch it's 'raid position' (->raid_disk)
997	 * with the failed disk. (only the first 'conf->nr_disks'
998	 * slots are used for 'real' disks and we must preserve this
999	 * property)
1000	 */
1001	case DISKOP_SPARE_ACTIVE:
1002<<<<<<< found
1003		if (conf->start_future > 0) {
1004			MD_BUG();
1005			err = -EBUSY;
1006			break;
1007		}
1008||||||| expected
1009		close_sync(conf);
1010=======
1011>>>>>>> replacement
1012		sdisk = conf->mirrors + spare_disk;
1013		fdisk = conf->mirrors + failed_disk;
1014
1015		spare_desc = &sb->disks[sdisk->number];
1016		failed_desc = &sb->disks[fdisk->number];
1017
1018		if (spare_desc != *d) {
1019			MD_BUG();
1020			err = 1;
1021			goto abort;
1022		}
1023
1024		if (spare_desc->raid_disk != sdisk->raid_disk) {
1025			MD_BUG();
1026			err = 1;
1027			goto abort;
1028		}
1029
1030		if (sdisk->raid_disk != spare_disk) {
1031			MD_BUG();
1032			err = 1;
1033			goto abort;
1034		}
1035
1036		if (failed_desc->raid_disk != fdisk->raid_disk) {
1037			MD_BUG();
1038			err = 1;
1039			goto abort;
1040		}
1041
1042		if (fdisk->raid_disk != failed_disk) {
1043			MD_BUG();
1044			err = 1;
1045			goto abort;
1046		}
1047
1048		/*
1049		 * do the switch finally
1050		 */
1051		spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1052		failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1053
1054		/* There must be a spare_rdev, but there may not be a
1055		 * failed_rdev.  That slot might be empty...
1056		 */
1057		spare_rdev->desc_nr = failed_desc->number;
1058		if (failed_rdev)
1059			failed_rdev->desc_nr = spare_desc->number;
1060
1061		xchg_values(*spare_desc, *failed_desc);
1062		xchg_values(*fdisk, *sdisk);
1063
1064		/*
1065		 * (careful, 'failed' and 'spare' are switched from now on)
1066		 *
1067		 * we want to preserve linear numbering and we want to
1068		 * give the proper raid_disk number to the now activated
1069		 * disk. (this means we switch back these values)
1070		 */
1071
1072		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1073		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1074		xchg_values(spare_desc->number, failed_desc->number);
1075		xchg_values(sdisk->number, fdisk->number);
1076
1077		*d = failed_desc;
1078
1079		if (sdisk->dev == MKDEV(0,0))
1080			sdisk->used_slot = 0;
1081		/*
1082		 * this really activates the spare.
1083		 */
1084		fdisk->spare = 0;
1085		fdisk->write_only = 0;
1086
1087		/*
1088		 * if we activate a spare, we definitely replace a
1089		 * non-operational disk slot in the 'low' area of
1090		 * the disk array.
1091		 */
1092
1093		conf->working_disks++;
1094
1095		break;
1096
1097	case DISKOP_HOT_REMOVE_DISK:
1098		rdisk = conf->mirrors + removed_disk;
1099
1100		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1101			MD_BUG();
1102			err = 1;
1103			goto abort;
1104		}
1105		rdisk->dev = MKDEV(0,0);
1106		rdisk->used_slot = 0;
1107		conf->nr_disks--;
1108		break;
1109
1110	case DISKOP_HOT_ADD_DISK:
1111		adisk = conf->mirrors + added_disk;
1112		added_desc = *d;
1113
1114		if (added_disk != added_desc->number) {
1115			MD_BUG();
1116			err = 1;
1117			goto abort;
1118		}
1119
1120		adisk->number = added_desc->number;
1121		adisk->raid_disk = added_desc->raid_disk;
1122		adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1123
1124		adisk->operational = 0;
1125		adisk->write_only = 0;
1126		adisk->spare = 1;
1127		adisk->used_slot = 1;
1128		adisk->head_position = 0;
1129		conf->nr_disks++;
1130
1131		break;
1132
1133	default:
1134		MD_BUG();
1135		err = 1;
1136		goto abort;
1137	}
1138abort:
1139	md_spin_unlock_irq(&conf->device_lock);
1140<<<<<<< found
1141	if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1142		/* should move to "END_REBUILD" when such exists */
1143		raid1_shrink_buffers(conf);
1144
1145	print_raid1_conf(conf);
1146||||||| expected
1147	if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) {
1148		mempool_destroy(conf->r1buf_pool);
1149		conf->r1buf_pool = NULL;
1150	}
1151
1152	print_conf(conf);
1153=======
1154
1155	print_conf(conf);
1156>>>>>>> replacement
1157	return err;
1158}
1159
1160
1161#define IO_ERROR KERN_ALERT \
1162"raid1: %s: unrecoverable I/O read error for block %lu\n"
1163
1164#define REDIRECT_SECTOR KERN_ERR \
1165"raid1: %s: redirecting sector %lu to another mirror\n"
1166
1167/*
1168 * This is a kernel thread which:
1169 *
1170 *	1.	Retries failed read operations on working mirrors.
1171 *	2.	Updates the raid superblock when problems encounter.
1172 *	3.	Performs writes following reads for array syncronising.
1173 */
1174static void end_sync_write(struct buffer_head *bh, int uptodate);
1175static void end_sync_read(struct buffer_head *bh, int uptodate);
1176
1177static void raid1d (void *data)
1178{
1179	struct raid1_bh *r1_bh;
1180	struct buffer_head *bh;
1181	unsigned long flags;
1182	raid1_conf_t *conf = data;
1183	mddev_t *mddev = conf->mddev;
1184	kdev_t dev;
1185
1186	if (mddev->sb_dirty)
1187		md_update_sb(mddev);
1188
1189	for (;;) {
1190		md_spin_lock_irqsave(&retry_list_lock, flags);
1191		r1_bh = raid1_retry_list;
1192		if (!r1_bh)
1193			break;
1194		raid1_retry_list = r1_bh->next_r1;
1195		md_spin_unlock_irqrestore(&retry_list_lock, flags);
1196
1197		mddev = r1_bh->mddev;
1198		bh = &r1_bh->bh_req;
1199		switch(r1_bh->cmd) {
1200		case SPECIAL:
1201			/* have to allocate lots of bh structures and
1202			 * schedule writes
1203			 */
1204			if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1205				int i, sum_bhs = 0;
1206				int disks = MD_SB_DISKS;
1207				struct buffer_head *bhl, *mbh;
1208
1209				conf = mddev_to_conf(mddev);
1210				bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1211				for (i = 0; i < disks ; i++) {
1212					if (!conf->mirrors[i].operational)
1213						continue;
1214					if (i==conf->last_used)
1215						/* we read from here, no need to write */
1216						continue;
1217					if (i < conf->raid_disks
1218					    && mddev->in_sync)
1219						/* don't need to write this,
1220						 * we are just rebuilding */
1221						continue;
1222					mbh = bhl;
1223					if (!mbh) {
1224						MD_BUG();
1225						break;
1226					}
1227					bhl = mbh->b_next;
1228					mbh->b_this_page = (struct buffer_head *)1;
1229
1230
1231				/*
1232				 * prepare mirrored bh (fields ordered for max mem throughput):
1233				 */
1234					mbh->b_blocknr    = bh->b_blocknr;
1235					mbh->b_dev        = conf->mirrors[i].dev;
1236					mbh->b_rdev	  = conf->mirrors[i].dev;
1237					mbh->b_rsector	  = bh->b_blocknr;
1238					mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1239						(1<<BH_Mapped) | (1<<BH_Lock);
1240					atomic_set(&mbh->b_count, 1);
1241					mbh->b_size       = bh->b_size;
1242					mbh->b_page	  = bh->b_page;
1243					mbh->b_data	  = bh->b_data;
1244					mbh->b_list       = BUF_LOCKED;
1245					mbh->b_end_io     = end_sync_write;
1246					mbh->b_private    = r1_bh;
1247
1248					mbh->b_next = r1_bh->mirror_bh_list;
1249					r1_bh->mirror_bh_list = mbh;
1250
1251					sum_bhs++;
1252				}
1253				md_atomic_set(&r1_bh->remaining, sum_bhs);
1254				if (bhl) raid1_free_bh(conf, bhl);
1255				mbh = r1_bh->mirror_bh_list;
1256
1257				if (!sum_bhs) {
1258					/* nowhere to write this too... I guess we
1259					 * must be done
1260					 */
1261					sync_request_done(bh->b_blocknr, conf);
1262					md_done_sync(mddev, bh->b_size>>9, 0);
1263					raid1_free_buf(r1_bh);
1264				} else
1265				while (mbh) {
1266					struct buffer_head *bh1 = mbh;
1267					mbh = mbh->b_next;
1268					generic_make_request(WRITE, bh1);
1269					md_sync_acct(bh1->b_dev, bh1->b_size/512);
1270				}
1271			} else {
1272				/* There is no point trying a read-for-reconstruct
1273				 * as reconstruct is about to be aborted
1274				 */
1275
1276				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1277				md_done_sync(mddev, bh->b_size>>9, 0);
1278			}
1279
1280			break;
1281		case READ:
1282		case READA:
1283			dev = bh->b_dev;
1284			raid1_map (mddev, &bh->b_dev);
1285			if (bh->b_dev == dev) {
1286				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1287				raid1_end_bh_io(r1_bh, 0);
1288			} else {
1289				printk (REDIRECT_SECTOR,
1290					partition_name(bh->b_dev), bh->b_blocknr);
1291				bh->b_rdev = bh->b_dev;
1292				bh->b_rsector = bh->b_blocknr;
1293				generic_make_request (r1_bh->cmd, bh);
1294			}
1295			break;
1296		}
1297	}
1298	md_spin_unlock_irqrestore(&retry_list_lock, flags);
1299}
1300#undef IO_ERROR
1301#undef REDIRECT_SECTOR
1302
1303<<<<<<< found
1304static void raid1syncd (void *data)
1305{
1306	raid1_conf_t *conf = data;
1307||||||| expected
1308static void raid1syncd(void *data)
1309{
1310	conf_t *conf = data;
1311=======
1312>>>>>>> replacement
1313
1314/*
1315 * perform a "sync" on one "block"
1316 *
1317 * We need to make sure that no normal I/O request - particularly write
1318 * requests - conflict with active sync requests.
1319 * This is achieved by conceptually dividing the device space into a
1320 * number of sections:
1321 *  DONE: 0 .. a-1     These blocks are in-sync
1322 *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1323 *                     no normal IO requests
1324 *  READY: b .. c-1    These blocks have no normal IO requests - sync
1325 *                     request may be happening
1326 *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1327 *                     ones will be added
1328 *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1329 *                     be happening, but not sync
1330 *
1331 * We keep a
1332 *   phase    which flips (0 or 1) each time d moves and
1333 * a count of:
1334 *   z =  active io requests in FUTURE since d moved - marked with
1335 *        current phase
1336 *   y =  active io requests in FUTURE before d moved, or PENDING -
1337 *        marked with previous phase
1338 *   x =  active sync requests in READY
1339 *   w =  active sync requests in ACTIVE
1340 *   v =  active io requests in DONE
1341 *
1342 * Normally, a=b=c=d=0 and z= active io requests
1343 *   or a=b=c=d=END and v= active io requests
1344 * Allowed changes to a,b,c,d:
1345 * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1346 * B:  y==0 -> c=d
1347 * C:   b=c, w+=x, x=0
1348 * D:  w==0 -> a=b
1349 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1350 *
1351 * At start of sync we apply A.
1352 * When y reaches 0, we apply B then A then being sync requests
1353 * When sync point reaches c-1, we wait for y==0, and W==0, and
1354 * then apply apply B then A then D then C.
1355 * Finally, we apply E
1356 *
1357 * The sync request simply issues a "read" against a working drive
1358 * This is marked so that on completion the raid1d thread is woken to
1359 * issue suitable write requests
1360 */
1361
1362static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1363{
1364	raid1_conf_t *conf = mddev_to_conf(mddev);
1365	struct mirror_info *mirror;
1366	struct raid1_bh *r1_bh;
1367	struct buffer_head *bh;
1368	int bsize;
1369	int disk;
1370	int block_nr;
1371	int buffs;
1372
1373	if (!sector_nr) {
1374		/* we want enough buffers to hold twice the window of 128*/
1375		buffs = 128 *2 / (PAGE_SIZE>>9);
1376		buffs = raid1_grow_buffers(conf, buffs);
1377		if (buffs < 2)
1378			goto nomem;
1379		conf->window = buffs*(PAGE_SIZE>>9)/2;
1380	}
1381	spin_lock_irq(&conf->segment_lock);
1382	if (!sector_nr) {
1383		/* initialize ...*/
1384		conf->start_active = 0;
1385		conf->start_ready = 0;
1386		conf->start_pending = 0;
1387		conf->start_future = 0;
1388		conf->phase = 0;
1389
1390		conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1391		conf->cnt_done = conf->cnt_pending = 0;
1392		if (conf->cnt_ready || conf->cnt_active)
1393			MD_BUG();
1394	}
1395	while (sector_nr >= conf->start_pending) {
1396		PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1397			sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1398			conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1399		wait_event_lock_irq(conf->wait_done,
1400					!conf->cnt_active,
1401					conf->segment_lock);
1402		wait_event_lock_irq(conf->wait_ready,
1403					!conf->cnt_pending,
1404					conf->segment_lock);
1405		conf->start_active = conf->start_ready;
1406		conf->start_ready = conf->start_pending;
1407		conf->start_pending = conf->start_future;
1408		conf->start_future = conf->start_future+conf->window;
1409		// Note: falling off the end is not a problem
1410		conf->phase = conf->phase ^1;
1411		conf->cnt_active = conf->cnt_ready;
1412		conf->cnt_ready = 0;
1413		conf->cnt_pending = conf->cnt_future;
1414		conf->cnt_future = 0;
1415		wake_up(&conf->wait_done);
1416	}
1417	conf->cnt_ready++;
1418	spin_unlock_irq(&conf->segment_lock);
1419
1420
1421	/* If reconstructing, and >1 working disc,
1422	 * could dedicate one to rebuild and others to
1423	 * service read requests ..
1424	 */
1425	disk = conf->last_used;
1426	/* make sure disk is operational */
1427	while (!conf->mirrors[disk].operational) {
1428		if (disk <= 0) disk = conf->raid_disks;
1429		disk--;
1430		if (disk == conf->last_used)
1431			break;
1432	}
1433	conf->last_used = disk;
1434
1435	mirror = conf->mirrors+conf->last_used;
1436
1437	r1_bh = raid1_alloc_buf (conf);
1438	r1_bh->master_bh = NULL;
1439	r1_bh->mddev = mddev;
1440	r1_bh->cmd = SPECIAL;
1441	bh = &r1_bh->bh_req;
1442
1443	block_nr = sector_nr;
1444	bsize = 512;
1445	while (!(block_nr & 1) && bsize < PAGE_SIZE
1446			&& (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
1447		block_nr >>= 1;
1448		bsize <<= 1;
1449	}
1450	bh->b_size = bsize;
1451	bh->b_list = BUF_LOCKED;
1452	bh->b_dev = mirror->dev;
1453	bh->b_rdev = mirror->dev;
1454	bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1455	if (!bh->b_page)
1456		BUG();
1457	if (!bh->b_data)
1458		BUG();
1459	if (bh->b_data != page_address(bh->b_page))
1460		BUG();
1461	bh->b_end_io = end_sync_read;
1462	bh->b_private = r1_bh;
1463	bh->b_blocknr = sector_nr;
1464	bh->b_rsector = sector_nr;
1465	init_waitqueue_head(&bh->b_wait);
1466
1467	generic_make_request(READ, bh);
1468	md_sync_acct(bh->b_dev, bh->b_size/512);
1469
1470	return (bsize >> 9);
1471
1472nomem:
1473<<<<<<< found
1474	raid1_shrink_buffers(conf);
1475	return -ENOMEM;
1476}
1477
1478static void end_sync_read(struct buffer_head *bh, int uptodate)
1479||||||| expected
1480	if (!sector_nr)
1481		if (init_resync(conf))
1482			return -ENOMEM;
1483	/*
1484	 * If there is non-resync activity waiting for us then
1485	 * put in a delay to throttle resync.
1486=======
1487	if (sector_nr == 0)
1488		if (init_resync(conf))
1489			return -ENOMEM;
1490
1491	max_sector = mddev->sb->size << 1;
1492	if (sector_nr >= max_sector) {
1493		close_sync(conf);
1494		return 0;
1495	}
1496
1497	/*
1498	 * If there is non-resync activity waiting for us then
1499	 * put in a delay to throttle resync.
1500>>>>>>> replacement
1501{
1502	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1503
1504	/* we have read a block, now it needs to be re-written,
1505	 * or re-read if the read failed.
1506	 * We don't do much here, just schedule handling by raid1d
1507	 */
1508	if (!uptodate)
1509		md_error (r1_bh->mddev, bh->b_dev);
1510	else
1511		set_bit(R1BH_Uptodate, &r1_bh->state);
1512	raid1_reschedule_retry(r1_bh);
1513}
1514
1515static void end_sync_write(struct buffer_head *bh, int uptodate)
1516{
1517 	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1518
1519	if (!uptodate)
1520 		md_error (r1_bh->mddev, bh->b_dev);
1521	if (atomic_dec_and_test(&r1_bh->remaining)) {
1522		mddev_t *mddev = r1_bh->mddev;
1523<<<<<<< found
1524 		unsigned long sect = bh->b_blocknr;
1525		int size = bh->b_size;
1526		raid1_free_buf(r1_bh);
1527		sync_request_done(sect, mddev_to_conf(mddev));
1528		md_done_sync(mddev,size>>9, uptodate);
1529||||||| expected
1530	r1_bio->sector = sector_nr;
1531	r1_bio->cmd = SPECIAL;
1532
1533	max_sector = mddev->sb->size << 1;
1534	if (sector_nr >= max_sector)
1535		BUG();
1536
1537=======
1538	r1_bio->sector = sector_nr;
1539	r1_bio->cmd = SPECIAL;
1540
1541>>>>>>> replacement
1542	}
1543}
1544
1545#define INVALID_LEVEL KERN_WARNING \
1546"raid1: md%d: raid level not set to mirroring (%d)\n"
1547
1548#define NO_SB KERN_ERR \
1549"raid1: disabled mirror %s (couldn't access raid superblock)\n"
1550
1551#define ERRORS KERN_ERR \
1552"raid1: disabled mirror %s (errors detected)\n"
1553
1554#define NOT_IN_SYNC KERN_ERR \
1555"raid1: disabled mirror %s (not in sync)\n"
1556
1557#define INCONSISTENT KERN_ERR \
1558"raid1: disabled mirror %s (inconsistent descriptor)\n"
1559
1560#define ALREADY_RUNNING KERN_ERR \
1561"raid1: disabled mirror %s (mirror %d already operational)\n"
1562
1563#define OPERATIONAL KERN_INFO \
1564"raid1: device %s operational as mirror %d\n"
1565
1566#define MEM_ERROR KERN_ERR \
1567"raid1: couldn't allocate memory for md%d\n"
1568
1569#define SPARE KERN_INFO \
1570"raid1: spare disk %s\n"
1571
1572#define NONE_OPERATIONAL KERN_ERR \
1573"raid1: no operational mirrors for md%d\n"
1574
1575#define ARRAY_IS_ACTIVE KERN_INFO \
1576"raid1: raid set md%d active with %d out of %d mirrors\n"
1577
1578#define THREAD_ERROR KERN_ERR \
1579"raid1: couldn't allocate thread for md%d\n"
1580
1581#define START_RESYNC KERN_WARNING \
1582"raid1: raid set md%d not clean; reconstructing mirrors\n"
1583
1584static int raid1_run (mddev_t *mddev)
1585{
1586	raid1_conf_t *conf;
1587	int i, j, disk_idx;
1588	struct mirror_info *disk;
1589	mdp_super_t *sb = mddev->sb;
1590	mdp_disk_t *descriptor;
1591	mdk_rdev_t *rdev;
1592	struct md_list_head *tmp;
1593
1594	MOD_INC_USE_COUNT;
1595
1596	if (sb->level != 1) {
1597		printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1598		goto out;
1599	}
1600	/*
1601	 * copy the already verified devices into our private RAID1
1602	 * bookkeeping area. [whatever we allocate in raid1_run(),
1603	 * should be freed in raid1_stop()]
1604	 */
1605
1606	conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1607	mddev->private = conf;
1608	if (!conf) {
1609		printk(MEM_ERROR, mdidx(mddev));
1610		goto out;
1611	}
1612	memset(conf, 0, sizeof(*conf));
1613
1614	ITERATE_RDEV(mddev,rdev,tmp) {
1615		if (rdev->faulty) {
1616			printk(ERRORS, partition_name(rdev->dev));
1617		} else {
1618			if (!rdev->sb) {
1619				MD_BUG();
1620				continue;
1621			}
1622		}
1623		if (rdev->desc_nr == -1) {
1624			MD_BUG();
1625			continue;
1626		}
1627		descriptor = &sb->disks[rdev->desc_nr];
1628		disk_idx = descriptor->raid_disk;
1629		disk = conf->mirrors + disk_idx;
1630
1631		if (disk_faulty(descriptor)) {
1632			disk->number = descriptor->number;
1633			disk->raid_disk = disk_idx;
1634			disk->dev = rdev->dev;
1635			disk->sect_limit = MAX_WORK_PER_DISK;
1636			disk->operational = 0;
1637			disk->write_only = 0;
1638			disk->spare = 0;
1639			disk->used_slot = 1;
1640			disk->head_position = 0;
1641			continue;
1642		}
1643		if (disk_active(descriptor)) {
1644			if (!disk_sync(descriptor)) {
1645				printk(NOT_IN_SYNC,
1646					partition_name(rdev->dev));
1647				continue;
1648			}
1649			if ((descriptor->number > MD_SB_DISKS) ||
1650					 (disk_idx > sb->raid_disks)) {
1651
1652				printk(INCONSISTENT,
1653					partition_name(rdev->dev));
1654				continue;
1655			}
1656			if (disk->operational) {
1657				printk(ALREADY_RUNNING,
1658					partition_name(rdev->dev),
1659					disk_idx);
1660				continue;
1661			}
1662			printk(OPERATIONAL, partition_name(rdev->dev),
1663 					disk_idx);
1664			disk->number = descriptor->number;
1665			disk->raid_disk = disk_idx;
1666			disk->dev = rdev->dev;
1667			disk->sect_limit = MAX_WORK_PER_DISK;
1668			disk->operational = 1;
1669			disk->write_only = 0;
1670			disk->spare = 0;
1671			disk->used_slot = 1;
1672			disk->head_position = 0;
1673			conf->working_disks++;
1674		} else {
1675		/*
1676		 * Must be a spare disk ..
1677		 */
1678			printk(SPARE, partition_name(rdev->dev));
1679			disk->number = descriptor->number;
1680			disk->raid_disk = disk_idx;
1681			disk->dev = rdev->dev;
1682			disk->sect_limit = MAX_WORK_PER_DISK;
1683			disk->operational = 0;
1684			disk->write_only = 0;
1685			disk->spare = 1;
1686			disk->used_slot = 1;
1687			disk->head_position = 0;
1688		}
1689	}
1690	conf->raid_disks = sb->raid_disks;
1691	conf->nr_disks = sb->nr_disks;
1692	conf->mddev = mddev;
1693	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1694
1695	conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1696	init_waitqueue_head(&conf->wait_buffer);
1697	init_waitqueue_head(&conf->wait_done);
1698	init_waitqueue_head(&conf->wait_ready);
1699
1700	if (!conf->working_disks) {
1701		printk(NONE_OPERATIONAL, mdidx(mddev));
1702		goto out_free_conf;
1703	}
1704
1705
1706	/* pre-allocate some buffer_head structures.
1707	 * As a minimum, 1 r1bh and raid_disks buffer_heads
1708	 * would probably get us by in tight memory situations,
1709	 * but a few more is probably a good idea.
1710	 * For now, try NR_RESERVED_BUFS r1bh and
1711	 * NR_RESERVED_BUFS*raid_disks bufferheads
1712	 * This will allow at least NR_RESERVED_BUFS concurrent
1713	 * reads or writes even if kmalloc starts failing
1714	 */
1715	if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1716	    raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1717	                      < NR_RESERVED_BUFS*conf->raid_disks) {
1718		printk(MEM_ERROR, mdidx(mddev));
1719		goto out_free_conf;
1720	}
1721
1722	for (i = 0; i < MD_SB_DISKS; i++) {
1723
1724		descriptor = sb->disks+i;
1725		disk_idx = descriptor->raid_disk;
1726		disk = conf->mirrors + disk_idx;
1727
1728		if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1729				!disk->used_slot) {
1730
1731			disk->number = descriptor->number;
1732			disk->raid_disk = disk_idx;
1733			disk->dev = MKDEV(0,0);
1734
1735			disk->operational = 0;
1736			disk->write_only = 0;
1737			disk->spare = 0;
1738			disk->used_slot = 1;
1739			disk->head_position = 0;
1740		}
1741	}
1742
1743	/*
1744	 * find the first working one and use it as a starting point
1745	 * to read balancing.
1746	 */
1747	for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1748		/* nothing */;
1749	conf->last_used = j;
1750
1751
1752
1753	{
1754		const char * name = "raid1d";
1755
1756		conf->thread = md_register_thread(raid1d, conf, name);
1757		if (!conf->thread) {
1758			printk(THREAD_ERROR, mdidx(mddev));
1759			goto out_free_conf;
1760		}
1761	}
1762
1763<<<<<<< found
1764	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
1765	    (conf->working_disks > 1)) {
1766		const char * name = "raid1syncd";
1767
1768		conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1769||||||| expected
1770	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
1771						(conf->working_disks > 1)) {
1772		const char * name = "raid1syncd";
1773
1774		conf->resync_thread = md_register_thread(raid1syncd, conf, name);
1775=======
1776>>>>>>> replacement
1777
1778	/*
1779	 * Regenerate the "device is in sync with the raid set" bit for
1780	 * each device.
1781	 */
1782	for (i = 0; i < MD_SB_DISKS; i++) {
1783		mark_disk_nonsync(sb->disks+i);
1784		for (j = 0; j < sb->raid_disks; j++) {
1785			if (!conf->mirrors[j].operational)
1786				continue;
1787			if (sb->disks[i].number == conf->mirrors[j].number)
1788				mark_disk_sync(sb->disks+i);
1789		}
1790	}
1791	sb->active_disks = conf->working_disks;
1792
1793	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1794	/*
1795	 * Ok, everything is just fine now
1796	 */
1797	return 0;
1798
1799out_free_conf:
1800	raid1_shrink_r1bh(conf);
1801	raid1_shrink_bh(conf);
1802	raid1_shrink_buffers(conf);
1803	kfree(conf);
1804	mddev->private = NULL;
1805out:
1806	MOD_DEC_USE_COUNT;
1807	return -EIO;
1808}
1809
1810#undef INVALID_LEVEL
1811#undef NO_SB
1812#undef ERRORS
1813#undef NOT_IN_SYNC
1814#undef INCONSISTENT
1815#undef ALREADY_RUNNING
1816#undef OPERATIONAL
1817#undef SPARE
1818#undef NONE_OPERATIONAL
1819#undef ARRAY_IS_ACTIVE
1820
1821<<<<<<< found
1822static int raid1_stop_resync (mddev_t *mddev)
1823{
1824	raid1_conf_t *conf = mddev_to_conf(mddev);
1825
1826	if (conf->resync_thread) {
1827		if (conf->resync_mirrors) {
1828			md_interrupt_thread(conf->resync_thread);
1829
1830			printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1831			return 1;
1832		}
1833		return 0;
1834	}
1835	return 0;
1836}
1837
1838static int raid1_restart_resync (mddev_t *mddev)
1839{
1840	raid1_conf_t *conf = mddev_to_conf(mddev);
1841||||||| expected
1842static int stop_resync(mddev_t *mddev)
1843{
1844	conf_t *conf = mddev_to_conf(mddev);
1845
1846	if (conf->resync_thread) {
1847		if (conf->resync_mirrors) {
1848			md_interrupt_thread(conf->resync_thread);
1849
1850			printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1851			return 1;
1852		}
1853		return 0;
1854	}
1855	return 0;
1856}
1857
1858static int restart_resync(mddev_t *mddev)
1859{
1860	conf_t *conf = mddev_to_conf(mddev);
1861=======
1862>>>>>>> replacement
1863static int raid1_stop (mddev_t *mddev)
1864{
1865	raid1_conf_t *conf = mddev_to_conf(mddev);
1866
1867	md_unregister_thread(conf->thread);
1868	raid1_shrink_r1bh(conf);
1869	raid1_shrink_bh(conf);
1870	raid1_shrink_buffers(conf);
1871	kfree(conf);
1872	mddev->private = NULL;
1873	MOD_DEC_USE_COUNT;
1874	return 0;
1875}
1876
1877static mdk_personality_t raid1_personality=
1878{
1879	name:		"raid1",
1880	make_request:	raid1_make_request,
1881	run:		raid1_run,
1882	stop:		raid1_stop,
1883	status:		raid1_status,
1884	error_handler:	raid1_error,
1885	diskop:		raid1_diskop,
1886<<<<<<< found
1887	stop_resync:	raid1_stop_resync,
1888	restart_resync:	raid1_restart_resync,
1889||||||| expected
1890	stop_resync:	stop_resync,
1891	restart_resync:	restart_resync,
1892=======
1893>>>>>>> replacement
1894	sync_request:	raid1_sync_request
1895};
1896
1897static int md__init raid1_init (void)
1898{
1899	return register_md_personality (RAID1, &raid1_personality);
1900}
1901
1902static void raid1_exit (void)
1903{
1904	unregister_md_personality (RAID1);
1905}
1906
1907module_init(raid1_init);
1908module_exit(raid1_exit);
1909MODULE_LICENSE("GPL");
1910