xref: /dragonfly/sys/vfs/hammer/hammer_redo.c (revision e98bdfd3)
1 /*
2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * HAMMER redo - REDO record support for the UNDO/REDO FIFO.
37  *
38  * See also hammer_undo.c
39  */
40 
41 #include "hammer.h"
42 
43 RB_GENERATE2(hammer_redo_rb_tree, hammer_inode, rb_redonode,
44 	     hammer_redo_rb_compare, hammer_off_t, redo_fifo_start);
45 
46 /*
47  * HAMMER version 4+ REDO support.
48  *
49  * REDO records are used to improve fsync() performance.  Instead of having
50  * to go through a complete double-flush cycle involving at least two disk
51  * synchronizations the fsync need only flush UNDO/REDO FIFO buffers through
52  * the related REDO records, which is a single synchronization requiring
53  * no track seeking.  If a recovery becomes necessary the recovery code
54  * will generate logical data writes based on the REDO records encountered.
55  * That is, the recovery code will UNDO any partial meta-data/data writes
56  * at the raw disk block level and then REDO the data writes at the logical
57  * level.
58  */
59 int
60 hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
61 		     hammer_off_t file_off, uint32_t flags,
62 		     void *base, int len)
63 {
64 	hammer_mount_t hmp;
65 	hammer_volume_t root_volume;
66 	hammer_blockmap_t undomap;
67 	hammer_buffer_t buffer = NULL;
68 	hammer_fifo_redo_t redo;
69 	hammer_fifo_tail_t tail;
70 	hammer_off_t next_offset;
71 	int error;
72 	int bytes;
73 	int n;
74 
75 	/*
76 	 * Setup
77 	 */
78 	hmp = trans->hmp;
79 
80 	root_volume = trans->rootvol;
81 	undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
82 
83 	/*
84 	 * No undo recursion when modifying the root volume
85 	 */
86 	hammer_modify_volume_noundo(NULL, root_volume);
87 	hammer_lock_ex(&hmp->undo_lock);
88 
89 	/* undo had better not roll over (loose test) */
90 	if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3)
91 		hpanic("insufficient undo FIFO space for redo!");
92 
93 	/*
94 	 * Loop until the undo for the entire range has been laid down.
95 	 * Loop at least once (len might be 0 as a degenerate case).
96 	 */
97 	for (;;) {
98 		/*
99 		 * Fetch the layout offset in the UNDO FIFO, wrap it as
100 		 * necessary.
101 		 */
102 		if (undomap->next_offset == undomap->alloc_offset) {
103 			undomap->next_offset =
104 				HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
105 		}
106 		next_offset = undomap->next_offset;
107 
108 		/*
109 		 * This is a tail-chasing FIFO, when we hit the start of a new
110 		 * buffer we don't have to read it in.
111 		 */
112 		if ((next_offset & HAMMER_BUFMASK) == 0) {
113 			redo = hammer_bnew(hmp, next_offset, &error, &buffer);
114 			hammer_format_undo(redo, hmp->undo_seqno ^ 0x40000000);
115 		} else {
116 			redo = hammer_bread(hmp, next_offset, &error, &buffer);
117 		}
118 		if (error)
119 			break;
120 		hammer_modify_buffer_noundo(NULL, buffer);
121 
122 		/*
123 		 * Calculate how big a media structure fits up to the next
124 		 * alignment point and how large a data payload we can
125 		 * accomodate.
126 		 *
127 		 * If n calculates to 0 or negative there is no room for
128 		 * anything but a PAD.
129 		 */
130 		bytes = HAMMER_UNDO_ALIGN -
131 			((int)next_offset & HAMMER_UNDO_MASK);
132 		n = bytes -
133 		    (int)sizeof(struct hammer_fifo_redo) -
134 		    (int)sizeof(struct hammer_fifo_tail);
135 
136 		/*
137 		 * If available space is insufficient for any payload
138 		 * we have to lay down a PAD.
139 		 *
140 		 * The minimum PAD is 8 bytes and the head and tail will
141 		 * overlap each other in that case.  PADs do not have
142 		 * sequence numbers or CRCs.
143 		 *
144 		 * A PAD may not start on a boundary.  That is, every
145 		 * 512-byte block in the UNDO/REDO FIFO must begin with
146 		 * a record containing a sequence number.
147 		 */
148 		if (n <= 0) {
149 			KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
150 			KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0);
151 			tail = (void *)((char *)redo + bytes - sizeof(*tail));
152 			if ((void *)redo != (void *)tail) {
153 				tail->tail_signature = HAMMER_TAIL_SIGNATURE;
154 				tail->tail_type = HAMMER_HEAD_TYPE_PAD;
155 				tail->tail_size = bytes;
156 			}
157 			redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
158 			redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
159 			redo->head.hdr_size = bytes;
160 			/* NO CRC OR SEQ NO */
161 			undomap->next_offset += bytes;
162 			hammer_modify_buffer_done(buffer);
163 			hammer_stats_redo += bytes;
164 			continue;
165 		}
166 
167 		/*
168 		 * When generating an inode-related REDO record we track
169 		 * the point in the UNDO/REDO FIFO containing the inode's
170 		 * earliest REDO record.  See hammer_generate_redo_sync().
171 		 *
172 		 * redo_fifo_next is cleared when an inode is staged to
173 		 * the backend and then used to determine how to reassign
174 		 * redo_fifo_start after the inode flush completes.
175 		 */
176 		if (ip) {
177 			redo->redo_objid = ip->obj_id;
178 			redo->redo_localization = ip->obj_localization;
179 			if ((ip->flags & HAMMER_INODE_RDIRTY) == 0) {
180 				ip->redo_fifo_start = next_offset;
181 				if (RB_INSERT(hammer_redo_rb_tree,
182 					      &hmp->rb_redo_root, ip)) {
183 					hpanic("cannot insert inode %p on "
184 					      "redo FIFO", ip);
185 				}
186 				ip->flags |= HAMMER_INODE_RDIRTY;
187 			}
188 			if (ip->redo_fifo_next == 0)
189 				ip->redo_fifo_next = next_offset;
190 		} else {
191 			redo->redo_objid = 0;
192 			redo->redo_localization = 0;
193 		}
194 
195 		/*
196 		 * Calculate the actual payload and recalculate the size
197 		 * of the media structure as necessary.  If no data buffer
198 		 * is supplied there is no payload.
199 		 */
200 		if (base == NULL) {
201 			n = 0;
202 		} else if (n > len) {
203 			n = len;
204 		}
205 		bytes = ((n + HAMMER_HEAD_ALIGN_MASK) &
206 			 ~HAMMER_HEAD_ALIGN_MASK) +
207 			(int)sizeof(struct hammer_fifo_redo) +
208 			(int)sizeof(struct hammer_fifo_tail);
209 		if (hammer_debug_general & 0x0080) {
210 			hdkprintf("redo %016jx %d %d\n",
211 				(intmax_t)next_offset, bytes, n);
212 		}
213 
214 		redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
215 		redo->head.hdr_type = HAMMER_HEAD_TYPE_REDO;
216 		redo->head.hdr_size = bytes;
217 		redo->head.hdr_seq = hmp->undo_seqno++;
218 		redo->head.hdr_crc = 0;
219 		redo->redo_mtime = trans->time;
220 		redo->redo_offset = file_off;
221 		redo->redo_flags = flags;
222 
223 		/*
224 		 * Incremental payload.  If no payload we throw the entire
225 		 * len into redo_data_bytes and will not loop.
226 		 */
227 		if (base) {
228 			redo->redo_data_bytes = n;
229 			bcopy(base, redo + 1, n);
230 			len -= n;
231 			base = (char *)base + n;
232 			file_off += n;
233 		} else {
234 			redo->redo_data_bytes = len;
235 			file_off += len;
236 			len = 0;
237 		}
238 
239 		tail = (void *)((char *)redo + bytes - sizeof(*tail));
240 		tail->tail_signature = HAMMER_TAIL_SIGNATURE;
241 		tail->tail_type = HAMMER_HEAD_TYPE_REDO;
242 		tail->tail_size = bytes;
243 
244 		KKASSERT(bytes >= sizeof(redo->head));
245 		redo->head.hdr_crc = crc32(redo, HAMMER_FIFO_HEAD_CRCOFF) ^
246 			     crc32(&redo->head + 1, bytes - sizeof(redo->head));
247 		undomap->next_offset += bytes;
248 		hammer_stats_redo += bytes;
249 
250 		/*
251 		 * Before we finish off the buffer we have to deal with any
252 		 * junk between the end of the media structure we just laid
253 		 * down and the UNDO alignment boundary.  We do this by laying
254 		 * down a dummy PAD.  Even though we will probably overwrite
255 		 * it almost immediately we have to do this so recovery runs
256 		 * can iterate the UNDO space without having to depend on
257 		 * the indices in the volume header.
258 		 *
259 		 * This dummy PAD will be overwritten on the next undo so
260 		 * we do not adjust undomap->next_offset.
261 		 */
262 		bytes = HAMMER_UNDO_ALIGN -
263 			((int)undomap->next_offset & HAMMER_UNDO_MASK);
264 		if (bytes != HAMMER_UNDO_ALIGN) {
265 			KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
266 			redo = (void *)(tail + 1);
267 			tail = (void *)((char *)redo + bytes - sizeof(*tail));
268 			if ((void *)redo != (void *)tail) {
269 				tail->tail_signature = HAMMER_TAIL_SIGNATURE;
270 				tail->tail_type = HAMMER_HEAD_TYPE_PAD;
271 				tail->tail_size = bytes;
272 			}
273 			redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
274 			redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
275 			redo->head.hdr_size = bytes;
276 			/* NO CRC OR SEQ NO */
277 		}
278 		hammer_modify_buffer_done(buffer);
279 		if (len == 0)
280 			break;
281 	}
282 	hammer_modify_volume_done(root_volume);
283 	hammer_unlock(&hmp->undo_lock);
284 
285 	if (buffer)
286 		hammer_rel_buffer(buffer, 0);
287 
288 	/*
289 	 * Make sure the nominal undo span contains at least one REDO_SYNC,
290 	 * otherwise the REDO recovery will not be triggered.
291 	 */
292 	if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 &&
293 	    flags != HAMMER_REDO_SYNC) {
294 		hammer_generate_redo_sync(trans);
295 	}
296 
297 	return(error);
298 }
299 
300 /*
301  * Generate a REDO SYNC record.  At least one such record must be generated
302  * in the nominal recovery span for the recovery code to be able to run
303  * REDOs outside of the span.
304  *
305  * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset
306  * for all inodes with active REDOs.  This changes dynamically as inodes
307  * get flushed.
308  *
309  * During recovery stage2 any new flush cycles must specify the original
310  * redo sync offset.  That way a crash will re-run the REDOs, at least
311  * up to the point where the UNDO FIFO does not overwrite the area.
312  */
313 void
314 hammer_generate_redo_sync(hammer_transaction_t trans)
315 {
316 	hammer_mount_t hmp = trans->hmp;
317 	hammer_inode_t ip;
318 	hammer_off_t redo_fifo_start;
319 
320 	if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) {
321 		ip = NULL;
322 		redo_fifo_start = hmp->recover_stage2_offset;
323 	} else {
324 		ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root);
325 		if (ip)
326 			redo_fifo_start = ip->redo_fifo_start;
327 		else
328 			redo_fifo_start = 0;
329 	}
330 	if (redo_fifo_start) {
331 		if (hammer_debug_io & 0x0004) {
332 			hdkprintf("SYNC IP %p %016jx\n",
333 				ip, (intmax_t)redo_fifo_start);
334 		}
335 		hammer_generate_redo(trans, NULL, redo_fifo_start,
336 				     HAMMER_REDO_SYNC, NULL, 0);
337 		trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
338 	}
339 }
340 
341 /*
342  * This is called when an inode is queued to the backend.
343  */
344 void
345 hammer_redo_fifo_start_flush(hammer_inode_t ip)
346 {
347 	ip->redo_fifo_next = 0;
348 }
349 
350 /*
351  * This is called when an inode backend flush is finished.  We have to make
352  * sure that RDIRTY is not set unless dirty bufs are present.  Dirty bufs
353  * can get destroyed through operations such as truncations and leave
354  * us with a stale redo_fifo_next.
355  */
356 void
357 hammer_redo_fifo_end_flush(hammer_inode_t ip)
358 {
359 	hammer_mount_t hmp = ip->hmp;
360 
361 	if (ip->flags & HAMMER_INODE_RDIRTY) {
362 		RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
363 		ip->flags &= ~HAMMER_INODE_RDIRTY;
364 	}
365 	if ((ip->flags & HAMMER_INODE_BUFS) == 0)
366 		ip->redo_fifo_next = 0;
367 	if (ip->redo_fifo_next) {
368 		ip->redo_fifo_start = ip->redo_fifo_next;
369 		if (RB_INSERT(hammer_redo_rb_tree, &hmp->rb_redo_root, ip)) {
370 			hpanic("cannot reinsert inode %p on redo FIFO", ip);
371 		}
372 		ip->flags |= HAMMER_INODE_RDIRTY;
373 	}
374 }
375