1 /* 2 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * HAMMER redo - REDO record support for the UNDO/REDO FIFO. 37 * 38 * See also hammer_undo.c 39 */ 40 41 #include "hammer.h" 42 43 RB_GENERATE2(hammer_redo_rb_tree, hammer_inode, rb_redonode, 44 hammer_redo_rb_compare, hammer_off_t, redo_fifo_start); 45 46 /* 47 * HAMMER version 4+ REDO support. 48 * 49 * REDO records are used to improve fsync() performance. Instead of having 50 * to go through a complete double-flush cycle involving at least two disk 51 * synchronizations the fsync need only flush UNDO/REDO FIFO buffers through 52 * the related REDO records, which is a single synchronization requiring 53 * no track seeking. If a recovery becomes necessary the recovery code 54 * will generate logical data writes based on the REDO records encountered. 55 * That is, the recovery code will UNDO any partial meta-data/data writes 56 * at the raw disk block level and then REDO the data writes at the logical 57 * level. 58 */ 59 int 60 hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip, 61 hammer_off_t file_off, u_int32_t flags, 62 void *base, int len) 63 { 64 hammer_mount_t hmp; 65 hammer_volume_t root_volume; 66 hammer_blockmap_t undomap; 67 hammer_buffer_t buffer = NULL; 68 hammer_fifo_redo_t redo; 69 hammer_fifo_tail_t tail; 70 hammer_off_t next_offset; 71 int error; 72 int bytes; 73 int n; 74 75 /* 76 * Setup 77 */ 78 hmp = trans->hmp; 79 80 root_volume = trans->rootvol; 81 undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX]; 82 83 /* 84 * No undo recursion when modifying the root volume 85 */ 86 hammer_modify_volume(NULL, root_volume, NULL, 0); 87 hammer_lock_ex(&hmp->undo_lock); 88 89 /* undo had better not roll over (loose test) */ 90 if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3) 91 panic("hammer: insufficient undo FIFO space!"); 92 93 /* 94 * Loop until the undo for the entire range has been laid down. 95 * Loop at least once (len might be 0 as a degenerate case). 96 */ 97 for (;;) { 98 /* 99 * Fetch the layout offset in the UNDO FIFO, wrap it as 100 * necessary. 101 */ 102 if (undomap->next_offset == undomap->alloc_offset) { 103 undomap->next_offset = 104 HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0); 105 } 106 next_offset = undomap->next_offset; 107 108 /* 109 * This is a tail-chasing FIFO, when we hit the start of a new 110 * buffer we don't have to read it in. 111 */ 112 if ((next_offset & HAMMER_BUFMASK) == 0) { 113 redo = hammer_bnew(hmp, next_offset, &error, &buffer); 114 hammer_format_undo(redo, hmp->undo_seqno ^ 0x40000000); 115 } else { 116 redo = hammer_bread(hmp, next_offset, &error, &buffer); 117 } 118 if (error) 119 break; 120 hammer_modify_buffer(NULL, buffer, NULL, 0); 121 122 /* 123 * Calculate how big a media structure fits up to the next 124 * alignment point and how large a data payload we can 125 * accomodate. 126 * 127 * If n calculates to 0 or negative there is no room for 128 * anything but a PAD. 129 */ 130 bytes = HAMMER_UNDO_ALIGN - 131 ((int)next_offset & HAMMER_UNDO_MASK); 132 n = bytes - 133 (int)sizeof(struct hammer_fifo_redo) - 134 (int)sizeof(struct hammer_fifo_tail); 135 136 /* 137 * If available space is insufficient for any payload 138 * we have to lay down a PAD. 139 * 140 * The minimum PAD is 8 bytes and the head and tail will 141 * overlap each other in that case. PADs do not have 142 * sequence numbers or CRCs. 143 * 144 * A PAD may not start on a boundary. That is, every 145 * 512-byte block in the UNDO/REDO FIFO must begin with 146 * a record containing a sequence number. 147 */ 148 if (n <= 0) { 149 KKASSERT(bytes >= sizeof(struct hammer_fifo_tail)); 150 KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0); 151 tail = (void *)((char *)redo + bytes - sizeof(*tail)); 152 if ((void *)redo != (void *)tail) { 153 tail->tail_signature = HAMMER_TAIL_SIGNATURE; 154 tail->tail_type = HAMMER_HEAD_TYPE_PAD; 155 tail->tail_size = bytes; 156 } 157 redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE; 158 redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD; 159 redo->head.hdr_size = bytes; 160 /* NO CRC OR SEQ NO */ 161 undomap->next_offset += bytes; 162 hammer_modify_buffer_done(buffer); 163 hammer_stats_redo += bytes; 164 continue; 165 } 166 167 /* 168 * When generating an inode-related REDO record we track 169 * the point in the UNDO/REDO FIFO containing the inode's 170 * earliest REDO record. See hammer_generate_redo_sync(). 171 * 172 * redo_fifo_next is cleared when an inode is staged to 173 * the backend and then used to determine how to reassign 174 * redo_fifo_start after the inode flush completes. 175 */ 176 if (ip) { 177 redo->redo_objid = ip->obj_id; 178 redo->redo_localization = ip->obj_localization; 179 if ((ip->flags & HAMMER_INODE_RDIRTY) == 0) { 180 ip->redo_fifo_start = next_offset; 181 if (RB_INSERT(hammer_redo_rb_tree, 182 &hmp->rb_redo_root, ip)) { 183 panic("hammer_generate_redo: " 184 "cannot insert inode %p on " 185 "redo FIFO", ip); 186 } 187 ip->flags |= HAMMER_INODE_RDIRTY; 188 } 189 if (ip->redo_fifo_next == 0) 190 ip->redo_fifo_next = next_offset; 191 } else { 192 redo->redo_objid = 0; 193 redo->redo_localization = 0; 194 } 195 196 /* 197 * Calculate the actual payload and recalculate the size 198 * of the media structure as necessary. If no data buffer 199 * is supplied there is no payload. 200 */ 201 if (base == NULL) { 202 n = 0; 203 } else if (n > len) { 204 n = len; 205 } 206 bytes = ((n + HAMMER_HEAD_ALIGN_MASK) & 207 ~HAMMER_HEAD_ALIGN_MASK) + 208 (int)sizeof(struct hammer_fifo_redo) + 209 (int)sizeof(struct hammer_fifo_tail); 210 if (hammer_debug_general & 0x0080) { 211 kprintf("redo %016llx %d %d\n", 212 (long long)next_offset, bytes, n); 213 } 214 215 redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE; 216 redo->head.hdr_type = HAMMER_HEAD_TYPE_REDO; 217 redo->head.hdr_size = bytes; 218 redo->head.hdr_seq = hmp->undo_seqno++; 219 redo->head.hdr_crc = 0; 220 redo->redo_mtime = trans->time; 221 redo->redo_offset = file_off; 222 redo->redo_flags = flags; 223 224 /* 225 * Incremental payload. If no payload we throw the entire 226 * len into redo_data_bytes and will not loop. 227 */ 228 if (base) { 229 redo->redo_data_bytes = n; 230 bcopy(base, redo + 1, n); 231 len -= n; 232 base = (char *)base + n; 233 file_off += n; 234 } else { 235 redo->redo_data_bytes = len; 236 file_off += len; 237 len = 0; 238 } 239 240 tail = (void *)((char *)redo + bytes - sizeof(*tail)); 241 tail->tail_signature = HAMMER_TAIL_SIGNATURE; 242 tail->tail_type = HAMMER_HEAD_TYPE_REDO; 243 tail->tail_size = bytes; 244 245 KKASSERT(bytes >= sizeof(redo->head)); 246 redo->head.hdr_crc = crc32(redo, HAMMER_FIFO_HEAD_CRCOFF) ^ 247 crc32(&redo->head + 1, bytes - sizeof(redo->head)); 248 undomap->next_offset += bytes; 249 hammer_stats_redo += bytes; 250 251 /* 252 * Before we finish off the buffer we have to deal with any 253 * junk between the end of the media structure we just laid 254 * down and the UNDO alignment boundary. We do this by laying 255 * down a dummy PAD. Even though we will probably overwrite 256 * it almost immediately we have to do this so recovery runs 257 * can iterate the UNDO space without having to depend on 258 * the indices in the volume header. 259 * 260 * This dummy PAD will be overwritten on the next undo so 261 * we do not adjust undomap->next_offset. 262 */ 263 bytes = HAMMER_UNDO_ALIGN - 264 ((int)undomap->next_offset & HAMMER_UNDO_MASK); 265 if (bytes != HAMMER_UNDO_ALIGN) { 266 KKASSERT(bytes >= sizeof(struct hammer_fifo_tail)); 267 redo = (void *)(tail + 1); 268 tail = (void *)((char *)redo + bytes - sizeof(*tail)); 269 if ((void *)redo != (void *)tail) { 270 tail->tail_signature = HAMMER_TAIL_SIGNATURE; 271 tail->tail_type = HAMMER_HEAD_TYPE_PAD; 272 tail->tail_size = bytes; 273 } 274 redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE; 275 redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD; 276 redo->head.hdr_size = bytes; 277 /* NO CRC OR SEQ NO */ 278 } 279 hammer_modify_buffer_done(buffer); 280 if (len == 0) 281 break; 282 } 283 hammer_modify_volume_done(root_volume); 284 hammer_unlock(&hmp->undo_lock); 285 286 if (buffer) 287 hammer_rel_buffer(buffer, 0); 288 289 /* 290 * Make sure the nominal undo span contains at least one REDO_SYNC, 291 * otherwise the REDO recovery will not be triggered. 292 */ 293 if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 && 294 flags != HAMMER_REDO_SYNC) { 295 hammer_generate_redo_sync(trans); 296 } 297 298 return(error); 299 } 300 301 /* 302 * Generate a REDO SYNC record. At least one such record must be generated 303 * in the nominal recovery span for the recovery code to be able to run 304 * REDOs outside of the span. 305 * 306 * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset 307 * for all inodes with active REDOs. This changes dynamically as inodes 308 * get flushed. 309 * 310 * During recovery stage2 any new flush cycles must specify the original 311 * redo sync offset. That way a crash will re-run the REDOs, at least 312 * up to the point where the UNDO FIFO does not overwrite the area. 313 */ 314 void 315 hammer_generate_redo_sync(hammer_transaction_t trans) 316 { 317 hammer_mount_t hmp = trans->hmp; 318 hammer_inode_t ip; 319 hammer_off_t redo_fifo_start; 320 321 if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) { 322 ip = NULL; 323 redo_fifo_start = hmp->recover_stage2_offset; 324 } else { 325 ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root); 326 if (ip) 327 redo_fifo_start = ip->redo_fifo_start; 328 else 329 redo_fifo_start = 0; 330 } 331 if (redo_fifo_start) { 332 if (hammer_debug_io & 0x0004) { 333 kprintf("SYNC IP %p %016jx\n", 334 ip, (intmax_t)redo_fifo_start); 335 } 336 hammer_generate_redo(trans, NULL, redo_fifo_start, 337 HAMMER_REDO_SYNC, NULL, 0); 338 trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC; 339 } 340 } 341 342 /* 343 * This is called when an inode is queued to the backend. 344 */ 345 void 346 hammer_redo_fifo_start_flush(hammer_inode_t ip) 347 { 348 ip->redo_fifo_next = 0; 349 } 350 351 /* 352 * This is called when an inode backend flush is finished. We have to make 353 * sure that RDIRTY is not set unless dirty bufs are present. Dirty bufs 354 * can get destroyed through operations such as truncations and leave 355 * us with a stale redo_fifo_next. 356 */ 357 void 358 hammer_redo_fifo_end_flush(hammer_inode_t ip) 359 { 360 hammer_mount_t hmp = ip->hmp; 361 362 if (ip->flags & HAMMER_INODE_RDIRTY) { 363 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip); 364 ip->flags &= ~HAMMER_INODE_RDIRTY; 365 } 366 if ((ip->flags & HAMMER_INODE_BUFS) == 0) 367 ip->redo_fifo_next = 0; 368 if (ip->redo_fifo_next) { 369 ip->redo_fifo_start = ip->redo_fifo_next; 370 if (RB_INSERT(hammer_redo_rb_tree, &hmp->rb_redo_root, ip)) { 371 panic("hammer_generate_redo: cannot reinsert " 372 "inode %p on redo FIFO", 373 ip); 374 } 375 ip->flags |= HAMMER_INODE_RDIRTY; 376 } 377 } 378