1 /* 2 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/kern/vfs_journal.c,v 1.5 2004/12/31 23:48:08 dillon Exp $ 35 */ 36 /* 37 * Each mount point may have zero or more independantly configured journals 38 * attached to it. Each journal is represented by a memory FIFO and worker 39 * thread. Journal events are streamed through the FIFO to the thread, 40 * batched up (typically on one-second intervals), and written out by the 41 * thread. 42 * 43 * Journal vnode ops are executed instead of mnt_vn_norm_ops when one or 44 * more journals have been installed on a mount point. It becomes the 45 * responsibility of the journal op to call the underlying normal op as 46 * appropriate. 47 * 48 * The journaling protocol is intended to evolve into a two-way stream 49 * whereby transaction IDs can be acknowledged by the journaling target 50 * when the data has been committed to hard storage. Both implicit and 51 * explicit acknowledgement schemes will be supported, depending on the 52 * sophistication of the journaling stream, plus resynchronization and 53 * restart when a journaling stream is interrupted. This information will 54 * also be made available to journaling-aware filesystems to allow better 55 * management of their own physical storage synchronization mechanisms as 56 * well as to allow such filesystems to take direct advantage of the kernel's 57 * journaling layer so they don't have to roll their own. 58 * 59 * In addition, the worker thread will have access to much larger 60 * spooling areas then the memory buffer is able to provide by e.g. 61 * reserving swap space, in order to absorb potentially long interruptions 62 * of off-site journaling streams, and to prevent 'slow' off-site linkages 63 * from radically slowing down local filesystem operations. 64 * 65 * Because of the non-trivial algorithms the journaling system will be 66 * required to support, use of a worker thread is mandatory. Efficiencies 67 * are maintained by utilitizing the memory FIFO to batch transactions when 68 * possible, reducing the number of gratuitous thread switches and taking 69 * advantage of cpu caches through the use of shorter batched code paths 70 * rather then trying to do everything in the context of the process 71 * originating the filesystem op. In the future the memory FIFO can be 72 * made per-cpu to remove BGL or other locking requirements. 73 */ 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/buf.h> 77 #include <sys/conf.h> 78 #include <sys/kernel.h> 79 #include <sys/queue.h> 80 #include <sys/lock.h> 81 #include <sys/malloc.h> 82 #include <sys/mount.h> 83 #include <sys/unistd.h> 84 #include <sys/vnode.h> 85 #include <sys/poll.h> 86 #include <sys/mountctl.h> 87 #include <sys/file.h> 88 89 #include <machine/limits.h> 90 91 #include <vm/vm.h> 92 #include <vm/vm_object.h> 93 #include <vm/vm_page.h> 94 #include <vm/vm_pager.h> 95 #include <vm/vnode_pager.h> 96 97 #include <sys/file2.h> 98 #include <sys/thread2.h> 99 100 static int journal_attach(struct mount *mp); 101 static void journal_detach(struct mount *mp); 102 static int journal_install_vfs_journal(struct mount *mp, struct file *fp, 103 const struct mountctl_install_journal *info); 104 static int journal_remove_vfs_journal(struct mount *mp, 105 const struct mountctl_remove_journal *info); 106 static int journal_resync_vfs_journal(struct mount *mp, const void *ctl); 107 static void journal_thread(void *info); 108 109 static void *journal_reserve(struct journal *jo, 110 struct journal_rawrecbeg **rawpp, 111 int16_t streamid, int bytes); 112 static void *journal_extend(struct journal *jo, 113 struct journal_rawrecbeg **rawpp, 114 int truncbytes, int bytes, int *newstreamrecp); 115 static void journal_abort(struct journal *jo, 116 struct journal_rawrecbeg **rawpp); 117 static void journal_commit(struct journal *jo, 118 struct journal_rawrecbeg **rawpp, 119 int bytes, int closeout); 120 121 static void jrecord_init(struct journal *jo, 122 struct jrecord *jrec, int16_t streamid); 123 static struct journal_subrecord *jrecord_push( 124 struct jrecord *jrec, int16_t rectype); 125 static void jrecord_pop(struct jrecord *jrec, struct journal_subrecord *parent); 126 static struct journal_subrecord *jrecord_write(struct jrecord *jrec, 127 int16_t rectype, int bytes); 128 static void jrecord_data(struct jrecord *jrec, const void *buf, int bytes); 129 static void jrecord_done(struct jrecord *jrec, int abortit); 130 131 static void jrecord_write_path(struct jrecord *jrec, 132 int16_t rectype, struct namecache *ncp); 133 static void jrecord_write_vattr(struct jrecord *jrec, struct vattr *vat); 134 135 136 static int journal_setattr(struct vop_setattr_args *ap); 137 static int journal_write(struct vop_write_args *ap); 138 static int journal_fsync(struct vop_fsync_args *ap); 139 static int journal_putpages(struct vop_putpages_args *ap); 140 static int journal_setacl(struct vop_setacl_args *ap); 141 static int journal_setextattr(struct vop_setextattr_args *ap); 142 static int journal_ncreate(struct vop_ncreate_args *ap); 143 static int journal_nmknod(struct vop_nmknod_args *ap); 144 static int journal_nlink(struct vop_nlink_args *ap); 145 static int journal_nsymlink(struct vop_nsymlink_args *ap); 146 static int journal_nwhiteout(struct vop_nwhiteout_args *ap); 147 static int journal_nremove(struct vop_nremove_args *ap); 148 static int journal_nmkdir(struct vop_nmkdir_args *ap); 149 static int journal_nrmdir(struct vop_nrmdir_args *ap); 150 static int journal_nrename(struct vop_nrename_args *ap); 151 152 static struct vnodeopv_entry_desc journal_vnodeop_entries[] = { 153 { &vop_default_desc, vop_journal_operate_ap }, 154 { &vop_mountctl_desc, (void *)journal_mountctl }, 155 { &vop_setattr_desc, (void *)journal_setattr }, 156 { &vop_write_desc, (void *)journal_write }, 157 { &vop_fsync_desc, (void *)journal_fsync }, 158 { &vop_putpages_desc, (void *)journal_putpages }, 159 { &vop_setacl_desc, (void *)journal_setacl }, 160 { &vop_setextattr_desc, (void *)journal_setextattr }, 161 { &vop_ncreate_desc, (void *)journal_ncreate }, 162 { &vop_nmknod_desc, (void *)journal_nmknod }, 163 { &vop_nlink_desc, (void *)journal_nlink }, 164 { &vop_nsymlink_desc, (void *)journal_nsymlink }, 165 { &vop_nwhiteout_desc, (void *)journal_nwhiteout }, 166 { &vop_nremove_desc, (void *)journal_nremove }, 167 { &vop_nmkdir_desc, (void *)journal_nmkdir }, 168 { &vop_nrmdir_desc, (void *)journal_nrmdir }, 169 { &vop_nrename_desc, (void *)journal_nrename }, 170 { NULL, NULL } 171 }; 172 173 static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures"); 174 static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO"); 175 176 int 177 journal_mountctl(struct vop_mountctl_args *ap) 178 { 179 struct mount *mp; 180 int error = 0; 181 182 mp = ap->a_head.a_ops->vv_mount; 183 KKASSERT(mp); 184 185 if (mp->mnt_vn_journal_ops == NULL) { 186 switch(ap->a_op) { 187 case MOUNTCTL_INSTALL_VFS_JOURNAL: 188 error = journal_attach(mp); 189 if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal)) 190 error = EINVAL; 191 if (error == 0 && ap->a_fp == NULL) 192 error = EBADF; 193 if (error == 0) 194 error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl); 195 if (TAILQ_EMPTY(&mp->mnt_jlist)) 196 journal_detach(mp); 197 break; 198 case MOUNTCTL_REMOVE_VFS_JOURNAL: 199 case MOUNTCTL_RESYNC_VFS_JOURNAL: 200 error = EINVAL; 201 break; 202 default: 203 error = EOPNOTSUPP; 204 break; 205 } 206 } else { 207 switch(ap->a_op) { 208 case MOUNTCTL_INSTALL_VFS_JOURNAL: 209 if (ap->a_ctllen != sizeof(struct mountctl_install_journal)) 210 error = EINVAL; 211 if (error == 0 && ap->a_fp == NULL) 212 error = EBADF; 213 if (error == 0) 214 error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl); 215 break; 216 case MOUNTCTL_REMOVE_VFS_JOURNAL: 217 if (ap->a_ctllen != sizeof(struct mountctl_remove_journal)) 218 error = EINVAL; 219 if (error == 0) 220 error = journal_remove_vfs_journal(mp, ap->a_ctl); 221 if (TAILQ_EMPTY(&mp->mnt_jlist)) 222 journal_detach(mp); 223 break; 224 case MOUNTCTL_RESYNC_VFS_JOURNAL: 225 if (ap->a_ctllen != 0) 226 error = EINVAL; 227 error = journal_resync_vfs_journal(mp, ap->a_ctl); 228 break; 229 default: 230 error = EOPNOTSUPP; 231 break; 232 } 233 } 234 return (error); 235 } 236 237 /* 238 * High level mount point setup. When a 239 */ 240 static int 241 journal_attach(struct mount *mp) 242 { 243 vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops, journal_vnodeop_entries); 244 return(0); 245 } 246 247 static void 248 journal_detach(struct mount *mp) 249 { 250 if (mp->mnt_vn_journal_ops) 251 vfs_rm_vnodeops(&mp->mnt_vn_journal_ops); 252 } 253 254 /* 255 * Install a journal on a mount point. Each journal has an associated worker 256 * thread which is responsible for buffering and spooling the data to the 257 * target. A mount point may have multiple journals attached to it. An 258 * initial start record is generated when the journal is associated. 259 */ 260 static int 261 journal_install_vfs_journal(struct mount *mp, struct file *fp, 262 const struct mountctl_install_journal *info) 263 { 264 struct journal *jo; 265 struct jrecord jrec; 266 int error = 0; 267 int size; 268 269 jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK|M_ZERO); 270 bcopy(info->id, jo->id, sizeof(jo->id)); 271 jo->flags = info->flags & ~(MC_JOURNAL_ACTIVE | MC_JOURNAL_STOP_REQ); 272 273 /* 274 * Memory FIFO size, round to nearest power of 2 275 */ 276 if (info->membufsize) { 277 if (info->membufsize < 65536) 278 size = 65536; 279 else if (info->membufsize > 128 * 1024 * 1024) 280 size = 128 * 1024 * 1024; 281 else 282 size = (int)info->membufsize; 283 } else { 284 size = 1024 * 1024; 285 } 286 jo->fifo.size = 1; 287 while (jo->fifo.size < size) 288 jo->fifo.size <<= 1; 289 290 /* 291 * Other parameters. If not specified the starting transaction id 292 * will be the current date. 293 */ 294 if (info->transid) { 295 jo->transid = info->transid; 296 } else { 297 struct timespec ts; 298 getnanotime(&ts); 299 jo->transid = ((int64_t)ts.tv_sec << 30) | ts.tv_nsec; 300 } 301 302 jo->fp = fp; 303 304 /* 305 * Allocate the memory FIFO 306 */ 307 jo->fifo.mask = jo->fifo.size - 1; 308 jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK|M_ZERO|M_NULLOK); 309 if (jo->fifo.membase == NULL) 310 error = ENOMEM; 311 312 /* 313 * Create the worker thread and generate the association record. 314 */ 315 if (error) { 316 free(jo, M_JOURNAL); 317 } else { 318 fhold(fp); 319 jo->flags |= MC_JOURNAL_ACTIVE; 320 lwkt_create(journal_thread, jo, NULL, &jo->thread, 321 TDF_STOPREQ, -1, "journal %.*s", JIDMAX, jo->id); 322 lwkt_setpri(&jo->thread, TDPRI_KERN_DAEMON); 323 lwkt_schedule(&jo->thread); 324 325 jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT); 326 jrecord_write(&jrec, JTYPE_ASSOCIATE, 0); 327 jrecord_done(&jrec, 0); 328 TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry); 329 } 330 return(error); 331 } 332 333 /* 334 * Disassociate a journal from a mount point and terminate its worker thread. 335 * A final termination record is written out before the file pointer is 336 * dropped. 337 */ 338 static int 339 journal_remove_vfs_journal(struct mount *mp, 340 const struct mountctl_remove_journal *info) 341 { 342 struct journal *jo; 343 struct jrecord jrec; 344 int error; 345 346 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 347 if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0) 348 break; 349 } 350 if (jo) { 351 error = 0; 352 TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry); 353 354 jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT); 355 jrecord_write(&jrec, JTYPE_DISASSOCIATE, 0); 356 jrecord_done(&jrec, 0); 357 358 jo->flags |= MC_JOURNAL_STOP_REQ | (info->flags & MC_JOURNAL_STOP_IMM); 359 wakeup(&jo->fifo); 360 while (jo->flags & MC_JOURNAL_ACTIVE) { 361 tsleep(jo, 0, "jwait", 0); 362 } 363 lwkt_free_thread(&jo->thread); /* XXX SMP */ 364 if (jo->fp) 365 fdrop(jo->fp, curthread); 366 if (jo->fifo.membase) 367 free(jo->fifo.membase, M_JFIFO); 368 free(jo, M_JOURNAL); 369 } else { 370 error = EINVAL; 371 } 372 return (error); 373 } 374 375 static int 376 journal_resync_vfs_journal(struct mount *mp, const void *ctl) 377 { 378 return(EINVAL); 379 } 380 381 /* 382 * The per-journal worker thread is responsible for writing out the 383 * journal's FIFO to the target stream. 384 */ 385 static void 386 journal_thread(void *info) 387 { 388 struct journal *jo = info; 389 struct journal_rawrecbeg *rawp; 390 int bytes; 391 int error; 392 int avail; 393 int res; 394 395 for (;;) { 396 /* 397 * Calculate the number of bytes available to write. This buffer 398 * area may contain reserved records so we can't just write it out 399 * without further checks. 400 */ 401 bytes = jo->fifo.windex - jo->fifo.rindex; 402 403 /* 404 * sleep if no bytes are available or if an incomplete record is 405 * encountered (it needs to be filled in before we can write it 406 * out), and skip any pad records that we encounter. 407 */ 408 if (bytes == 0) { 409 if (jo->flags & MC_JOURNAL_STOP_REQ) 410 break; 411 tsleep(&jo->fifo, 0, "jfifo", hz); 412 continue; 413 } 414 rawp = (void *)(jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask)); 415 if (rawp->begmagic == JREC_INCOMPLETEMAGIC) { 416 tsleep(&jo->fifo, 0, "jpad", hz); 417 continue; 418 } 419 if (rawp->streamid == JREC_STREAMID_PAD) { 420 jo->fifo.rindex += (rawp->recsize + 15) & ~15; 421 KKASSERT(jo->fifo.windex - jo->fifo.rindex > 0); 422 continue; 423 } 424 425 /* 426 * Figure out how much we can write out, beware the buffer wrap 427 * case. 428 */ 429 res = 0; 430 avail = jo->fifo.size - (jo->fifo.rindex & jo->fifo.mask); 431 while (res < bytes && rawp->begmagic == JREC_BEGMAGIC) { 432 res += (rawp->recsize + 15) & ~15; 433 if (res >= avail) { 434 KKASSERT(res == avail); 435 break; 436 } 437 } 438 439 /* 440 * Issue the write and deal with any errors or other conditions. 441 * For now assume blocking I/O. Since we are record-aware the 442 * code cannot yet handle partial writes. 443 * 444 * XXX EWOULDBLOCK/NBIO 445 * XXX notification on failure 446 * XXX two-way acknowledgement stream in the return direction / xindex 447 */ 448 printf("write @%d,%d\n", jo->fifo.rindex & jo->fifo.mask, bytes); 449 bytes = res; 450 error = fp_write(jo->fp, 451 jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask), 452 bytes, &res); 453 if (error) { 454 printf("journal_thread(%s) write, error %d\n", jo->id, error); 455 /* XXX */ 456 } else { 457 KKASSERT(res == bytes); 458 printf("journal_thread(%s) write %d\n", jo->id, res); 459 } 460 461 /* 462 * Advance rindex. XXX for now also advance xindex, which will 463 * eventually be advanced when the target acknowledges the sequence 464 * space. 465 */ 466 jo->fifo.rindex += bytes; 467 jo->fifo.xindex += bytes; 468 if (jo->flags & MC_JOURNAL_WWAIT) { 469 jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */ 470 wakeup(&jo->fifo.windex); 471 } 472 } 473 jo->flags &= ~MC_JOURNAL_ACTIVE; 474 wakeup(jo); 475 wakeup(&jo->fifo.windex); 476 } 477 478 static __inline 479 void 480 journal_build_pad(struct journal_rawrecbeg *rawp, int recsize) 481 { 482 struct journal_rawrecend *rendp; 483 484 KKASSERT((recsize & 15) == 0 && recsize >= 16); 485 486 rawp->begmagic = JREC_BEGMAGIC; 487 rawp->streamid = JREC_STREAMID_PAD; 488 rawp->recsize = recsize; /* must be 16-byte aligned */ 489 rawp->seqno = 0; 490 /* 491 * WARNING, rendp may overlap rawp->seqno. This is necessary to 492 * allow PAD records to fit in 16 bytes. Use cpu_mb1() to 493 * hopefully cause the compiler to not make any assumptions. 494 */ 495 cpu_mb1(); 496 rendp = (void *)((char *)rawp + rawp->recsize - sizeof(*rendp)); 497 rendp->endmagic = JREC_ENDMAGIC; 498 rendp->check = 0; 499 rendp->recsize = rawp->recsize; 500 } 501 502 /* 503 * Wake up the worker thread if the FIFO is more then half full or if 504 * someone is waiting for space to be freed up. Otherwise let the 505 * heartbeat deal with it. Being able to avoid waking up the worker 506 * is the key to the journal's cpu efficiency. 507 */ 508 static __inline 509 void 510 journal_commit_wakeup(struct journal *jo) 511 { 512 int avail; 513 514 avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex); 515 KKASSERT(avail >= 0); 516 if ((avail < (jo->fifo.size >> 1)) || (jo->flags & MC_JOURNAL_WWAIT)) 517 wakeup(&jo->fifo); 518 } 519 520 /* 521 * Create a new BEGIN stream record with the specified streamid and the 522 * specified amount of payload space. *rawpp will be set to point to the 523 * base of the new stream record and a pointer to the base of the payload 524 * space will be returned. *rawpp does not need to be pre-NULLd prior to 525 * making this call. 526 * 527 * A stream can be extended, aborted, or committed by other API calls 528 * below. This may result in a sequence of potentially disconnected 529 * stream records to be output to the journaling target. The first record 530 * (the one created by this function) will be marked JREC_STREAMCTL_BEGIN, 531 * while the last record on commit or abort will be marked JREC_STREAMCTL_END 532 * (and possibly also JREC_STREAMCTL_ABORTED). The last record could wind 533 * up being the same as the first, in which case the bits are all set in 534 * the first record. 535 * 536 * The stream record is created in an incomplete state by setting the begin 537 * magic to JREC_INCOMPLETEMAGIC. This prevents the worker thread from 538 * flushing the fifo past our record until we have finished populating it. 539 * Other threads can reserve and operate on their own space without stalling 540 * but the stream output will stall until we have completed operations. The 541 * memory FIFO is intended to be large enough to absorb such situations 542 * without stalling out other threads. 543 */ 544 static 545 void * 546 journal_reserve(struct journal *jo, struct journal_rawrecbeg **rawpp, 547 int16_t streamid, int bytes) 548 { 549 struct journal_rawrecbeg *rawp; 550 int avail; 551 int availtoend; 552 int req; 553 554 /* 555 * Add header and trailer overheads to the passed payload. Note that 556 * the passed payload size need not be aligned in any way. 557 */ 558 bytes += sizeof(struct journal_rawrecbeg); 559 bytes += sizeof(struct journal_rawrecend); 560 561 for (;;) { 562 /* 563 * First, check boundary conditions. If the request would wrap around 564 * we have to skip past the ending block and return to the beginning 565 * of the FIFO's buffer. Calculate 'req' which is the actual number 566 * of bytes being reserved, including wrap-around dead space. 567 * 568 * Note that availtoend is not truncated to avail and so cannot be 569 * used to determine whether the reservation is possible by itself. 570 * Also, since all fifo ops are 16-byte aligned, we can check 571 * the size before calculating the aligned size. 572 */ 573 availtoend = jo->fifo.size - (jo->fifo.windex & jo->fifo.mask); 574 if (bytes > availtoend) 575 req = bytes + availtoend; /* add pad to end */ 576 else 577 req = bytes; 578 579 /* 580 * Next calculate the total available space and see if it is 581 * sufficient. We cannot overwrite previously buffered data 582 * past xindex because otherwise we would not be able to restart 583 * a broken link at the target's last point of commit. 584 */ 585 avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex); 586 KKASSERT(avail >= 0 && (avail & 15) == 0); 587 588 if (avail < req) { 589 /* XXX MC_JOURNAL_STOP_IMM */ 590 jo->flags |= MC_JOURNAL_WWAIT; 591 tsleep(&jo->fifo.windex, 0, "jwrite", 0); 592 continue; 593 } 594 595 /* 596 * Create a pad record for any dead space and create an incomplete 597 * record for the live space, then return a pointer to the 598 * contiguous buffer space that was requested. 599 * 600 * NOTE: The worker thread will not flush past an incomplete 601 * record, so the reserved space can be filled in at-will. The 602 * journaling code must also be aware the reserved sections occuring 603 * after this one will also not be written out even if completed 604 * until this one is completed. 605 */ 606 rawp = (void *)(jo->fifo.membase + (jo->fifo.windex & jo->fifo.mask)); 607 if (req != bytes) { 608 journal_build_pad(rawp, req - bytes); 609 rawp = (void *)jo->fifo.membase; 610 } 611 rawp->begmagic = JREC_INCOMPLETEMAGIC; /* updated by abort/commit */ 612 rawp->recsize = bytes; /* (unaligned size) */ 613 rawp->streamid = streamid | JREC_STREAMCTL_BEGIN; 614 rawp->seqno = 0; /* set by caller */ 615 616 /* 617 * Issue a memory barrier to guarentee that the record data has been 618 * properly initialized before we advance the write index and return 619 * a pointer to the reserved record. Otherwise the worker thread 620 * could accidently run past us. 621 * 622 * Note that stream records are always 16-byte aligned. 623 */ 624 cpu_mb1(); 625 jo->fifo.windex += (req + 15) & ~15; 626 *rawpp = rawp; 627 return(rawp + 1); 628 } 629 /* not reached */ 630 *rawpp = NULL; 631 return(NULL); 632 } 633 634 /* 635 * Extend a previous reservation by the specified number of payload bytes. 636 * If it is not possible to extend the existing reservation due to either 637 * another thread having reserved space after us or due to a boundary 638 * condition, the current reservation will be committed and possibly 639 * truncated and a new reservation with the specified payload size will 640 * be created. *rawpp is set to the new reservation in this case but the 641 * caller cannot depend on a comparison with the old rawp to determine if 642 * this case occurs because we could end up using the same memory FIFO 643 * offset for the new stream record. 644 * 645 * In either case this function will return a pointer to the base of the 646 * extended payload space. 647 * 648 * If a new stream block is created the caller needs to recalculate payload 649 * byte counts, if the same stream block is used the caller needs to extend 650 * its current notion of the payload byte count. 651 */ 652 static void * 653 journal_extend(struct journal *jo, struct journal_rawrecbeg **rawpp, 654 int truncbytes, int bytes, int *newstreamrecp) 655 { 656 struct journal_rawrecbeg *rawp; 657 int16_t streamid; 658 int availtoend; 659 int avail; 660 int osize; 661 int nsize; 662 int wbase; 663 void *rptr; 664 665 *newstreamrecp = 0; 666 rawp = *rawpp; 667 osize = (rawp->recsize + 15) & ~15; 668 nsize = (rawp->recsize + bytes + 15) & ~15; 669 wbase = (char *)rawp - jo->fifo.membase; 670 671 /* 672 * If the aligned record size does not change we can trivially extend 673 * the record. 674 */ 675 if (nsize == osize) { 676 rawp->recsize += bytes; 677 return((char *)rawp + rawp->recsize - bytes); 678 } 679 680 /* 681 * If the fifo's write index hasn't been modified since we made the 682 * reservation and we do not hit any boundary conditions, we can 683 * trivially extend the record. 684 */ 685 if ((jo->fifo.windex & jo->fifo.mask) == wbase + osize) { 686 availtoend = jo->fifo.size - wbase; 687 avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex) + osize; 688 KKASSERT((availtoend & 15) == 0); 689 KKASSERT((avail & 15) == 0); 690 if (nsize <= avail && nsize <= availtoend) { 691 jo->fifo.windex += nsize - osize; 692 rawp->recsize += bytes; 693 return((char *)rawp + rawp->recsize - bytes); 694 } 695 } 696 697 /* 698 * It was not possible to extend the buffer. Commit the current 699 * buffer and create a new one. We manually clear the BEGIN mark that 700 * journal_reserve() creates (because this is a continuing record, not 701 * the start of a new stream). 702 */ 703 streamid = rawp->streamid & JREC_STREAMID_MASK; 704 journal_commit(jo, rawpp, truncbytes, 0); 705 rptr = journal_reserve(jo, rawpp, streamid, bytes); 706 rawp = *rawpp; 707 rawp->streamid &= ~JREC_STREAMCTL_BEGIN; 708 *newstreamrecp = 1; 709 return(rptr); 710 } 711 712 /* 713 * Abort a journal record. If the transaction record represents a stream 714 * BEGIN and we can reverse the fifo's write index we can simply reverse 715 * index the entire record, as if it were never reserved in the first place. 716 * 717 * Otherwise we set the JREC_STREAMCTL_ABORTED bit and commit the record 718 * with the payload truncated to 0 bytes. 719 */ 720 static void 721 journal_abort(struct journal *jo, struct journal_rawrecbeg **rawpp) 722 { 723 struct journal_rawrecbeg *rawp; 724 int osize; 725 726 rawp = *rawpp; 727 osize = (rawp->recsize + 15) & ~15; 728 729 if ((rawp->streamid & JREC_STREAMCTL_BEGIN) && 730 (jo->fifo.windex & jo->fifo.mask) == 731 (char *)rawp - jo->fifo.membase + osize) 732 { 733 jo->fifo.windex -= osize; 734 *rawpp = NULL; 735 } else { 736 rawp->streamid |= JREC_STREAMCTL_ABORTED; 737 journal_commit(jo, rawpp, 0, 1); 738 } 739 } 740 741 /* 742 * Commit a journal record and potentially truncate it to the specified 743 * number of payload bytes. If you do not want to truncate the record, 744 * simply pass -1 for the bytes parameter. Do not pass rawp->recsize, that 745 * field includes header and trailer and will not be correct. Note that 746 * passing 0 will truncate the entire data payload of the record. 747 * 748 * The logical stream is terminated by this function. 749 * 750 * If truncation occurs, and it is not possible to physically optimize the 751 * memory FIFO due to other threads having reserved space after ours, 752 * the remaining reserved space will be covered by a pad record. 753 */ 754 static void 755 journal_commit(struct journal *jo, struct journal_rawrecbeg **rawpp, 756 int bytes, int closeout) 757 { 758 struct journal_rawrecbeg *rawp; 759 struct journal_rawrecend *rendp; 760 int osize; 761 int nsize; 762 763 rawp = *rawpp; 764 *rawpp = NULL; 765 766 KKASSERT((char *)rawp >= jo->fifo.membase && 767 (char *)rawp + rawp->recsize <= jo->fifo.membase + jo->fifo.size); 768 KKASSERT(((intptr_t)rawp & 15) == 0); 769 770 /* 771 * Truncate the record if requested. If the FIFO write index as still 772 * at the end of our record we can optimally backindex it. Otherwise 773 * we have to insert a pad record. 774 * 775 * We calculate osize which is the 16-byte-aligned original recsize. 776 * We calculate nsize which is the 16-byte-aligned new recsize. 777 * 778 * Due to alignment issues or in case the passed truncation bytes is 779 * the same as the original payload, windex will be equal to nindex. 780 */ 781 if (bytes >= 0) { 782 KKASSERT(bytes >= 0 && bytes <= rawp->recsize - sizeof(struct journal_rawrecbeg) - sizeof(struct journal_rawrecend)); 783 osize = (rawp->recsize + 15) & ~15; 784 rawp->recsize = bytes + sizeof(struct journal_rawrecbeg) + 785 sizeof(struct journal_rawrecend); 786 nsize = (rawp->recsize + 15) & ~15; 787 if (osize == nsize) { 788 /* do nothing */ 789 } else if ((jo->fifo.windex & jo->fifo.mask) == (char *)rawp - jo->fifo.membase + osize) { 790 /* we are able to backindex the fifo */ 791 jo->fifo.windex -= osize - nsize; 792 } else { 793 /* we cannot backindex the fifo, emplace a pad in the dead space */ 794 journal_build_pad((void *)((char *)rawp + osize), osize - nsize); 795 } 796 } 797 798 /* 799 * Fill in the trailer. Note that unlike pad records, the trailer will 800 * never overlap the header. 801 */ 802 rendp = (void *)((char *)rawp + 803 ((rawp->recsize + 15) & ~15) - sizeof(*rendp)); 804 rendp->endmagic = JREC_ENDMAGIC; 805 rendp->recsize = rawp->recsize; 806 rendp->check = 0; /* XXX check word, disabled for now */ 807 808 /* 809 * Fill in begmagic last. This will allow the worker thread to proceed. 810 * Use a memory barrier to guarentee write ordering. Mark the stream 811 * as terminated if closeout is set. This is the typical case. 812 */ 813 if (closeout) 814 rawp->streamid |= JREC_STREAMCTL_END; 815 cpu_mb1(); /* memory barrier */ 816 rawp->begmagic = JREC_BEGMAGIC; 817 818 journal_commit_wakeup(jo); 819 } 820 821 /************************************************************************ 822 * TRANSACTION SUPPORT ROUTINES * 823 ************************************************************************ 824 * 825 * JRECORD_*() - routines to create subrecord transactions and embed them 826 * in the logical streams managed by the journal_*() routines. 827 */ 828 829 static int16_t sid = JREC_STREAMID_JMIN; 830 831 /* 832 * Initialize the passed jrecord structure and start a new stream transaction 833 * by reserving an initial build space in the journal's memory FIFO. 834 */ 835 static void 836 jrecord_init(struct journal *jo, struct jrecord *jrec, int16_t streamid) 837 { 838 bzero(jrec, sizeof(*jrec)); 839 jrec->jo = jo; 840 if (streamid < 0) { 841 streamid = sid++; /* XXX need to track stream ids! */ 842 if (sid == JREC_STREAMID_JMAX) 843 sid = JREC_STREAMID_JMIN; 844 } 845 jrec->streamid = streamid; 846 jrec->stream_residual = JREC_DEFAULTSIZE; 847 jrec->stream_reserved = jrec->stream_residual; 848 jrec->stream_ptr = 849 journal_reserve(jo, &jrec->rawp, streamid, jrec->stream_reserved); 850 } 851 852 /* 853 * Push a recursive record type. All pushes should have matching pops. 854 * The old parent is returned and the newly pushed record becomes the 855 * new parent. Note that the old parent's pointer may already be invalid 856 * or may become invalid if jrecord_write() had to build a new stream 857 * record, so the caller should not mess with the returned pointer in 858 * any way other then to save it. 859 */ 860 static 861 struct journal_subrecord * 862 jrecord_push(struct jrecord *jrec, int16_t rectype) 863 { 864 struct journal_subrecord *save; 865 866 save = jrec->parent; 867 jrec->parent = jrecord_write(jrec, rectype|JMASK_NESTED, 0); 868 jrec->last = NULL; 869 KKASSERT(jrec->parent != NULL); 870 ++jrec->pushcount; 871 ++jrec->pushptrgood; /* cleared on flush */ 872 return(save); 873 } 874 875 /* 876 * Pop a previously pushed sub-transaction. We must set JMASK_LAST 877 * on the last record written within the subtransaction. If the last 878 * record written is not accessible or if the subtransaction is empty, 879 * we must write out a pad record with JMASK_LAST set before popping. 880 * 881 * When popping a subtransaction the parent record's recsize field 882 * will be properly set. If the parent pointer is no longer valid 883 * (which can occur if the data has already been flushed out to the 884 * stream), the protocol spec allows us to leave it 0. 885 * 886 * The saved parent pointer which we restore may or may not be valid, 887 * and if not valid may or may not be NULL, depending on the value 888 * of pushptrgood. 889 */ 890 static void 891 jrecord_pop(struct jrecord *jrec, struct journal_subrecord *save) 892 { 893 struct journal_subrecord *last; 894 895 KKASSERT(jrec->pushcount > 0); 896 KKASSERT(jrec->residual == 0); 897 898 /* 899 * Set JMASK_LAST on the last record we wrote at the current 900 * level. If last is NULL we either no longer have access to the 901 * record or the subtransaction was empty and we must write out a pad 902 * record. 903 */ 904 if ((last = jrec->last) == NULL) { 905 jrecord_write(jrec, JLEAF_PAD|JMASK_LAST, 0); 906 last = jrec->last; /* reload after possible flush */ 907 } else { 908 last->rectype |= JMASK_LAST; 909 } 910 911 /* 912 * pushptrgood tells us how many levels of parent record pointers 913 * are valid. The jrec only stores the current parent record pointer 914 * (and it is only valid if pushptrgood != 0). The higher level parent 915 * record pointers are saved by the routines calling jrecord_push() and 916 * jrecord_pop(). These pointers may become stale and we determine 917 * that fact by tracking the count of valid parent pointers with 918 * pushptrgood. Pointers become invalid when their related stream 919 * record gets pushed out. 920 * 921 * [parentA] 922 * [node X] 923 * [parentB] 924 * [node Y] 925 * [node Z] 926 * (pop B) see NOTE B 927 * (pop A) see NOTE A 928 * 929 * NOTE B: This pop sets LAST in node Z if the node is still accessible, 930 * else a PAD record is appended and LAST is set in that. 931 * 932 * This pop sets the record size in parentB if parentB is still 933 * accessible, else the record size is left 0 (the scanner must 934 * deal with that). 935 * 936 * This pop sets the new 'last' record to parentB, the pointer 937 * to which may or may not still be accessible. 938 * 939 * NOTE A: This pop sets LAST in parentB if the node is still accessible, 940 * else a PAD record is appended and LAST is set in that. 941 * 942 * This pop sets the record size in parentA if parentA is still 943 * accessible, else the record size is left 0 (the scanner must 944 * deal with that). 945 * 946 * This pop sets the new 'last' record to parentA, the pointer 947 * to which may or may not still be accessible. 948 * 949 * Also note that the last record in the stream transaction, which in 950 * the above example is parentA, does not currently have the LAST bit 951 * set. 952 * 953 * The current parent becomes the last record relative to the 954 * saved parent passed into us. It's validity is based on 955 * whether pushptrgood is non-zero prior to decrementing. The saved 956 * parent becomes the new parent, and its validity is based on whether 957 * pushptrgood is non-zero after decrementing. 958 * 959 * The old jrec->parent may be NULL if it is no longer accessible. 960 * If pushptrgood is non-zero, however, it is guarenteed to not 961 * be NULL (since no flush occured). 962 */ 963 jrec->last = jrec->parent; 964 --jrec->pushcount; 965 if (jrec->pushptrgood) { 966 KKASSERT(jrec->last != NULL && last != NULL); 967 if (--jrec->pushptrgood == 0) { 968 jrec->parent = NULL; /* 'save' contains garbage or NULL */ 969 } else { 970 KKASSERT(save != NULL); 971 jrec->parent = save; /* 'save' must not be NULL */ 972 } 973 974 /* 975 * Set the record size in the old parent. 'last' still points to 976 * the original last record in the subtransaction being popped, 977 * jrec->last points to the old parent (which became the last 978 * record relative to the new parent being popped into). 979 */ 980 jrec->last->recsize = (char *)last + last->recsize - (char *)jrec->last; 981 } else { 982 jrec->parent = NULL; 983 KKASSERT(jrec->last == NULL); 984 } 985 } 986 987 /* 988 * Write a leaf record out and return a pointer to its base. The leaf 989 * record may contain potentially megabytes of data which is supplied 990 * in jrecord_data() calls. The exact amount must be specified in this 991 * call. 992 */ 993 static 994 struct journal_subrecord * 995 jrecord_write(struct jrecord *jrec, int16_t rectype, int bytes) 996 { 997 struct journal_subrecord *last; 998 int pusheditout; 999 1000 /* 1001 * Try to catch some obvious errors. Nesting records must specify a 1002 * size of 0, and there should be no left-overs from previous operations 1003 * (such as incomplete data writeouts). 1004 */ 1005 KKASSERT(bytes == 0 || (rectype & JMASK_NESTED) == 0); 1006 KKASSERT(jrec->residual == 0); 1007 1008 /* 1009 * Check to see if the current stream record has enough room for 1010 * the new subrecord header. If it doesn't we extend the current 1011 * stream record. 1012 * 1013 * This may have the side effect of pushing out the current stream record 1014 * and creating a new one. We must adjust our stream tracking fields 1015 * accordingly. 1016 */ 1017 if (jrec->stream_residual < sizeof(struct journal_subrecord)) { 1018 jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp, 1019 jrec->stream_reserved - jrec->stream_residual, 1020 JREC_DEFAULTSIZE, &pusheditout); 1021 if (pusheditout) { 1022 jrec->stream_reserved = JREC_DEFAULTSIZE; 1023 jrec->stream_residual = JREC_DEFAULTSIZE; 1024 jrec->parent = NULL; /* no longer accessible */ 1025 jrec->pushptrgood = 0; /* restored parents in pops no good */ 1026 } else { 1027 jrec->stream_reserved += JREC_DEFAULTSIZE; 1028 jrec->stream_residual += JREC_DEFAULTSIZE; 1029 } 1030 } 1031 last = (void *)jrec->stream_ptr; 1032 last->rectype = rectype; 1033 last->reserved = 0; 1034 last->recsize = sizeof(struct journal_subrecord) + bytes; 1035 jrec->last = last; 1036 jrec->residual = bytes; /* remaining data to be posted */ 1037 jrec->residual_align = -bytes & 7; /* post-data alignment required */ 1038 return(last); 1039 } 1040 1041 /* 1042 * Write out the data associated with a leaf record. Any number of calls 1043 * to this routine may be made as long as the byte count adds up to the 1044 * amount originally specified in jrecord_write(). 1045 * 1046 * The act of writing out the leaf data may result in numerous stream records 1047 * being pushed out. Callers should be aware that even the associated 1048 * subrecord header may become inaccessible due to stream record pushouts. 1049 */ 1050 static void 1051 jrecord_data(struct jrecord *jrec, const void *buf, int bytes) 1052 { 1053 int pusheditout; 1054 int extsize; 1055 1056 KKASSERT(bytes >= 0 && bytes <= jrec->residual); 1057 1058 /* 1059 * Push out stream records as long as there is insufficient room to hold 1060 * the remaining data. 1061 */ 1062 while (jrec->stream_residual < bytes) { 1063 /* 1064 * Fill in any remaining space in the current stream record. 1065 */ 1066 bcopy(buf, jrec->stream_ptr, jrec->stream_residual); 1067 buf = (const char *)buf + jrec->stream_residual; 1068 bytes -= jrec->stream_residual; 1069 /*jrec->stream_ptr += jrec->stream_residual;*/ 1070 jrec->stream_residual = 0; 1071 jrec->residual -= jrec->stream_residual; 1072 1073 /* 1074 * Try to extend the current stream record, but no more then 1/4 1075 * the size of the FIFO. 1076 */ 1077 extsize = jrec->jo->fifo.size >> 2; 1078 if (extsize > bytes) 1079 extsize = (bytes + 15) & ~15; 1080 1081 jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp, 1082 jrec->stream_reserved - jrec->stream_residual, 1083 extsize, &pusheditout); 1084 if (pusheditout) { 1085 jrec->stream_reserved = extsize; 1086 jrec->stream_residual = extsize; 1087 jrec->parent = NULL; /* no longer accessible */ 1088 jrec->last = NULL; /* no longer accessible */ 1089 jrec->pushptrgood = 0; /* restored parents in pops no good */ 1090 } else { 1091 jrec->stream_reserved += extsize; 1092 jrec->stream_residual += extsize; 1093 } 1094 } 1095 1096 /* 1097 * Push out any remaining bytes into the current stream record. 1098 */ 1099 if (bytes) { 1100 bcopy(buf, jrec->stream_ptr, bytes); 1101 jrec->stream_ptr += bytes; 1102 jrec->stream_residual -= bytes; 1103 jrec->residual -= bytes; 1104 } 1105 1106 /* 1107 * Handle data alignment requirements for the subrecord. Because the 1108 * stream record's data space is more strictly aligned, it must already 1109 * have sufficient space to hold any subrecord alignment slop. 1110 */ 1111 if (jrec->residual == 0 && jrec->residual_align) { 1112 KKASSERT(jrec->residual_align <= jrec->stream_residual); 1113 bzero(jrec->stream_ptr, jrec->residual_align); 1114 jrec->stream_ptr += jrec->residual_align; 1115 jrec->stream_residual -= jrec->residual_align; 1116 jrec->residual_align = 0; 1117 } 1118 } 1119 1120 /* 1121 * We are finished with a transaction. If abortit is not set then we must 1122 * be at the top level with no residual subrecord data left to output. 1123 * If abortit is set then we can be in any state. 1124 * 1125 * The stream record will be committed or aborted as specified and jrecord 1126 * resources will be cleaned up. 1127 */ 1128 static void 1129 jrecord_done(struct jrecord *jrec, int abortit) 1130 { 1131 KKASSERT(jrec->rawp != NULL); 1132 1133 if (abortit) { 1134 journal_abort(jrec->jo, &jrec->rawp); 1135 } else { 1136 KKASSERT(jrec->pushcount == 0 && jrec->residual == 0); 1137 journal_commit(jrec->jo, &jrec->rawp, 1138 jrec->stream_reserved - jrec->stream_residual, 1); 1139 } 1140 1141 /* 1142 * jrec should not be used beyond this point without another init, 1143 * but clean up some fields to ensure that we panic if it is. 1144 * 1145 * Note that jrec->rawp is NULLd out by journal_abort/journal_commit. 1146 */ 1147 jrec->jo = NULL; 1148 jrec->stream_ptr = NULL; 1149 } 1150 1151 /************************************************************************ 1152 * LEAF RECORD SUPPORT ROUTINES * 1153 ************************************************************************ 1154 * 1155 * These routine create leaf subrecords representing common filesystem 1156 * structures. 1157 */ 1158 1159 static void 1160 jrecord_write_path(struct jrecord *jrec, int16_t rectype, struct namecache *ncp) 1161 { 1162 } 1163 1164 static void 1165 jrecord_write_vattr(struct jrecord *jrec, struct vattr *vat) 1166 { 1167 } 1168 1169 /************************************************************************ 1170 * JOURNAL VNOPS * 1171 ************************************************************************ 1172 * 1173 * These are function shims replacing the normal filesystem ops. We become 1174 * responsible for calling the underlying filesystem ops. We have the choice 1175 * of executing the underlying op first and then generating the journal entry, 1176 * or starting the journal entry, executing the underlying op, and then 1177 * either completing or aborting it. 1178 * 1179 * The journal is supposed to be a high-level entity, which generally means 1180 * identifying files by name rather then by inode. Supplying both allows 1181 * the journal to be used both for inode-number-compatible 'mirrors' and 1182 * for simple filesystem replication. 1183 * 1184 * Writes are particularly difficult to deal with because a single write may 1185 * represent a hundred megabyte buffer or more, and both writes and truncations 1186 * require the 'old' data to be written out as well as the new data if the 1187 * log is reversable. Other issues: 1188 * 1189 * - How to deal with operations on unlinked files (no path available), 1190 * but which may still be filesystem visible due to hard links. 1191 * 1192 * - How to deal with modifications made via a memory map. 1193 * 1194 * - Future cache coherency support will require cache coherency API calls 1195 * both prior to and after the call to the underlying VFS. 1196 * 1197 * ALSO NOTE: We do not have to shim compatibility VOPs like MKDIR which have 1198 * new VFS equivalents (NMKDIR). 1199 */ 1200 1201 static 1202 int 1203 journal_setattr(struct vop_setattr_args *ap) 1204 { 1205 struct mount *mp; 1206 struct journal *jo; 1207 struct jrecord jrec; 1208 void *save; /* warning, save pointers do not always remain valid */ 1209 int error; 1210 1211 error = vop_journal_operate_ap(&ap->a_head); 1212 mp = ap->a_head.a_ops->vv_mount; 1213 if (error == 0) { 1214 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1215 jrecord_init(jo, &jrec, -1); 1216 save = jrecord_push(&jrec, JTYPE_SETATTR); 1217 jrecord_pop(&jrec, save); 1218 jrecord_done(&jrec, 0); 1219 } 1220 } 1221 return (error); 1222 } 1223 1224 static 1225 int 1226 journal_write(struct vop_write_args *ap) 1227 { 1228 struct mount *mp; 1229 struct journal *jo; 1230 struct jrecord jrec; 1231 void *save; /* warning, save pointers do not always remain valid */ 1232 int error; 1233 1234 error = vop_journal_operate_ap(&ap->a_head); 1235 mp = ap->a_head.a_ops->vv_mount; 1236 if (error == 0) { 1237 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1238 jrecord_init(jo, &jrec, -1); 1239 save = jrecord_push(&jrec, JTYPE_WRITE); 1240 jrecord_pop(&jrec, save); 1241 jrecord_done(&jrec, 0); 1242 } 1243 } 1244 return (error); 1245 } 1246 1247 static 1248 int 1249 journal_fsync(struct vop_fsync_args *ap) 1250 { 1251 struct mount *mp; 1252 struct journal *jo; 1253 int error; 1254 1255 error = vop_journal_operate_ap(&ap->a_head); 1256 mp = ap->a_head.a_ops->vv_mount; 1257 if (error == 0) { 1258 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1259 /* XXX synchronize pending journal records */ 1260 } 1261 } 1262 return (error); 1263 } 1264 1265 static 1266 int 1267 journal_putpages(struct vop_putpages_args *ap) 1268 { 1269 struct mount *mp; 1270 struct journal *jo; 1271 struct jrecord jrec; 1272 void *save; /* warning, save pointers do not always remain valid */ 1273 int error; 1274 1275 error = vop_journal_operate_ap(&ap->a_head); 1276 mp = ap->a_head.a_ops->vv_mount; 1277 if (error == 0) { 1278 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1279 jrecord_init(jo, &jrec, -1); 1280 save = jrecord_push(&jrec, JTYPE_PUTPAGES); 1281 jrecord_pop(&jrec, save); 1282 jrecord_done(&jrec, 0); 1283 } 1284 } 1285 return (error); 1286 } 1287 1288 static 1289 int 1290 journal_setacl(struct vop_setacl_args *ap) 1291 { 1292 struct mount *mp; 1293 struct journal *jo; 1294 struct jrecord jrec; 1295 void *save; /* warning, save pointers do not always remain valid */ 1296 int error; 1297 1298 error = vop_journal_operate_ap(&ap->a_head); 1299 mp = ap->a_head.a_ops->vv_mount; 1300 if (error == 0) { 1301 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1302 jrecord_init(jo, &jrec, -1); 1303 save = jrecord_push(&jrec, JTYPE_SETACL); 1304 jrecord_pop(&jrec, save); 1305 jrecord_done(&jrec, 0); 1306 } 1307 } 1308 return (error); 1309 } 1310 1311 static 1312 int 1313 journal_setextattr(struct vop_setextattr_args *ap) 1314 { 1315 struct mount *mp; 1316 struct journal *jo; 1317 struct jrecord jrec; 1318 void *save; /* warning, save pointers do not always remain valid */ 1319 int error; 1320 1321 error = vop_journal_operate_ap(&ap->a_head); 1322 mp = ap->a_head.a_ops->vv_mount; 1323 if (error == 0) { 1324 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1325 jrecord_init(jo, &jrec, -1); 1326 save = jrecord_push(&jrec, JTYPE_SETEXTATTR); 1327 jrecord_pop(&jrec, save); 1328 jrecord_done(&jrec, 0); 1329 } 1330 } 1331 return (error); 1332 } 1333 1334 static 1335 int 1336 journal_ncreate(struct vop_ncreate_args *ap) 1337 { 1338 struct mount *mp; 1339 struct journal *jo; 1340 struct jrecord jrec; 1341 void *save; /* warning, save pointers do not always remain valid */ 1342 int error; 1343 1344 error = vop_journal_operate_ap(&ap->a_head); 1345 mp = ap->a_head.a_ops->vv_mount; 1346 if (error == 0) { 1347 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1348 jrecord_init(jo, &jrec, -1); 1349 save = jrecord_push(&jrec, JTYPE_CREATE); 1350 jrecord_pop(&jrec, save); 1351 jrecord_done(&jrec, 0); 1352 } 1353 } 1354 return (error); 1355 } 1356 1357 static 1358 int 1359 journal_nmknod(struct vop_nmknod_args *ap) 1360 { 1361 struct mount *mp; 1362 struct journal *jo; 1363 struct jrecord jrec; 1364 void *save; /* warning, save pointers do not always remain valid */ 1365 int error; 1366 1367 error = vop_journal_operate_ap(&ap->a_head); 1368 mp = ap->a_head.a_ops->vv_mount; 1369 if (error == 0) { 1370 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1371 jrecord_init(jo, &jrec, -1); 1372 save = jrecord_push(&jrec, JTYPE_MKNOD); 1373 jrecord_pop(&jrec, save); 1374 jrecord_done(&jrec, 0); 1375 } 1376 } 1377 return (error); 1378 } 1379 1380 static 1381 int 1382 journal_nlink(struct vop_nlink_args *ap) 1383 { 1384 struct mount *mp; 1385 struct journal *jo; 1386 struct jrecord jrec; 1387 void *save; /* warning, save pointers do not always remain valid */ 1388 int error; 1389 1390 error = vop_journal_operate_ap(&ap->a_head); 1391 mp = ap->a_head.a_ops->vv_mount; 1392 if (error == 0) { 1393 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1394 jrecord_init(jo, &jrec, -1); 1395 save = jrecord_push(&jrec, JTYPE_LINK); 1396 jrecord_pop(&jrec, save); 1397 jrecord_done(&jrec, 0); 1398 } 1399 } 1400 return (error); 1401 } 1402 1403 static 1404 int 1405 journal_nsymlink(struct vop_nsymlink_args *ap) 1406 { 1407 struct mount *mp; 1408 struct journal *jo; 1409 struct jrecord jrec; 1410 void *save; /* warning, save pointers do not always remain valid */ 1411 int error; 1412 1413 error = vop_journal_operate_ap(&ap->a_head); 1414 mp = ap->a_head.a_ops->vv_mount; 1415 if (error == 0) { 1416 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1417 jrecord_init(jo, &jrec, -1); 1418 save = jrecord_push(&jrec, JTYPE_SYMLINK); 1419 jrecord_pop(&jrec, save); 1420 jrecord_done(&jrec, 0); 1421 } 1422 } 1423 return (error); 1424 } 1425 1426 static 1427 int 1428 journal_nwhiteout(struct vop_nwhiteout_args *ap) 1429 { 1430 struct mount *mp; 1431 struct journal *jo; 1432 struct jrecord jrec; 1433 void *save; /* warning, save pointers do not always remain valid */ 1434 int error; 1435 1436 error = vop_journal_operate_ap(&ap->a_head); 1437 mp = ap->a_head.a_ops->vv_mount; 1438 if (error == 0) { 1439 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1440 jrecord_init(jo, &jrec, -1); 1441 save = jrecord_push(&jrec, JTYPE_WHITEOUT); 1442 jrecord_pop(&jrec, save); 1443 jrecord_done(&jrec, 0); 1444 } 1445 } 1446 return (error); 1447 } 1448 1449 static 1450 int 1451 journal_nremove(struct vop_nremove_args *ap) 1452 { 1453 struct mount *mp; 1454 struct journal *jo; 1455 struct jrecord jrec; 1456 void *save; /* warning, save pointers do not always remain valid */ 1457 int error; 1458 1459 error = vop_journal_operate_ap(&ap->a_head); 1460 mp = ap->a_head.a_ops->vv_mount; 1461 if (error == 0) { 1462 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1463 jrecord_init(jo, &jrec, -1); 1464 save = jrecord_push(&jrec, JTYPE_REMOVE); 1465 jrecord_pop(&jrec, save); 1466 jrecord_done(&jrec, 0); 1467 } 1468 } 1469 return (error); 1470 } 1471 1472 static 1473 int 1474 journal_nmkdir(struct vop_nmkdir_args *ap) 1475 { 1476 struct mount *mp; 1477 struct journal *jo; 1478 struct jrecord jrec; 1479 void *save; /* warning, save pointers do not always remain valid */ 1480 int error; 1481 1482 error = vop_journal_operate_ap(&ap->a_head); 1483 mp = ap->a_head.a_ops->vv_mount; 1484 if (error == 0) { 1485 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1486 jrecord_init(jo, &jrec, -1); 1487 if (jo->flags & MC_JOURNAL_WANT_REVERSABLE) { 1488 save = jrecord_push(&jrec, JTYPE_UNDO); 1489 /* XXX undo operations */ 1490 jrecord_pop(&jrec, save); 1491 } 1492 #if 0 1493 if (jo->flags & MC_JOURNAL_WANT_AUDIT) { 1494 jrecord_write_audit(&jrec); 1495 } 1496 #endif 1497 save = jrecord_push(&jrec, JTYPE_MKDIR); 1498 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1499 jrecord_write_vattr(&jrec, ap->a_vap); 1500 jrecord_pop(&jrec, save); 1501 jrecord_done(&jrec, 0); 1502 } 1503 } 1504 return (error); 1505 } 1506 1507 1508 static 1509 int 1510 journal_nrmdir(struct vop_nrmdir_args *ap) 1511 { 1512 struct mount *mp; 1513 struct journal *jo; 1514 struct jrecord jrec; 1515 void *save; /* warning, save pointers do not always remain valid */ 1516 int error; 1517 1518 error = vop_journal_operate_ap(&ap->a_head); 1519 mp = ap->a_head.a_ops->vv_mount; 1520 if (error == 0) { 1521 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1522 jrecord_init(jo, &jrec, -1); 1523 save = jrecord_push(&jrec, JTYPE_RMDIR); 1524 jrecord_pop(&jrec, save); 1525 jrecord_done(&jrec, 0); 1526 } 1527 } 1528 return (error); 1529 } 1530 1531 static 1532 int 1533 journal_nrename(struct vop_nrename_args *ap) 1534 { 1535 struct mount *mp; 1536 struct journal *jo; 1537 struct jrecord jrec; 1538 void *save; /* warning, save pointers do not always remain valid */ 1539 int error; 1540 1541 error = vop_journal_operate_ap(&ap->a_head); 1542 mp = ap->a_head.a_ops->vv_mount; 1543 if (error == 0) { 1544 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1545 jrecord_init(jo, &jrec, -1); 1546 save = jrecord_push(&jrec, JTYPE_RENAME); 1547 jrecord_pop(&jrec, save); 1548 jrecord_done(&jrec, 0); 1549 } 1550 } 1551 return (error); 1552 } 1553 1554