16ddb7618SMatthew Dillon /* 26ddb7618SMatthew Dillon * Copyright (c) 2004 The DragonFly Project. All rights reserved. 36ddb7618SMatthew Dillon * 46ddb7618SMatthew Dillon * This code is derived from software contributed to The DragonFly Project 56ddb7618SMatthew Dillon * by Matthew Dillon <dillon@backplane.com> 66ddb7618SMatthew Dillon * 76ddb7618SMatthew Dillon * Redistribution and use in source and binary forms, with or without 86ddb7618SMatthew Dillon * modification, are permitted provided that the following conditions 96ddb7618SMatthew Dillon * are met: 106ddb7618SMatthew Dillon * 116ddb7618SMatthew Dillon * 1. Redistributions of source code must retain the above copyright 126ddb7618SMatthew Dillon * notice, this list of conditions and the following disclaimer. 136ddb7618SMatthew Dillon * 2. Redistributions in binary form must reproduce the above copyright 146ddb7618SMatthew Dillon * notice, this list of conditions and the following disclaimer in 156ddb7618SMatthew Dillon * the documentation and/or other materials provided with the 166ddb7618SMatthew Dillon * distribution. 176ddb7618SMatthew Dillon * 3. Neither the name of The DragonFly Project nor the names of its 186ddb7618SMatthew Dillon * contributors may be used to endorse or promote products derived 196ddb7618SMatthew Dillon * from this software without specific, prior written permission. 206ddb7618SMatthew Dillon * 216ddb7618SMatthew Dillon * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 226ddb7618SMatthew Dillon * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 236ddb7618SMatthew Dillon * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 246ddb7618SMatthew Dillon * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 256ddb7618SMatthew Dillon * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 266ddb7618SMatthew Dillon * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 276ddb7618SMatthew Dillon * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 286ddb7618SMatthew Dillon * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 296ddb7618SMatthew Dillon * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 306ddb7618SMatthew Dillon * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 316ddb7618SMatthew Dillon * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 326ddb7618SMatthew Dillon * SUCH DAMAGE. 336ddb7618SMatthew Dillon * 34*82eaef15SMatthew Dillon * $DragonFly: src/sys/kern/vfs_jops.c,v 1.4 2004/12/30 21:41:04 dillon Exp $ 352281065eSMatthew Dillon */ 362281065eSMatthew Dillon /* 372281065eSMatthew Dillon * Each mount point may have zero or more independantly configured journals 382281065eSMatthew Dillon * attached to it. Each journal is represented by a memory FIFO and worker 392281065eSMatthew Dillon * thread. Journal events are streamed through the FIFO to the thread, 402281065eSMatthew Dillon * batched up (typically on one-second intervals), and written out by the 412281065eSMatthew Dillon * thread. 422281065eSMatthew Dillon * 432281065eSMatthew Dillon * Journal vnode ops are executed instead of mnt_vn_norm_ops when one or 442281065eSMatthew Dillon * more journals have been installed on a mount point. It becomes the 452281065eSMatthew Dillon * responsibility of the journal op to call the underlying normal op as 462281065eSMatthew Dillon * appropriate. 472281065eSMatthew Dillon * 482281065eSMatthew Dillon * The journaling protocol is intended to evolve into a two-way stream 492281065eSMatthew Dillon * whereby transaction IDs can be acknowledged by the journaling target 502281065eSMatthew Dillon * when the data has been committed to hard storage. Both implicit and 512281065eSMatthew Dillon * explicit acknowledgement schemes will be supported, depending on the 522281065eSMatthew Dillon * sophistication of the journaling stream, plus resynchronization and 532281065eSMatthew Dillon * restart when a journaling stream is interrupted. This information will 542281065eSMatthew Dillon * also be made available to journaling-aware filesystems to allow better 552281065eSMatthew Dillon * management of their own physical storage synchronization mechanisms as 562281065eSMatthew Dillon * well as to allow such filesystems to take direct advantage of the kernel's 572281065eSMatthew Dillon * journaling layer so they don't have to roll their own. 582281065eSMatthew Dillon * 59*82eaef15SMatthew Dillon * In addition, the worker thread will have access to much larger 602281065eSMatthew Dillon * spooling areas then the memory buffer is able to provide by e.g. 612281065eSMatthew Dillon * reserving swap space, in order to absorb potentially long interruptions 622281065eSMatthew Dillon * of off-site journaling streams, and to prevent 'slow' off-site linkages 632281065eSMatthew Dillon * from radically slowing down local filesystem operations. 642281065eSMatthew Dillon * 652281065eSMatthew Dillon * Because of the non-trivial algorithms the journaling system will be 662281065eSMatthew Dillon * required to support, use of a worker thread is mandatory. Efficiencies 672281065eSMatthew Dillon * are maintained by utilitizing the memory FIFO to batch transactions when 682281065eSMatthew Dillon * possible, reducing the number of gratuitous thread switches and taking 692281065eSMatthew Dillon * advantage of cpu caches through the use of shorter batched code paths 702281065eSMatthew Dillon * rather then trying to do everything in the context of the process 71*82eaef15SMatthew Dillon * originating the filesystem op. In the future the memory FIFO can be 72*82eaef15SMatthew Dillon * made per-cpu to remove BGL or other locking requirements. 736ddb7618SMatthew Dillon */ 746ddb7618SMatthew Dillon #include <sys/param.h> 756ddb7618SMatthew Dillon #include <sys/systm.h> 766ddb7618SMatthew Dillon #include <sys/buf.h> 776ddb7618SMatthew Dillon #include <sys/conf.h> 786ddb7618SMatthew Dillon #include <sys/kernel.h> 79*82eaef15SMatthew Dillon #include <sys/queue.h> 806ddb7618SMatthew Dillon #include <sys/lock.h> 816ddb7618SMatthew Dillon #include <sys/malloc.h> 826ddb7618SMatthew Dillon #include <sys/mount.h> 836ddb7618SMatthew Dillon #include <sys/unistd.h> 846ddb7618SMatthew Dillon #include <sys/vnode.h> 856ddb7618SMatthew Dillon #include <sys/poll.h> 862281065eSMatthew Dillon #include <sys/mountctl.h> 872281065eSMatthew Dillon #include <sys/file.h> 886ddb7618SMatthew Dillon 896ddb7618SMatthew Dillon #include <machine/limits.h> 906ddb7618SMatthew Dillon 916ddb7618SMatthew Dillon #include <vm/vm.h> 926ddb7618SMatthew Dillon #include <vm/vm_object.h> 936ddb7618SMatthew Dillon #include <vm/vm_page.h> 946ddb7618SMatthew Dillon #include <vm/vm_pager.h> 956ddb7618SMatthew Dillon #include <vm/vnode_pager.h> 966ddb7618SMatthew Dillon 972281065eSMatthew Dillon #include <sys/file2.h> 982281065eSMatthew Dillon #include <sys/thread2.h> 992281065eSMatthew Dillon 1002281065eSMatthew Dillon static int journal_attach(struct mount *mp); 1012281065eSMatthew Dillon static void journal_detach(struct mount *mp); 1022281065eSMatthew Dillon static int journal_install_vfs_journal(struct mount *mp, struct file *fp, 1032281065eSMatthew Dillon const struct mountctl_install_journal *info); 1042281065eSMatthew Dillon static int journal_remove_vfs_journal(struct mount *mp, 1052281065eSMatthew Dillon const struct mountctl_remove_journal *info); 1062281065eSMatthew Dillon static int journal_resync_vfs_journal(struct mount *mp, const void *ctl); 1072281065eSMatthew Dillon static void journal_thread(void *info); 108*82eaef15SMatthew Dillon 109*82eaef15SMatthew Dillon static void *journal_reserve(struct journal *jo, 110*82eaef15SMatthew Dillon struct journal_rawrecbeg **rawpp, 111*82eaef15SMatthew Dillon int16_t streamid, int bytes); 112*82eaef15SMatthew Dillon static void *journal_extend(struct journal *jo, 113*82eaef15SMatthew Dillon struct journal_rawrecbeg **rawpp, 114*82eaef15SMatthew Dillon int truncbytes, int bytes, int *newstreamrecp); 115*82eaef15SMatthew Dillon static void journal_abort(struct journal *jo, 116*82eaef15SMatthew Dillon struct journal_rawrecbeg **rawpp); 117*82eaef15SMatthew Dillon static void journal_commit(struct journal *jo, 118*82eaef15SMatthew Dillon struct journal_rawrecbeg **rawpp, 119*82eaef15SMatthew Dillon int bytes, int closeout); 120*82eaef15SMatthew Dillon 121*82eaef15SMatthew Dillon static void jrecord_init(struct journal *jo, 122*82eaef15SMatthew Dillon struct jrecord *jrec, int16_t streamid); 123*82eaef15SMatthew Dillon static struct journal_subrecord *jrecord_push( 124*82eaef15SMatthew Dillon struct jrecord *jrec, int16_t rectype); 125*82eaef15SMatthew Dillon static void jrecord_pop(struct jrecord *jrec, struct journal_subrecord *parent); 126*82eaef15SMatthew Dillon static struct journal_subrecord *jrecord_write(struct jrecord *jrec, 127*82eaef15SMatthew Dillon int16_t rectype, int bytes); 128*82eaef15SMatthew Dillon static void jrecord_data(struct jrecord *jrec, const void *buf, int bytes); 129*82eaef15SMatthew Dillon static void jrecord_done(struct jrecord *jrec, int abortit); 130*82eaef15SMatthew Dillon 131*82eaef15SMatthew Dillon static void jrecord_write_path(struct jrecord *jrec, 132*82eaef15SMatthew Dillon int16_t rectype, struct namecache *ncp); 133*82eaef15SMatthew Dillon static void jrecord_write_vattr(struct jrecord *jrec, struct vattr *vat); 134*82eaef15SMatthew Dillon 1352281065eSMatthew Dillon 1362281065eSMatthew Dillon static int journal_nmkdir(struct vop_nmkdir_args *ap); 1372281065eSMatthew Dillon 1386ddb7618SMatthew Dillon static struct vnodeopv_entry_desc journal_vnodeop_entries[] = { 1396ddb7618SMatthew Dillon { &vop_default_desc, vop_journal_operate_ap }, 1402281065eSMatthew Dillon { &vop_mountctl_desc, (void *)journal_mountctl }, 1412281065eSMatthew Dillon { &vop_nmkdir_desc, (void *)journal_nmkdir }, 1426ddb7618SMatthew Dillon { NULL, NULL } 1436ddb7618SMatthew Dillon }; 1446ddb7618SMatthew Dillon 145*82eaef15SMatthew Dillon static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures"); 1462281065eSMatthew Dillon static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO"); 1472281065eSMatthew Dillon 1486ddb7618SMatthew Dillon int 1492281065eSMatthew Dillon journal_mountctl(struct vop_mountctl_args *ap) 1502281065eSMatthew Dillon { 1512281065eSMatthew Dillon struct mount *mp; 1522281065eSMatthew Dillon int error = 0; 1532281065eSMatthew Dillon 1542281065eSMatthew Dillon mp = ap->a_head.a_ops->vv_mount; 1552281065eSMatthew Dillon KKASSERT(mp); 1562281065eSMatthew Dillon 1572281065eSMatthew Dillon if (mp->mnt_vn_journal_ops == NULL) { 1582281065eSMatthew Dillon switch(ap->a_op) { 1592281065eSMatthew Dillon case MOUNTCTL_INSTALL_VFS_JOURNAL: 1602281065eSMatthew Dillon error = journal_attach(mp); 1612281065eSMatthew Dillon if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal)) 1622281065eSMatthew Dillon error = EINVAL; 1632281065eSMatthew Dillon if (error == 0 && ap->a_fp == NULL) 1642281065eSMatthew Dillon error = EBADF; 1652281065eSMatthew Dillon if (error == 0) 1662281065eSMatthew Dillon error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl); 1672281065eSMatthew Dillon if (TAILQ_EMPTY(&mp->mnt_jlist)) 1682281065eSMatthew Dillon journal_detach(mp); 1692281065eSMatthew Dillon break; 1702281065eSMatthew Dillon case MOUNTCTL_REMOVE_VFS_JOURNAL: 1712281065eSMatthew Dillon case MOUNTCTL_RESYNC_VFS_JOURNAL: 1722281065eSMatthew Dillon error = EINVAL; 1732281065eSMatthew Dillon break; 1742281065eSMatthew Dillon default: 1752281065eSMatthew Dillon error = EOPNOTSUPP; 1762281065eSMatthew Dillon break; 1772281065eSMatthew Dillon } 1782281065eSMatthew Dillon } else { 1792281065eSMatthew Dillon switch(ap->a_op) { 1802281065eSMatthew Dillon case MOUNTCTL_INSTALL_VFS_JOURNAL: 1812281065eSMatthew Dillon if (ap->a_ctllen != sizeof(struct mountctl_install_journal)) 1822281065eSMatthew Dillon error = EINVAL; 1832281065eSMatthew Dillon if (error == 0 && ap->a_fp == NULL) 1842281065eSMatthew Dillon error = EBADF; 1852281065eSMatthew Dillon if (error == 0) 1862281065eSMatthew Dillon error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl); 1872281065eSMatthew Dillon break; 1882281065eSMatthew Dillon case MOUNTCTL_REMOVE_VFS_JOURNAL: 1892281065eSMatthew Dillon if (ap->a_ctllen != sizeof(struct mountctl_remove_journal)) 1902281065eSMatthew Dillon error = EINVAL; 1912281065eSMatthew Dillon if (error == 0) 1922281065eSMatthew Dillon error = journal_remove_vfs_journal(mp, ap->a_ctl); 1932281065eSMatthew Dillon if (TAILQ_EMPTY(&mp->mnt_jlist)) 1942281065eSMatthew Dillon journal_detach(mp); 1952281065eSMatthew Dillon break; 1962281065eSMatthew Dillon case MOUNTCTL_RESYNC_VFS_JOURNAL: 1972281065eSMatthew Dillon if (ap->a_ctllen != 0) 1982281065eSMatthew Dillon error = EINVAL; 1992281065eSMatthew Dillon error = journal_resync_vfs_journal(mp, ap->a_ctl); 2002281065eSMatthew Dillon break; 2012281065eSMatthew Dillon default: 2022281065eSMatthew Dillon error = EOPNOTSUPP; 2032281065eSMatthew Dillon break; 2042281065eSMatthew Dillon } 2052281065eSMatthew Dillon } 2062281065eSMatthew Dillon return (error); 2072281065eSMatthew Dillon } 2082281065eSMatthew Dillon 2092281065eSMatthew Dillon /* 2102281065eSMatthew Dillon * High level mount point setup. When a 2112281065eSMatthew Dillon */ 2122281065eSMatthew Dillon static int 2136ddb7618SMatthew Dillon journal_attach(struct mount *mp) 2146ddb7618SMatthew Dillon { 2156ddb7618SMatthew Dillon vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops, journal_vnodeop_entries); 2166ddb7618SMatthew Dillon return(0); 2176ddb7618SMatthew Dillon } 2186ddb7618SMatthew Dillon 2192281065eSMatthew Dillon static void 2206ddb7618SMatthew Dillon journal_detach(struct mount *mp) 2216ddb7618SMatthew Dillon { 2226ddb7618SMatthew Dillon if (mp->mnt_vn_journal_ops) 2236ddb7618SMatthew Dillon vfs_rm_vnodeops(&mp->mnt_vn_journal_ops); 2246ddb7618SMatthew Dillon } 2256ddb7618SMatthew Dillon 2262281065eSMatthew Dillon /* 227*82eaef15SMatthew Dillon * Install a journal on a mount point. Each journal has an associated worker 228*82eaef15SMatthew Dillon * thread which is responsible for buffering and spooling the data to the 229*82eaef15SMatthew Dillon * target. A mount point may have multiple journals attached to it. An 230*82eaef15SMatthew Dillon * initial start record is generated when the journal is associated. 2312281065eSMatthew Dillon */ 2322281065eSMatthew Dillon static int 2332281065eSMatthew Dillon journal_install_vfs_journal(struct mount *mp, struct file *fp, 2342281065eSMatthew Dillon const struct mountctl_install_journal *info) 2352281065eSMatthew Dillon { 2362281065eSMatthew Dillon struct journal *jo; 237*82eaef15SMatthew Dillon struct jrecord jrec; 2382281065eSMatthew Dillon int error = 0; 2392281065eSMatthew Dillon int size; 2402281065eSMatthew Dillon 2412281065eSMatthew Dillon jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK|M_ZERO); 2422281065eSMatthew Dillon bcopy(info->id, jo->id, sizeof(jo->id)); 2432281065eSMatthew Dillon jo->flags = info->flags & ~(MC_JOURNAL_ACTIVE | MC_JOURNAL_STOP_REQ); 2442281065eSMatthew Dillon 2452281065eSMatthew Dillon /* 2462281065eSMatthew Dillon * Memory FIFO size, round to nearest power of 2 2472281065eSMatthew Dillon */ 248*82eaef15SMatthew Dillon if (info->membufsize) { 2492281065eSMatthew Dillon if (info->membufsize < 65536) 2502281065eSMatthew Dillon size = 65536; 2512281065eSMatthew Dillon else if (info->membufsize > 128 * 1024 * 1024) 2522281065eSMatthew Dillon size = 128 * 1024 * 1024; 2532281065eSMatthew Dillon else 2542281065eSMatthew Dillon size = (int)info->membufsize; 2552281065eSMatthew Dillon } else { 2562281065eSMatthew Dillon size = 1024 * 1024; 2572281065eSMatthew Dillon } 2582281065eSMatthew Dillon jo->fifo.size = 1; 2592281065eSMatthew Dillon while (jo->fifo.size < size) 2602281065eSMatthew Dillon jo->fifo.size <<= 1; 2612281065eSMatthew Dillon 2622281065eSMatthew Dillon /* 2632281065eSMatthew Dillon * Other parameters. If not specified the starting transaction id 2642281065eSMatthew Dillon * will be the current date. 2652281065eSMatthew Dillon */ 266*82eaef15SMatthew Dillon if (info->transid) { 2672281065eSMatthew Dillon jo->transid = info->transid; 2682281065eSMatthew Dillon } else { 2692281065eSMatthew Dillon struct timespec ts; 2702281065eSMatthew Dillon getnanotime(&ts); 2712281065eSMatthew Dillon jo->transid = ((int64_t)ts.tv_sec << 30) | ts.tv_nsec; 2722281065eSMatthew Dillon } 2732281065eSMatthew Dillon 2742281065eSMatthew Dillon jo->fp = fp; 2752281065eSMatthew Dillon 2762281065eSMatthew Dillon /* 2772281065eSMatthew Dillon * Allocate the memory FIFO 2782281065eSMatthew Dillon */ 2792281065eSMatthew Dillon jo->fifo.mask = jo->fifo.size - 1; 2802281065eSMatthew Dillon jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK|M_ZERO|M_NULLOK); 2812281065eSMatthew Dillon if (jo->fifo.membase == NULL) 2822281065eSMatthew Dillon error = ENOMEM; 2832281065eSMatthew Dillon 284*82eaef15SMatthew Dillon /* 285*82eaef15SMatthew Dillon * Create the worker thread and generate the association record. 286*82eaef15SMatthew Dillon */ 2872281065eSMatthew Dillon if (error) { 2882281065eSMatthew Dillon free(jo, M_JOURNAL); 2892281065eSMatthew Dillon } else { 2902281065eSMatthew Dillon fhold(fp); 2912281065eSMatthew Dillon jo->flags |= MC_JOURNAL_ACTIVE; 2922281065eSMatthew Dillon lwkt_create(journal_thread, jo, NULL, &jo->thread, 2932281065eSMatthew Dillon TDF_STOPREQ, -1, "journal %.*s", JIDMAX, jo->id); 2942281065eSMatthew Dillon lwkt_setpri(&jo->thread, TDPRI_KERN_DAEMON); 2952281065eSMatthew Dillon lwkt_schedule(&jo->thread); 2962281065eSMatthew Dillon 297*82eaef15SMatthew Dillon jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT); 298*82eaef15SMatthew Dillon jrecord_write(&jrec, JTYPE_ASSOCIATE, 0); 299*82eaef15SMatthew Dillon jrecord_done(&jrec, 0); 3002281065eSMatthew Dillon TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry); 3012281065eSMatthew Dillon } 3022281065eSMatthew Dillon return(error); 3032281065eSMatthew Dillon } 3042281065eSMatthew Dillon 305*82eaef15SMatthew Dillon /* 306*82eaef15SMatthew Dillon * Disassociate a journal from a mount point and terminate its worker thread. 307*82eaef15SMatthew Dillon * A final termination record is written out before the file pointer is 308*82eaef15SMatthew Dillon * dropped. 309*82eaef15SMatthew Dillon */ 3102281065eSMatthew Dillon static int 311*82eaef15SMatthew Dillon journal_remove_vfs_journal(struct mount *mp, 312*82eaef15SMatthew Dillon const struct mountctl_remove_journal *info) 3132281065eSMatthew Dillon { 3142281065eSMatthew Dillon struct journal *jo; 315*82eaef15SMatthew Dillon struct jrecord jrec; 3162281065eSMatthew Dillon int error; 3172281065eSMatthew Dillon 3182281065eSMatthew Dillon TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 3192281065eSMatthew Dillon if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0) 3202281065eSMatthew Dillon break; 3212281065eSMatthew Dillon } 3222281065eSMatthew Dillon if (jo) { 3232281065eSMatthew Dillon error = 0; 3242281065eSMatthew Dillon TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry); 325*82eaef15SMatthew Dillon 326*82eaef15SMatthew Dillon jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT); 327*82eaef15SMatthew Dillon jrecord_write(&jrec, JTYPE_DISASSOCIATE, 0); 328*82eaef15SMatthew Dillon jrecord_done(&jrec, 0); 329*82eaef15SMatthew Dillon 3302281065eSMatthew Dillon jo->flags |= MC_JOURNAL_STOP_REQ | (info->flags & MC_JOURNAL_STOP_IMM); 3312281065eSMatthew Dillon wakeup(&jo->fifo); 3322281065eSMatthew Dillon while (jo->flags & MC_JOURNAL_ACTIVE) { 3332281065eSMatthew Dillon tsleep(jo, 0, "jwait", 0); 3342281065eSMatthew Dillon } 3352281065eSMatthew Dillon lwkt_free_thread(&jo->thread); /* XXX SMP */ 3362281065eSMatthew Dillon if (jo->fp) 3372281065eSMatthew Dillon fdrop(jo->fp, curthread); 3382281065eSMatthew Dillon if (jo->fifo.membase) 3392281065eSMatthew Dillon free(jo->fifo.membase, M_JFIFO); 3402281065eSMatthew Dillon free(jo, M_JOURNAL); 3412281065eSMatthew Dillon } else { 3422281065eSMatthew Dillon error = EINVAL; 3432281065eSMatthew Dillon } 3442281065eSMatthew Dillon return (error); 3452281065eSMatthew Dillon } 3462281065eSMatthew Dillon 3472281065eSMatthew Dillon static int 3482281065eSMatthew Dillon journal_resync_vfs_journal(struct mount *mp, const void *ctl) 3492281065eSMatthew Dillon { 3502281065eSMatthew Dillon return(EINVAL); 3512281065eSMatthew Dillon } 3522281065eSMatthew Dillon 353*82eaef15SMatthew Dillon /* 354*82eaef15SMatthew Dillon * The per-journal worker thread is responsible for writing out the 355*82eaef15SMatthew Dillon * journal's FIFO to the target stream. 356*82eaef15SMatthew Dillon */ 3572281065eSMatthew Dillon static void 3582281065eSMatthew Dillon journal_thread(void *info) 3592281065eSMatthew Dillon { 3602281065eSMatthew Dillon struct journal *jo = info; 361*82eaef15SMatthew Dillon struct journal_rawrecbeg *rawp; 3622281065eSMatthew Dillon int bytes; 3632281065eSMatthew Dillon int error; 364*82eaef15SMatthew Dillon int avail; 3652281065eSMatthew Dillon int res; 3662281065eSMatthew Dillon 3672281065eSMatthew Dillon for (;;) { 368*82eaef15SMatthew Dillon /* 369*82eaef15SMatthew Dillon * Calculate the number of bytes available to write. This buffer 370*82eaef15SMatthew Dillon * area may contain reserved records so we can't just write it out 371*82eaef15SMatthew Dillon * without further checks. 372*82eaef15SMatthew Dillon */ 373*82eaef15SMatthew Dillon bytes = jo->fifo.windex - jo->fifo.rindex; 374*82eaef15SMatthew Dillon 375*82eaef15SMatthew Dillon /* 376*82eaef15SMatthew Dillon * sleep if no bytes are available or if an incomplete record is 377*82eaef15SMatthew Dillon * encountered (it needs to be filled in before we can write it 378*82eaef15SMatthew Dillon * out), and skip any pad records that we encounter. 379*82eaef15SMatthew Dillon */ 380*82eaef15SMatthew Dillon if (bytes == 0) { 3812281065eSMatthew Dillon if (jo->flags & MC_JOURNAL_STOP_REQ) 3822281065eSMatthew Dillon break; 383*82eaef15SMatthew Dillon tsleep(&jo->fifo, 0, "jfifo", hz); 384*82eaef15SMatthew Dillon continue; 3852281065eSMatthew Dillon } 386*82eaef15SMatthew Dillon rawp = (void *)(jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask)); 387*82eaef15SMatthew Dillon if (rawp->begmagic == JREC_INCOMPLETEMAGIC) { 388*82eaef15SMatthew Dillon tsleep(&jo->fifo, 0, "jpad", hz); 389*82eaef15SMatthew Dillon continue; 390*82eaef15SMatthew Dillon } 391*82eaef15SMatthew Dillon if (rawp->streamid == JREC_STREAMID_PAD) { 392*82eaef15SMatthew Dillon jo->fifo.rindex += (rawp->recsize + 15) & ~15; 393*82eaef15SMatthew Dillon KKASSERT(jo->fifo.windex - jo->fifo.rindex > 0); 394*82eaef15SMatthew Dillon continue; 395*82eaef15SMatthew Dillon } 396*82eaef15SMatthew Dillon 397*82eaef15SMatthew Dillon /* 398*82eaef15SMatthew Dillon * Figure out how much we can write out, beware the buffer wrap 399*82eaef15SMatthew Dillon * case. 400*82eaef15SMatthew Dillon */ 401*82eaef15SMatthew Dillon res = 0; 402*82eaef15SMatthew Dillon avail = jo->fifo.size - (jo->fifo.rindex & jo->fifo.mask); 403*82eaef15SMatthew Dillon while (res < bytes && rawp->begmagic == JREC_BEGMAGIC) { 404*82eaef15SMatthew Dillon res += (rawp->recsize + 15) & ~15; 405*82eaef15SMatthew Dillon if (res >= avail) { 406*82eaef15SMatthew Dillon KKASSERT(res == avail); 407*82eaef15SMatthew Dillon break; 408*82eaef15SMatthew Dillon } 409*82eaef15SMatthew Dillon } 410*82eaef15SMatthew Dillon 411*82eaef15SMatthew Dillon /* 412*82eaef15SMatthew Dillon * Issue the write and deal with any errors or other conditions. 413*82eaef15SMatthew Dillon * For now assume blocking I/O. Since we are record-aware the 414*82eaef15SMatthew Dillon * code cannot yet handle partial writes. 415*82eaef15SMatthew Dillon * 416*82eaef15SMatthew Dillon * XXX EWOULDBLOCK/NBIO 417*82eaef15SMatthew Dillon * XXX notification on failure 418*82eaef15SMatthew Dillon * XXX two-way acknowledgement stream in the return direction / xindex 419*82eaef15SMatthew Dillon */ 420*82eaef15SMatthew Dillon bytes = res; 421*82eaef15SMatthew Dillon error = fp_write(jo->fp, 422*82eaef15SMatthew Dillon jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask), 423*82eaef15SMatthew Dillon bytes, &res); 4242281065eSMatthew Dillon if (error) { 4252281065eSMatthew Dillon printf("journal_thread(%s) write, error %d\n", jo->id, error); 426*82eaef15SMatthew Dillon /* XXX */ 4272281065eSMatthew Dillon } else { 428*82eaef15SMatthew Dillon KKASSERT(res == bytes); 4292281065eSMatthew Dillon printf("journal_thread(%s) write %d\n", jo->id, res); 430*82eaef15SMatthew Dillon } 431*82eaef15SMatthew Dillon 432*82eaef15SMatthew Dillon /* 433*82eaef15SMatthew Dillon * Advance rindex. XXX for now also advance xindex, which will 434*82eaef15SMatthew Dillon * eventually be advanced when the target acknowledges the sequence 435*82eaef15SMatthew Dillon * space. 436*82eaef15SMatthew Dillon */ 437*82eaef15SMatthew Dillon jo->fifo.rindex += bytes; 438*82eaef15SMatthew Dillon jo->fifo.xindex += bytes; 4392281065eSMatthew Dillon if (jo->flags & MC_JOURNAL_WWAIT) { 4402281065eSMatthew Dillon jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */ 4412281065eSMatthew Dillon wakeup(&jo->fifo.windex); 4422281065eSMatthew Dillon } 4432281065eSMatthew Dillon } 4442281065eSMatthew Dillon jo->flags &= ~MC_JOURNAL_ACTIVE; 4452281065eSMatthew Dillon wakeup(jo); 4462281065eSMatthew Dillon wakeup(&jo->fifo.windex); 4472281065eSMatthew Dillon } 4482281065eSMatthew Dillon 449*82eaef15SMatthew Dillon static __inline 4502281065eSMatthew Dillon void 451*82eaef15SMatthew Dillon journal_build_pad(struct journal_rawrecbeg *rawp, int recsize) 4522281065eSMatthew Dillon { 453*82eaef15SMatthew Dillon struct journal_rawrecend *rendp; 4542281065eSMatthew Dillon 455*82eaef15SMatthew Dillon KKASSERT((recsize & 15) == 0 && recsize >= 16); 456*82eaef15SMatthew Dillon 457*82eaef15SMatthew Dillon rawp->begmagic = JREC_BEGMAGIC; 458*82eaef15SMatthew Dillon rawp->streamid = JREC_STREAMID_PAD; 459*82eaef15SMatthew Dillon rawp->recsize = recsize; /* must be 16-byte aligned */ 460*82eaef15SMatthew Dillon rawp->seqno = 0; 461*82eaef15SMatthew Dillon /* 462*82eaef15SMatthew Dillon * WARNING, rendp may overlap rawp->seqno. This is necessary to 463*82eaef15SMatthew Dillon * allow PAD records to fit in 16 bytes. Use cpu_mb1() to 464*82eaef15SMatthew Dillon * hopefully cause the compiler to not make any assumptions. 465*82eaef15SMatthew Dillon */ 466*82eaef15SMatthew Dillon cpu_mb1(); 467*82eaef15SMatthew Dillon rendp = (void *)((char *)rawp + rawp->recsize - sizeof(*rendp)); 468*82eaef15SMatthew Dillon rendp->endmagic = JREC_ENDMAGIC; 469*82eaef15SMatthew Dillon rendp->check = 0; 470*82eaef15SMatthew Dillon rendp->recsize = rawp->recsize; 4712281065eSMatthew Dillon } 4722281065eSMatthew Dillon 473*82eaef15SMatthew Dillon /* 474*82eaef15SMatthew Dillon * Wake up the worker thread if the FIFO is more then half full or if 475*82eaef15SMatthew Dillon * someone is waiting for space to be freed up. Otherwise let the 476*82eaef15SMatthew Dillon * heartbeat deal with it. Being able to avoid waking up the worker 477*82eaef15SMatthew Dillon * is the key to the journal's cpu efficiency. 478*82eaef15SMatthew Dillon */ 479*82eaef15SMatthew Dillon static __inline 4802281065eSMatthew Dillon void 481*82eaef15SMatthew Dillon journal_commit_wakeup(struct journal *jo) 4822281065eSMatthew Dillon { 4832281065eSMatthew Dillon int avail; 4842281065eSMatthew Dillon 485*82eaef15SMatthew Dillon avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex); 486*82eaef15SMatthew Dillon KKASSERT(avail >= 0); 487*82eaef15SMatthew Dillon if ((avail < (jo->fifo.size >> 1)) || (jo->flags & MC_JOURNAL_WWAIT)) 488*82eaef15SMatthew Dillon wakeup(&jo->fifo); 489*82eaef15SMatthew Dillon } 490*82eaef15SMatthew Dillon 491*82eaef15SMatthew Dillon /* 492*82eaef15SMatthew Dillon * Create a new BEGIN stream record with the specified streamid and the 493*82eaef15SMatthew Dillon * specified amount of payload space. *rawpp will be set to point to the 494*82eaef15SMatthew Dillon * base of the new stream record and a pointer to the base of the payload 495*82eaef15SMatthew Dillon * space will be returned. *rawpp does not need to be pre-NULLd prior to 496*82eaef15SMatthew Dillon * making this call. 497*82eaef15SMatthew Dillon * 498*82eaef15SMatthew Dillon * A stream can be extended, aborted, or committed by other API calls 499*82eaef15SMatthew Dillon * below. This may result in a sequence of potentially disconnected 500*82eaef15SMatthew Dillon * stream records to be output to the journaling target. The first record 501*82eaef15SMatthew Dillon * (the one created by this function) will be marked JREC_STREAMCTL_BEGIN, 502*82eaef15SMatthew Dillon * while the last record on commit or abort will be marked JREC_STREAMCTL_END 503*82eaef15SMatthew Dillon * (and possibly also JREC_STREAMCTL_ABORTED). The last record could wind 504*82eaef15SMatthew Dillon * up being the same as the first, in which case the bits are all set in 505*82eaef15SMatthew Dillon * the first record. 506*82eaef15SMatthew Dillon * 507*82eaef15SMatthew Dillon * The stream record is created in an incomplete state by setting the begin 508*82eaef15SMatthew Dillon * magic to JREC_INCOMPLETEMAGIC. This prevents the worker thread from 509*82eaef15SMatthew Dillon * flushing the fifo past our record until we have finished populating it. 510*82eaef15SMatthew Dillon * Other threads can reserve and operate on their own space without stalling 511*82eaef15SMatthew Dillon * but the stream output will stall until we have completed operations. The 512*82eaef15SMatthew Dillon * memory FIFO is intended to be large enough to absorb such situations 513*82eaef15SMatthew Dillon * without stalling out other threads. 514*82eaef15SMatthew Dillon */ 515*82eaef15SMatthew Dillon static 516*82eaef15SMatthew Dillon void * 517*82eaef15SMatthew Dillon journal_reserve(struct journal *jo, struct journal_rawrecbeg **rawpp, 518*82eaef15SMatthew Dillon int16_t streamid, int bytes) 519*82eaef15SMatthew Dillon { 520*82eaef15SMatthew Dillon struct journal_rawrecbeg *rawp; 521*82eaef15SMatthew Dillon int avail; 522*82eaef15SMatthew Dillon int availtoend; 523*82eaef15SMatthew Dillon int req; 524*82eaef15SMatthew Dillon 525*82eaef15SMatthew Dillon /* 526*82eaef15SMatthew Dillon * Add header and trailer overheads to the passed payload. Note that 527*82eaef15SMatthew Dillon * the passed payload size need not be aligned in any way. 528*82eaef15SMatthew Dillon */ 529*82eaef15SMatthew Dillon bytes += sizeof(struct journal_rawrecbeg); 530*82eaef15SMatthew Dillon bytes += sizeof(struct journal_rawrecend); 531*82eaef15SMatthew Dillon 532*82eaef15SMatthew Dillon for (;;) { 533*82eaef15SMatthew Dillon /* 534*82eaef15SMatthew Dillon * First, check boundary conditions. If the request would wrap around 535*82eaef15SMatthew Dillon * we have to skip past the ending block and return to the beginning 536*82eaef15SMatthew Dillon * of the FIFO's buffer. Calculate 'req' which is the actual number 537*82eaef15SMatthew Dillon * of bytes being reserved, including wrap-around dead space. 538*82eaef15SMatthew Dillon * 539*82eaef15SMatthew Dillon * Note that availtoend is not truncated to avail and so cannot be 540*82eaef15SMatthew Dillon * used to determine whether the reservation is possible by itself. 541*82eaef15SMatthew Dillon * Also, since all fifo ops are 16-byte aligned, we can check 542*82eaef15SMatthew Dillon * the size before calculating the aligned size. 543*82eaef15SMatthew Dillon */ 544*82eaef15SMatthew Dillon availtoend = jo->fifo.size - (jo->fifo.windex & jo->fifo.mask); 545*82eaef15SMatthew Dillon if (bytes > availtoend) 546*82eaef15SMatthew Dillon req = bytes + availtoend; /* add pad to end */ 547*82eaef15SMatthew Dillon else 548*82eaef15SMatthew Dillon req = bytes; 549*82eaef15SMatthew Dillon 550*82eaef15SMatthew Dillon /* 551*82eaef15SMatthew Dillon * Next calculate the total available space and see if it is 552*82eaef15SMatthew Dillon * sufficient. We cannot overwrite previously buffered data 553*82eaef15SMatthew Dillon * past xindex because otherwise we would not be able to restart 554*82eaef15SMatthew Dillon * a broken link at the target's last point of commit. 555*82eaef15SMatthew Dillon */ 556*82eaef15SMatthew Dillon avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex); 557*82eaef15SMatthew Dillon KKASSERT(avail >= 0 && (avail & 15) == 0); 558*82eaef15SMatthew Dillon 559*82eaef15SMatthew Dillon if (avail < req) { 560*82eaef15SMatthew Dillon /* XXX MC_JOURNAL_STOP_IMM */ 5612281065eSMatthew Dillon jo->flags |= MC_JOURNAL_WWAIT; 5622281065eSMatthew Dillon tsleep(&jo->fifo.windex, 0, "jwrite", 0); 5632281065eSMatthew Dillon continue; 5642281065eSMatthew Dillon } 565*82eaef15SMatthew Dillon 566*82eaef15SMatthew Dillon /* 567*82eaef15SMatthew Dillon * Create a pad record for any dead space and create an incomplete 568*82eaef15SMatthew Dillon * record for the live space, then return a pointer to the 569*82eaef15SMatthew Dillon * contiguous buffer space that was requested. 570*82eaef15SMatthew Dillon * 571*82eaef15SMatthew Dillon * NOTE: The worker thread will not flush past an incomplete 572*82eaef15SMatthew Dillon * record, so the reserved space can be filled in at-will. The 573*82eaef15SMatthew Dillon * journaling code must also be aware the reserved sections occuring 574*82eaef15SMatthew Dillon * after this one will also not be written out even if completed 575*82eaef15SMatthew Dillon * until this one is completed. 576*82eaef15SMatthew Dillon */ 577*82eaef15SMatthew Dillon rawp = (void *)(jo->fifo.membase + (jo->fifo.windex & jo->fifo.mask)); 578*82eaef15SMatthew Dillon if (req != bytes) { 579*82eaef15SMatthew Dillon journal_build_pad(rawp, req - bytes); 580*82eaef15SMatthew Dillon rawp = (void *)jo->fifo.membase; 5812281065eSMatthew Dillon } 582*82eaef15SMatthew Dillon rawp->begmagic = JREC_INCOMPLETEMAGIC; /* updated by abort/commit */ 583*82eaef15SMatthew Dillon rawp->recsize = bytes; /* (unaligned size) */ 584*82eaef15SMatthew Dillon rawp->streamid = streamid | JREC_STREAMCTL_BEGIN; 585*82eaef15SMatthew Dillon rawp->seqno = 0; /* set by caller */ 586*82eaef15SMatthew Dillon 587*82eaef15SMatthew Dillon /* 588*82eaef15SMatthew Dillon * Issue a memory barrier to guarentee that the record data has been 589*82eaef15SMatthew Dillon * properly initialized before we advance the write index and return 590*82eaef15SMatthew Dillon * a pointer to the reserved record. Otherwise the worker thread 591*82eaef15SMatthew Dillon * could accidently run past us. 592*82eaef15SMatthew Dillon * 593*82eaef15SMatthew Dillon * Note that stream records are always 16-byte aligned. 594*82eaef15SMatthew Dillon */ 595*82eaef15SMatthew Dillon cpu_mb1(); 596*82eaef15SMatthew Dillon jo->fifo.windex += (req + 15) & ~15; 597*82eaef15SMatthew Dillon *rawpp = rawp; 598*82eaef15SMatthew Dillon return(rawp + 1); 599*82eaef15SMatthew Dillon } 600*82eaef15SMatthew Dillon /* not reached */ 601*82eaef15SMatthew Dillon *rawpp = NULL; 602*82eaef15SMatthew Dillon return(NULL); 603*82eaef15SMatthew Dillon } 604*82eaef15SMatthew Dillon 605*82eaef15SMatthew Dillon /* 606*82eaef15SMatthew Dillon * Extend a previous reservation by the specified number of payload bytes. 607*82eaef15SMatthew Dillon * If it is not possible to extend the existing reservation due to either 608*82eaef15SMatthew Dillon * another thread having reserved space after us or due to a boundary 609*82eaef15SMatthew Dillon * condition, the current reservation will be committed and possibly 610*82eaef15SMatthew Dillon * truncated and a new reservation with the specified payload size will 611*82eaef15SMatthew Dillon * be created. *rawpp is set to the new reservation in this case but the 612*82eaef15SMatthew Dillon * caller cannot depend on a comparison with the old rawp to determine if 613*82eaef15SMatthew Dillon * this case occurs because we could end up using the same memory FIFO 614*82eaef15SMatthew Dillon * offset for the new stream record. 615*82eaef15SMatthew Dillon * 616*82eaef15SMatthew Dillon * In either case this function will return a pointer to the base of the 617*82eaef15SMatthew Dillon * extended payload space. 618*82eaef15SMatthew Dillon * 619*82eaef15SMatthew Dillon * If a new stream block is created the caller needs to recalculate payload 620*82eaef15SMatthew Dillon * byte counts, if the same stream block is used the caller needs to extend 621*82eaef15SMatthew Dillon * its current notion of the payload byte count. 622*82eaef15SMatthew Dillon */ 623*82eaef15SMatthew Dillon static void * 624*82eaef15SMatthew Dillon journal_extend(struct journal *jo, struct journal_rawrecbeg **rawpp, 625*82eaef15SMatthew Dillon int truncbytes, int bytes, int *newstreamrecp) 626*82eaef15SMatthew Dillon { 627*82eaef15SMatthew Dillon struct journal_rawrecbeg *rawp; 628*82eaef15SMatthew Dillon int16_t streamid; 629*82eaef15SMatthew Dillon int availtoend; 630*82eaef15SMatthew Dillon int avail; 631*82eaef15SMatthew Dillon int osize; 632*82eaef15SMatthew Dillon int nsize; 633*82eaef15SMatthew Dillon int wbase; 634*82eaef15SMatthew Dillon void *rptr; 635*82eaef15SMatthew Dillon 636*82eaef15SMatthew Dillon *newstreamrecp = 0; 637*82eaef15SMatthew Dillon rawp = *rawpp; 638*82eaef15SMatthew Dillon osize = (rawp->recsize + 15) & ~15; 639*82eaef15SMatthew Dillon nsize = (rawp->recsize + bytes + 15) & ~15; 640*82eaef15SMatthew Dillon wbase = (char *)rawp - jo->fifo.membase; 641*82eaef15SMatthew Dillon 642*82eaef15SMatthew Dillon /* 643*82eaef15SMatthew Dillon * If the aligned record size does not change we can trivially extend 644*82eaef15SMatthew Dillon * the record. 645*82eaef15SMatthew Dillon */ 646*82eaef15SMatthew Dillon if (nsize == osize) { 647*82eaef15SMatthew Dillon rawp->recsize += bytes; 648*82eaef15SMatthew Dillon return((char *)rawp + rawp->recsize - bytes); 649*82eaef15SMatthew Dillon } 650*82eaef15SMatthew Dillon 651*82eaef15SMatthew Dillon /* 652*82eaef15SMatthew Dillon * If the fifo's write index hasn't been modified since we made the 653*82eaef15SMatthew Dillon * reservation and we do not hit any boundary conditions, we can 654*82eaef15SMatthew Dillon * trivially extend the record. 655*82eaef15SMatthew Dillon */ 656*82eaef15SMatthew Dillon if ((jo->fifo.windex & jo->fifo.mask) == wbase + osize) { 657*82eaef15SMatthew Dillon availtoend = jo->fifo.size - wbase; 658*82eaef15SMatthew Dillon avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex) + osize; 659*82eaef15SMatthew Dillon KKASSERT((availtoend & 15) == 0); 660*82eaef15SMatthew Dillon KKASSERT((avail & 15) == 0); 661*82eaef15SMatthew Dillon if (nsize <= avail && nsize <= availtoend) { 662*82eaef15SMatthew Dillon jo->fifo.windex += nsize - osize; 663*82eaef15SMatthew Dillon rawp->recsize += bytes; 664*82eaef15SMatthew Dillon return((char *)rawp + rawp->recsize - bytes); 665*82eaef15SMatthew Dillon } 666*82eaef15SMatthew Dillon } 667*82eaef15SMatthew Dillon 668*82eaef15SMatthew Dillon /* 669*82eaef15SMatthew Dillon * It was not possible to extend the buffer. Commit the current 670*82eaef15SMatthew Dillon * buffer and create a new one. We manually clear the BEGIN mark that 671*82eaef15SMatthew Dillon * journal_reserve() creates (because this is a continuing record, not 672*82eaef15SMatthew Dillon * the start of a new stream). 673*82eaef15SMatthew Dillon */ 674*82eaef15SMatthew Dillon streamid = rawp->streamid & JREC_STREAMID_MASK; 675*82eaef15SMatthew Dillon journal_commit(jo, rawpp, truncbytes, 0); 676*82eaef15SMatthew Dillon rptr = journal_reserve(jo, rawpp, streamid, bytes); 677*82eaef15SMatthew Dillon rawp = *rawpp; 678*82eaef15SMatthew Dillon rawp->streamid &= ~JREC_STREAMCTL_BEGIN; 679*82eaef15SMatthew Dillon *newstreamrecp = 1; 680*82eaef15SMatthew Dillon return(rptr); 681*82eaef15SMatthew Dillon } 682*82eaef15SMatthew Dillon 683*82eaef15SMatthew Dillon /* 684*82eaef15SMatthew Dillon * Abort a journal record. If the transaction record represents a stream 685*82eaef15SMatthew Dillon * BEGIN and we can reverse the fifo's write index we can simply reverse 686*82eaef15SMatthew Dillon * index the entire record, as if it were never reserved in the first place. 687*82eaef15SMatthew Dillon * 688*82eaef15SMatthew Dillon * Otherwise we set the JREC_STREAMCTL_ABORTED bit and commit the record 689*82eaef15SMatthew Dillon * with the payload truncated to 0 bytes. 690*82eaef15SMatthew Dillon */ 691*82eaef15SMatthew Dillon static void 692*82eaef15SMatthew Dillon journal_abort(struct journal *jo, struct journal_rawrecbeg **rawpp) 693*82eaef15SMatthew Dillon { 694*82eaef15SMatthew Dillon struct journal_rawrecbeg *rawp; 695*82eaef15SMatthew Dillon int osize; 696*82eaef15SMatthew Dillon 697*82eaef15SMatthew Dillon rawp = *rawpp; 698*82eaef15SMatthew Dillon osize = (rawp->recsize + 15) & ~15; 699*82eaef15SMatthew Dillon 700*82eaef15SMatthew Dillon if ((rawp->streamid & JREC_STREAMCTL_BEGIN) && 701*82eaef15SMatthew Dillon (jo->fifo.windex & jo->fifo.mask) == 702*82eaef15SMatthew Dillon (char *)rawp - jo->fifo.membase + osize) 703*82eaef15SMatthew Dillon { 704*82eaef15SMatthew Dillon jo->fifo.windex -= osize; 705*82eaef15SMatthew Dillon *rawpp = NULL; 706*82eaef15SMatthew Dillon } else { 707*82eaef15SMatthew Dillon rawp->streamid |= JREC_STREAMCTL_ABORTED; 708*82eaef15SMatthew Dillon journal_commit(jo, rawpp, 0, 1); 709*82eaef15SMatthew Dillon } 710*82eaef15SMatthew Dillon } 711*82eaef15SMatthew Dillon 712*82eaef15SMatthew Dillon /* 713*82eaef15SMatthew Dillon * Commit a journal record and potentially truncate it to the specified 714*82eaef15SMatthew Dillon * number of payload bytes. If you do not want to truncate the record, 715*82eaef15SMatthew Dillon * simply pass -1 for the bytes parameter. Do not pass rawp->recsize, that 716*82eaef15SMatthew Dillon * field includes header and trailer and will not be correct. Note that 717*82eaef15SMatthew Dillon * passing 0 will truncate the entire data payload of the record. 718*82eaef15SMatthew Dillon * 719*82eaef15SMatthew Dillon * The logical stream is terminated by this function. 720*82eaef15SMatthew Dillon * 721*82eaef15SMatthew Dillon * If truncation occurs, and it is not possible to physically optimize the 722*82eaef15SMatthew Dillon * memory FIFO due to other threads having reserved space after ours, 723*82eaef15SMatthew Dillon * the remaining reserved space will be covered by a pad record. 724*82eaef15SMatthew Dillon */ 725*82eaef15SMatthew Dillon static void 726*82eaef15SMatthew Dillon journal_commit(struct journal *jo, struct journal_rawrecbeg **rawpp, 727*82eaef15SMatthew Dillon int bytes, int closeout) 728*82eaef15SMatthew Dillon { 729*82eaef15SMatthew Dillon struct journal_rawrecbeg *rawp; 730*82eaef15SMatthew Dillon struct journal_rawrecend *rendp; 731*82eaef15SMatthew Dillon int osize; 732*82eaef15SMatthew Dillon int nsize; 733*82eaef15SMatthew Dillon 734*82eaef15SMatthew Dillon rawp = *rawpp; 735*82eaef15SMatthew Dillon *rawpp = NULL; 736*82eaef15SMatthew Dillon 737*82eaef15SMatthew Dillon KKASSERT((char *)rawp >= jo->fifo.membase && 738*82eaef15SMatthew Dillon (char *)rawp + rawp->recsize <= jo->fifo.membase + jo->fifo.size); 739*82eaef15SMatthew Dillon KKASSERT(((intptr_t)rawp & 15) == 0); 740*82eaef15SMatthew Dillon 741*82eaef15SMatthew Dillon /* 742*82eaef15SMatthew Dillon * Truncate the record if requested. If the FIFO write index as still 743*82eaef15SMatthew Dillon * at the end of our record we can optimally backindex it. Otherwise 744*82eaef15SMatthew Dillon * we have to insert a pad record. 745*82eaef15SMatthew Dillon * 746*82eaef15SMatthew Dillon * We calculate osize which is the 16-byte-aligned original recsize. 747*82eaef15SMatthew Dillon * We calculate nsize which is the 16-byte-aligned new recsize. 748*82eaef15SMatthew Dillon * 749*82eaef15SMatthew Dillon * Due to alignment issues or in case the passed truncation bytes is 750*82eaef15SMatthew Dillon * the same as the original payload, windex will be equal to nindex. 751*82eaef15SMatthew Dillon */ 752*82eaef15SMatthew Dillon if (bytes >= 0) { 753*82eaef15SMatthew Dillon KKASSERT(bytes >= 0 && bytes <= rawp->recsize - sizeof(struct journal_rawrecbeg) - sizeof(struct journal_rawrecend)); 754*82eaef15SMatthew Dillon osize = (rawp->recsize + 15) & ~15; 755*82eaef15SMatthew Dillon rawp->recsize = bytes + sizeof(struct journal_rawrecbeg) + 756*82eaef15SMatthew Dillon sizeof(struct journal_rawrecend); 757*82eaef15SMatthew Dillon nsize = (rawp->recsize + 15) & ~15; 758*82eaef15SMatthew Dillon if (osize == nsize) { 759*82eaef15SMatthew Dillon /* do nothing */ 760*82eaef15SMatthew Dillon } else if ((jo->fifo.windex & jo->fifo.mask) == (char *)rawp - jo->fifo.membase + osize) { 761*82eaef15SMatthew Dillon /* we are able to backindex the fifo */ 762*82eaef15SMatthew Dillon jo->fifo.windex -= osize - nsize; 763*82eaef15SMatthew Dillon } else { 764*82eaef15SMatthew Dillon /* we cannot backindex the fifo, emplace a pad in the dead space */ 765*82eaef15SMatthew Dillon journal_build_pad((void *)((char *)rawp + osize), osize - nsize); 766*82eaef15SMatthew Dillon } 767*82eaef15SMatthew Dillon } 768*82eaef15SMatthew Dillon 769*82eaef15SMatthew Dillon /* 770*82eaef15SMatthew Dillon * Fill in the trailer. Note that unlike pad records, the trailer will 771*82eaef15SMatthew Dillon * never overlap the header. 772*82eaef15SMatthew Dillon */ 773*82eaef15SMatthew Dillon rendp = (void *)((char *)rawp + 774*82eaef15SMatthew Dillon ((rawp->recsize + 15) & ~15) - sizeof(*rendp)); 775*82eaef15SMatthew Dillon rendp->endmagic = JREC_ENDMAGIC; 776*82eaef15SMatthew Dillon rendp->recsize = rawp->recsize; 777*82eaef15SMatthew Dillon rendp->check = 0; /* XXX check word, disabled for now */ 778*82eaef15SMatthew Dillon 779*82eaef15SMatthew Dillon /* 780*82eaef15SMatthew Dillon * Fill in begmagic last. This will allow the worker thread to proceed. 781*82eaef15SMatthew Dillon * Use a memory barrier to guarentee write ordering. Mark the stream 782*82eaef15SMatthew Dillon * as terminated if closeout is set. This is the typical case. 783*82eaef15SMatthew Dillon */ 784*82eaef15SMatthew Dillon if (closeout) 785*82eaef15SMatthew Dillon rawp->streamid |= JREC_STREAMCTL_END; 786*82eaef15SMatthew Dillon cpu_mb1(); /* memory barrier */ 787*82eaef15SMatthew Dillon rawp->begmagic = JREC_BEGMAGIC; 788*82eaef15SMatthew Dillon 789*82eaef15SMatthew Dillon journal_commit_wakeup(jo); 790*82eaef15SMatthew Dillon } 791*82eaef15SMatthew Dillon 792*82eaef15SMatthew Dillon /************************************************************************ 793*82eaef15SMatthew Dillon * TRANSACTION SUPPORT ROUTINES * 794*82eaef15SMatthew Dillon ************************************************************************ 795*82eaef15SMatthew Dillon * 796*82eaef15SMatthew Dillon * JRECORD_*() - routines to create subrecord transactions and embed them 797*82eaef15SMatthew Dillon * in the logical streams managed by the journal_*() routines. 798*82eaef15SMatthew Dillon */ 799*82eaef15SMatthew Dillon 800*82eaef15SMatthew Dillon static int16_t sid = JREC_STREAMID_JMIN; 801*82eaef15SMatthew Dillon 802*82eaef15SMatthew Dillon /* 803*82eaef15SMatthew Dillon * Initialize the passed jrecord structure and start a new stream transaction 804*82eaef15SMatthew Dillon * by reserving an initial build space in the journal's memory FIFO. 805*82eaef15SMatthew Dillon */ 806*82eaef15SMatthew Dillon static void 807*82eaef15SMatthew Dillon jrecord_init(struct journal *jo, struct jrecord *jrec, int16_t streamid) 808*82eaef15SMatthew Dillon { 809*82eaef15SMatthew Dillon bzero(jrec, sizeof(*jrec)); 810*82eaef15SMatthew Dillon jrec->jo = jo; 811*82eaef15SMatthew Dillon if (streamid < 0) { 812*82eaef15SMatthew Dillon streamid = sid++; /* XXX need to track stream ids! */ 813*82eaef15SMatthew Dillon if (sid == JREC_STREAMID_JMAX) 814*82eaef15SMatthew Dillon sid = JREC_STREAMID_JMIN; 815*82eaef15SMatthew Dillon } 816*82eaef15SMatthew Dillon jrec->streamid = streamid; 817*82eaef15SMatthew Dillon jrec->stream_residual = JREC_DEFAULTSIZE; 818*82eaef15SMatthew Dillon jrec->stream_reserved = jrec->stream_residual; 819*82eaef15SMatthew Dillon jrec->stream_ptr = 820*82eaef15SMatthew Dillon journal_reserve(jo, &jrec->rawp, streamid, jrec->stream_reserved); 821*82eaef15SMatthew Dillon } 822*82eaef15SMatthew Dillon 823*82eaef15SMatthew Dillon /* 824*82eaef15SMatthew Dillon * Push a recursive record type. All pushes should have matching pops. 825*82eaef15SMatthew Dillon * The old parent is returned and the newly pushed record becomes the 826*82eaef15SMatthew Dillon * new parent. Note that the old parent's pointer may already be invalid 827*82eaef15SMatthew Dillon * or may become invalid if jrecord_write() had to build a new stream 828*82eaef15SMatthew Dillon * record, so the caller should not mess with the returned pointer in 829*82eaef15SMatthew Dillon * any way other then to save it. 830*82eaef15SMatthew Dillon */ 831*82eaef15SMatthew Dillon static 832*82eaef15SMatthew Dillon struct journal_subrecord * 833*82eaef15SMatthew Dillon jrecord_push(struct jrecord *jrec, int16_t rectype) 834*82eaef15SMatthew Dillon { 835*82eaef15SMatthew Dillon struct journal_subrecord *save; 836*82eaef15SMatthew Dillon 837*82eaef15SMatthew Dillon save = jrec->parent; 838*82eaef15SMatthew Dillon jrec->parent = jrecord_write(jrec, rectype|JMASK_NESTED, 0); 839*82eaef15SMatthew Dillon jrec->last = NULL; 840*82eaef15SMatthew Dillon KKASSERT(jrec->parent != NULL); 841*82eaef15SMatthew Dillon ++jrec->pushcount; 842*82eaef15SMatthew Dillon ++jrec->pushptrgood; /* cleared on flush */ 843*82eaef15SMatthew Dillon return(save); 844*82eaef15SMatthew Dillon } 845*82eaef15SMatthew Dillon 846*82eaef15SMatthew Dillon /* 847*82eaef15SMatthew Dillon * Pop a previously pushed sub-transaction. We must set JMASK_LAST 848*82eaef15SMatthew Dillon * on the last record written within the subtransaction. If the last 849*82eaef15SMatthew Dillon * record written is not accessible or if the subtransaction is empty, 850*82eaef15SMatthew Dillon * we must write out a pad record with JMASK_LAST set before popping. 851*82eaef15SMatthew Dillon * 852*82eaef15SMatthew Dillon * When popping a subtransaction the parent record's recsize field 853*82eaef15SMatthew Dillon * will be properly set. If the parent pointer is no longer valid 854*82eaef15SMatthew Dillon * (which can occur if the data has already been flushed out to the 855*82eaef15SMatthew Dillon * stream), the protocol spec allows us to leave it 0. 856*82eaef15SMatthew Dillon * 857*82eaef15SMatthew Dillon * The saved parent pointer which we restore may or may not be valid, 858*82eaef15SMatthew Dillon * and if not valid may or may not be NULL, depending on the value 859*82eaef15SMatthew Dillon * of pushptrgood. 860*82eaef15SMatthew Dillon */ 861*82eaef15SMatthew Dillon static void 862*82eaef15SMatthew Dillon jrecord_pop(struct jrecord *jrec, struct journal_subrecord *save) 863*82eaef15SMatthew Dillon { 864*82eaef15SMatthew Dillon struct journal_subrecord *last; 865*82eaef15SMatthew Dillon 866*82eaef15SMatthew Dillon KKASSERT(jrec->pushcount > 0); 867*82eaef15SMatthew Dillon KKASSERT(jrec->residual == 0); 868*82eaef15SMatthew Dillon 869*82eaef15SMatthew Dillon /* 870*82eaef15SMatthew Dillon * Set JMASK_LAST on the last record we wrote at the current 871*82eaef15SMatthew Dillon * level. If last is NULL we either no longer have access to the 872*82eaef15SMatthew Dillon * record or the subtransaction was empty and we must write out a pad 873*82eaef15SMatthew Dillon * record. 874*82eaef15SMatthew Dillon */ 875*82eaef15SMatthew Dillon if ((last = jrec->last) == NULL) { 876*82eaef15SMatthew Dillon jrecord_write(jrec, JLEAF_PAD|JMASK_LAST, 0); 877*82eaef15SMatthew Dillon last = jrec->last; /* reload after possible flush */ 878*82eaef15SMatthew Dillon } else { 879*82eaef15SMatthew Dillon last->rectype |= JMASK_LAST; 880*82eaef15SMatthew Dillon } 881*82eaef15SMatthew Dillon 882*82eaef15SMatthew Dillon /* 883*82eaef15SMatthew Dillon * pushptrgood tells us how many levels of parent record pointers 884*82eaef15SMatthew Dillon * are valid. The jrec only stores the current parent record pointer 885*82eaef15SMatthew Dillon * (and it is only valid if pushptrgood != 0). The higher level parent 886*82eaef15SMatthew Dillon * record pointers are saved by the routines calling jrecord_push() and 887*82eaef15SMatthew Dillon * jrecord_pop(). These pointers may become stale and we determine 888*82eaef15SMatthew Dillon * that fact by tracking the count of valid parent pointers with 889*82eaef15SMatthew Dillon * pushptrgood. Pointers become invalid when their related stream 890*82eaef15SMatthew Dillon * record gets pushed out. 891*82eaef15SMatthew Dillon * 892*82eaef15SMatthew Dillon * [parentA] 893*82eaef15SMatthew Dillon * [node X] 894*82eaef15SMatthew Dillon * [parentB] 895*82eaef15SMatthew Dillon * [node Y] 896*82eaef15SMatthew Dillon * [node Z] 897*82eaef15SMatthew Dillon * (pop B) see NOTE B 898*82eaef15SMatthew Dillon * (pop A) see NOTE A 899*82eaef15SMatthew Dillon * 900*82eaef15SMatthew Dillon * NOTE B: This pop sets LAST in node Z if the node is still accessible, 901*82eaef15SMatthew Dillon * else a PAD record is appended and LAST is set in that. 902*82eaef15SMatthew Dillon * 903*82eaef15SMatthew Dillon * This pop sets the record size in parentB if parentB is still 904*82eaef15SMatthew Dillon * accessible, else the record size is left 0 (the scanner must 905*82eaef15SMatthew Dillon * deal with that). 906*82eaef15SMatthew Dillon * 907*82eaef15SMatthew Dillon * This pop sets the new 'last' record to parentB, the pointer 908*82eaef15SMatthew Dillon * to which may or may not still be accessible. 909*82eaef15SMatthew Dillon * 910*82eaef15SMatthew Dillon * NOTE A: This pop sets LAST in parentB if the node is still accessible, 911*82eaef15SMatthew Dillon * else a PAD record is appended and LAST is set in that. 912*82eaef15SMatthew Dillon * 913*82eaef15SMatthew Dillon * This pop sets the record size in parentA if parentA is still 914*82eaef15SMatthew Dillon * accessible, else the record size is left 0 (the scanner must 915*82eaef15SMatthew Dillon * deal with that). 916*82eaef15SMatthew Dillon * 917*82eaef15SMatthew Dillon * This pop sets the new 'last' record to parentA, the pointer 918*82eaef15SMatthew Dillon * to which may or may not still be accessible. 919*82eaef15SMatthew Dillon * 920*82eaef15SMatthew Dillon * Also note that the last record in the stream transaction, which in 921*82eaef15SMatthew Dillon * the above example is parentA, does not currently have the LAST bit 922*82eaef15SMatthew Dillon * set. 923*82eaef15SMatthew Dillon * 924*82eaef15SMatthew Dillon * The current parent becomes the last record relative to the 925*82eaef15SMatthew Dillon * saved parent passed into us. It's validity is based on 926*82eaef15SMatthew Dillon * whether pushptrgood is non-zero prior to decrementing. The saved 927*82eaef15SMatthew Dillon * parent becomes the new parent, and its validity is based on whether 928*82eaef15SMatthew Dillon * pushptrgood is non-zero after decrementing. 929*82eaef15SMatthew Dillon * 930*82eaef15SMatthew Dillon * The old jrec->parent may be NULL if it is no longer accessible. 931*82eaef15SMatthew Dillon * If pushptrgood is non-zero, however, it is guarenteed to not 932*82eaef15SMatthew Dillon * be NULL (since no flush occured). 933*82eaef15SMatthew Dillon */ 934*82eaef15SMatthew Dillon jrec->last = jrec->parent; 935*82eaef15SMatthew Dillon --jrec->pushcount; 936*82eaef15SMatthew Dillon if (jrec->pushptrgood) { 937*82eaef15SMatthew Dillon KKASSERT(jrec->last != NULL && last != NULL); 938*82eaef15SMatthew Dillon if (--jrec->pushptrgood == 0) { 939*82eaef15SMatthew Dillon jrec->parent = NULL; /* 'save' contains garbage or NULL */ 940*82eaef15SMatthew Dillon } else { 941*82eaef15SMatthew Dillon KKASSERT(save != NULL); 942*82eaef15SMatthew Dillon jrec->parent = save; /* 'save' must not be NULL */ 943*82eaef15SMatthew Dillon } 944*82eaef15SMatthew Dillon 945*82eaef15SMatthew Dillon /* 946*82eaef15SMatthew Dillon * Set the record size in the old parent. 'last' still points to 947*82eaef15SMatthew Dillon * the original last record in the subtransaction being popped, 948*82eaef15SMatthew Dillon * jrec->last points to the old parent (which became the last 949*82eaef15SMatthew Dillon * record relative to the new parent being popped into). 950*82eaef15SMatthew Dillon */ 951*82eaef15SMatthew Dillon jrec->last->recsize = (char *)last + last->recsize - (char *)jrec->last; 952*82eaef15SMatthew Dillon } else { 953*82eaef15SMatthew Dillon jrec->parent = NULL; 954*82eaef15SMatthew Dillon KKASSERT(jrec->last == NULL); 955*82eaef15SMatthew Dillon } 956*82eaef15SMatthew Dillon } 957*82eaef15SMatthew Dillon 958*82eaef15SMatthew Dillon /* 959*82eaef15SMatthew Dillon * Write a leaf record out and return a pointer to its base. The leaf 960*82eaef15SMatthew Dillon * record may contain potentially megabytes of data which is supplied 961*82eaef15SMatthew Dillon * in jrecord_data() calls. The exact amount must be specified in this 962*82eaef15SMatthew Dillon * call. 963*82eaef15SMatthew Dillon */ 964*82eaef15SMatthew Dillon static 965*82eaef15SMatthew Dillon struct journal_subrecord * 966*82eaef15SMatthew Dillon jrecord_write(struct jrecord *jrec, int16_t rectype, int bytes) 967*82eaef15SMatthew Dillon { 968*82eaef15SMatthew Dillon struct journal_subrecord *last; 969*82eaef15SMatthew Dillon int pusheditout; 970*82eaef15SMatthew Dillon 971*82eaef15SMatthew Dillon /* 972*82eaef15SMatthew Dillon * Try to catch some obvious errors. Nesting records must specify a 973*82eaef15SMatthew Dillon * size of 0, and there should be no left-overs from previous operations 974*82eaef15SMatthew Dillon * (such as incomplete data writeouts). 975*82eaef15SMatthew Dillon */ 976*82eaef15SMatthew Dillon KKASSERT(bytes == 0 || (rectype & JMASK_NESTED) == 0); 977*82eaef15SMatthew Dillon KKASSERT(jrec->residual == 0); 978*82eaef15SMatthew Dillon 979*82eaef15SMatthew Dillon /* 980*82eaef15SMatthew Dillon * Check to see if the current stream record has enough room for 981*82eaef15SMatthew Dillon * the new subrecord header. If it doesn't we extend the current 982*82eaef15SMatthew Dillon * stream record. 983*82eaef15SMatthew Dillon * 984*82eaef15SMatthew Dillon * This may have the side effect of pushing out the current stream record 985*82eaef15SMatthew Dillon * and creating a new one. We must adjust our stream tracking fields 986*82eaef15SMatthew Dillon * accordingly. 987*82eaef15SMatthew Dillon */ 988*82eaef15SMatthew Dillon if (jrec->stream_residual < sizeof(struct journal_subrecord)) { 989*82eaef15SMatthew Dillon jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp, 990*82eaef15SMatthew Dillon jrec->stream_reserved - jrec->stream_residual, 991*82eaef15SMatthew Dillon JREC_DEFAULTSIZE, &pusheditout); 992*82eaef15SMatthew Dillon if (pusheditout) { 993*82eaef15SMatthew Dillon jrec->stream_reserved = JREC_DEFAULTSIZE; 994*82eaef15SMatthew Dillon jrec->stream_residual = JREC_DEFAULTSIZE; 995*82eaef15SMatthew Dillon jrec->parent = NULL; /* no longer accessible */ 996*82eaef15SMatthew Dillon jrec->pushptrgood = 0; /* restored parents in pops no good */ 997*82eaef15SMatthew Dillon } else { 998*82eaef15SMatthew Dillon jrec->stream_reserved += JREC_DEFAULTSIZE; 999*82eaef15SMatthew Dillon jrec->stream_residual += JREC_DEFAULTSIZE; 1000*82eaef15SMatthew Dillon } 1001*82eaef15SMatthew Dillon } 1002*82eaef15SMatthew Dillon last = (void *)jrec->stream_ptr; 1003*82eaef15SMatthew Dillon last->rectype = rectype; 1004*82eaef15SMatthew Dillon last->reserved = 0; 1005*82eaef15SMatthew Dillon last->recsize = sizeof(struct journal_subrecord) + bytes; 1006*82eaef15SMatthew Dillon jrec->last = last; 1007*82eaef15SMatthew Dillon jrec->residual = bytes; /* remaining data to be posted */ 1008*82eaef15SMatthew Dillon jrec->residual_align = -bytes & 7; /* post-data alignment required */ 1009*82eaef15SMatthew Dillon return(last); 1010*82eaef15SMatthew Dillon } 1011*82eaef15SMatthew Dillon 1012*82eaef15SMatthew Dillon /* 1013*82eaef15SMatthew Dillon * Write out the data associated with a leaf record. Any number of calls 1014*82eaef15SMatthew Dillon * to this routine may be made as long as the byte count adds up to the 1015*82eaef15SMatthew Dillon * amount originally specified in jrecord_write(). 1016*82eaef15SMatthew Dillon * 1017*82eaef15SMatthew Dillon * The act of writing out the leaf data may result in numerous stream records 1018*82eaef15SMatthew Dillon * being pushed out. Callers should be aware that even the associated 1019*82eaef15SMatthew Dillon * subrecord header may become inaccessible due to stream record pushouts. 1020*82eaef15SMatthew Dillon */ 1021*82eaef15SMatthew Dillon static void 1022*82eaef15SMatthew Dillon jrecord_data(struct jrecord *jrec, const void *buf, int bytes) 1023*82eaef15SMatthew Dillon { 1024*82eaef15SMatthew Dillon int pusheditout; 1025*82eaef15SMatthew Dillon int extsize; 1026*82eaef15SMatthew Dillon 1027*82eaef15SMatthew Dillon KKASSERT(bytes >= 0 && bytes <= jrec->residual); 1028*82eaef15SMatthew Dillon 1029*82eaef15SMatthew Dillon /* 1030*82eaef15SMatthew Dillon * Push out stream records as long as there is insufficient room to hold 1031*82eaef15SMatthew Dillon * the remaining data. 1032*82eaef15SMatthew Dillon */ 1033*82eaef15SMatthew Dillon while (jrec->stream_residual < bytes) { 1034*82eaef15SMatthew Dillon /* 1035*82eaef15SMatthew Dillon * Fill in any remaining space in the current stream record. 1036*82eaef15SMatthew Dillon */ 1037*82eaef15SMatthew Dillon bcopy(buf, jrec->stream_ptr, jrec->stream_residual); 1038*82eaef15SMatthew Dillon buf = (const char *)buf + jrec->stream_residual; 1039*82eaef15SMatthew Dillon bytes -= jrec->stream_residual; 1040*82eaef15SMatthew Dillon /*jrec->stream_ptr += jrec->stream_residual;*/ 1041*82eaef15SMatthew Dillon jrec->stream_residual = 0; 1042*82eaef15SMatthew Dillon jrec->residual -= jrec->stream_residual; 1043*82eaef15SMatthew Dillon 1044*82eaef15SMatthew Dillon /* 1045*82eaef15SMatthew Dillon * Try to extend the current stream record, but no more then 1/4 1046*82eaef15SMatthew Dillon * the size of the FIFO. 1047*82eaef15SMatthew Dillon */ 1048*82eaef15SMatthew Dillon extsize = jrec->jo->fifo.size >> 2; 1049*82eaef15SMatthew Dillon if (extsize > bytes) 1050*82eaef15SMatthew Dillon extsize = (bytes + 15) & ~15; 1051*82eaef15SMatthew Dillon 1052*82eaef15SMatthew Dillon jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp, 1053*82eaef15SMatthew Dillon jrec->stream_reserved - jrec->stream_residual, 1054*82eaef15SMatthew Dillon extsize, &pusheditout); 1055*82eaef15SMatthew Dillon if (pusheditout) { 1056*82eaef15SMatthew Dillon jrec->stream_reserved = extsize; 1057*82eaef15SMatthew Dillon jrec->stream_residual = extsize; 1058*82eaef15SMatthew Dillon jrec->parent = NULL; /* no longer accessible */ 1059*82eaef15SMatthew Dillon jrec->last = NULL; /* no longer accessible */ 1060*82eaef15SMatthew Dillon jrec->pushptrgood = 0; /* restored parents in pops no good */ 1061*82eaef15SMatthew Dillon } else { 1062*82eaef15SMatthew Dillon jrec->stream_reserved += extsize; 1063*82eaef15SMatthew Dillon jrec->stream_residual += extsize; 1064*82eaef15SMatthew Dillon } 1065*82eaef15SMatthew Dillon } 1066*82eaef15SMatthew Dillon 1067*82eaef15SMatthew Dillon /* 1068*82eaef15SMatthew Dillon * Push out any remaining bytes into the current stream record. 1069*82eaef15SMatthew Dillon */ 1070*82eaef15SMatthew Dillon if (bytes) { 1071*82eaef15SMatthew Dillon bcopy(buf, jrec->stream_ptr, bytes); 1072*82eaef15SMatthew Dillon jrec->stream_ptr += bytes; 1073*82eaef15SMatthew Dillon jrec->stream_residual -= bytes; 1074*82eaef15SMatthew Dillon jrec->residual -= bytes; 1075*82eaef15SMatthew Dillon } 1076*82eaef15SMatthew Dillon 1077*82eaef15SMatthew Dillon /* 1078*82eaef15SMatthew Dillon * Handle data alignment requirements for the subrecord. Because the 1079*82eaef15SMatthew Dillon * stream record's data space is more strictly aligned, it must already 1080*82eaef15SMatthew Dillon * have sufficient space to hold any subrecord alignment slop. 1081*82eaef15SMatthew Dillon */ 1082*82eaef15SMatthew Dillon if (jrec->residual == 0 && jrec->residual_align) { 1083*82eaef15SMatthew Dillon KKASSERT(jrec->residual_align <= jrec->stream_residual); 1084*82eaef15SMatthew Dillon bzero(jrec->stream_ptr, jrec->residual_align); 1085*82eaef15SMatthew Dillon jrec->stream_ptr += jrec->residual_align; 1086*82eaef15SMatthew Dillon jrec->stream_residual -= jrec->residual_align; 1087*82eaef15SMatthew Dillon jrec->residual_align = 0; 1088*82eaef15SMatthew Dillon } 1089*82eaef15SMatthew Dillon } 1090*82eaef15SMatthew Dillon 1091*82eaef15SMatthew Dillon /* 1092*82eaef15SMatthew Dillon * We are finished with a transaction. If abortit is not set then we must 1093*82eaef15SMatthew Dillon * be at the top level with no residual subrecord data left to output. 1094*82eaef15SMatthew Dillon * If abortit is set then we can be in any state. 1095*82eaef15SMatthew Dillon * 1096*82eaef15SMatthew Dillon * The stream record will be committed or aborted as specified and jrecord 1097*82eaef15SMatthew Dillon * resources will be cleaned up. 1098*82eaef15SMatthew Dillon */ 1099*82eaef15SMatthew Dillon static void 1100*82eaef15SMatthew Dillon jrecord_done(struct jrecord *jrec, int abortit) 1101*82eaef15SMatthew Dillon { 1102*82eaef15SMatthew Dillon KKASSERT(jrec->rawp != NULL); 1103*82eaef15SMatthew Dillon 1104*82eaef15SMatthew Dillon if (abortit) { 1105*82eaef15SMatthew Dillon journal_abort(jrec->jo, &jrec->rawp); 1106*82eaef15SMatthew Dillon } else { 1107*82eaef15SMatthew Dillon KKASSERT(jrec->pushcount == 0 && jrec->residual == 0); 1108*82eaef15SMatthew Dillon journal_commit(jrec->jo, &jrec->rawp, 1109*82eaef15SMatthew Dillon jrec->stream_reserved - jrec->stream_residual, 1); 1110*82eaef15SMatthew Dillon } 1111*82eaef15SMatthew Dillon 1112*82eaef15SMatthew Dillon /* 1113*82eaef15SMatthew Dillon * jrec should not be used beyond this point without another init, 1114*82eaef15SMatthew Dillon * but clean up some fields to ensure that we panic if it is. 1115*82eaef15SMatthew Dillon * 1116*82eaef15SMatthew Dillon * Note that jrec->rawp is NULLd out by journal_abort/journal_commit. 1117*82eaef15SMatthew Dillon */ 1118*82eaef15SMatthew Dillon jrec->jo = NULL; 1119*82eaef15SMatthew Dillon jrec->stream_ptr = NULL; 1120*82eaef15SMatthew Dillon } 1121*82eaef15SMatthew Dillon 1122*82eaef15SMatthew Dillon /************************************************************************ 1123*82eaef15SMatthew Dillon * LEAF RECORD SUPPORT ROUTINES * 1124*82eaef15SMatthew Dillon ************************************************************************ 1125*82eaef15SMatthew Dillon * 1126*82eaef15SMatthew Dillon * These routine create leaf subrecords representing common filesystem 1127*82eaef15SMatthew Dillon * structures. 1128*82eaef15SMatthew Dillon */ 1129*82eaef15SMatthew Dillon 1130*82eaef15SMatthew Dillon static void 1131*82eaef15SMatthew Dillon jrecord_write_path(struct jrecord *jrec, int16_t rectype, struct namecache *ncp) 1132*82eaef15SMatthew Dillon { 1133*82eaef15SMatthew Dillon } 1134*82eaef15SMatthew Dillon 1135*82eaef15SMatthew Dillon static void 1136*82eaef15SMatthew Dillon jrecord_write_vattr(struct jrecord *jrec, struct vattr *vat) 1137*82eaef15SMatthew Dillon { 11382281065eSMatthew Dillon } 11392281065eSMatthew Dillon 11402281065eSMatthew Dillon /************************************************************************ 11412281065eSMatthew Dillon * JOURNAL VNOPS * 11422281065eSMatthew Dillon ************************************************************************/ 11432281065eSMatthew Dillon 11442281065eSMatthew Dillon static 11452281065eSMatthew Dillon int 11462281065eSMatthew Dillon journal_nmkdir(struct vop_nmkdir_args *ap) 11472281065eSMatthew Dillon { 1148*82eaef15SMatthew Dillon struct mount *mp; 1149*82eaef15SMatthew Dillon struct journal *jo; 1150*82eaef15SMatthew Dillon struct jrecord jrec; 1151*82eaef15SMatthew Dillon void *save; /* warning, save pointers do not always remain valid */ 11522281065eSMatthew Dillon int error; 11532281065eSMatthew Dillon 11542281065eSMatthew Dillon error = vop_journal_operate_ap(&ap->a_head); 1155*82eaef15SMatthew Dillon mp = ap->a_head.a_ops->vv_mount; 1156*82eaef15SMatthew Dillon if (error == 0) { 1157*82eaef15SMatthew Dillon TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1158*82eaef15SMatthew Dillon jrecord_init(jo, &jrec, -1); 1159*82eaef15SMatthew Dillon if (jo->flags & MC_JOURNAL_WANT_REVERSABLE) { 1160*82eaef15SMatthew Dillon save = jrecord_push(&jrec, JTYPE_UNDO); 1161*82eaef15SMatthew Dillon /* XXX undo operations */ 1162*82eaef15SMatthew Dillon jrecord_pop(&jrec, save); 1163*82eaef15SMatthew Dillon } 1164*82eaef15SMatthew Dillon #if 0 1165*82eaef15SMatthew Dillon if (jo->flags & MC_JOURNAL_WANT_AUDIT) { 1166*82eaef15SMatthew Dillon jrecord_write_audit(&jrec); 1167*82eaef15SMatthew Dillon } 1168*82eaef15SMatthew Dillon #endif 1169*82eaef15SMatthew Dillon save = jrecord_push(&jrec, JTYPE_MKDIR); 1170*82eaef15SMatthew Dillon jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1171*82eaef15SMatthew Dillon jrecord_write_vattr(&jrec, ap->a_vap); 1172*82eaef15SMatthew Dillon jrecord_pop(&jrec, save); 1173*82eaef15SMatthew Dillon jrecord_done(&jrec, 0); 1174*82eaef15SMatthew Dillon } 1175*82eaef15SMatthew Dillon } 11762281065eSMatthew Dillon return (error); 11772281065eSMatthew Dillon } 11782281065eSMatthew Dillon 1179