xref: /dragonfly/lib/libdmsg/msg.c (revision 157f2a25)
10c3a8cd0SMatthew Dillon /*
2e96cef49SMatthew Dillon  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
30c3a8cd0SMatthew Dillon  *
40c3a8cd0SMatthew Dillon  * This code is derived from software contributed to The DragonFly Project
50c3a8cd0SMatthew Dillon  * by Matthew Dillon <dillon@dragonflybsd.org>
60c3a8cd0SMatthew Dillon  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
70c3a8cd0SMatthew Dillon  *
80c3a8cd0SMatthew Dillon  * Redistribution and use in source and binary forms, with or without
90c3a8cd0SMatthew Dillon  * modification, are permitted provided that the following conditions
100c3a8cd0SMatthew Dillon  * are met:
110c3a8cd0SMatthew Dillon  *
120c3a8cd0SMatthew Dillon  * 1. Redistributions of source code must retain the above copyright
130c3a8cd0SMatthew Dillon  *    notice, this list of conditions and the following disclaimer.
140c3a8cd0SMatthew Dillon  * 2. Redistributions in binary form must reproduce the above copyright
150c3a8cd0SMatthew Dillon  *    notice, this list of conditions and the following disclaimer in
160c3a8cd0SMatthew Dillon  *    the documentation and/or other materials provided with the
170c3a8cd0SMatthew Dillon  *    distribution.
180c3a8cd0SMatthew Dillon  * 3. Neither the name of The DragonFly Project nor the names of its
190c3a8cd0SMatthew Dillon  *    contributors may be used to endorse or promote products derived
200c3a8cd0SMatthew Dillon  *    from this software without specific, prior written permission.
210c3a8cd0SMatthew Dillon  *
220c3a8cd0SMatthew Dillon  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
230c3a8cd0SMatthew Dillon  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
240c3a8cd0SMatthew Dillon  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
250c3a8cd0SMatthew Dillon  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
260c3a8cd0SMatthew Dillon  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
270c3a8cd0SMatthew Dillon  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
280c3a8cd0SMatthew Dillon  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
290c3a8cd0SMatthew Dillon  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
300c3a8cd0SMatthew Dillon  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
310c3a8cd0SMatthew Dillon  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
320c3a8cd0SMatthew Dillon  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
330c3a8cd0SMatthew Dillon  * SUCH DAMAGE.
340c3a8cd0SMatthew Dillon  */
350c3a8cd0SMatthew Dillon 
360c3a8cd0SMatthew Dillon #include "dmsg_local.h"
370c3a8cd0SMatthew Dillon 
380a9eefcaSMatthew Dillon #define DMSG_BLOCK_DEBUG
390a9eefcaSMatthew Dillon 
400c3a8cd0SMatthew Dillon int DMsgDebugOpt;
41*157f2a25STomohiro Kusumi static unsigned int dmsg_state_count;
427adbba57SMatthew Dillon #ifdef DMSG_BLOCK_DEBUG
43*157f2a25STomohiro Kusumi static unsigned int biocount;
447adbba57SMatthew Dillon #endif
450c3a8cd0SMatthew Dillon 
460a9eefcaSMatthew Dillon static int dmsg_state_msgrx(dmsg_msg_t *msg, int mstate);
471b8eded1SMatthew Dillon static void dmsg_state_cleanuptx(dmsg_iocom_t *iocom, dmsg_msg_t *msg);
48a2179323SMatthew Dillon static void dmsg_msg_free_locked(dmsg_msg_t *msg);
49323c0947SMatthew Dillon static void dmsg_state_free(dmsg_state_t *state);
500a9eefcaSMatthew Dillon static void dmsg_subq_delete(dmsg_state_t *state);
510a9eefcaSMatthew Dillon static void dmsg_simulate_failure(dmsg_state_t *state, int meto, int error);
520a9eefcaSMatthew Dillon static void dmsg_state_abort(dmsg_state_t *state);
530a9eefcaSMatthew Dillon static void dmsg_state_dying(dmsg_state_t *state);
540c3a8cd0SMatthew Dillon 
550d20ec8aSMatthew Dillon RB_GENERATE(dmsg_state_tree, dmsg_state, rbnode, dmsg_state_cmp);
560c3a8cd0SMatthew Dillon 
570c3a8cd0SMatthew Dillon /*
580c3a8cd0SMatthew Dillon  * STATE TREE - Represents open transactions which are indexed by their
590d20ec8aSMatthew Dillon  *		{ msgid } relative to the governing iocom.
600c3a8cd0SMatthew Dillon  */
610c3a8cd0SMatthew Dillon int
620c3a8cd0SMatthew Dillon dmsg_state_cmp(dmsg_state_t *state1, dmsg_state_t *state2)
630c3a8cd0SMatthew Dillon {
640c3a8cd0SMatthew Dillon 	if (state1->msgid < state2->msgid)
650c3a8cd0SMatthew Dillon 		return(-1);
660c3a8cd0SMatthew Dillon 	if (state1->msgid > state2->msgid)
670c3a8cd0SMatthew Dillon 		return(1);
680c3a8cd0SMatthew Dillon 	return(0);
690c3a8cd0SMatthew Dillon }
700c3a8cd0SMatthew Dillon 
710d20ec8aSMatthew Dillon /*
720c3a8cd0SMatthew Dillon  * Initialize a low-level ioq
730c3a8cd0SMatthew Dillon  */
740c3a8cd0SMatthew Dillon void
750c3a8cd0SMatthew Dillon dmsg_ioq_init(dmsg_iocom_t *iocom __unused, dmsg_ioq_t *ioq)
760c3a8cd0SMatthew Dillon {
770c3a8cd0SMatthew Dillon 	bzero(ioq, sizeof(*ioq));
780c3a8cd0SMatthew Dillon 	ioq->state = DMSG_MSGQ_STATE_HEADER1;
790c3a8cd0SMatthew Dillon 	TAILQ_INIT(&ioq->msgq);
800c3a8cd0SMatthew Dillon }
810c3a8cd0SMatthew Dillon 
820c3a8cd0SMatthew Dillon /*
830c3a8cd0SMatthew Dillon  * Cleanup queue.
840c3a8cd0SMatthew Dillon  *
850c3a8cd0SMatthew Dillon  * caller holds iocom->mtx.
860c3a8cd0SMatthew Dillon  */
870c3a8cd0SMatthew Dillon void
880c3a8cd0SMatthew Dillon dmsg_ioq_done(dmsg_iocom_t *iocom __unused, dmsg_ioq_t *ioq)
890c3a8cd0SMatthew Dillon {
900c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
910c3a8cd0SMatthew Dillon 
920c3a8cd0SMatthew Dillon 	while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
930c3a8cd0SMatthew Dillon 		assert(0);	/* shouldn't happen */
940c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&ioq->msgq, msg, qentry);
950c3a8cd0SMatthew Dillon 		dmsg_msg_free(msg);
960c3a8cd0SMatthew Dillon 	}
970c3a8cd0SMatthew Dillon 	if ((msg = ioq->msg) != NULL) {
980c3a8cd0SMatthew Dillon 		ioq->msg = NULL;
990c3a8cd0SMatthew Dillon 		dmsg_msg_free(msg);
1000c3a8cd0SMatthew Dillon 	}
1010c3a8cd0SMatthew Dillon }
1020c3a8cd0SMatthew Dillon 
1030c3a8cd0SMatthew Dillon /*
1040c3a8cd0SMatthew Dillon  * Initialize a low-level communications channel.
1050c3a8cd0SMatthew Dillon  *
1060c3a8cd0SMatthew Dillon  * NOTE: The signal_func() is called at least once from the loop and can be
1070c3a8cd0SMatthew Dillon  *	 re-armed via dmsg_iocom_restate().
1080c3a8cd0SMatthew Dillon  */
1090c3a8cd0SMatthew Dillon void
1100c3a8cd0SMatthew Dillon dmsg_iocom_init(dmsg_iocom_t *iocom, int sock_fd, int alt_fd,
11101e43224SMatthew Dillon 		   void (*signal_func)(dmsg_iocom_t *iocom),
11201e43224SMatthew Dillon 		   void (*rcvmsg_func)(dmsg_msg_t *msg),
11301e43224SMatthew Dillon 		   void (*usrmsg_func)(dmsg_msg_t *msg, int unmanaged),
11401e43224SMatthew Dillon 		   void (*altmsg_func)(dmsg_iocom_t *iocom))
1150c3a8cd0SMatthew Dillon {
1160c3a8cd0SMatthew Dillon 	struct stat st;
1170c3a8cd0SMatthew Dillon 
1180c3a8cd0SMatthew Dillon 	bzero(iocom, sizeof(*iocom));
1190c3a8cd0SMatthew Dillon 
120f306de83SMatthew Dillon 	asprintf(&iocom->label, "iocom-%p", iocom);
1210d20ec8aSMatthew Dillon 	iocom->signal_callback = signal_func;
1220d20ec8aSMatthew Dillon 	iocom->rcvmsg_callback = rcvmsg_func;
1230d20ec8aSMatthew Dillon 	iocom->altmsg_callback = altmsg_func;
12401e43224SMatthew Dillon 	iocom->usrmsg_callback = usrmsg_func;
1250c3a8cd0SMatthew Dillon 
1260c3a8cd0SMatthew Dillon 	pthread_mutex_init(&iocom->mtx, NULL);
1271b8eded1SMatthew Dillon 	RB_INIT(&iocom->staterd_tree);
1281b8eded1SMatthew Dillon 	RB_INIT(&iocom->statewr_tree);
1290d20ec8aSMatthew Dillon 	TAILQ_INIT(&iocom->txmsgq);
1300c3a8cd0SMatthew Dillon 	iocom->sock_fd = sock_fd;
1310c3a8cd0SMatthew Dillon 	iocom->alt_fd = alt_fd;
13298126869SMatthew Dillon 	iocom->flags = DMSG_IOCOMF_RREQ | DMSG_IOCOMF_CLOSEALT;
1330c3a8cd0SMatthew Dillon 	if (signal_func)
1340c3a8cd0SMatthew Dillon 		iocom->flags |= DMSG_IOCOMF_SWORK;
1350c3a8cd0SMatthew Dillon 	dmsg_ioq_init(iocom, &iocom->ioq_rx);
1360c3a8cd0SMatthew Dillon 	dmsg_ioq_init(iocom, &iocom->ioq_tx);
137323c0947SMatthew Dillon 	iocom->state0.refs = 1;		/* should never trigger a free */
1381b8eded1SMatthew Dillon 	iocom->state0.iocom = iocom;
1391b8eded1SMatthew Dillon 	iocom->state0.parent = &iocom->state0;
140d30cab67SMatthew Dillon 	iocom->state0.flags = DMSG_STATE_ROOT;
1411b8eded1SMatthew Dillon 	TAILQ_INIT(&iocom->state0.subq);
1421b8eded1SMatthew Dillon 
1430c3a8cd0SMatthew Dillon 	if (pipe(iocom->wakeupfds) < 0)
1440c3a8cd0SMatthew Dillon 		assert(0);
1450c3a8cd0SMatthew Dillon 	fcntl(iocom->wakeupfds[0], F_SETFL, O_NONBLOCK);
1460c3a8cd0SMatthew Dillon 	fcntl(iocom->wakeupfds[1], F_SETFL, O_NONBLOCK);
1470c3a8cd0SMatthew Dillon 
1480c3a8cd0SMatthew Dillon 	/*
1490c3a8cd0SMatthew Dillon 	 * Negotiate session crypto synchronously.  This will mark the
1500c3a8cd0SMatthew Dillon 	 * connection as error'd if it fails.  If this is a pipe it's
1510c3a8cd0SMatthew Dillon 	 * a linkage that we set up ourselves to the filesystem and there
1520c3a8cd0SMatthew Dillon 	 * is no crypto.
1530c3a8cd0SMatthew Dillon 	 */
1540c3a8cd0SMatthew Dillon 	if (fstat(sock_fd, &st) < 0)
1550c3a8cd0SMatthew Dillon 		assert(0);
1560c3a8cd0SMatthew Dillon 	if (S_ISSOCK(st.st_mode))
1570c3a8cd0SMatthew Dillon 		dmsg_crypto_negotiate(iocom);
1580c3a8cd0SMatthew Dillon 
1590c3a8cd0SMatthew Dillon 	/*
1600c3a8cd0SMatthew Dillon 	 * Make sure our fds are set to non-blocking for the iocom core.
1610c3a8cd0SMatthew Dillon 	 */
1620c3a8cd0SMatthew Dillon 	if (sock_fd >= 0)
1630c3a8cd0SMatthew Dillon 		fcntl(sock_fd, F_SETFL, O_NONBLOCK);
1640c3a8cd0SMatthew Dillon #if 0
1650c3a8cd0SMatthew Dillon 	/* if line buffered our single fgets() should be fine */
1660c3a8cd0SMatthew Dillon 	if (alt_fd >= 0)
1670c3a8cd0SMatthew Dillon 		fcntl(alt_fd, F_SETFL, O_NONBLOCK);
1680c3a8cd0SMatthew Dillon #endif
1690c3a8cd0SMatthew Dillon }
1700c3a8cd0SMatthew Dillon 
171f306de83SMatthew Dillon void
172f306de83SMatthew Dillon dmsg_iocom_label(dmsg_iocom_t *iocom, const char *ctl, ...)
173f306de83SMatthew Dillon {
174f306de83SMatthew Dillon 	va_list va;
175f306de83SMatthew Dillon 	char *optr;
176f306de83SMatthew Dillon 
177f306de83SMatthew Dillon 	va_start(va, ctl);
178f306de83SMatthew Dillon 	optr = iocom->label;
179f306de83SMatthew Dillon 	vasprintf(&iocom->label, ctl, va);
180f306de83SMatthew Dillon 	va_end(va);
181f306de83SMatthew Dillon 	if (optr)
182f306de83SMatthew Dillon 		free(optr);
183f306de83SMatthew Dillon }
184f306de83SMatthew Dillon 
1850c3a8cd0SMatthew Dillon /*
1860c3a8cd0SMatthew Dillon  * May only be called from a callback from iocom_core.
1870c3a8cd0SMatthew Dillon  *
1880c3a8cd0SMatthew Dillon  * Adjust state machine functions, set flags to guarantee that both
1890c3a8cd0SMatthew Dillon  * the recevmsg_func and the sendmsg_func is called at least once.
1900c3a8cd0SMatthew Dillon  */
1910c3a8cd0SMatthew Dillon void
1920d20ec8aSMatthew Dillon dmsg_iocom_restate(dmsg_iocom_t *iocom,
1930d20ec8aSMatthew Dillon 		   void (*signal_func)(dmsg_iocom_t *),
19401e43224SMatthew Dillon 		   void (*rcvmsg_func)(dmsg_msg_t *msg))
1950c3a8cd0SMatthew Dillon {
196a2179323SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
1970d20ec8aSMatthew Dillon 	iocom->signal_callback = signal_func;
1980d20ec8aSMatthew Dillon 	iocom->rcvmsg_callback = rcvmsg_func;
1990c3a8cd0SMatthew Dillon 	if (signal_func)
200a2179323SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_SWORK);
2010c3a8cd0SMatthew Dillon 	else
202a2179323SMatthew Dillon 		atomic_clear_int(&iocom->flags, DMSG_IOCOMF_SWORK);
203a2179323SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
2040c3a8cd0SMatthew Dillon }
2050c3a8cd0SMatthew Dillon 
2060c3a8cd0SMatthew Dillon void
2070d20ec8aSMatthew Dillon dmsg_iocom_signal(dmsg_iocom_t *iocom)
2080c3a8cd0SMatthew Dillon {
209a2179323SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
2100d20ec8aSMatthew Dillon 	if (iocom->signal_callback)
211a2179323SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_SWORK);
212a2179323SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
2130c3a8cd0SMatthew Dillon }
2140c3a8cd0SMatthew Dillon 
2150c3a8cd0SMatthew Dillon /*
2160c3a8cd0SMatthew Dillon  * Cleanup a terminating iocom.
2170c3a8cd0SMatthew Dillon  *
2180c3a8cd0SMatthew Dillon  * Caller should not hold iocom->mtx.  The iocom has already been disconnected
2190c3a8cd0SMatthew Dillon  * from all possible references to it.
2200c3a8cd0SMatthew Dillon  */
2210c3a8cd0SMatthew Dillon void
2220c3a8cd0SMatthew Dillon dmsg_iocom_done(dmsg_iocom_t *iocom)
2230c3a8cd0SMatthew Dillon {
2240c3a8cd0SMatthew Dillon 	if (iocom->sock_fd >= 0) {
2250c3a8cd0SMatthew Dillon 		close(iocom->sock_fd);
2260c3a8cd0SMatthew Dillon 		iocom->sock_fd = -1;
2270c3a8cd0SMatthew Dillon 	}
22898126869SMatthew Dillon 	if (iocom->alt_fd >= 0 && (iocom->flags & DMSG_IOCOMF_CLOSEALT)) {
2290c3a8cd0SMatthew Dillon 		close(iocom->alt_fd);
2300c3a8cd0SMatthew Dillon 		iocom->alt_fd = -1;
2310c3a8cd0SMatthew Dillon 	}
2320c3a8cd0SMatthew Dillon 	dmsg_ioq_done(iocom, &iocom->ioq_rx);
2330c3a8cd0SMatthew Dillon 	dmsg_ioq_done(iocom, &iocom->ioq_tx);
2340c3a8cd0SMatthew Dillon 	if (iocom->wakeupfds[0] >= 0) {
2350c3a8cd0SMatthew Dillon 		close(iocom->wakeupfds[0]);
2360c3a8cd0SMatthew Dillon 		iocom->wakeupfds[0] = -1;
2370c3a8cd0SMatthew Dillon 	}
2380c3a8cd0SMatthew Dillon 	if (iocom->wakeupfds[1] >= 0) {
2390c3a8cd0SMatthew Dillon 		close(iocom->wakeupfds[1]);
2400c3a8cd0SMatthew Dillon 		iocom->wakeupfds[1] = -1;
2410c3a8cd0SMatthew Dillon 	}
2420c3a8cd0SMatthew Dillon 	pthread_mutex_destroy(&iocom->mtx);
2430c3a8cd0SMatthew Dillon }
2440c3a8cd0SMatthew Dillon 
2450c3a8cd0SMatthew Dillon /*
2461b8eded1SMatthew Dillon  * Allocate a new message using the specified transaction state.
247a2179323SMatthew Dillon  *
2481b8eded1SMatthew Dillon  * If CREATE is set a new transaction is allocated relative to the passed-in
249d30cab67SMatthew Dillon  * transaction (the 'state' argument becomes pstate).
2501b8eded1SMatthew Dillon  *
2511b8eded1SMatthew Dillon  * If CREATE is not set the message is associated with the passed-in
2521b8eded1SMatthew Dillon  * transaction.
2530c3a8cd0SMatthew Dillon  */
2540c3a8cd0SMatthew Dillon dmsg_msg_t *
2551b8eded1SMatthew Dillon dmsg_msg_alloc(dmsg_state_t *state,
2560d20ec8aSMatthew Dillon 	       size_t aux_size, uint32_t cmd,
2570c3a8cd0SMatthew Dillon 	       void (*func)(dmsg_msg_t *), void *data)
2580c3a8cd0SMatthew Dillon {
2591b8eded1SMatthew Dillon 	dmsg_iocom_t *iocom = state->iocom;
260323c0947SMatthew Dillon 	dmsg_msg_t *msg;
261323c0947SMatthew Dillon 
262323c0947SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
263323c0947SMatthew Dillon 	msg = dmsg_msg_alloc_locked(state, aux_size, cmd, func, data);
264323c0947SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
265323c0947SMatthew Dillon 
266323c0947SMatthew Dillon 	return msg;
267323c0947SMatthew Dillon }
268323c0947SMatthew Dillon 
269323c0947SMatthew Dillon dmsg_msg_t *
270323c0947SMatthew Dillon dmsg_msg_alloc_locked(dmsg_state_t *state,
271323c0947SMatthew Dillon 	       size_t aux_size, uint32_t cmd,
272323c0947SMatthew Dillon 	       void (*func)(dmsg_msg_t *), void *data)
273323c0947SMatthew Dillon {
274323c0947SMatthew Dillon 	dmsg_iocom_t *iocom = state->iocom;
2751b8eded1SMatthew Dillon 	dmsg_state_t *pstate;
2760c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
2770c3a8cd0SMatthew Dillon 	int hbytes;
278f306de83SMatthew Dillon 	size_t aligned_size;
2790c3a8cd0SMatthew Dillon 
280f306de83SMatthew Dillon 	aligned_size = DMSG_DOALIGN(aux_size);
2810c3a8cd0SMatthew Dillon 	if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
2820c3a8cd0SMatthew Dillon 		/*
2831b8eded1SMatthew Dillon 		 * When CREATE is set without REPLY the caller is
2841b8eded1SMatthew Dillon 		 * initiating a new transaction stacked under the specified
2851b8eded1SMatthew Dillon 		 * circuit.
2860c3a8cd0SMatthew Dillon 		 *
2870a9eefcaSMatthew Dillon 		 * It is possible to race a circuit failure, inherit the
2880a9eefcaSMatthew Dillon 		 * parent's STATE_DYING flag to trigger an abort sequence
2890a9eefcaSMatthew Dillon 		 * in the transmit path.  By not inheriting ABORTING the
2900a9eefcaSMatthew Dillon 		 * abort sequence can recurse.
2910a9eefcaSMatthew Dillon 		 *
2920c3a8cd0SMatthew Dillon 		 * NOTE: CREATE in txcmd handled by dmsg_msg_write()
2930c3a8cd0SMatthew Dillon 		 * NOTE: DELETE in txcmd handled by dmsg_state_cleanuptx()
2940c3a8cd0SMatthew Dillon 		 */
2951b8eded1SMatthew Dillon 		pstate = state;
2960c3a8cd0SMatthew Dillon 		state = malloc(sizeof(*state));
2970c3a8cd0SMatthew Dillon 		bzero(state, sizeof(*state));
2980a9eefcaSMatthew Dillon 		atomic_add_int(&dmsg_state_count, 1);
2990a9eefcaSMatthew Dillon 
3001b8eded1SMatthew Dillon 		TAILQ_INIT(&state->subq);
3011b8eded1SMatthew Dillon 		state->parent = pstate;
3020c3a8cd0SMatthew Dillon 		state->iocom = iocom;
3030c3a8cd0SMatthew Dillon 		state->flags = DMSG_STATE_DYNAMIC;
3040c3a8cd0SMatthew Dillon 		state->msgid = (uint64_t)(uintptr_t)state;
3050c3a8cd0SMatthew Dillon 		state->txcmd = cmd & ~(DMSGF_CREATE | DMSGF_DELETE);
3060c3a8cd0SMatthew Dillon 		state->rxcmd = DMSGF_REPLY;
3070d20ec8aSMatthew Dillon 		state->icmd = state->txcmd & DMSGF_BASECMDMASK;
3080c3a8cd0SMatthew Dillon 		state->func = func;
3090c3a8cd0SMatthew Dillon 		state->any.any = data;
310d30cab67SMatthew Dillon 
311a06d536bSMatthew Dillon 		state->flags |= DMSG_STATE_SUBINSERTED |
312a06d536bSMatthew Dillon 				DMSG_STATE_RBINSERTED;
3130a9eefcaSMatthew Dillon 		state->flags |= pstate->flags & DMSG_STATE_DYING;
3140a9eefcaSMatthew Dillon 		if (TAILQ_EMPTY(&pstate->subq))
3150a9eefcaSMatthew Dillon 			dmsg_state_hold(pstate);
3160a9eefcaSMatthew Dillon 		RB_INSERT(dmsg_state_tree, &iocom->statewr_tree, state);
3170a9eefcaSMatthew Dillon 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
3180a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* state on pstate->subq */
3190a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* state on rbtree */
3200a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* msg->state */
3211b8eded1SMatthew Dillon 	} else {
3221b8eded1SMatthew Dillon 		/*
3231b8eded1SMatthew Dillon 		 * Otherwise the message is transmitted over the existing
3241b8eded1SMatthew Dillon 		 * open transaction.
3251b8eded1SMatthew Dillon 		 */
3261b8eded1SMatthew Dillon 		pstate = state->parent;
3270a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* msg->state */
3280c3a8cd0SMatthew Dillon 	}
3291b8eded1SMatthew Dillon 
330a2179323SMatthew Dillon 	/* XXX SMP race for state */
331a2179323SMatthew Dillon 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
3320a9eefcaSMatthew Dillon 	assert((size_t)hbytes >= sizeof(struct dmsg_hdr));
3330a9eefcaSMatthew Dillon 	msg = malloc(offsetof(struct dmsg_msg, any.head) + hbytes);
334a2179323SMatthew Dillon 	bzero(msg, offsetof(struct dmsg_msg, any.head));
335f306de83SMatthew Dillon 
336f306de83SMatthew Dillon 	/*
337f306de83SMatthew Dillon 	 * [re]allocate the auxillary data buffer.  The caller knows that
338f306de83SMatthew Dillon 	 * a size-aligned buffer will be allocated but we do not want to
339f306de83SMatthew Dillon 	 * force the caller to zero any tail piece, so we do that ourself.
340f306de83SMatthew Dillon 	 */
3410c3a8cd0SMatthew Dillon 	if (msg->aux_size != aux_size) {
3420c3a8cd0SMatthew Dillon 		if (msg->aux_data) {
3430c3a8cd0SMatthew Dillon 			free(msg->aux_data);
3440c3a8cd0SMatthew Dillon 			msg->aux_data = NULL;
3450c3a8cd0SMatthew Dillon 			msg->aux_size = 0;
3460c3a8cd0SMatthew Dillon 		}
3470c3a8cd0SMatthew Dillon 		if (aux_size) {
348f306de83SMatthew Dillon 			msg->aux_data = malloc(aligned_size);
3490c3a8cd0SMatthew Dillon 			msg->aux_size = aux_size;
350f306de83SMatthew Dillon 			if (aux_size != aligned_size) {
351f306de83SMatthew Dillon 				bzero(msg->aux_data + aux_size,
352f306de83SMatthew Dillon 				      aligned_size - aux_size);
353f306de83SMatthew Dillon 			}
3540c3a8cd0SMatthew Dillon 		}
3550c3a8cd0SMatthew Dillon 	}
3561b8eded1SMatthew Dillon 
3571b8eded1SMatthew Dillon 	/*
3581b8eded1SMatthew Dillon 	 * Set REVTRANS if the transaction was remotely initiated
3591b8eded1SMatthew Dillon 	 * Set REVCIRC if the circuit was remotely initiated
3601b8eded1SMatthew Dillon 	 */
3611b8eded1SMatthew Dillon 	if (state->flags & DMSG_STATE_OPPOSITE)
3621b8eded1SMatthew Dillon 		cmd |= DMSGF_REVTRANS;
3631b8eded1SMatthew Dillon 	if (pstate->flags & DMSG_STATE_OPPOSITE)
3641b8eded1SMatthew Dillon 		cmd |= DMSGF_REVCIRC;
3651b8eded1SMatthew Dillon 
3661b8eded1SMatthew Dillon 	/*
3671b8eded1SMatthew Dillon 	 * Finish filling out the header.
3681b8eded1SMatthew Dillon 	 */
3690c3a8cd0SMatthew Dillon 	bzero(&msg->any.head, hbytes);
3700c3a8cd0SMatthew Dillon 	msg->hdr_size = hbytes;
3710d20ec8aSMatthew Dillon 	msg->any.head.magic = DMSG_HDR_MAGIC;
3720c3a8cd0SMatthew Dillon 	msg->any.head.cmd = cmd;
3730c3a8cd0SMatthew Dillon 	msg->any.head.aux_descr = 0;
3740c3a8cd0SMatthew Dillon 	msg->any.head.aux_crc = 0;
3750c3a8cd0SMatthew Dillon 	msg->any.head.msgid = state->msgid;
3761b8eded1SMatthew Dillon 	msg->any.head.circuit = pstate->msgid;
3771b8eded1SMatthew Dillon 	msg->state = state;
3781b8eded1SMatthew Dillon 
3790c3a8cd0SMatthew Dillon 	return (msg);
3800c3a8cd0SMatthew Dillon }
3810c3a8cd0SMatthew Dillon 
3820c3a8cd0SMatthew Dillon /*
3830c3a8cd0SMatthew Dillon  * Free a message so it can be reused afresh.
3840c3a8cd0SMatthew Dillon  *
3850c3a8cd0SMatthew Dillon  * NOTE: aux_size can be 0 with a non-NULL aux_data.
3860c3a8cd0SMatthew Dillon  */
3870c3a8cd0SMatthew Dillon static
3880c3a8cd0SMatthew Dillon void
3890c3a8cd0SMatthew Dillon dmsg_msg_free_locked(dmsg_msg_t *msg)
3900c3a8cd0SMatthew Dillon {
3910a9eefcaSMatthew Dillon 	dmsg_state_t *state;
392e96cef49SMatthew Dillon 
3930a9eefcaSMatthew Dillon 	if ((state = msg->state) != NULL) {
3940a9eefcaSMatthew Dillon 		dmsg_state_drop(state);
395323c0947SMatthew Dillon 		msg->state = NULL;	/* safety */
3960a9eefcaSMatthew Dillon 	}
397a2179323SMatthew Dillon 	if (msg->aux_data) {
398a2179323SMatthew Dillon 		free(msg->aux_data);
3990a9eefcaSMatthew Dillon 		msg->aux_data = NULL;	/* safety */
400a2179323SMatthew Dillon 	}
401a2179323SMatthew Dillon 	msg->aux_size = 0;
402a2179323SMatthew Dillon 	free (msg);
4030c3a8cd0SMatthew Dillon }
4040c3a8cd0SMatthew Dillon 
4050c3a8cd0SMatthew Dillon void
4060c3a8cd0SMatthew Dillon dmsg_msg_free(dmsg_msg_t *msg)
4070c3a8cd0SMatthew Dillon {
4081b8eded1SMatthew Dillon 	dmsg_iocom_t *iocom = msg->state->iocom;
4090c3a8cd0SMatthew Dillon 
4100c3a8cd0SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
4110c3a8cd0SMatthew Dillon 	dmsg_msg_free_locked(msg);
4120c3a8cd0SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
4130c3a8cd0SMatthew Dillon }
4140c3a8cd0SMatthew Dillon 
4150c3a8cd0SMatthew Dillon /*
4160c3a8cd0SMatthew Dillon  * I/O core loop for an iocom.
4170c3a8cd0SMatthew Dillon  *
4180c3a8cd0SMatthew Dillon  * Thread localized, iocom->mtx not held.
4190c3a8cd0SMatthew Dillon  */
4200c3a8cd0SMatthew Dillon void
4210c3a8cd0SMatthew Dillon dmsg_iocom_core(dmsg_iocom_t *iocom)
4220c3a8cd0SMatthew Dillon {
4230c3a8cd0SMatthew Dillon 	struct pollfd fds[3];
4240c3a8cd0SMatthew Dillon 	char dummybuf[256];
4250c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
4260c3a8cd0SMatthew Dillon 	int timeout;
4270c3a8cd0SMatthew Dillon 	int count;
4280c3a8cd0SMatthew Dillon 	int wi;	/* wakeup pipe */
4290c3a8cd0SMatthew Dillon 	int si;	/* socket */
4300c3a8cd0SMatthew Dillon 	int ai;	/* alt bulk path socket */
4310c3a8cd0SMatthew Dillon 
4320c3a8cd0SMatthew Dillon 	while ((iocom->flags & DMSG_IOCOMF_EOF) == 0) {
433a2179323SMatthew Dillon 		/*
434a2179323SMatthew Dillon 		 * These iocom->flags are only manipulated within the
435a2179323SMatthew Dillon 		 * context of the current thread.  However, modifications
436a2179323SMatthew Dillon 		 * still require atomic ops.
437a2179323SMatthew Dillon 		 */
4385ab1caedSMatthew Dillon 		dmio_printf(iocom, 5, "iocom %p %08x\n",
4395ab1caedSMatthew Dillon 			    iocom, iocom->flags);
4400c3a8cd0SMatthew Dillon 		if ((iocom->flags & (DMSG_IOCOMF_RWORK |
4410c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_WWORK |
4420c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_PWORK |
4430c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_SWORK |
4440c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_ARWORK |
4450c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_AWWORK)) == 0) {
4460c3a8cd0SMatthew Dillon 			/*
4470c3a8cd0SMatthew Dillon 			 * Only poll if no immediate work is pending.
4480c3a8cd0SMatthew Dillon 			 * Otherwise we are just wasting our time calling
4490c3a8cd0SMatthew Dillon 			 * poll.
4500c3a8cd0SMatthew Dillon 			 */
4510c3a8cd0SMatthew Dillon 			timeout = 5000;
4520c3a8cd0SMatthew Dillon 
4530c3a8cd0SMatthew Dillon 			count = 0;
4540c3a8cd0SMatthew Dillon 			wi = -1;
4550c3a8cd0SMatthew Dillon 			si = -1;
4560c3a8cd0SMatthew Dillon 			ai = -1;
4570c3a8cd0SMatthew Dillon 
4580c3a8cd0SMatthew Dillon 			/*
4590c3a8cd0SMatthew Dillon 			 * Always check the inter-thread pipe, e.g.
4600c3a8cd0SMatthew Dillon 			 * for iocom->txmsgq work.
4610c3a8cd0SMatthew Dillon 			 */
4620c3a8cd0SMatthew Dillon 			wi = count++;
4630c3a8cd0SMatthew Dillon 			fds[wi].fd = iocom->wakeupfds[0];
4640c3a8cd0SMatthew Dillon 			fds[wi].events = POLLIN;
4650c3a8cd0SMatthew Dillon 			fds[wi].revents = 0;
4660c3a8cd0SMatthew Dillon 
4670c3a8cd0SMatthew Dillon 			/*
4680c3a8cd0SMatthew Dillon 			 * Check the socket input/output direction as
4690c3a8cd0SMatthew Dillon 			 * requested
4700c3a8cd0SMatthew Dillon 			 */
4710c3a8cd0SMatthew Dillon 			if (iocom->flags & (DMSG_IOCOMF_RREQ |
4720c3a8cd0SMatthew Dillon 					    DMSG_IOCOMF_WREQ)) {
4730c3a8cd0SMatthew Dillon 				si = count++;
4740c3a8cd0SMatthew Dillon 				fds[si].fd = iocom->sock_fd;
4750c3a8cd0SMatthew Dillon 				fds[si].events = 0;
4760c3a8cd0SMatthew Dillon 				fds[si].revents = 0;
4770c3a8cd0SMatthew Dillon 
4780c3a8cd0SMatthew Dillon 				if (iocom->flags & DMSG_IOCOMF_RREQ)
4790c3a8cd0SMatthew Dillon 					fds[si].events |= POLLIN;
4800c3a8cd0SMatthew Dillon 				if (iocom->flags & DMSG_IOCOMF_WREQ)
4810c3a8cd0SMatthew Dillon 					fds[si].events |= POLLOUT;
4820c3a8cd0SMatthew Dillon 			}
4830c3a8cd0SMatthew Dillon 
4840c3a8cd0SMatthew Dillon 			/*
4850c3a8cd0SMatthew Dillon 			 * Check the alternative fd for work.
4860c3a8cd0SMatthew Dillon 			 */
4870c3a8cd0SMatthew Dillon 			if (iocom->alt_fd >= 0) {
4880c3a8cd0SMatthew Dillon 				ai = count++;
4890c3a8cd0SMatthew Dillon 				fds[ai].fd = iocom->alt_fd;
4900c3a8cd0SMatthew Dillon 				fds[ai].events = POLLIN;
4910c3a8cd0SMatthew Dillon 				fds[ai].revents = 0;
4920c3a8cd0SMatthew Dillon 			}
4930c3a8cd0SMatthew Dillon 			poll(fds, count, timeout);
4940c3a8cd0SMatthew Dillon 
4950c3a8cd0SMatthew Dillon 			if (wi >= 0 && (fds[wi].revents & POLLIN))
496a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
497a2179323SMatthew Dillon 					       DMSG_IOCOMF_PWORK);
4980c3a8cd0SMatthew Dillon 			if (si >= 0 && (fds[si].revents & POLLIN))
499a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
500a2179323SMatthew Dillon 					       DMSG_IOCOMF_RWORK);
5010c3a8cd0SMatthew Dillon 			if (si >= 0 && (fds[si].revents & POLLOUT))
502a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
503a2179323SMatthew Dillon 					       DMSG_IOCOMF_WWORK);
5040c3a8cd0SMatthew Dillon 			if (wi >= 0 && (fds[wi].revents & POLLOUT))
505a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
506a2179323SMatthew Dillon 					       DMSG_IOCOMF_WWORK);
5070c3a8cd0SMatthew Dillon 			if (ai >= 0 && (fds[ai].revents & POLLIN))
508a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
509a2179323SMatthew Dillon 					       DMSG_IOCOMF_ARWORK);
5100c3a8cd0SMatthew Dillon 		} else {
5110c3a8cd0SMatthew Dillon 			/*
5120c3a8cd0SMatthew Dillon 			 * Always check the pipe
5130c3a8cd0SMatthew Dillon 			 */
514a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_PWORK);
5150c3a8cd0SMatthew Dillon 		}
5160c3a8cd0SMatthew Dillon 
5170c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_SWORK) {
518a2179323SMatthew Dillon 			atomic_clear_int(&iocom->flags, DMSG_IOCOMF_SWORK);
5190d20ec8aSMatthew Dillon 			iocom->signal_callback(iocom);
5200c3a8cd0SMatthew Dillon 		}
5210c3a8cd0SMatthew Dillon 
5220c3a8cd0SMatthew Dillon 		/*
5230c3a8cd0SMatthew Dillon 		 * Pending message queues from other threads wake us up
5240c3a8cd0SMatthew Dillon 		 * with a write to the wakeupfds[] pipe.  We have to clear
5250c3a8cd0SMatthew Dillon 		 * the pipe with a dummy read.
5260c3a8cd0SMatthew Dillon 		 */
5270c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_PWORK) {
528a2179323SMatthew Dillon 			atomic_clear_int(&iocom->flags, DMSG_IOCOMF_PWORK);
5290c3a8cd0SMatthew Dillon 			read(iocom->wakeupfds[0], dummybuf, sizeof(dummybuf));
530a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_RWORK);
531a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_WWORK);
5320c3a8cd0SMatthew Dillon 		}
5330c3a8cd0SMatthew Dillon 
5340c3a8cd0SMatthew Dillon 		/*
5350c3a8cd0SMatthew Dillon 		 * Message write sequencing
5360c3a8cd0SMatthew Dillon 		 */
5370c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_WWORK)
5380c3a8cd0SMatthew Dillon 			dmsg_iocom_flush1(iocom);
5390c3a8cd0SMatthew Dillon 
5400c3a8cd0SMatthew Dillon 		/*
5410c3a8cd0SMatthew Dillon 		 * Message read sequencing.  Run this after the write
5420c3a8cd0SMatthew Dillon 		 * sequencing in case the write sequencing allowed another
5430c3a8cd0SMatthew Dillon 		 * auto-DELETE to occur on the read side.
5440c3a8cd0SMatthew Dillon 		 */
5450c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_RWORK) {
5460c3a8cd0SMatthew Dillon 			while ((iocom->flags & DMSG_IOCOMF_EOF) == 0 &&
5470c3a8cd0SMatthew Dillon 			       (msg = dmsg_ioq_read(iocom)) != NULL) {
5485ab1caedSMatthew Dillon 				dmio_printf(iocom, 4, "receive %s\n",
5490c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
5500d20ec8aSMatthew Dillon 				iocom->rcvmsg_callback(msg);
5510a9eefcaSMatthew Dillon 				pthread_mutex_lock(&iocom->mtx);
5520c3a8cd0SMatthew Dillon 				dmsg_state_cleanuprx(iocom, msg);
5530a9eefcaSMatthew Dillon 				pthread_mutex_unlock(&iocom->mtx);
5540c3a8cd0SMatthew Dillon 			}
5550c3a8cd0SMatthew Dillon 		}
5560c3a8cd0SMatthew Dillon 
5570c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_ARWORK) {
558a2179323SMatthew Dillon 			atomic_clear_int(&iocom->flags, DMSG_IOCOMF_ARWORK);
5590d20ec8aSMatthew Dillon 			iocom->altmsg_callback(iocom);
5600c3a8cd0SMatthew Dillon 		}
5610c3a8cd0SMatthew Dillon 	}
5620c3a8cd0SMatthew Dillon }
5630c3a8cd0SMatthew Dillon 
5640c3a8cd0SMatthew Dillon /*
5650c3a8cd0SMatthew Dillon  * Make sure there's enough room in the FIFO to hold the
5660c3a8cd0SMatthew Dillon  * needed data.
5670c3a8cd0SMatthew Dillon  *
5680c3a8cd0SMatthew Dillon  * Assume worst case encrypted form is 2x the size of the
5690c3a8cd0SMatthew Dillon  * plaintext equivalent.
5700c3a8cd0SMatthew Dillon  */
5710c3a8cd0SMatthew Dillon static
5720c3a8cd0SMatthew Dillon size_t
5730c3a8cd0SMatthew Dillon dmsg_ioq_makeroom(dmsg_ioq_t *ioq, size_t needed)
5740c3a8cd0SMatthew Dillon {
5750c3a8cd0SMatthew Dillon 	size_t bytes;
5760c3a8cd0SMatthew Dillon 	size_t nmax;
5770c3a8cd0SMatthew Dillon 
5780c3a8cd0SMatthew Dillon 	bytes = ioq->fifo_cdx - ioq->fifo_beg;
5790c3a8cd0SMatthew Dillon 	nmax = sizeof(ioq->buf) - ioq->fifo_end;
5800c3a8cd0SMatthew Dillon 	if (bytes + nmax / 2 < needed) {
5810c3a8cd0SMatthew Dillon 		if (bytes) {
5820c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg,
5830c3a8cd0SMatthew Dillon 			      ioq->buf,
5840c3a8cd0SMatthew Dillon 			      bytes);
5850c3a8cd0SMatthew Dillon 		}
5860c3a8cd0SMatthew Dillon 		ioq->fifo_cdx -= ioq->fifo_beg;
5870c3a8cd0SMatthew Dillon 		ioq->fifo_beg = 0;
5880c3a8cd0SMatthew Dillon 		if (ioq->fifo_cdn < ioq->fifo_end) {
5890c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_cdn,
5900c3a8cd0SMatthew Dillon 			      ioq->buf + ioq->fifo_cdx,
5910c3a8cd0SMatthew Dillon 			      ioq->fifo_end - ioq->fifo_cdn);
5920c3a8cd0SMatthew Dillon 		}
5930c3a8cd0SMatthew Dillon 		ioq->fifo_end -= ioq->fifo_cdn - ioq->fifo_cdx;
5940c3a8cd0SMatthew Dillon 		ioq->fifo_cdn = ioq->fifo_cdx;
5950c3a8cd0SMatthew Dillon 		nmax = sizeof(ioq->buf) - ioq->fifo_end;
5960c3a8cd0SMatthew Dillon 	}
5970c3a8cd0SMatthew Dillon 	return(nmax);
5980c3a8cd0SMatthew Dillon }
5990c3a8cd0SMatthew Dillon 
6000c3a8cd0SMatthew Dillon /*
6010c3a8cd0SMatthew Dillon  * Read the next ready message from the ioq, issuing I/O if needed.
6020c3a8cd0SMatthew Dillon  * Caller should retry on a read-event when NULL is returned.
6030c3a8cd0SMatthew Dillon  *
6040c3a8cd0SMatthew Dillon  * If an error occurs during reception a DMSG_LNK_ERROR msg will
6050c3a8cd0SMatthew Dillon  * be returned for each open transaction, then the ioq and iocom
6060c3a8cd0SMatthew Dillon  * will be errored out and a non-transactional DMSG_LNK_ERROR
6070c3a8cd0SMatthew Dillon  * msg will be returned as the final message.  The caller should not call
6080c3a8cd0SMatthew Dillon  * us again after the final message is returned.
6090c3a8cd0SMatthew Dillon  *
6100c3a8cd0SMatthew Dillon  * Thread localized, iocom->mtx not held.
6110c3a8cd0SMatthew Dillon  */
6120c3a8cd0SMatthew Dillon dmsg_msg_t *
6130c3a8cd0SMatthew Dillon dmsg_ioq_read(dmsg_iocom_t *iocom)
6140c3a8cd0SMatthew Dillon {
6150c3a8cd0SMatthew Dillon 	dmsg_ioq_t *ioq = &iocom->ioq_rx;
6160c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
6170c3a8cd0SMatthew Dillon 	dmsg_hdr_t *head;
6180c3a8cd0SMatthew Dillon 	ssize_t n;
6190c3a8cd0SMatthew Dillon 	size_t bytes;
6200c3a8cd0SMatthew Dillon 	size_t nmax;
621f306de83SMatthew Dillon 	uint32_t aux_size;
6220c3a8cd0SMatthew Dillon 	uint32_t xcrc32;
6230c3a8cd0SMatthew Dillon 	int error;
6240c3a8cd0SMatthew Dillon 
6250c3a8cd0SMatthew Dillon again:
6260c3a8cd0SMatthew Dillon 	/*
6270c3a8cd0SMatthew Dillon 	 * If a message is already pending we can just remove and
6280c3a8cd0SMatthew Dillon 	 * return it.  Message state has already been processed.
6290c3a8cd0SMatthew Dillon 	 * (currently not implemented)
6300c3a8cd0SMatthew Dillon 	 */
6310c3a8cd0SMatthew Dillon 	if ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
6320c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&ioq->msgq, msg, qentry);
633a06d536bSMatthew Dillon 
634a06d536bSMatthew Dillon 		if (msg->state == &iocom->state0) {
635a06d536bSMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_EOF);
6365ab1caedSMatthew Dillon 			dmio_printf(iocom, 1,
6375ab1caedSMatthew Dillon 				    "EOF ON SOCKET %d\n",
6385ab1caedSMatthew Dillon 				    iocom->sock_fd);
639a06d536bSMatthew Dillon 		}
6400c3a8cd0SMatthew Dillon 		return (msg);
6410c3a8cd0SMatthew Dillon 	}
642a2179323SMatthew Dillon 	atomic_clear_int(&iocom->flags, DMSG_IOCOMF_RREQ | DMSG_IOCOMF_RWORK);
6430c3a8cd0SMatthew Dillon 
6440c3a8cd0SMatthew Dillon 	/*
6450c3a8cd0SMatthew Dillon 	 * If the stream is errored out we stop processing it.
6460c3a8cd0SMatthew Dillon 	 */
6470c3a8cd0SMatthew Dillon 	if (ioq->error)
6480c3a8cd0SMatthew Dillon 		goto skip;
6490c3a8cd0SMatthew Dillon 
6500c3a8cd0SMatthew Dillon 	/*
6510c3a8cd0SMatthew Dillon 	 * Message read in-progress (msg is NULL at the moment).  We don't
6520c3a8cd0SMatthew Dillon 	 * allocate a msg until we have its core header.
6530c3a8cd0SMatthew Dillon 	 */
6540c3a8cd0SMatthew Dillon 	nmax = sizeof(ioq->buf) - ioq->fifo_end;
6550c3a8cd0SMatthew Dillon 	bytes = ioq->fifo_cdx - ioq->fifo_beg;		/* already decrypted */
6560c3a8cd0SMatthew Dillon 	msg = ioq->msg;
6570c3a8cd0SMatthew Dillon 
6580c3a8cd0SMatthew Dillon 	switch(ioq->state) {
6590c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_HEADER1:
6600c3a8cd0SMatthew Dillon 		/*
6610c3a8cd0SMatthew Dillon 		 * Load the primary header, fail on any non-trivial read
6620c3a8cd0SMatthew Dillon 		 * error or on EOF.  Since the primary header is the same
6630c3a8cd0SMatthew Dillon 		 * size is the message alignment it will never straddle
6640c3a8cd0SMatthew Dillon 		 * the end of the buffer.
6650c3a8cd0SMatthew Dillon 		 */
6660c3a8cd0SMatthew Dillon 		nmax = dmsg_ioq_makeroom(ioq, sizeof(msg->any.head));
6670c3a8cd0SMatthew Dillon 		if (bytes < sizeof(msg->any.head)) {
6680c3a8cd0SMatthew Dillon 			n = read(iocom->sock_fd,
6690c3a8cd0SMatthew Dillon 				 ioq->buf + ioq->fifo_end,
6700c3a8cd0SMatthew Dillon 				 nmax);
6710c3a8cd0SMatthew Dillon 			if (n <= 0) {
6720c3a8cd0SMatthew Dillon 				if (n == 0) {
6730c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_EOF;
6740c3a8cd0SMatthew Dillon 					break;
6750c3a8cd0SMatthew Dillon 				}
6760c3a8cd0SMatthew Dillon 				if (errno != EINTR &&
6770c3a8cd0SMatthew Dillon 				    errno != EINPROGRESS &&
6780c3a8cd0SMatthew Dillon 				    errno != EAGAIN) {
6790c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_SOCK;
6800c3a8cd0SMatthew Dillon 					break;
6810c3a8cd0SMatthew Dillon 				}
6820c3a8cd0SMatthew Dillon 				n = 0;
6830c3a8cd0SMatthew Dillon 				/* fall through */
6840c3a8cd0SMatthew Dillon 			}
6850c3a8cd0SMatthew Dillon 			ioq->fifo_end += (size_t)n;
6860c3a8cd0SMatthew Dillon 			nmax -= (size_t)n;
6870c3a8cd0SMatthew Dillon 		}
6880c3a8cd0SMatthew Dillon 
6890c3a8cd0SMatthew Dillon 		/*
6900c3a8cd0SMatthew Dillon 		 * Decrypt data received so far.  Data will be decrypted
6910c3a8cd0SMatthew Dillon 		 * in-place but might create gaps in the FIFO.  Partial
6920c3a8cd0SMatthew Dillon 		 * blocks are not immediately decrypted.
6930c3a8cd0SMatthew Dillon 		 *
6940c3a8cd0SMatthew Dillon 		 * WARNING!  The header might be in the wrong endian, we
6950c3a8cd0SMatthew Dillon 		 *	     do not fix it up until we get the entire
6960c3a8cd0SMatthew Dillon 		 *	     extended header.
6970c3a8cd0SMatthew Dillon 		 */
6980c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_CRYPTED) {
6990c3a8cd0SMatthew Dillon 			dmsg_crypto_decrypt(iocom, ioq);
7000c3a8cd0SMatthew Dillon 		} else {
7010c3a8cd0SMatthew Dillon 			ioq->fifo_cdx = ioq->fifo_end;
7020c3a8cd0SMatthew Dillon 			ioq->fifo_cdn = ioq->fifo_end;
7030c3a8cd0SMatthew Dillon 		}
7040c3a8cd0SMatthew Dillon 		bytes = ioq->fifo_cdx - ioq->fifo_beg;
7050c3a8cd0SMatthew Dillon 
7060c3a8cd0SMatthew Dillon 		/*
7070c3a8cd0SMatthew Dillon 		 * Insufficient data accumulated (msg is NULL, caller will
7080c3a8cd0SMatthew Dillon 		 * retry on event).
7090c3a8cd0SMatthew Dillon 		 */
7100c3a8cd0SMatthew Dillon 		assert(msg == NULL);
7110c3a8cd0SMatthew Dillon 		if (bytes < sizeof(msg->any.head))
7120c3a8cd0SMatthew Dillon 			break;
7130c3a8cd0SMatthew Dillon 
7140c3a8cd0SMatthew Dillon 		/*
7150c3a8cd0SMatthew Dillon 		 * Check and fixup the core header.  Note that the icrc
7160c3a8cd0SMatthew Dillon 		 * has to be calculated before any fixups, but the crc
7170c3a8cd0SMatthew Dillon 		 * fields in the msg may have to be swapped like everything
7180c3a8cd0SMatthew Dillon 		 * else.
7190c3a8cd0SMatthew Dillon 		 */
7200c3a8cd0SMatthew Dillon 		head = (void *)(ioq->buf + ioq->fifo_beg);
7210c3a8cd0SMatthew Dillon 		if (head->magic != DMSG_HDR_MAGIC &&
7220c3a8cd0SMatthew Dillon 		    head->magic != DMSG_HDR_MAGIC_REV) {
7235ab1caedSMatthew Dillon 			dmio_printf(iocom, 1,
7245ab1caedSMatthew Dillon 				    "%s: head->magic is bad %02x\n",
725f306de83SMatthew Dillon 				    iocom->label, head->magic);
726f306de83SMatthew Dillon 			if (iocom->flags & DMSG_IOCOMF_CRYPTED)
7275ab1caedSMatthew Dillon 				dmio_printf(iocom, 1, "%s\n",
7285ab1caedSMatthew Dillon 					    "(on encrypted link)");
7290c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_SYNC;
7300c3a8cd0SMatthew Dillon 			break;
7310c3a8cd0SMatthew Dillon 		}
7320c3a8cd0SMatthew Dillon 
7330c3a8cd0SMatthew Dillon 		/*
7340c3a8cd0SMatthew Dillon 		 * Calculate the full header size and aux data size
7350c3a8cd0SMatthew Dillon 		 */
7360c3a8cd0SMatthew Dillon 		if (head->magic == DMSG_HDR_MAGIC_REV) {
7370c3a8cd0SMatthew Dillon 			ioq->hbytes = (bswap32(head->cmd) & DMSGF_SIZE) *
7380c3a8cd0SMatthew Dillon 				      DMSG_ALIGN;
739f306de83SMatthew Dillon 			aux_size = bswap32(head->aux_bytes);
7400c3a8cd0SMatthew Dillon 		} else {
7410c3a8cd0SMatthew Dillon 			ioq->hbytes = (head->cmd & DMSGF_SIZE) *
7420c3a8cd0SMatthew Dillon 				      DMSG_ALIGN;
743f306de83SMatthew Dillon 			aux_size = head->aux_bytes;
7440c3a8cd0SMatthew Dillon 		}
745f306de83SMatthew Dillon 		ioq->abytes = DMSG_DOALIGN(aux_size);
746f306de83SMatthew Dillon 		ioq->unaligned_aux_size = aux_size;
7470c3a8cd0SMatthew Dillon 		if (ioq->hbytes < sizeof(msg->any.head) ||
7480c3a8cd0SMatthew Dillon 		    ioq->hbytes > sizeof(msg->any) ||
7490c3a8cd0SMatthew Dillon 		    ioq->abytes > DMSG_AUX_MAX) {
7500c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_FIELD;
7510c3a8cd0SMatthew Dillon 			break;
7520c3a8cd0SMatthew Dillon 		}
7530c3a8cd0SMatthew Dillon 
7540c3a8cd0SMatthew Dillon 		/*
7550c3a8cd0SMatthew Dillon 		 * Allocate the message, the next state will fill it in.
7561b8eded1SMatthew Dillon 		 *
7571b8eded1SMatthew Dillon 		 * NOTE: The aux_data buffer will be sized to an aligned
7581b8eded1SMatthew Dillon 		 *	 value and the aligned remainder zero'd for
7591b8eded1SMatthew Dillon 		 *	 convenience.
7601b8eded1SMatthew Dillon 		 *
7611b8eded1SMatthew Dillon 		 * NOTE: Supply dummy state and a degenerate cmd without
7621b8eded1SMatthew Dillon 		 *	 CREATE set.  The message will temporarily be
7631b8eded1SMatthew Dillon 		 *	 associated with state0 until later post-processing.
7640c3a8cd0SMatthew Dillon 		 */
7651b8eded1SMatthew Dillon 		msg = dmsg_msg_alloc(&iocom->state0, aux_size,
766a2179323SMatthew Dillon 				     ioq->hbytes / DMSG_ALIGN,
7670c3a8cd0SMatthew Dillon 				     NULL, NULL);
7680c3a8cd0SMatthew Dillon 		ioq->msg = msg;
7690c3a8cd0SMatthew Dillon 
7700c3a8cd0SMatthew Dillon 		/*
7710c3a8cd0SMatthew Dillon 		 * Fall through to the next state.  Make sure that the
7720c3a8cd0SMatthew Dillon 		 * extended header does not straddle the end of the buffer.
7730c3a8cd0SMatthew Dillon 		 * We still want to issue larger reads into our buffer,
7740c3a8cd0SMatthew Dillon 		 * book-keeping is easier if we don't bcopy() yet.
7750c3a8cd0SMatthew Dillon 		 *
7760c3a8cd0SMatthew Dillon 		 * Make sure there is enough room for bloated encrypt data.
7770c3a8cd0SMatthew Dillon 		 */
7780c3a8cd0SMatthew Dillon 		nmax = dmsg_ioq_makeroom(ioq, ioq->hbytes);
7790c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_HEADER2;
7800c3a8cd0SMatthew Dillon 		/* fall through */
7810c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_HEADER2:
7820c3a8cd0SMatthew Dillon 		/*
7830c3a8cd0SMatthew Dillon 		 * Fill out the extended header.
7840c3a8cd0SMatthew Dillon 		 */
7850c3a8cd0SMatthew Dillon 		assert(msg != NULL);
7860c3a8cd0SMatthew Dillon 		if (bytes < ioq->hbytes) {
7870a9eefcaSMatthew Dillon 			assert(nmax > 0);
7880c3a8cd0SMatthew Dillon 			n = read(iocom->sock_fd,
7890c3a8cd0SMatthew Dillon 				 ioq->buf + ioq->fifo_end,
7900c3a8cd0SMatthew Dillon 				 nmax);
7910c3a8cd0SMatthew Dillon 			if (n <= 0) {
7920c3a8cd0SMatthew Dillon 				if (n == 0) {
7930c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_EOF;
7940c3a8cd0SMatthew Dillon 					break;
7950c3a8cd0SMatthew Dillon 				}
7960c3a8cd0SMatthew Dillon 				if (errno != EINTR &&
7970c3a8cd0SMatthew Dillon 				    errno != EINPROGRESS &&
7980c3a8cd0SMatthew Dillon 				    errno != EAGAIN) {
7990c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_SOCK;
8000c3a8cd0SMatthew Dillon 					break;
8010c3a8cd0SMatthew Dillon 				}
8020c3a8cd0SMatthew Dillon 				n = 0;
8030c3a8cd0SMatthew Dillon 				/* fall through */
8040c3a8cd0SMatthew Dillon 			}
8050c3a8cd0SMatthew Dillon 			ioq->fifo_end += (size_t)n;
8060c3a8cd0SMatthew Dillon 			nmax -= (size_t)n;
8070c3a8cd0SMatthew Dillon 		}
8080c3a8cd0SMatthew Dillon 
8090c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_CRYPTED) {
8100c3a8cd0SMatthew Dillon 			dmsg_crypto_decrypt(iocom, ioq);
8110c3a8cd0SMatthew Dillon 		} else {
8120c3a8cd0SMatthew Dillon 			ioq->fifo_cdx = ioq->fifo_end;
8130c3a8cd0SMatthew Dillon 			ioq->fifo_cdn = ioq->fifo_end;
8140c3a8cd0SMatthew Dillon 		}
8150c3a8cd0SMatthew Dillon 		bytes = ioq->fifo_cdx - ioq->fifo_beg;
8160c3a8cd0SMatthew Dillon 
8170c3a8cd0SMatthew Dillon 		/*
8180c3a8cd0SMatthew Dillon 		 * Insufficient data accumulated (set msg NULL so caller will
8190c3a8cd0SMatthew Dillon 		 * retry on event).
8200c3a8cd0SMatthew Dillon 		 */
8210c3a8cd0SMatthew Dillon 		if (bytes < ioq->hbytes) {
8220c3a8cd0SMatthew Dillon 			msg = NULL;
8230c3a8cd0SMatthew Dillon 			break;
8240c3a8cd0SMatthew Dillon 		}
8250c3a8cd0SMatthew Dillon 
8260c3a8cd0SMatthew Dillon 		/*
8270c3a8cd0SMatthew Dillon 		 * Calculate the extended header, decrypt data received
8280c3a8cd0SMatthew Dillon 		 * so far.  Handle endian-conversion for the entire extended
8290c3a8cd0SMatthew Dillon 		 * header.
8300c3a8cd0SMatthew Dillon 		 */
8310c3a8cd0SMatthew Dillon 		head = (void *)(ioq->buf + ioq->fifo_beg);
8320c3a8cd0SMatthew Dillon 
8330c3a8cd0SMatthew Dillon 		/*
8340c3a8cd0SMatthew Dillon 		 * Check the CRC.
8350c3a8cd0SMatthew Dillon 		 */
8360c3a8cd0SMatthew Dillon 		if (head->magic == DMSG_HDR_MAGIC_REV)
8370c3a8cd0SMatthew Dillon 			xcrc32 = bswap32(head->hdr_crc);
8380c3a8cd0SMatthew Dillon 		else
8390c3a8cd0SMatthew Dillon 			xcrc32 = head->hdr_crc;
8400c3a8cd0SMatthew Dillon 		head->hdr_crc = 0;
8410c3a8cd0SMatthew Dillon 		if (dmsg_icrc32(head, ioq->hbytes) != xcrc32) {
8420c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_XCRC;
8435ab1caedSMatthew Dillon 			dmio_printf(iocom, 1, "BAD-XCRC(%08x,%08x) %s\n",
8440c3a8cd0SMatthew Dillon 				    xcrc32, dmsg_icrc32(head, ioq->hbytes),
8450c3a8cd0SMatthew Dillon 				    dmsg_msg_str(msg));
8460c3a8cd0SMatthew Dillon 			assert(0);
8470c3a8cd0SMatthew Dillon 			break;
8480c3a8cd0SMatthew Dillon 		}
8490c3a8cd0SMatthew Dillon 		head->hdr_crc = xcrc32;
8500c3a8cd0SMatthew Dillon 
8510c3a8cd0SMatthew Dillon 		if (head->magic == DMSG_HDR_MAGIC_REV) {
8520c3a8cd0SMatthew Dillon 			dmsg_bswap_head(head);
8530c3a8cd0SMatthew Dillon 		}
8540c3a8cd0SMatthew Dillon 
8550c3a8cd0SMatthew Dillon 		/*
8560c3a8cd0SMatthew Dillon 		 * Copy the extended header into the msg and adjust the
8570c3a8cd0SMatthew Dillon 		 * FIFO.
8580c3a8cd0SMatthew Dillon 		 */
8590c3a8cd0SMatthew Dillon 		bcopy(head, &msg->any, ioq->hbytes);
8600c3a8cd0SMatthew Dillon 
8610c3a8cd0SMatthew Dillon 		/*
8620c3a8cd0SMatthew Dillon 		 * We are either done or we fall-through.
8630c3a8cd0SMatthew Dillon 		 */
8640c3a8cd0SMatthew Dillon 		if (ioq->abytes == 0) {
8650c3a8cd0SMatthew Dillon 			ioq->fifo_beg += ioq->hbytes;
8660c3a8cd0SMatthew Dillon 			break;
8670c3a8cd0SMatthew Dillon 		}
8680c3a8cd0SMatthew Dillon 
8690c3a8cd0SMatthew Dillon 		/*
8700c3a8cd0SMatthew Dillon 		 * Must adjust bytes (and the state) when falling through.
8710c3a8cd0SMatthew Dillon 		 * nmax doesn't change.
8720c3a8cd0SMatthew Dillon 		 */
8730c3a8cd0SMatthew Dillon 		ioq->fifo_beg += ioq->hbytes;
8740c3a8cd0SMatthew Dillon 		bytes -= ioq->hbytes;
8750c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_AUXDATA1;
8760c3a8cd0SMatthew Dillon 		/* fall through */
8770c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_AUXDATA1:
8780c3a8cd0SMatthew Dillon 		/*
879a2179323SMatthew Dillon 		 * Copy the partial or complete [decrypted] payload from
880a2179323SMatthew Dillon 		 * remaining bytes in the FIFO in order to optimize the
881a2179323SMatthew Dillon 		 * makeroom call in the AUXDATA2 state.  We have to
882a2179323SMatthew Dillon 		 * fall-through either way so we can check the crc.
8830c3a8cd0SMatthew Dillon 		 *
8840c3a8cd0SMatthew Dillon 		 * msg->aux_size tracks our aux data.
885a2179323SMatthew Dillon 		 *
886a2179323SMatthew Dillon 		 * (Lets not complicate matters if the data is encrypted,
887a2179323SMatthew Dillon 		 *  since the data in-stream is not the same size as the
888a2179323SMatthew Dillon 		 *  data decrypted).
8890c3a8cd0SMatthew Dillon 		 */
8900c3a8cd0SMatthew Dillon 		if (bytes >= ioq->abytes) {
8910c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg, msg->aux_data,
8920c3a8cd0SMatthew Dillon 			      ioq->abytes);
8930c3a8cd0SMatthew Dillon 			msg->aux_size = ioq->abytes;
8940c3a8cd0SMatthew Dillon 			ioq->fifo_beg += ioq->abytes;
8950c3a8cd0SMatthew Dillon 			assert(ioq->fifo_beg <= ioq->fifo_cdx);
8960c3a8cd0SMatthew Dillon 			assert(ioq->fifo_cdx <= ioq->fifo_cdn);
8970c3a8cd0SMatthew Dillon 			bytes -= ioq->abytes;
8980c3a8cd0SMatthew Dillon 		} else if (bytes) {
8990c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg, msg->aux_data,
9000c3a8cd0SMatthew Dillon 			      bytes);
9010c3a8cd0SMatthew Dillon 			msg->aux_size = bytes;
9020c3a8cd0SMatthew Dillon 			ioq->fifo_beg += bytes;
9030c3a8cd0SMatthew Dillon 			if (ioq->fifo_cdx < ioq->fifo_beg)
9040c3a8cd0SMatthew Dillon 				ioq->fifo_cdx = ioq->fifo_beg;
9050c3a8cd0SMatthew Dillon 			assert(ioq->fifo_beg <= ioq->fifo_cdx);
9060c3a8cd0SMatthew Dillon 			assert(ioq->fifo_cdx <= ioq->fifo_cdn);
9070c3a8cd0SMatthew Dillon 			bytes = 0;
9080c3a8cd0SMatthew Dillon 		} else {
9090c3a8cd0SMatthew Dillon 			msg->aux_size = 0;
9100c3a8cd0SMatthew Dillon 		}
9110c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_AUXDATA2;
9120c3a8cd0SMatthew Dillon 		/* fall through */
9130c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_AUXDATA2:
9140c3a8cd0SMatthew Dillon 		/*
9150c3a8cd0SMatthew Dillon 		 * Make sure there is enough room for more data.
9160c3a8cd0SMatthew Dillon 		 */
9170c3a8cd0SMatthew Dillon 		assert(msg);
9180c3a8cd0SMatthew Dillon 		nmax = dmsg_ioq_makeroom(ioq, ioq->abytes - msg->aux_size);
9190c3a8cd0SMatthew Dillon 
9200c3a8cd0SMatthew Dillon 		/*
9210c3a8cd0SMatthew Dillon 		 * Read and decrypt more of the payload.
9220c3a8cd0SMatthew Dillon 		 */
9230c3a8cd0SMatthew Dillon 		if (msg->aux_size < ioq->abytes) {
9240a9eefcaSMatthew Dillon 			assert(nmax > 0);
9250c3a8cd0SMatthew Dillon 			assert(bytes == 0);
9260c3a8cd0SMatthew Dillon 			n = read(iocom->sock_fd,
9270c3a8cd0SMatthew Dillon 				 ioq->buf + ioq->fifo_end,
9280c3a8cd0SMatthew Dillon 				 nmax);
9290c3a8cd0SMatthew Dillon 			if (n <= 0) {
9300c3a8cd0SMatthew Dillon 				if (n == 0) {
9310c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_EOF;
9320c3a8cd0SMatthew Dillon 					break;
9330c3a8cd0SMatthew Dillon 				}
9340c3a8cd0SMatthew Dillon 				if (errno != EINTR &&
9350c3a8cd0SMatthew Dillon 				    errno != EINPROGRESS &&
9360c3a8cd0SMatthew Dillon 				    errno != EAGAIN) {
9370c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_SOCK;
9380c3a8cd0SMatthew Dillon 					break;
9390c3a8cd0SMatthew Dillon 				}
9400c3a8cd0SMatthew Dillon 				n = 0;
9410c3a8cd0SMatthew Dillon 				/* fall through */
9420c3a8cd0SMatthew Dillon 			}
9430c3a8cd0SMatthew Dillon 			ioq->fifo_end += (size_t)n;
9440c3a8cd0SMatthew Dillon 			nmax -= (size_t)n;
9450c3a8cd0SMatthew Dillon 		}
9460c3a8cd0SMatthew Dillon 
9470c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_CRYPTED) {
9480c3a8cd0SMatthew Dillon 			dmsg_crypto_decrypt(iocom, ioq);
9490c3a8cd0SMatthew Dillon 		} else {
9500c3a8cd0SMatthew Dillon 			ioq->fifo_cdx = ioq->fifo_end;
9510c3a8cd0SMatthew Dillon 			ioq->fifo_cdn = ioq->fifo_end;
9520c3a8cd0SMatthew Dillon 		}
9530c3a8cd0SMatthew Dillon 		bytes = ioq->fifo_cdx - ioq->fifo_beg;
9540c3a8cd0SMatthew Dillon 
9550c3a8cd0SMatthew Dillon 		if (bytes > ioq->abytes - msg->aux_size)
9560c3a8cd0SMatthew Dillon 			bytes = ioq->abytes - msg->aux_size;
9570c3a8cd0SMatthew Dillon 
9580c3a8cd0SMatthew Dillon 		if (bytes) {
9590c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg,
9600c3a8cd0SMatthew Dillon 			      msg->aux_data + msg->aux_size,
9610c3a8cd0SMatthew Dillon 			      bytes);
9620c3a8cd0SMatthew Dillon 			msg->aux_size += bytes;
9630c3a8cd0SMatthew Dillon 			ioq->fifo_beg += bytes;
9640c3a8cd0SMatthew Dillon 		}
9650c3a8cd0SMatthew Dillon 
9660c3a8cd0SMatthew Dillon 		/*
9670c3a8cd0SMatthew Dillon 		 * Insufficient data accumulated (set msg NULL so caller will
9680c3a8cd0SMatthew Dillon 		 * retry on event).
969f306de83SMatthew Dillon 		 *
970f306de83SMatthew Dillon 		 * Assert the auxillary data size is correct, then record the
971f306de83SMatthew Dillon 		 * original unaligned size from the message header.
9720c3a8cd0SMatthew Dillon 		 */
9730c3a8cd0SMatthew Dillon 		if (msg->aux_size < ioq->abytes) {
9740c3a8cd0SMatthew Dillon 			msg = NULL;
9750c3a8cd0SMatthew Dillon 			break;
9760c3a8cd0SMatthew Dillon 		}
9770c3a8cd0SMatthew Dillon 		assert(msg->aux_size == ioq->abytes);
978f306de83SMatthew Dillon 		msg->aux_size = ioq->unaligned_aux_size;
9790c3a8cd0SMatthew Dillon 
9800c3a8cd0SMatthew Dillon 		/*
981f306de83SMatthew Dillon 		 * Check aux_crc, then we are done.  Note that the crc
982f306de83SMatthew Dillon 		 * is calculated over the aligned size, not the actual
983f306de83SMatthew Dillon 		 * size.
9840c3a8cd0SMatthew Dillon 		 */
985f306de83SMatthew Dillon 		xcrc32 = dmsg_icrc32(msg->aux_data, ioq->abytes);
9860c3a8cd0SMatthew Dillon 		if (xcrc32 != msg->any.head.aux_crc) {
9870c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_ACRC;
9885ab1caedSMatthew Dillon 			dmio_printf(iocom, 1,
989d30cab67SMatthew Dillon 				    "iocom: ACRC error %08x vs %08x "
990d30cab67SMatthew Dillon 				    "msgid %016jx msgcmd %08x auxsize %d\n",
991d30cab67SMatthew Dillon 				    xcrc32,
992d30cab67SMatthew Dillon 				    msg->any.head.aux_crc,
993d30cab67SMatthew Dillon 				    (intmax_t)msg->any.head.msgid,
994d30cab67SMatthew Dillon 				    msg->any.head.cmd,
995d30cab67SMatthew Dillon 				    msg->any.head.aux_bytes);
9960c3a8cd0SMatthew Dillon 			break;
9970c3a8cd0SMatthew Dillon 		}
9980c3a8cd0SMatthew Dillon 		break;
9990c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_ERROR:
10000c3a8cd0SMatthew Dillon 		/*
10010c3a8cd0SMatthew Dillon 		 * Continued calls to drain recorded transactions (returning
10020c3a8cd0SMatthew Dillon 		 * a LNK_ERROR for each one), before we return the final
10030c3a8cd0SMatthew Dillon 		 * LNK_ERROR.
10040c3a8cd0SMatthew Dillon 		 */
10050c3a8cd0SMatthew Dillon 		assert(msg == NULL);
10060c3a8cd0SMatthew Dillon 		break;
10070c3a8cd0SMatthew Dillon 	default:
10080c3a8cd0SMatthew Dillon 		/*
10090c3a8cd0SMatthew Dillon 		 * We don't double-return errors, the caller should not
10100c3a8cd0SMatthew Dillon 		 * have called us again after getting an error msg.
10110c3a8cd0SMatthew Dillon 		 */
10120c3a8cd0SMatthew Dillon 		assert(0);
10130c3a8cd0SMatthew Dillon 		break;
10140c3a8cd0SMatthew Dillon 	}
10150c3a8cd0SMatthew Dillon 
10160c3a8cd0SMatthew Dillon 	/*
10170c3a8cd0SMatthew Dillon 	 * Check the message sequence.  The iv[] should prevent any
10180c3a8cd0SMatthew Dillon 	 * possibility of a replay but we add this check anyway.
10190c3a8cd0SMatthew Dillon 	 */
10200c3a8cd0SMatthew Dillon 	if (msg && ioq->error == 0) {
10210c3a8cd0SMatthew Dillon 		if ((msg->any.head.salt & 255) != (ioq->seq & 255)) {
10220c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_MSGSEQ;
10230c3a8cd0SMatthew Dillon 		} else {
10240c3a8cd0SMatthew Dillon 			++ioq->seq;
10250c3a8cd0SMatthew Dillon 		}
10260c3a8cd0SMatthew Dillon 	}
10270c3a8cd0SMatthew Dillon 
10280c3a8cd0SMatthew Dillon 	/*
10290c3a8cd0SMatthew Dillon 	 * Handle error, RREQ, or completion
10300c3a8cd0SMatthew Dillon 	 *
10310c3a8cd0SMatthew Dillon 	 * NOTE: nmax and bytes are invalid at this point, we don't bother
10320c3a8cd0SMatthew Dillon 	 *	 to update them when breaking out.
10330c3a8cd0SMatthew Dillon 	 */
10340c3a8cd0SMatthew Dillon 	if (ioq->error) {
10350c3a8cd0SMatthew Dillon skip:
10360c3a8cd0SMatthew Dillon 		/*
10370c3a8cd0SMatthew Dillon 		 * An unrecoverable error causes all active receive
10380c3a8cd0SMatthew Dillon 		 * transactions to be terminated with a LNK_ERROR message.
10390c3a8cd0SMatthew Dillon 		 *
10400c3a8cd0SMatthew Dillon 		 * Once all active transactions are exhausted we set the
10410c3a8cd0SMatthew Dillon 		 * iocom ERROR flag and return a non-transactional LNK_ERROR
10420c3a8cd0SMatthew Dillon 		 * message, which should cause master processing loops to
10430c3a8cd0SMatthew Dillon 		 * terminate.
10440c3a8cd0SMatthew Dillon 		 */
10455ab1caedSMatthew Dillon 		dmio_printf(iocom, 1, "IOQ ERROR %d\n", ioq->error);
10460c3a8cd0SMatthew Dillon 		assert(ioq->msg == msg);
10470c3a8cd0SMatthew Dillon 		if (msg) {
10480c3a8cd0SMatthew Dillon 			dmsg_msg_free(msg);
10490c3a8cd0SMatthew Dillon 			ioq->msg = NULL;
1050323c0947SMatthew Dillon 			msg = NULL;
10510c3a8cd0SMatthew Dillon 		}
10520c3a8cd0SMatthew Dillon 
10530c3a8cd0SMatthew Dillon 		/*
10540c3a8cd0SMatthew Dillon 		 * No more I/O read processing
10550c3a8cd0SMatthew Dillon 		 */
10560c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_ERROR;
10570c3a8cd0SMatthew Dillon 
10580c3a8cd0SMatthew Dillon 		/*
10590c3a8cd0SMatthew Dillon 		 * Simulate a remote LNK_ERROR DELETE msg for any open
10600c3a8cd0SMatthew Dillon 		 * transactions, ending with a final non-transactional
10610c3a8cd0SMatthew Dillon 		 * LNK_ERROR (that the session can detect) when no
10620c3a8cd0SMatthew Dillon 		 * transactions remain.
10630d20ec8aSMatthew Dillon 		 *
10641b8eded1SMatthew Dillon 		 * NOTE: Temporarily supply state0 and a degenerate cmd
10651b8eded1SMatthew Dillon 		 *	 without CREATE set.  The real state will be
10661b8eded1SMatthew Dillon 		 *	 assigned in the loop.
10671b8eded1SMatthew Dillon 		 *
10681b8eded1SMatthew Dillon 		 * NOTE: We are simulating a received message using our
10691b8eded1SMatthew Dillon 		 *	 side of the state, so the DMSGF_REV* bits have
10701b8eded1SMatthew Dillon 		 *	 to be reversed.
10710c3a8cd0SMatthew Dillon 		 */
10720c3a8cd0SMatthew Dillon 		pthread_mutex_lock(&iocom->mtx);
10730c3a8cd0SMatthew Dillon 		dmsg_iocom_drain(iocom);
10740a9eefcaSMatthew Dillon 		dmsg_simulate_failure(&iocom->state0, 0, ioq->error);
10750c3a8cd0SMatthew Dillon 		pthread_mutex_unlock(&iocom->mtx);
1076323c0947SMatthew Dillon 		if (TAILQ_FIRST(&ioq->msgq))
1077323c0947SMatthew Dillon 			goto again;
10780c3a8cd0SMatthew Dillon 
1079323c0947SMatthew Dillon #if 0
10800c3a8cd0SMatthew Dillon 		/*
10810c3a8cd0SMatthew Dillon 		 * For the iocom error case we want to set RWORK to indicate
10820c3a8cd0SMatthew Dillon 		 * that more messages might be pending.
10830c3a8cd0SMatthew Dillon 		 *
10840c3a8cd0SMatthew Dillon 		 * It is possible to return NULL when there is more work to
10850c3a8cd0SMatthew Dillon 		 * do because each message has to be DELETEd in both
10860c3a8cd0SMatthew Dillon 		 * directions before we continue on with the next (though
10870c3a8cd0SMatthew Dillon 		 * this could be optimized).  The transmit direction will
10880c3a8cd0SMatthew Dillon 		 * re-set RWORK.
10890c3a8cd0SMatthew Dillon 		 */
10900c3a8cd0SMatthew Dillon 		if (msg)
1091a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_RWORK);
1092323c0947SMatthew Dillon #endif
10930c3a8cd0SMatthew Dillon 	} else if (msg == NULL) {
10940c3a8cd0SMatthew Dillon 		/*
10950c3a8cd0SMatthew Dillon 		 * Insufficient data received to finish building the message,
10960c3a8cd0SMatthew Dillon 		 * set RREQ and return NULL.
10970c3a8cd0SMatthew Dillon 		 *
10980c3a8cd0SMatthew Dillon 		 * Leave ioq->msg intact.
10990c3a8cd0SMatthew Dillon 		 * Leave the FIFO intact.
11000c3a8cd0SMatthew Dillon 		 */
1101a2179323SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_RREQ);
11020c3a8cd0SMatthew Dillon 	} else {
11030c3a8cd0SMatthew Dillon 		/*
11040d20ec8aSMatthew Dillon 		 * Continue processing msg.
11050c3a8cd0SMatthew Dillon 		 *
11060c3a8cd0SMatthew Dillon 		 * The fifo has already been advanced past the message.
11070c3a8cd0SMatthew Dillon 		 * Trivially reset the FIFO indices if possible.
11080c3a8cd0SMatthew Dillon 		 *
11090c3a8cd0SMatthew Dillon 		 * clear the FIFO if it is now empty and set RREQ to wait
11100c3a8cd0SMatthew Dillon 		 * for more from the socket.  If the FIFO is not empty set
11110c3a8cd0SMatthew Dillon 		 * TWORK to bypass the poll so we loop immediately.
11120c3a8cd0SMatthew Dillon 		 */
11130c3a8cd0SMatthew Dillon 		if (ioq->fifo_beg == ioq->fifo_cdx &&
11140c3a8cd0SMatthew Dillon 		    ioq->fifo_cdn == ioq->fifo_end) {
1115a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_RREQ);
11160c3a8cd0SMatthew Dillon 			ioq->fifo_cdx = 0;
11170c3a8cd0SMatthew Dillon 			ioq->fifo_cdn = 0;
11180c3a8cd0SMatthew Dillon 			ioq->fifo_beg = 0;
11190c3a8cd0SMatthew Dillon 			ioq->fifo_end = 0;
11200c3a8cd0SMatthew Dillon 		} else {
1121a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_RWORK);
11220c3a8cd0SMatthew Dillon 		}
11230c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_HEADER1;
11240c3a8cd0SMatthew Dillon 		ioq->msg = NULL;
11250d20ec8aSMatthew Dillon 
11260d20ec8aSMatthew Dillon 		/*
11270d20ec8aSMatthew Dillon 		 * Handle message routing.  Validates non-zero sources
11280d20ec8aSMatthew Dillon 		 * and routes message.  Error will be 0 if the message is
11290d20ec8aSMatthew Dillon 		 * destined for us.
11300d20ec8aSMatthew Dillon 		 *
11310d20ec8aSMatthew Dillon 		 * State processing only occurs for messages destined for us.
11320d20ec8aSMatthew Dillon 		 */
11335ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
11340a9eefcaSMatthew Dillon 			    "rxmsg cmd=%08x circ=%016jx\n",
1135a2179323SMatthew Dillon 			    msg->any.head.cmd,
1136a2179323SMatthew Dillon 			    (intmax_t)msg->any.head.circuit);
11377adbba57SMatthew Dillon 
11380a9eefcaSMatthew Dillon 		error = dmsg_state_msgrx(msg, 0);
11390d20ec8aSMatthew Dillon 
11400d20ec8aSMatthew Dillon 		if (error) {
11410d20ec8aSMatthew Dillon 			/*
11420d20ec8aSMatthew Dillon 			 * Abort-after-closure, throw message away and
11430d20ec8aSMatthew Dillon 			 * start reading another.
11440d20ec8aSMatthew Dillon 			 */
11450d20ec8aSMatthew Dillon 			if (error == DMSG_IOQ_ERROR_EALREADY) {
11460d20ec8aSMatthew Dillon 				dmsg_msg_free(msg);
11470d20ec8aSMatthew Dillon 				goto again;
11480d20ec8aSMatthew Dillon 			}
11490d20ec8aSMatthew Dillon 
11500d20ec8aSMatthew Dillon 			/*
11510d20ec8aSMatthew Dillon 			 * Process real error and throw away message.
11520d20ec8aSMatthew Dillon 			 */
11530d20ec8aSMatthew Dillon 			ioq->error = error;
11540d20ec8aSMatthew Dillon 			goto skip;
11550d20ec8aSMatthew Dillon 		}
1156a06d536bSMatthew Dillon 
1157a06d536bSMatthew Dillon 		/*
1158a06d536bSMatthew Dillon 		 * No error and not routed
1159a06d536bSMatthew Dillon 		 */
11600d20ec8aSMatthew Dillon 		/* no error, not routed.  Fall through and return msg */
11610c3a8cd0SMatthew Dillon 	}
11620c3a8cd0SMatthew Dillon 	return (msg);
11630c3a8cd0SMatthew Dillon }
11640c3a8cd0SMatthew Dillon 
11650c3a8cd0SMatthew Dillon /*
11660c3a8cd0SMatthew Dillon  * Calculate the header and data crc's and write a low-level message to
11670c3a8cd0SMatthew Dillon  * the connection.  If aux_crc is non-zero the aux_data crc is already
11680c3a8cd0SMatthew Dillon  * assumed to have been set.
11690c3a8cd0SMatthew Dillon  *
11700c3a8cd0SMatthew Dillon  * A non-NULL msg is added to the queue but not necessarily flushed.
11710c3a8cd0SMatthew Dillon  * Calling this function with msg == NULL will get a flush going.
11720c3a8cd0SMatthew Dillon  *
1173a2179323SMatthew Dillon  * (called from iocom_core only)
11740c3a8cd0SMatthew Dillon  */
11750c3a8cd0SMatthew Dillon void
11760c3a8cd0SMatthew Dillon dmsg_iocom_flush1(dmsg_iocom_t *iocom)
11770c3a8cd0SMatthew Dillon {
11780c3a8cd0SMatthew Dillon 	dmsg_ioq_t *ioq = &iocom->ioq_tx;
11790c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
11800c3a8cd0SMatthew Dillon 	uint32_t xcrc32;
1181f306de83SMatthew Dillon 	size_t hbytes;
1182f306de83SMatthew Dillon 	size_t abytes;
11830c3a8cd0SMatthew Dillon 	dmsg_msg_queue_t tmpq;
11840c3a8cd0SMatthew Dillon 
1185a2179323SMatthew Dillon 	atomic_clear_int(&iocom->flags, DMSG_IOCOMF_WREQ | DMSG_IOCOMF_WWORK);
11860c3a8cd0SMatthew Dillon 	TAILQ_INIT(&tmpq);
11870c3a8cd0SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
11880d20ec8aSMatthew Dillon 	while ((msg = TAILQ_FIRST(&iocom->txmsgq)) != NULL) {
11890d20ec8aSMatthew Dillon 		TAILQ_REMOVE(&iocom->txmsgq, msg, qentry);
11900c3a8cd0SMatthew Dillon 		TAILQ_INSERT_TAIL(&tmpq, msg, qentry);
11910c3a8cd0SMatthew Dillon 	}
11920c3a8cd0SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
11930c3a8cd0SMatthew Dillon 
11940a9eefcaSMatthew Dillon 	/*
11950a9eefcaSMatthew Dillon 	 * Flush queue, doing all required encryption and CRC generation,
11960a9eefcaSMatthew Dillon 	 * with the mutex unlocked.
11970a9eefcaSMatthew Dillon 	 */
11980c3a8cd0SMatthew Dillon 	while ((msg = TAILQ_FIRST(&tmpq)) != NULL) {
11990c3a8cd0SMatthew Dillon 		/*
12000c3a8cd0SMatthew Dillon 		 * Process terminal connection errors.
12010c3a8cd0SMatthew Dillon 		 */
12020c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&tmpq, msg, qentry);
12030c3a8cd0SMatthew Dillon 		if (ioq->error) {
12040c3a8cd0SMatthew Dillon 			TAILQ_INSERT_TAIL(&ioq->msgq, msg, qentry);
12050c3a8cd0SMatthew Dillon 			++ioq->msgcount;
12060c3a8cd0SMatthew Dillon 			continue;
12070c3a8cd0SMatthew Dillon 		}
12080c3a8cd0SMatthew Dillon 
12090c3a8cd0SMatthew Dillon 		/*
12100c3a8cd0SMatthew Dillon 		 * Finish populating the msg fields.  The salt ensures that
12110c3a8cd0SMatthew Dillon 		 * the iv[] array is ridiculously randomized and we also
12120c3a8cd0SMatthew Dillon 		 * re-seed our PRNG every 32768 messages just to be sure.
12130c3a8cd0SMatthew Dillon 		 */
12140c3a8cd0SMatthew Dillon 		msg->any.head.magic = DMSG_HDR_MAGIC;
12150c3a8cd0SMatthew Dillon 		msg->any.head.salt = (random() << 8) | (ioq->seq & 255);
12160c3a8cd0SMatthew Dillon 		++ioq->seq;
12170a9eefcaSMatthew Dillon 		if ((ioq->seq & 32767) == 0) {
12180a9eefcaSMatthew Dillon 			pthread_mutex_lock(&iocom->mtx);
12190c3a8cd0SMatthew Dillon 			srandomdev();
12200a9eefcaSMatthew Dillon 			pthread_mutex_unlock(&iocom->mtx);
12210a9eefcaSMatthew Dillon 		}
12220c3a8cd0SMatthew Dillon 
12230c3a8cd0SMatthew Dillon 		/*
12240c3a8cd0SMatthew Dillon 		 * Calculate aux_crc if 0, then calculate hdr_crc.
12250c3a8cd0SMatthew Dillon 		 */
12260c3a8cd0SMatthew Dillon 		if (msg->aux_size && msg->any.head.aux_crc == 0) {
1227f306de83SMatthew Dillon 			abytes = DMSG_DOALIGN(msg->aux_size);
1228f306de83SMatthew Dillon 			xcrc32 = dmsg_icrc32(msg->aux_data, abytes);
12290c3a8cd0SMatthew Dillon 			msg->any.head.aux_crc = xcrc32;
12300c3a8cd0SMatthew Dillon 		}
1231f306de83SMatthew Dillon 		msg->any.head.aux_bytes = msg->aux_size;
12320c3a8cd0SMatthew Dillon 
12330c3a8cd0SMatthew Dillon 		hbytes = (msg->any.head.cmd & DMSGF_SIZE) *
12340c3a8cd0SMatthew Dillon 			 DMSG_ALIGN;
12350c3a8cd0SMatthew Dillon 		msg->any.head.hdr_crc = 0;
12360c3a8cd0SMatthew Dillon 		msg->any.head.hdr_crc = dmsg_icrc32(&msg->any.head, hbytes);
12370c3a8cd0SMatthew Dillon 
12380c3a8cd0SMatthew Dillon 		/*
12390c3a8cd0SMatthew Dillon 		 * Enqueue the message (the flush codes handles stream
12400c3a8cd0SMatthew Dillon 		 * encryption).
12410c3a8cd0SMatthew Dillon 		 */
12420c3a8cd0SMatthew Dillon 		TAILQ_INSERT_TAIL(&ioq->msgq, msg, qentry);
12430c3a8cd0SMatthew Dillon 		++ioq->msgcount;
12440c3a8cd0SMatthew Dillon 	}
12450c3a8cd0SMatthew Dillon 	dmsg_iocom_flush2(iocom);
12460c3a8cd0SMatthew Dillon }
12470c3a8cd0SMatthew Dillon 
12480c3a8cd0SMatthew Dillon /*
12490c3a8cd0SMatthew Dillon  * Thread localized, iocom->mtx not held by caller.
1250a2179323SMatthew Dillon  *
1251a2179323SMatthew Dillon  * (called from iocom_core via iocom_flush1 only)
12520c3a8cd0SMatthew Dillon  */
12530c3a8cd0SMatthew Dillon void
12540c3a8cd0SMatthew Dillon dmsg_iocom_flush2(dmsg_iocom_t *iocom)
12550c3a8cd0SMatthew Dillon {
12560c3a8cd0SMatthew Dillon 	dmsg_ioq_t *ioq = &iocom->ioq_tx;
12570c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
12580c3a8cd0SMatthew Dillon 	ssize_t n;
12590c3a8cd0SMatthew Dillon 	struct iovec iov[DMSG_IOQ_MAXIOVEC];
12600c3a8cd0SMatthew Dillon 	size_t nact;
12610c3a8cd0SMatthew Dillon 	size_t hbytes;
12620c3a8cd0SMatthew Dillon 	size_t abytes;
12630c3a8cd0SMatthew Dillon 	size_t hoff;
12640c3a8cd0SMatthew Dillon 	size_t aoff;
12650c3a8cd0SMatthew Dillon 	int iovcnt;
12667adbba57SMatthew Dillon 	int save_errno;
12670c3a8cd0SMatthew Dillon 
12680c3a8cd0SMatthew Dillon 	if (ioq->error) {
12690c3a8cd0SMatthew Dillon 		dmsg_iocom_drain(iocom);
12700c3a8cd0SMatthew Dillon 		return;
12710c3a8cd0SMatthew Dillon 	}
12720c3a8cd0SMatthew Dillon 
12730c3a8cd0SMatthew Dillon 	/*
12740c3a8cd0SMatthew Dillon 	 * Pump messages out the connection by building an iovec.
12750c3a8cd0SMatthew Dillon 	 *
12760c3a8cd0SMatthew Dillon 	 * ioq->hbytes/ioq->abytes tracks how much of the first message
12770c3a8cd0SMatthew Dillon 	 * in the queue has been successfully written out, so we can
12780c3a8cd0SMatthew Dillon 	 * resume writing.
12790c3a8cd0SMatthew Dillon 	 */
12800c3a8cd0SMatthew Dillon 	iovcnt = 0;
12810c3a8cd0SMatthew Dillon 	nact = 0;
12820c3a8cd0SMatthew Dillon 	hoff = ioq->hbytes;
12830c3a8cd0SMatthew Dillon 	aoff = ioq->abytes;
12840c3a8cd0SMatthew Dillon 
12850c3a8cd0SMatthew Dillon 	TAILQ_FOREACH(msg, &ioq->msgq, qentry) {
12860c3a8cd0SMatthew Dillon 		hbytes = (msg->any.head.cmd & DMSGF_SIZE) *
12870c3a8cd0SMatthew Dillon 			 DMSG_ALIGN;
12888d6d37b8SMatthew Dillon 		abytes = DMSG_DOALIGN(msg->aux_size);
12890c3a8cd0SMatthew Dillon 		assert(hoff <= hbytes && aoff <= abytes);
12900c3a8cd0SMatthew Dillon 
12910c3a8cd0SMatthew Dillon 		if (hoff < hbytes) {
1292024de405SMatthew Dillon 			size_t maxlen = hbytes - hoff;
1293024de405SMatthew Dillon 			if (maxlen > sizeof(ioq->buf) / 2)
1294024de405SMatthew Dillon 				maxlen = sizeof(ioq->buf) / 2;
12950c3a8cd0SMatthew Dillon 			iov[iovcnt].iov_base = (char *)&msg->any.head + hoff;
1296024de405SMatthew Dillon 			iov[iovcnt].iov_len = maxlen;
1297024de405SMatthew Dillon 			nact += maxlen;
12980c3a8cd0SMatthew Dillon 			++iovcnt;
1299024de405SMatthew Dillon 			if (iovcnt == DMSG_IOQ_MAXIOVEC ||
1300024de405SMatthew Dillon 			    maxlen != hbytes - hoff) {
13010c3a8cd0SMatthew Dillon 				break;
13020c3a8cd0SMatthew Dillon 			}
1303024de405SMatthew Dillon 		}
13040c3a8cd0SMatthew Dillon 		if (aoff < abytes) {
1305024de405SMatthew Dillon 			size_t maxlen = abytes - aoff;
1306024de405SMatthew Dillon 			if (maxlen > sizeof(ioq->buf) / 2)
1307024de405SMatthew Dillon 				maxlen = sizeof(ioq->buf) / 2;
1308024de405SMatthew Dillon 
13090c3a8cd0SMatthew Dillon 			assert(msg->aux_data != NULL);
13100c3a8cd0SMatthew Dillon 			iov[iovcnt].iov_base = (char *)msg->aux_data + aoff;
1311024de405SMatthew Dillon 			iov[iovcnt].iov_len = maxlen;
1312024de405SMatthew Dillon 			nact += maxlen;
13130c3a8cd0SMatthew Dillon 			++iovcnt;
1314024de405SMatthew Dillon 			if (iovcnt == DMSG_IOQ_MAXIOVEC ||
1315024de405SMatthew Dillon 			    maxlen != abytes - aoff) {
13160c3a8cd0SMatthew Dillon 				break;
13170c3a8cd0SMatthew Dillon 			}
1318024de405SMatthew Dillon 		}
13190c3a8cd0SMatthew Dillon 		hoff = 0;
13200c3a8cd0SMatthew Dillon 		aoff = 0;
13210c3a8cd0SMatthew Dillon 	}
13227adbba57SMatthew Dillon 
13237adbba57SMatthew Dillon 	/*
13247adbba57SMatthew Dillon 	 * Shortcut if no work to do.  Be sure to check for old work still
13257adbba57SMatthew Dillon 	 * pending in the FIFO.
13267adbba57SMatthew Dillon 	 */
13277adbba57SMatthew Dillon 	if (iovcnt == 0 && ioq->fifo_beg == ioq->fifo_cdx)
13280c3a8cd0SMatthew Dillon 		return;
13290c3a8cd0SMatthew Dillon 
13300c3a8cd0SMatthew Dillon 	/*
13310c3a8cd0SMatthew Dillon 	 * Encrypt and write the data.  The crypto code will move the
13320c3a8cd0SMatthew Dillon 	 * data into the fifo and adjust the iov as necessary.  If
13330c3a8cd0SMatthew Dillon 	 * encryption is disabled the iov is left alone.
13340c3a8cd0SMatthew Dillon 	 *
13350c3a8cd0SMatthew Dillon 	 * May return a smaller iov (thus a smaller n), with aggregated
13360c3a8cd0SMatthew Dillon 	 * chunks.  May reduce nmax to what fits in the FIFO.
13370c3a8cd0SMatthew Dillon 	 *
13380c3a8cd0SMatthew Dillon 	 * This function sets nact to the number of original bytes now
13390c3a8cd0SMatthew Dillon 	 * encrypted, adding to the FIFO some number of bytes that might
13400c3a8cd0SMatthew Dillon 	 * be greater depending on the crypto mechanic.  iov[] is adjusted
13410c3a8cd0SMatthew Dillon 	 * to point at the FIFO if necessary.
13420c3a8cd0SMatthew Dillon 	 *
1343f2239a4eSMatthew Dillon 	 * NOTE: nact is the number of bytes eaten from the message.  For
1344f2239a4eSMatthew Dillon 	 *	 encrypted data this is the number of bytes processed for
1345f2239a4eSMatthew Dillon 	 *	 encryption and not necessarily the number of bytes writable.
1346f2239a4eSMatthew Dillon 	 *	 The return value from the writev() is the post-encrypted
1347f2239a4eSMatthew Dillon 	 *	 byte count which might be larger.
1348f2239a4eSMatthew Dillon 	 *
1349f2239a4eSMatthew Dillon 	 * NOTE: For direct writes, nact is the return value from the writev().
13500c3a8cd0SMatthew Dillon 	 */
13510c3a8cd0SMatthew Dillon 	if (iocom->flags & DMSG_IOCOMF_CRYPTED) {
13520c3a8cd0SMatthew Dillon 		/*
13530c3a8cd0SMatthew Dillon 		 * Make sure the FIFO has a reasonable amount of space
13540c3a8cd0SMatthew Dillon 		 * left (if not completely full).
1355a2179323SMatthew Dillon 		 *
1356a2179323SMatthew Dillon 		 * In this situation we are staging the encrypted message
1357a2179323SMatthew Dillon 		 * data in the FIFO.  (nact) represents how much plaintext
1358a2179323SMatthew Dillon 		 * has been staged, (n) represents how much encrypted data
1359a2179323SMatthew Dillon 		 * has been flushed.  The two are independent of each other.
13600c3a8cd0SMatthew Dillon 		 */
13610c3a8cd0SMatthew Dillon 		if (ioq->fifo_beg > sizeof(ioq->buf) / 2 &&
1362a2179323SMatthew Dillon 		    sizeof(ioq->buf) - ioq->fifo_end < DMSG_ALIGN * 2) {
13630c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg, ioq->buf,
13640c3a8cd0SMatthew Dillon 			      ioq->fifo_end - ioq->fifo_beg);
13650c3a8cd0SMatthew Dillon 			ioq->fifo_cdx -= ioq->fifo_beg;
13660c3a8cd0SMatthew Dillon 			ioq->fifo_cdn -= ioq->fifo_beg;
13670c3a8cd0SMatthew Dillon 			ioq->fifo_end -= ioq->fifo_beg;
13680c3a8cd0SMatthew Dillon 			ioq->fifo_beg = 0;
13690c3a8cd0SMatthew Dillon 		}
13700c3a8cd0SMatthew Dillon 
1371f2239a4eSMatthew Dillon 		/*
1372f2239a4eSMatthew Dillon 		 * beg .... cdx ............ cdn ............. end
1373f2239a4eSMatthew Dillon 		 * [WRITABLE] [PARTIALENCRYPT] [NOTYETENCRYPTED]
1374f2239a4eSMatthew Dillon 		 *
1375e96cef49SMatthew Dillon 		 * Advance fifo_beg on a successful write.
1376f2239a4eSMatthew Dillon 		 */
13770c3a8cd0SMatthew Dillon 		iovcnt = dmsg_crypto_encrypt(iocom, ioq, iov, iovcnt, &nact);
13780c3a8cd0SMatthew Dillon 		n = writev(iocom->sock_fd, iov, iovcnt);
13797adbba57SMatthew Dillon 		save_errno = errno;
13800c3a8cd0SMatthew Dillon 		if (n > 0) {
13810c3a8cd0SMatthew Dillon 			ioq->fifo_beg += n;
13820c3a8cd0SMatthew Dillon 			if (ioq->fifo_beg == ioq->fifo_end) {
13830c3a8cd0SMatthew Dillon 				ioq->fifo_beg = 0;
13840c3a8cd0SMatthew Dillon 				ioq->fifo_cdn = 0;
13850c3a8cd0SMatthew Dillon 				ioq->fifo_cdx = 0;
13860c3a8cd0SMatthew Dillon 				ioq->fifo_end = 0;
13870c3a8cd0SMatthew Dillon 			}
13880c3a8cd0SMatthew Dillon 		}
13897adbba57SMatthew Dillon 
1390a2179323SMatthew Dillon 		/*
1391a2179323SMatthew Dillon 		 * We don't mess with the nact returned by the crypto_encrypt
1392a2179323SMatthew Dillon 		 * call, which represents the filling of the FIFO.  (n) tells
1393a2179323SMatthew Dillon 		 * us how much we were able to write from the FIFO.  The two
1394a2179323SMatthew Dillon 		 * are different beasts when encrypting.
1395a2179323SMatthew Dillon 		 */
13960c3a8cd0SMatthew Dillon 	} else {
1397a2179323SMatthew Dillon 		/*
1398a2179323SMatthew Dillon 		 * In this situation we are not staging the messages to the
1399a2179323SMatthew Dillon 		 * FIFO but instead writing them directly from the msg
1400f2239a4eSMatthew Dillon 		 * structure(s) unencrypted, so (nact) is basically (n).
1401a2179323SMatthew Dillon 		 */
14020c3a8cd0SMatthew Dillon 		n = writev(iocom->sock_fd, iov, iovcnt);
14037adbba57SMatthew Dillon 		save_errno = errno;
14040c3a8cd0SMatthew Dillon 		if (n > 0)
14050c3a8cd0SMatthew Dillon 			nact = n;
14060c3a8cd0SMatthew Dillon 		else
14070c3a8cd0SMatthew Dillon 			nact = 0;
14080c3a8cd0SMatthew Dillon 	}
14090c3a8cd0SMatthew Dillon 
14100c3a8cd0SMatthew Dillon 	/*
14110c3a8cd0SMatthew Dillon 	 * Clean out the transmit queue based on what we successfully
14127adbba57SMatthew Dillon 	 * encrypted (nact is the plaintext count) and is now in the FIFO.
14137adbba57SMatthew Dillon 	 * ioq->hbytes/abytes represents the portion of the first message
14147adbba57SMatthew Dillon 	 * previously sent.
14150c3a8cd0SMatthew Dillon 	 */
14160c3a8cd0SMatthew Dillon 	while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
14170c3a8cd0SMatthew Dillon 		hbytes = (msg->any.head.cmd & DMSGF_SIZE) *
14180c3a8cd0SMatthew Dillon 			 DMSG_ALIGN;
14198d6d37b8SMatthew Dillon 		abytes = DMSG_DOALIGN(msg->aux_size);
14200c3a8cd0SMatthew Dillon 
14210c3a8cd0SMatthew Dillon 		if ((size_t)nact < hbytes - ioq->hbytes) {
14220c3a8cd0SMatthew Dillon 			ioq->hbytes += nact;
14230c3a8cd0SMatthew Dillon 			nact = 0;
14240c3a8cd0SMatthew Dillon 			break;
14250c3a8cd0SMatthew Dillon 		}
14260c3a8cd0SMatthew Dillon 		nact -= hbytes - ioq->hbytes;
14270c3a8cd0SMatthew Dillon 		ioq->hbytes = hbytes;
14280c3a8cd0SMatthew Dillon 		if ((size_t)nact < abytes - ioq->abytes) {
14290c3a8cd0SMatthew Dillon 			ioq->abytes += nact;
14300c3a8cd0SMatthew Dillon 			nact = 0;
14310c3a8cd0SMatthew Dillon 			break;
14320c3a8cd0SMatthew Dillon 		}
14330c3a8cd0SMatthew Dillon 		nact -= abytes - ioq->abytes;
1434a2179323SMatthew Dillon 		/* ioq->abytes = abytes; optimized out */
1435a2179323SMatthew Dillon 
14365ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
14370a9eefcaSMatthew Dillon 			    "txmsg cmd=%08x circ=%016jx\n",
1438a2179323SMatthew Dillon 			    msg->any.head.cmd,
1439a2179323SMatthew Dillon 			    (intmax_t)msg->any.head.circuit);
14400c3a8cd0SMatthew Dillon 
14417adbba57SMatthew Dillon #ifdef DMSG_BLOCK_DEBUG
14427adbba57SMatthew Dillon 		uint32_t tcmd;
14437adbba57SMatthew Dillon 
14447adbba57SMatthew Dillon 		if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
14457adbba57SMatthew Dillon 			if ((msg->state->flags & DMSG_STATE_ROOT) == 0) {
14467adbba57SMatthew Dillon 				tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
14477adbba57SMatthew Dillon 					    (msg->any.head.cmd & (DMSGF_CREATE |
14487adbba57SMatthew Dillon 								  DMSGF_DELETE |
14497adbba57SMatthew Dillon 								  DMSGF_REPLY));
14507adbba57SMatthew Dillon 			} else {
14517adbba57SMatthew Dillon 				tcmd = 0;
14527adbba57SMatthew Dillon 			}
14537adbba57SMatthew Dillon 		} else {
14547adbba57SMatthew Dillon 			tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
14557adbba57SMatthew Dillon 		}
14567adbba57SMatthew Dillon 
14577adbba57SMatthew Dillon 		switch (tcmd) {
14587adbba57SMatthew Dillon 		case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE:
14597adbba57SMatthew Dillon 		case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE:
14605ab1caedSMatthew Dillon 			dmio_printf(iocom, 4,
14615ab1caedSMatthew Dillon 				    "write BIO %-3d %016jx %d@%016jx\n",
14627adbba57SMatthew Dillon 				    biocount, msg->any.head.msgid,
14637adbba57SMatthew Dillon 				    msg->any.blk_read.bytes,
14647adbba57SMatthew Dillon 				    msg->any.blk_read.offset);
14657adbba57SMatthew Dillon 			break;
14667adbba57SMatthew Dillon 		case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
14677adbba57SMatthew Dillon 		case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
14685ab1caedSMatthew Dillon 			dmio_printf(iocom, 4,
14695ab1caedSMatthew Dillon 				    "wretr BIO %-3d %016jx %d@%016jx\n",
14707adbba57SMatthew Dillon 				    biocount, msg->any.head.msgid,
14717adbba57SMatthew Dillon 				    msg->any.blk_read.bytes,
14727adbba57SMatthew Dillon 				    msg->any.blk_read.offset);
14737adbba57SMatthew Dillon 			break;
14747adbba57SMatthew Dillon 		default:
14757adbba57SMatthew Dillon 			break;
14767adbba57SMatthew Dillon 		}
14777adbba57SMatthew Dillon #endif
14787adbba57SMatthew Dillon 
14790c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&ioq->msgq, msg, qentry);
14800c3a8cd0SMatthew Dillon 		--ioq->msgcount;
14810c3a8cd0SMatthew Dillon 		ioq->hbytes = 0;
14820c3a8cd0SMatthew Dillon 		ioq->abytes = 0;
1483323c0947SMatthew Dillon 		dmsg_msg_free(msg);
14840c3a8cd0SMatthew Dillon 	}
14850c3a8cd0SMatthew Dillon 	assert(nact == 0);
14860c3a8cd0SMatthew Dillon 
14870c3a8cd0SMatthew Dillon 	/*
14880c3a8cd0SMatthew Dillon 	 * Process the return value from the write w/regards to blocking.
14890c3a8cd0SMatthew Dillon 	 */
14900c3a8cd0SMatthew Dillon 	if (n < 0) {
14917adbba57SMatthew Dillon 		if (save_errno != EINTR &&
14927adbba57SMatthew Dillon 		    save_errno != EINPROGRESS &&
14937adbba57SMatthew Dillon 		    save_errno != EAGAIN) {
14940c3a8cd0SMatthew Dillon 			/*
14950c3a8cd0SMatthew Dillon 			 * Fatal write error
14960c3a8cd0SMatthew Dillon 			 */
14970c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_SOCK;
14980c3a8cd0SMatthew Dillon 			dmsg_iocom_drain(iocom);
14990c3a8cd0SMatthew Dillon 		} else {
15000c3a8cd0SMatthew Dillon 			/*
15017adbba57SMatthew Dillon 			 * Wait for socket buffer space, do not try to
15027adbba57SMatthew Dillon 			 * process more packets for transmit until space
15037adbba57SMatthew Dillon 			 * is available.
15040c3a8cd0SMatthew Dillon 			 */
1505a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_WREQ);
15060c3a8cd0SMatthew Dillon 		}
15077adbba57SMatthew Dillon 	} else if (TAILQ_FIRST(&ioq->msgq) ||
15087adbba57SMatthew Dillon 		   TAILQ_FIRST(&iocom->txmsgq) ||
15097adbba57SMatthew Dillon 		   ioq->fifo_beg != ioq->fifo_cdx) {
15107adbba57SMatthew Dillon 		/*
15117adbba57SMatthew Dillon 		 * If the write succeeded and more messages are pending
15127adbba57SMatthew Dillon 		 * in either msgq, or the FIFO WWORK must remain set.
15137adbba57SMatthew Dillon 		 */
15147adbba57SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_WWORK);
15150c3a8cd0SMatthew Dillon 	}
15167adbba57SMatthew Dillon 	/* else no transmit-side work remains */
15177adbba57SMatthew Dillon 
15180c3a8cd0SMatthew Dillon 	if (ioq->error) {
15190c3a8cd0SMatthew Dillon 		dmsg_iocom_drain(iocom);
15200c3a8cd0SMatthew Dillon 	}
15210c3a8cd0SMatthew Dillon }
15220c3a8cd0SMatthew Dillon 
15230c3a8cd0SMatthew Dillon /*
15240c3a8cd0SMatthew Dillon  * Kill pending msgs on ioq_tx and adjust the flags such that no more
15250c3a8cd0SMatthew Dillon  * write events will occur.  We don't kill read msgs because we want
15260c3a8cd0SMatthew Dillon  * the caller to pull off our contrived terminal error msg to detect
15270c3a8cd0SMatthew Dillon  * the connection failure.
15280c3a8cd0SMatthew Dillon  *
1529a2179323SMatthew Dillon  * Localized to iocom_core thread, iocom->mtx not held by caller.
15300c3a8cd0SMatthew Dillon  */
15310c3a8cd0SMatthew Dillon void
15320c3a8cd0SMatthew Dillon dmsg_iocom_drain(dmsg_iocom_t *iocom)
15330c3a8cd0SMatthew Dillon {
15340c3a8cd0SMatthew Dillon 	dmsg_ioq_t *ioq = &iocom->ioq_tx;
15350c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
15360c3a8cd0SMatthew Dillon 
1537a2179323SMatthew Dillon 	atomic_clear_int(&iocom->flags, DMSG_IOCOMF_WREQ | DMSG_IOCOMF_WWORK);
15380c3a8cd0SMatthew Dillon 	ioq->hbytes = 0;
15390c3a8cd0SMatthew Dillon 	ioq->abytes = 0;
15400c3a8cd0SMatthew Dillon 
15410c3a8cd0SMatthew Dillon 	while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
15420c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&ioq->msgq, msg, qentry);
15430c3a8cd0SMatthew Dillon 		--ioq->msgcount;
1544323c0947SMatthew Dillon 		dmsg_msg_free(msg);
15450c3a8cd0SMatthew Dillon 	}
15460c3a8cd0SMatthew Dillon }
15470c3a8cd0SMatthew Dillon 
15480c3a8cd0SMatthew Dillon /*
15490c3a8cd0SMatthew Dillon  * Write a message to an iocom, with additional state processing.
15500c3a8cd0SMatthew Dillon  */
15510c3a8cd0SMatthew Dillon void
15520c3a8cd0SMatthew Dillon dmsg_msg_write(dmsg_msg_t *msg)
15530c3a8cd0SMatthew Dillon {
15541b8eded1SMatthew Dillon 	dmsg_iocom_t *iocom = msg->state->iocom;
15550c3a8cd0SMatthew Dillon 	dmsg_state_t *state;
15560c3a8cd0SMatthew Dillon 	char dummy;
15570c3a8cd0SMatthew Dillon 
15580c3a8cd0SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
15591b8eded1SMatthew Dillon 	state = msg->state;
1560d30cab67SMatthew Dillon 
15615ab1caedSMatthew Dillon 	dmio_printf(iocom, 5,
15620a9eefcaSMatthew Dillon 		    "msgtx: cmd=%08x msgid=%016jx "
15630a9eefcaSMatthew Dillon 		    "state %p(%08x) error=%d\n",
15640a9eefcaSMatthew Dillon 		    msg->any.head.cmd, msg->any.head.msgid,
15650a9eefcaSMatthew Dillon 		    state, (state ? state->icmd : 0),
15660a9eefcaSMatthew Dillon 		    msg->any.head.error);
15670a9eefcaSMatthew Dillon 
15680a9eefcaSMatthew Dillon 
1569a06d536bSMatthew Dillon #if 0
1570323c0947SMatthew Dillon 	/*
1571323c0947SMatthew Dillon 	 * Make sure the parent transaction is still open in the transmit
1572323c0947SMatthew Dillon 	 * direction.  If it isn't the message is dead and we have to
1573323c0947SMatthew Dillon 	 * potentially simulate a rxmsg terminating the transaction.
1574323c0947SMatthew Dillon 	 */
1575a06d536bSMatthew Dillon 	if ((state->parent->txcmd & DMSGF_DELETE) ||
1576a06d536bSMatthew Dillon 	    (state->parent->rxcmd & DMSGF_DELETE)) {
15775ab1caedSMatthew Dillon 		dmio_printf(iocom, 4, "dmsg_msg_write: EARLY TERMINATION\n");
1578a06d536bSMatthew Dillon 		dmsg_simulate_failure(state, DMSG_ERR_LOSTLINK);
1579323c0947SMatthew Dillon 		dmsg_state_cleanuptx(iocom, msg);
1580323c0947SMatthew Dillon 		dmsg_msg_free(msg);
1581323c0947SMatthew Dillon 		pthread_mutex_unlock(&iocom->mtx);
1582323c0947SMatthew Dillon 		return;
1583323c0947SMatthew Dillon 	}
1584a06d536bSMatthew Dillon #endif
1585323c0947SMatthew Dillon 	/*
1586323c0947SMatthew Dillon 	 * Process state data into the message as needed, then update the
1587323c0947SMatthew Dillon 	 * state based on the message.
1588323c0947SMatthew Dillon 	 */
1589d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
15900c3a8cd0SMatthew Dillon 		/*
15910c3a8cd0SMatthew Dillon 		 * Existing transaction (could be reply).  It is also
15920c3a8cd0SMatthew Dillon 		 * possible for this to be the first reply (CREATE is set),
15930c3a8cd0SMatthew Dillon 		 * in which case we populate state->txcmd.
15940c3a8cd0SMatthew Dillon 		 *
15950c3a8cd0SMatthew Dillon 		 * state->txcmd is adjusted to hold the final message cmd,
15960c3a8cd0SMatthew Dillon 		 * and we also be sure to set the CREATE bit here.  We did
15970c3a8cd0SMatthew Dillon 		 * not set it in dmsg_msg_alloc() because that would have
15980c3a8cd0SMatthew Dillon 		 * not been serialized (state could have gotten ripped out
15990c3a8cd0SMatthew Dillon 		 * from under the message prior to it being transmitted).
16000c3a8cd0SMatthew Dillon 		 */
16010c3a8cd0SMatthew Dillon 		if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_REPLY)) ==
16020c3a8cd0SMatthew Dillon 		    DMSGF_CREATE) {
16030c3a8cd0SMatthew Dillon 			state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
16040d20ec8aSMatthew Dillon 			state->icmd = state->txcmd & DMSGF_BASECMDMASK;
16050a9eefcaSMatthew Dillon 			state->flags &= ~DMSG_STATE_NEW;
16060c3a8cd0SMatthew Dillon 		}
16070c3a8cd0SMatthew Dillon 		msg->any.head.msgid = state->msgid;
16081b8eded1SMatthew Dillon 
16090d20ec8aSMatthew Dillon 		if (msg->any.head.cmd & DMSGF_CREATE) {
16100c3a8cd0SMatthew Dillon 			state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
16110c3a8cd0SMatthew Dillon 		}
16120d20ec8aSMatthew Dillon 	}
16131b8eded1SMatthew Dillon 
16140c3a8cd0SMatthew Dillon 	/*
16150a9eefcaSMatthew Dillon 	 * Discard messages sent to transactions which are already dead.
16160c3a8cd0SMatthew Dillon 	 */
16170a9eefcaSMatthew Dillon 	if (state && (state->txcmd & DMSGF_DELETE)) {
16185ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
16195ab1caedSMatthew Dillon 			    "dmsg_msg_write: drop msg %08x to dead "
16200a9eefcaSMatthew Dillon 			    "circuit state=%p\n",
16210a9eefcaSMatthew Dillon 			    msg->any.head.cmd, state);
16220a9eefcaSMatthew Dillon 		dmsg_msg_free(msg);
16230a9eefcaSMatthew Dillon 		return;
16240a9eefcaSMatthew Dillon 	}
16250a9eefcaSMatthew Dillon 
16260a9eefcaSMatthew Dillon 	/*
16270a9eefcaSMatthew Dillon 	 * Normally we queue the msg for output.  However, if the circuit is
16280a9eefcaSMatthew Dillon 	 * dead or dying we must simulate a failure in the return direction
16290a9eefcaSMatthew Dillon 	 * and throw the message away.  The other end is not expecting any
16300a9eefcaSMatthew Dillon 	 * further messages from us on this state.
16310a9eefcaSMatthew Dillon 	 *
16320a9eefcaSMatthew Dillon 	 * Note that the I/O thread is responsible for generating the CRCs
16330a9eefcaSMatthew Dillon 	 * and encryption.
16340a9eefcaSMatthew Dillon 	 */
16350a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_DYING) {
16360a9eefcaSMatthew Dillon #if 0
16370a9eefcaSMatthew Dillon 	if ((state->parent->txcmd & DMSGF_DELETE) ||
16380a9eefcaSMatthew Dillon 	    (state->parent->flags & DMSG_STATE_DYING) ||
16390a9eefcaSMatthew Dillon 	    (state->flags & DMSG_STATE_DYING)) {
16400a9eefcaSMatthew Dillon #endif
16410a9eefcaSMatthew Dillon 		/*
16420a9eefcaSMatthew Dillon 		 * Illegal message, kill state and related sub-state.
16430a9eefcaSMatthew Dillon 		 * Cannot transmit if state is already dying.
16440a9eefcaSMatthew Dillon 		 */
16455ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
16465ab1caedSMatthew Dillon 			    "dmsg_msg_write: Write to dying circuit "
16470a9eefcaSMatthew Dillon 			    "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
16480a9eefcaSMatthew Dillon 			    state->parent->rxcmd,
16490a9eefcaSMatthew Dillon 			    state->parent->txcmd,
16500a9eefcaSMatthew Dillon 			    state->parent->flags);
16510a9eefcaSMatthew Dillon 		dmsg_state_hold(state);
16520a9eefcaSMatthew Dillon 		dmsg_state_cleanuptx(iocom, msg);
16530a9eefcaSMatthew Dillon 		if ((state->flags & DMSG_STATE_ABORTING) == 0) {
16540a9eefcaSMatthew Dillon 			dmsg_simulate_failure(state, 1, DMSG_ERR_LOSTLINK);
16550a9eefcaSMatthew Dillon 		}
16560a9eefcaSMatthew Dillon 		dmsg_state_drop(state);
16570a9eefcaSMatthew Dillon 		dmsg_msg_free(msg);
16580a9eefcaSMatthew Dillon 	} else {
16590a9eefcaSMatthew Dillon 		/*
16600a9eefcaSMatthew Dillon 		 * Queue the message, clean up transmit state prior to queueing
16610a9eefcaSMatthew Dillon 		 * to avoid SMP races.
16620a9eefcaSMatthew Dillon 		 */
16635ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
16645ab1caedSMatthew Dillon 			    "dmsg_msg_write: commit msg state=%p to txkmsgq\n",
16655ab1caedSMatthew Dillon 			    state);
16660a9eefcaSMatthew Dillon 		dmsg_state_cleanuptx(iocom, msg);
16670d20ec8aSMatthew Dillon 		TAILQ_INSERT_TAIL(&iocom->txmsgq, msg, qentry);
16680c3a8cd0SMatthew Dillon 		dummy = 0;
16690c3a8cd0SMatthew Dillon 		write(iocom->wakeupfds[1], &dummy, 1);	/* XXX optimize me */
16700a9eefcaSMatthew Dillon 	}
16710c3a8cd0SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
16720c3a8cd0SMatthew Dillon }
16730c3a8cd0SMatthew Dillon 
16740c3a8cd0SMatthew Dillon /*
16750a9eefcaSMatthew Dillon  * Remove state from its parent's subq.  This can wind up recursively
16760a9eefcaSMatthew Dillon  * dropping the parent upward.
16770a9eefcaSMatthew Dillon  *
16780a9eefcaSMatthew Dillon  * NOTE: iocom must be locked.
16790a9eefcaSMatthew Dillon  *
16800a9eefcaSMatthew Dillon  * NOTE: Once we drop the parent, our pstate pointer may become invalid.
16810a9eefcaSMatthew Dillon  */
16820a9eefcaSMatthew Dillon static
16830a9eefcaSMatthew Dillon void
16840a9eefcaSMatthew Dillon dmsg_subq_delete(dmsg_state_t *state)
16850a9eefcaSMatthew Dillon {
16860a9eefcaSMatthew Dillon 	dmsg_state_t *pstate;
16870a9eefcaSMatthew Dillon 
16880a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_SUBINSERTED) {
16890a9eefcaSMatthew Dillon 		pstate = state->parent;
16900a9eefcaSMatthew Dillon 		assert(pstate);
16910a9eefcaSMatthew Dillon 		if (pstate->scan == state)
16920a9eefcaSMatthew Dillon 			pstate->scan = NULL;
16930a9eefcaSMatthew Dillon 		TAILQ_REMOVE(&pstate->subq, state, entry);
16940a9eefcaSMatthew Dillon 		state->flags &= ~DMSG_STATE_SUBINSERTED;
16950a9eefcaSMatthew Dillon 		state->parent = NULL;
16960a9eefcaSMatthew Dillon 		if (TAILQ_EMPTY(&pstate->subq))
16970a9eefcaSMatthew Dillon 			dmsg_state_drop(pstate);/* pstate->subq */
16980a9eefcaSMatthew Dillon 		pstate = NULL;			/* safety */
16990a9eefcaSMatthew Dillon 		dmsg_state_drop(state);         /* pstate->subq */
17000a9eefcaSMatthew Dillon 	} else {
17010a9eefcaSMatthew Dillon 		assert(state->parent == NULL);
17020a9eefcaSMatthew Dillon 	}
17030a9eefcaSMatthew Dillon }
17040a9eefcaSMatthew Dillon 
17050a9eefcaSMatthew Dillon /*
1706a06d536bSMatthew Dillon  * Simulate reception of a transaction DELETE message when the link goes
1707a06d536bSMatthew Dillon  * bad.  This routine must recurse through state->subq and generate messages
1708a06d536bSMatthew Dillon  * and callbacks bottom-up.
1709a06d536bSMatthew Dillon  *
1710323c0947SMatthew Dillon  * iocom->mtx must be held by caller.
1711323c0947SMatthew Dillon  */
1712323c0947SMatthew Dillon static
1713323c0947SMatthew Dillon void
17140a9eefcaSMatthew Dillon dmsg_simulate_failure(dmsg_state_t *state, int meto, int error)
1715323c0947SMatthew Dillon {
1716a06d536bSMatthew Dillon 	dmsg_state_t *substate;
17170a9eefcaSMatthew Dillon 
17180a9eefcaSMatthew Dillon 	dmsg_state_hold(state);
17190a9eefcaSMatthew Dillon 	if (meto)
17200a9eefcaSMatthew Dillon 		dmsg_state_abort(state);
17210a9eefcaSMatthew Dillon 
17220a9eefcaSMatthew Dillon 	/*
17230a9eefcaSMatthew Dillon 	 * Recurse through sub-states.
17240a9eefcaSMatthew Dillon 	 */
17250a9eefcaSMatthew Dillon again:
17260a9eefcaSMatthew Dillon 	TAILQ_FOREACH(substate, &state->subq, entry) {
17270a9eefcaSMatthew Dillon 		if (substate->flags & DMSG_STATE_ABORTING)
17280a9eefcaSMatthew Dillon 			continue;
17290a9eefcaSMatthew Dillon 		state->scan = substate;
17300a9eefcaSMatthew Dillon 		dmsg_simulate_failure(substate, 1, error);
17310a9eefcaSMatthew Dillon 		if (state->scan != substate)
17320a9eefcaSMatthew Dillon 			goto again;
17330a9eefcaSMatthew Dillon 	}
17340a9eefcaSMatthew Dillon 
17350a9eefcaSMatthew Dillon 	dmsg_state_drop(state);
17360a9eefcaSMatthew Dillon }
17370a9eefcaSMatthew Dillon 
17380a9eefcaSMatthew Dillon static
17390a9eefcaSMatthew Dillon void
17400a9eefcaSMatthew Dillon dmsg_state_abort(dmsg_state_t *state)
17410a9eefcaSMatthew Dillon {
1742a06d536bSMatthew Dillon 	dmsg_iocom_t *iocom;
1743323c0947SMatthew Dillon 	dmsg_msg_t *msg;
1744323c0947SMatthew Dillon 
17450a9eefcaSMatthew Dillon 	/*
17460a9eefcaSMatthew Dillon 	 * Set ABORTING and DYING, return if already set.  If the state was
17470a9eefcaSMatthew Dillon 	 * just allocated we defer the abort operation until the related
17480a9eefcaSMatthew Dillon 	 * message is processed.
17490a9eefcaSMatthew Dillon 	 */
17500a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_ABORTING)
17510a9eefcaSMatthew Dillon 		return;
17520a9eefcaSMatthew Dillon 	state->flags |= DMSG_STATE_ABORTING;
17530a9eefcaSMatthew Dillon 	dmsg_state_dying(state);
17540a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_NEW) {
17555ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
17565ab1caedSMatthew Dillon 			    "dmsg_state_abort(0): state %p rxcmd %08x "
17575ab1caedSMatthew Dillon 			    "txcmd %08x flags %08x - in NEW state\n",
17585ab1caedSMatthew Dillon 			    state, state->rxcmd,
17595ab1caedSMatthew Dillon 			    state->txcmd, state->flags);
17600a9eefcaSMatthew Dillon 		return;
1761a06d536bSMatthew Dillon 	}
1762323c0947SMatthew Dillon 
1763323c0947SMatthew Dillon 	/*
17640a9eefcaSMatthew Dillon 	 * Simulate parent state failure before child states.  Device
17650a9eefcaSMatthew Dillon 	 * drivers need to understand this and flag the situation but might
17660a9eefcaSMatthew Dillon 	 * have asynchronous operations in progress that they cannot stop.
17670a9eefcaSMatthew Dillon 	 * To make things easier, parent states will not actually disappear
17680a9eefcaSMatthew Dillon 	 * until the children are all gone.
1769323c0947SMatthew Dillon 	 */
1770a06d536bSMatthew Dillon 	if ((state->rxcmd & DMSGF_DELETE) == 0) {
17715ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
17725ab1caedSMatthew Dillon 			    "dmsg_state_abort() on state %p\n",
17735ab1caedSMatthew Dillon 			    state);
17740a9eefcaSMatthew Dillon 		msg = dmsg_msg_alloc_locked(state, 0, DMSG_LNK_ERROR,
1775323c0947SMatthew Dillon 					    NULL, NULL);
1776323c0947SMatthew Dillon 		if ((state->rxcmd & DMSGF_CREATE) == 0)
1777323c0947SMatthew Dillon 			msg->any.head.cmd |= DMSGF_CREATE;
17780a9eefcaSMatthew Dillon 		msg->any.head.cmd |= DMSGF_DELETE |
17790a9eefcaSMatthew Dillon 				     (state->rxcmd & DMSGF_REPLY);
17800a9eefcaSMatthew Dillon 		msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
17810a9eefcaSMatthew Dillon 		msg->any.head.error = DMSG_ERR_LOSTLINK;
17820a9eefcaSMatthew Dillon 		msg->any.head.cmd |= DMSGF_ABORT;
17830a9eefcaSMatthew Dillon 
17840a9eefcaSMatthew Dillon 		/*
17850a9eefcaSMatthew Dillon 		 * Issue callback synchronously even though this isn't
17860a9eefcaSMatthew Dillon 		 * the receiver thread.  We need to issue the callback
17870a9eefcaSMatthew Dillon 		 * before removing state from the subq in order to allow
17880a9eefcaSMatthew Dillon 		 * the callback to reply.
17890a9eefcaSMatthew Dillon 		 */
17900a9eefcaSMatthew Dillon 		iocom = state->iocom;
17910a9eefcaSMatthew Dillon 		dmsg_state_msgrx(msg, 1);
17920a9eefcaSMatthew Dillon 		pthread_mutex_unlock(&iocom->mtx);
17930a9eefcaSMatthew Dillon 		iocom->rcvmsg_callback(msg);
17940a9eefcaSMatthew Dillon 		pthread_mutex_lock(&iocom->mtx);
17950a9eefcaSMatthew Dillon 		dmsg_state_cleanuprx(iocom, msg);
17960a9eefcaSMatthew Dillon #if 0
1797323c0947SMatthew Dillon 		TAILQ_INSERT_TAIL(&iocom->ioq_rx.msgq, msg, qentry);
1798323c0947SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_RWORK);
17990a9eefcaSMatthew Dillon #endif
18000a9eefcaSMatthew Dillon 	}
18010a9eefcaSMatthew Dillon }
18020a9eefcaSMatthew Dillon 
18030a9eefcaSMatthew Dillon 
18040a9eefcaSMatthew Dillon /*
18050a9eefcaSMatthew Dillon  * Recursively sets DMSG_STATE_DYING on state and all sub-states, preventing
18060a9eefcaSMatthew Dillon  * the transmission of any new messages on these states.  This is done
18070a9eefcaSMatthew Dillon  * atomically when parent state is terminating, whereas setting ABORTING is
18080a9eefcaSMatthew Dillon  * not atomic and can leak races.
18090a9eefcaSMatthew Dillon  */
18100a9eefcaSMatthew Dillon static
18110a9eefcaSMatthew Dillon void
18120a9eefcaSMatthew Dillon dmsg_state_dying(dmsg_state_t *state)
18130a9eefcaSMatthew Dillon {
18140a9eefcaSMatthew Dillon 	dmsg_state_t *scan;
18150a9eefcaSMatthew Dillon 
18160a9eefcaSMatthew Dillon 	if ((state->flags & DMSG_STATE_DYING) == 0) {
18170a9eefcaSMatthew Dillon 		state->flags |= DMSG_STATE_DYING;
18180a9eefcaSMatthew Dillon 		TAILQ_FOREACH(scan, &state->subq, entry)
18190a9eefcaSMatthew Dillon 			dmsg_state_dying(scan);
1820323c0947SMatthew Dillon 	}
1821323c0947SMatthew Dillon }
1822323c0947SMatthew Dillon 
1823323c0947SMatthew Dillon /*
18240c3a8cd0SMatthew Dillon  * This is a shortcut to formulate a reply to msg with a simple error code,
18250c3a8cd0SMatthew Dillon  * It can reply to and terminate a transaction, or it can reply to a one-way
18260c3a8cd0SMatthew Dillon  * messages.  A DMSG_LNK_ERROR command code is utilized to encode
18270c3a8cd0SMatthew Dillon  * the error code (which can be 0).  Not all transactions are terminated
18280c3a8cd0SMatthew Dillon  * with DMSG_LNK_ERROR status (the low level only cares about the
18290c3a8cd0SMatthew Dillon  * MSGF_DELETE flag), but most are.
18300c3a8cd0SMatthew Dillon  *
18310c3a8cd0SMatthew Dillon  * Replies to one-way messages are a bit of an oxymoron but the feature
18320c3a8cd0SMatthew Dillon  * is used by the debug (DBG) protocol.
18330c3a8cd0SMatthew Dillon  *
18340c3a8cd0SMatthew Dillon  * The reply contains no extended data.
18350c3a8cd0SMatthew Dillon  */
18360c3a8cd0SMatthew Dillon void
18370c3a8cd0SMatthew Dillon dmsg_msg_reply(dmsg_msg_t *msg, uint32_t error)
18380c3a8cd0SMatthew Dillon {
18390c3a8cd0SMatthew Dillon 	dmsg_state_t *state = msg->state;
18400c3a8cd0SMatthew Dillon 	dmsg_msg_t *nmsg;
18410c3a8cd0SMatthew Dillon 	uint32_t cmd;
18420c3a8cd0SMatthew Dillon 
18430c3a8cd0SMatthew Dillon 	/*
18440c3a8cd0SMatthew Dillon 	 * Reply with a simple error code and terminate the transaction.
18450c3a8cd0SMatthew Dillon 	 */
18460c3a8cd0SMatthew Dillon 	cmd = DMSG_LNK_ERROR;
18470c3a8cd0SMatthew Dillon 
18480c3a8cd0SMatthew Dillon 	/*
18490c3a8cd0SMatthew Dillon 	 * Check if our direction has even been initiated yet, set CREATE.
18500c3a8cd0SMatthew Dillon 	 *
18510c3a8cd0SMatthew Dillon 	 * Check what direction this is (command or reply direction).  Note
18520c3a8cd0SMatthew Dillon 	 * that txcmd might not have been initiated yet.
18530c3a8cd0SMatthew Dillon 	 *
18540c3a8cd0SMatthew Dillon 	 * If our direction has already been closed we just return without
18550c3a8cd0SMatthew Dillon 	 * doing anything.
18560c3a8cd0SMatthew Dillon 	 */
1857d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
18580c3a8cd0SMatthew Dillon 		if (state->txcmd & DMSGF_DELETE)
18590c3a8cd0SMatthew Dillon 			return;
18600c3a8cd0SMatthew Dillon 		if (state->txcmd & DMSGF_REPLY)
18610c3a8cd0SMatthew Dillon 			cmd |= DMSGF_REPLY;
18620c3a8cd0SMatthew Dillon 		cmd |= DMSGF_DELETE;
18630c3a8cd0SMatthew Dillon 	} else {
18640c3a8cd0SMatthew Dillon 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
18650c3a8cd0SMatthew Dillon 			cmd |= DMSGF_REPLY;
18660c3a8cd0SMatthew Dillon 	}
18670c3a8cd0SMatthew Dillon 
18680c3a8cd0SMatthew Dillon 	/*
18690c3a8cd0SMatthew Dillon 	 * Allocate the message and associate it with the existing state.
18700d20ec8aSMatthew Dillon 	 * We cannot pass DMSGF_CREATE to msg_alloc() because that may
18710c3a8cd0SMatthew Dillon 	 * allocate new state.  We have our state already.
18720c3a8cd0SMatthew Dillon 	 */
18731b8eded1SMatthew Dillon 	nmsg = dmsg_msg_alloc(state, 0, cmd, NULL, NULL);
1874d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
18750c3a8cd0SMatthew Dillon 		if ((state->txcmd & DMSGF_CREATE) == 0)
18760c3a8cd0SMatthew Dillon 			nmsg->any.head.cmd |= DMSGF_CREATE;
18770c3a8cd0SMatthew Dillon 	}
18780c3a8cd0SMatthew Dillon 	nmsg->any.head.error = error;
18791b8eded1SMatthew Dillon 
18800c3a8cd0SMatthew Dillon 	dmsg_msg_write(nmsg);
18810c3a8cd0SMatthew Dillon }
18820c3a8cd0SMatthew Dillon 
18830c3a8cd0SMatthew Dillon /*
18840c3a8cd0SMatthew Dillon  * Similar to dmsg_msg_reply() but leave the transaction open.  That is,
18850c3a8cd0SMatthew Dillon  * we are generating a streaming reply or an intermediate acknowledgement
18860c3a8cd0SMatthew Dillon  * of some sort as part of the higher level protocol, with more to come
18870c3a8cd0SMatthew Dillon  * later.
18880c3a8cd0SMatthew Dillon  */
18890c3a8cd0SMatthew Dillon void
18900c3a8cd0SMatthew Dillon dmsg_msg_result(dmsg_msg_t *msg, uint32_t error)
18910c3a8cd0SMatthew Dillon {
18920c3a8cd0SMatthew Dillon 	dmsg_state_t *state = msg->state;
18930c3a8cd0SMatthew Dillon 	dmsg_msg_t *nmsg;
18940c3a8cd0SMatthew Dillon 	uint32_t cmd;
18950c3a8cd0SMatthew Dillon 
18960c3a8cd0SMatthew Dillon 
18970c3a8cd0SMatthew Dillon 	/*
18980c3a8cd0SMatthew Dillon 	 * Reply with a simple error code and terminate the transaction.
18990c3a8cd0SMatthew Dillon 	 */
19000c3a8cd0SMatthew Dillon 	cmd = DMSG_LNK_ERROR;
19010c3a8cd0SMatthew Dillon 
19020c3a8cd0SMatthew Dillon 	/*
19030c3a8cd0SMatthew Dillon 	 * Check if our direction has even been initiated yet, set CREATE.
19040c3a8cd0SMatthew Dillon 	 *
19050c3a8cd0SMatthew Dillon 	 * Check what direction this is (command or reply direction).  Note
19060c3a8cd0SMatthew Dillon 	 * that txcmd might not have been initiated yet.
19070c3a8cd0SMatthew Dillon 	 *
19080c3a8cd0SMatthew Dillon 	 * If our direction has already been closed we just return without
19090c3a8cd0SMatthew Dillon 	 * doing anything.
19100c3a8cd0SMatthew Dillon 	 */
1911d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
19120c3a8cd0SMatthew Dillon 		if (state->txcmd & DMSGF_DELETE)
19130c3a8cd0SMatthew Dillon 			return;
19140c3a8cd0SMatthew Dillon 		if (state->txcmd & DMSGF_REPLY)
19150c3a8cd0SMatthew Dillon 			cmd |= DMSGF_REPLY;
19160c3a8cd0SMatthew Dillon 		/* continuing transaction, do not set MSGF_DELETE */
19170c3a8cd0SMatthew Dillon 	} else {
19180c3a8cd0SMatthew Dillon 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
19190c3a8cd0SMatthew Dillon 			cmd |= DMSGF_REPLY;
19200c3a8cd0SMatthew Dillon 	}
19211b8eded1SMatthew Dillon 	nmsg = dmsg_msg_alloc(state, 0, cmd, NULL, NULL);
1922d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
19230c3a8cd0SMatthew Dillon 		if ((state->txcmd & DMSGF_CREATE) == 0)
19240c3a8cd0SMatthew Dillon 			nmsg->any.head.cmd |= DMSGF_CREATE;
19250c3a8cd0SMatthew Dillon 	}
19260c3a8cd0SMatthew Dillon 	nmsg->any.head.error = error;
19271b8eded1SMatthew Dillon 
19280c3a8cd0SMatthew Dillon 	dmsg_msg_write(nmsg);
19290c3a8cd0SMatthew Dillon }
19300c3a8cd0SMatthew Dillon 
19310c3a8cd0SMatthew Dillon /*
19320c3a8cd0SMatthew Dillon  * Terminate a transaction given a state structure by issuing a DELETE.
19331b8eded1SMatthew Dillon  * (the state structure must not be &iocom->state0)
19340c3a8cd0SMatthew Dillon  */
19350c3a8cd0SMatthew Dillon void
19360c3a8cd0SMatthew Dillon dmsg_state_reply(dmsg_state_t *state, uint32_t error)
19370c3a8cd0SMatthew Dillon {
19380c3a8cd0SMatthew Dillon 	dmsg_msg_t *nmsg;
19390c3a8cd0SMatthew Dillon 	uint32_t cmd = DMSG_LNK_ERROR | DMSGF_DELETE;
19400c3a8cd0SMatthew Dillon 
19410c3a8cd0SMatthew Dillon 	/*
19420c3a8cd0SMatthew Dillon 	 * Nothing to do if we already transmitted a delete
19430c3a8cd0SMatthew Dillon 	 */
19440c3a8cd0SMatthew Dillon 	if (state->txcmd & DMSGF_DELETE)
19450c3a8cd0SMatthew Dillon 		return;
19460c3a8cd0SMatthew Dillon 
19470c3a8cd0SMatthew Dillon 	/*
19480c3a8cd0SMatthew Dillon 	 * Set REPLY if the other end initiated the command.  Otherwise
19490c3a8cd0SMatthew Dillon 	 * we are the command direction.
19500c3a8cd0SMatthew Dillon 	 */
19510c3a8cd0SMatthew Dillon 	if (state->txcmd & DMSGF_REPLY)
19520c3a8cd0SMatthew Dillon 		cmd |= DMSGF_REPLY;
19530c3a8cd0SMatthew Dillon 
19541b8eded1SMatthew Dillon 	nmsg = dmsg_msg_alloc(state, 0, cmd, NULL, NULL);
1955d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
19560c3a8cd0SMatthew Dillon 		if ((state->txcmd & DMSGF_CREATE) == 0)
19570c3a8cd0SMatthew Dillon 			nmsg->any.head.cmd |= DMSGF_CREATE;
19580c3a8cd0SMatthew Dillon 	}
19590c3a8cd0SMatthew Dillon 	nmsg->any.head.error = error;
19600d20ec8aSMatthew Dillon 	dmsg_msg_write(nmsg);
19610d20ec8aSMatthew Dillon }
19620d20ec8aSMatthew Dillon 
19630d20ec8aSMatthew Dillon /*
19640d20ec8aSMatthew Dillon  * Terminate a transaction given a state structure by issuing a DELETE.
19651b8eded1SMatthew Dillon  * (the state structure must not be &iocom->state0)
19660d20ec8aSMatthew Dillon  */
19670d20ec8aSMatthew Dillon void
19680d20ec8aSMatthew Dillon dmsg_state_result(dmsg_state_t *state, uint32_t error)
19690d20ec8aSMatthew Dillon {
19700d20ec8aSMatthew Dillon 	dmsg_msg_t *nmsg;
19710d20ec8aSMatthew Dillon 	uint32_t cmd = DMSG_LNK_ERROR;
19720d20ec8aSMatthew Dillon 
19730d20ec8aSMatthew Dillon 	/*
19740d20ec8aSMatthew Dillon 	 * Nothing to do if we already transmitted a delete
19750d20ec8aSMatthew Dillon 	 */
19760d20ec8aSMatthew Dillon 	if (state->txcmd & DMSGF_DELETE)
19770d20ec8aSMatthew Dillon 		return;
19780d20ec8aSMatthew Dillon 
19790d20ec8aSMatthew Dillon 	/*
19800d20ec8aSMatthew Dillon 	 * Set REPLY if the other end initiated the command.  Otherwise
19810d20ec8aSMatthew Dillon 	 * we are the command direction.
19820d20ec8aSMatthew Dillon 	 */
19830d20ec8aSMatthew Dillon 	if (state->txcmd & DMSGF_REPLY)
19840d20ec8aSMatthew Dillon 		cmd |= DMSGF_REPLY;
19850d20ec8aSMatthew Dillon 
19861b8eded1SMatthew Dillon 	nmsg = dmsg_msg_alloc(state, 0, cmd, NULL, NULL);
1987d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
19880d20ec8aSMatthew Dillon 		if ((state->txcmd & DMSGF_CREATE) == 0)
19890d20ec8aSMatthew Dillon 			nmsg->any.head.cmd |= DMSGF_CREATE;
19900d20ec8aSMatthew Dillon 	}
19910d20ec8aSMatthew Dillon 	nmsg->any.head.error = error;
19920c3a8cd0SMatthew Dillon 	dmsg_msg_write(nmsg);
19930c3a8cd0SMatthew Dillon }
19940c3a8cd0SMatthew Dillon 
19950c3a8cd0SMatthew Dillon /************************************************************************
19960c3a8cd0SMatthew Dillon  *			TRANSACTION STATE HANDLING			*
19970c3a8cd0SMatthew Dillon  ************************************************************************
19980c3a8cd0SMatthew Dillon  *
19990c3a8cd0SMatthew Dillon  */
20000c3a8cd0SMatthew Dillon 
20010c3a8cd0SMatthew Dillon /*
2002d30cab67SMatthew Dillon  * Process state tracking for a message after reception, prior to execution.
2003d30cab67SMatthew Dillon  * Possibly route the message (consuming it).
20040c3a8cd0SMatthew Dillon  *
20050c3a8cd0SMatthew Dillon  * Called with msglk held and the msg dequeued.
20060c3a8cd0SMatthew Dillon  *
20070c3a8cd0SMatthew Dillon  * All messages are called with dummy state and return actual state.
20080c3a8cd0SMatthew Dillon  * (One-off messages often just return the same dummy state).
20090c3a8cd0SMatthew Dillon  *
20100c3a8cd0SMatthew Dillon  * May request that caller discard the message by setting *discardp to 1.
20110c3a8cd0SMatthew Dillon  * The returned state is not used in this case and is allowed to be NULL.
20120c3a8cd0SMatthew Dillon  *
20130c3a8cd0SMatthew Dillon  * --
20140c3a8cd0SMatthew Dillon  *
20150c3a8cd0SMatthew Dillon  * These routines handle persistent and command/reply message state via the
20160c3a8cd0SMatthew Dillon  * CREATE and DELETE flags.  The first message in a command or reply sequence
20170c3a8cd0SMatthew Dillon  * sets CREATE, the last message in a command or reply sequence sets DELETE.
20180c3a8cd0SMatthew Dillon  *
20190c3a8cd0SMatthew Dillon  * There can be any number of intermediate messages belonging to the same
20200c3a8cd0SMatthew Dillon  * sequence sent inbetween the CREATE message and the DELETE message,
20210c3a8cd0SMatthew Dillon  * which set neither flag.  This represents a streaming command or reply.
20220c3a8cd0SMatthew Dillon  *
20230c3a8cd0SMatthew Dillon  * Any command message received with CREATE set expects a reply sequence to
20240c3a8cd0SMatthew Dillon  * be returned.  Reply sequences work the same as command sequences except the
20250c3a8cd0SMatthew Dillon  * REPLY bit is also sent.  Both the command side and reply side can
20260c3a8cd0SMatthew Dillon  * degenerate into a single message with both CREATE and DELETE set.  Note
20270c3a8cd0SMatthew Dillon  * that one side can be streaming and the other side not, or neither, or both.
20280c3a8cd0SMatthew Dillon  *
20290c3a8cd0SMatthew Dillon  * The msgid is unique for the initiator.  That is, two sides sending a new
20300c3a8cd0SMatthew Dillon  * message can use the same msgid without colliding.
20310c3a8cd0SMatthew Dillon  *
20320c3a8cd0SMatthew Dillon  * --
20330c3a8cd0SMatthew Dillon  *
2034a06d536bSMatthew Dillon  * The message may be running over a circuit.  If the circuit is half-deleted
2035a06d536bSMatthew Dillon  * The message is typically racing against a link failure and must be thrown
2036a06d536bSMatthew Dillon  * out.  As the circuit deletion propagates the library will automatically
2037a06d536bSMatthew Dillon  * generate terminations for sub states.
2038a06d536bSMatthew Dillon  *
2039a06d536bSMatthew Dillon  * --
2040a06d536bSMatthew Dillon  *
20410c3a8cd0SMatthew Dillon  * ABORT sequences work by setting the ABORT flag along with normal message
20420c3a8cd0SMatthew Dillon  * state.  However, ABORTs can also be sent on half-closed messages, that is
20430c3a8cd0SMatthew Dillon  * even if the command or reply side has already sent a DELETE, as long as
20440c3a8cd0SMatthew Dillon  * the message has not been fully closed it can still send an ABORT+DELETE
20450c3a8cd0SMatthew Dillon  * to terminate the half-closed message state.
20460c3a8cd0SMatthew Dillon  *
20470c3a8cd0SMatthew Dillon  * Since ABORT+DELETEs can race we silently discard ABORT's for message
20480c3a8cd0SMatthew Dillon  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
20490c3a8cd0SMatthew Dillon  * also race, and in this situation the other side might have already
20500c3a8cd0SMatthew Dillon  * initiated a new unrelated command with the same message id.  Since
20510c3a8cd0SMatthew Dillon  * the abort has not set the CREATE flag the situation can be detected
20520c3a8cd0SMatthew Dillon  * and the message will also be discarded.
20530c3a8cd0SMatthew Dillon  *
20540c3a8cd0SMatthew Dillon  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
20550c3a8cd0SMatthew Dillon  * The ABORT request is essentially integrated into the command instead
20560c3a8cd0SMatthew Dillon  * of being sent later on.  In this situation the command implementation
20570c3a8cd0SMatthew Dillon  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
20580c3a8cd0SMatthew Dillon  * special-case non-blocking operation for the command.
20590c3a8cd0SMatthew Dillon  *
20600c3a8cd0SMatthew Dillon  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
20610c3a8cd0SMatthew Dillon  *	  to be mid-stream aborts for command/reply sequences.  ABORTs on
20620c3a8cd0SMatthew Dillon  *	  one-way messages are not supported.
20630c3a8cd0SMatthew Dillon  *
20640c3a8cd0SMatthew Dillon  * NOTE!  If a command sequence does not support aborts the ABORT flag is
20650c3a8cd0SMatthew Dillon  *	  simply ignored.
20660c3a8cd0SMatthew Dillon  *
20670c3a8cd0SMatthew Dillon  * --
20680c3a8cd0SMatthew Dillon  *
2069d30cab67SMatthew Dillon  * One-off messages (no reply expected) are sent without an established
2070d30cab67SMatthew Dillon  * transaction.  CREATE and DELETE are left clear and the msgid is usually 0.
2071d30cab67SMatthew Dillon  * For one-off messages sent over circuits msgid generally MUST be 0.
2072d30cab67SMatthew Dillon  *
2073d30cab67SMatthew Dillon  * One-off messages cannot be aborted and typically aren't processed
2074d30cab67SMatthew Dillon  * by these routines.  Order is still guaranteed for messages sent over
2075d30cab67SMatthew Dillon  * the same circuit.  The REPLY bit can be used to distinguish whether
2076d30cab67SMatthew Dillon  * a one-off message is a command or reply.  For example, one-off replies
20770c3a8cd0SMatthew Dillon  * will typically just contain status updates.
20780c3a8cd0SMatthew Dillon  */
20790c3a8cd0SMatthew Dillon static int
20800a9eefcaSMatthew Dillon dmsg_state_msgrx(dmsg_msg_t *msg, int mstate)
20810c3a8cd0SMatthew Dillon {
20821b8eded1SMatthew Dillon 	dmsg_iocom_t *iocom = msg->state->iocom;
20830c3a8cd0SMatthew Dillon 	dmsg_state_t *state;
20841b8eded1SMatthew Dillon 	dmsg_state_t *pstate;
20850d20ec8aSMatthew Dillon 	dmsg_state_t sdummy;
20860c3a8cd0SMatthew Dillon 	int error;
20870c3a8cd0SMatthew Dillon 
20880d20ec8aSMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
20890d20ec8aSMatthew Dillon 
20900a9eefcaSMatthew Dillon 	if (DMsgDebugOpt) {
20915ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
20920a9eefcaSMatthew Dillon 			    "msgrx: cmd=%08x msgid=%016jx "
20930a9eefcaSMatthew Dillon 			    "circuit=%016jx error=%d\n",
20940a9eefcaSMatthew Dillon 			    msg->any.head.cmd,
20950a9eefcaSMatthew Dillon 			    msg->any.head.msgid,
20960a9eefcaSMatthew Dillon 			    msg->any.head.circuit,
20970a9eefcaSMatthew Dillon 			    msg->any.head.error);
20980a9eefcaSMatthew Dillon 	}
20990a9eefcaSMatthew Dillon 
21000c3a8cd0SMatthew Dillon 	/*
2101d30cab67SMatthew Dillon 	 * Lookup the circuit (pstate).  The circuit will be an open
2102d30cab67SMatthew Dillon 	 * transaction.  The REVCIRC bit in the message tells us which side
2103d30cab67SMatthew Dillon 	 * initiated it.
21040a9eefcaSMatthew Dillon 	 *
21050a9eefcaSMatthew Dillon 	 * If mstate is non-zero the state has already been incorporated
21060a9eefcaSMatthew Dillon 	 * into the message as part of a simulated abort.  Note that in this
21070a9eefcaSMatthew Dillon 	 * situation the parent state may have already been removed from
21080a9eefcaSMatthew Dillon 	 * the RBTREE.
21091b8eded1SMatthew Dillon 	 */
21100a9eefcaSMatthew Dillon 	if (mstate) {
21110a9eefcaSMatthew Dillon 		pstate = msg->state->parent;
21120a9eefcaSMatthew Dillon 	} else if (msg->any.head.circuit) {
21131b8eded1SMatthew Dillon 		sdummy.msgid = msg->any.head.circuit;
21141b8eded1SMatthew Dillon 
21151b8eded1SMatthew Dillon 		if (msg->any.head.cmd & DMSGF_REVCIRC) {
21161b8eded1SMatthew Dillon 			pstate = RB_FIND(dmsg_state_tree,
21171b8eded1SMatthew Dillon 					 &iocom->statewr_tree,
21181b8eded1SMatthew Dillon 					 &sdummy);
21191b8eded1SMatthew Dillon 		} else {
21201b8eded1SMatthew Dillon 			pstate = RB_FIND(dmsg_state_tree,
21211b8eded1SMatthew Dillon 					 &iocom->staterd_tree,
21221b8eded1SMatthew Dillon 					 &sdummy);
21231b8eded1SMatthew Dillon 		}
21240a9eefcaSMatthew Dillon 
21250a9eefcaSMatthew Dillon 		/*
21260a9eefcaSMatthew Dillon 		 * If we cannot find the circuit throw the message away.
21270a9eefcaSMatthew Dillon 		 * The state will have already been taken care of by
21280a9eefcaSMatthew Dillon 		 * the simulated failure code.  This case can occur due
21290a9eefcaSMatthew Dillon 		 * to a failure propagating in one direction crossing a
21300a9eefcaSMatthew Dillon 		 * request on the failed circuit propagating in the other
21310a9eefcaSMatthew Dillon 		 * direction.
21320a9eefcaSMatthew Dillon 		 */
21331b8eded1SMatthew Dillon 		if (pstate == NULL) {
21345ab1caedSMatthew Dillon 			dmio_printf(iocom, 4,
21351b8eded1SMatthew Dillon 				    "missing parent in stacked trans %s\n",
21361b8eded1SMatthew Dillon 				    dmsg_msg_str(msg));
21371b8eded1SMatthew Dillon 			pthread_mutex_unlock(&iocom->mtx);
21380a9eefcaSMatthew Dillon 			error = DMSG_IOQ_ERROR_EALREADY;
21390a9eefcaSMatthew Dillon 
21400a9eefcaSMatthew Dillon 			return error;
21411b8eded1SMatthew Dillon 		}
21421b8eded1SMatthew Dillon 	} else {
21431b8eded1SMatthew Dillon 		pstate = &iocom->state0;
21441b8eded1SMatthew Dillon 	}
21450a9eefcaSMatthew Dillon 	/* WARNING: pstate not (yet) refd */
21461b8eded1SMatthew Dillon 
21471b8eded1SMatthew Dillon 	/*
2148d30cab67SMatthew Dillon 	 * Lookup the msgid.
2149d30cab67SMatthew Dillon 	 *
21500a9eefcaSMatthew Dillon 	 * If mstate is non-zero the state has already been incorporated
21510a9eefcaSMatthew Dillon 	 * into the message as part of a simulated abort.  Note that in this
21520a9eefcaSMatthew Dillon 	 * situation the state may have already been removed from the RBTREE.
21530a9eefcaSMatthew Dillon 	 *
2154d30cab67SMatthew Dillon 	 * If received msg is a command state is on staterd_tree.
2155d30cab67SMatthew Dillon 	 * If received msg is a reply state is on statewr_tree.
2156d30cab67SMatthew Dillon 	 * Otherwise there is no state (retain &iocom->state0)
2157d30cab67SMatthew Dillon 	 */
21580a9eefcaSMatthew Dillon 	if (mstate) {
21590a9eefcaSMatthew Dillon 		state = msg->state;
21600a9eefcaSMatthew Dillon 	} else {
2161d30cab67SMatthew Dillon 		sdummy.msgid = msg->any.head.msgid;
21620a9eefcaSMatthew Dillon 		if (msg->any.head.cmd & DMSGF_REVTRANS) {
21630a9eefcaSMatthew Dillon 			state = RB_FIND(dmsg_state_tree,
21640a9eefcaSMatthew Dillon 					&iocom->statewr_tree, &sdummy);
21650a9eefcaSMatthew Dillon 		} else {
21660a9eefcaSMatthew Dillon 			state = RB_FIND(dmsg_state_tree,
21670a9eefcaSMatthew Dillon 					&iocom->staterd_tree, &sdummy);
21680a9eefcaSMatthew Dillon 		}
21690a9eefcaSMatthew Dillon 	}
2170d30cab67SMatthew Dillon 
21710a9eefcaSMatthew Dillon 	if (DMsgDebugOpt) {
21725ab1caedSMatthew Dillon 		dmio_printf(iocom, 5, "msgrx:\tstate %p(%08x)",
21730a9eefcaSMatthew Dillon 			    state, (state ? state->icmd : 0));
21740a9eefcaSMatthew Dillon 		if (pstate != &iocom->state0) {
21755ab1caedSMatthew Dillon 			dmio_printf(iocom, 5,
21760a9eefcaSMatthew Dillon 				    " pstate %p(%08x)",
21770a9eefcaSMatthew Dillon 				    pstate, pstate->icmd);
21780a9eefcaSMatthew Dillon 		}
21795ab1caedSMatthew Dillon 		dmio_printf(iocom, 5, "%s\n", "");
21800a9eefcaSMatthew Dillon 	}
21810a9eefcaSMatthew Dillon 
21820a9eefcaSMatthew Dillon 	if (mstate) {
21830a9eefcaSMatthew Dillon 		/* state already assigned to msg */
21840a9eefcaSMatthew Dillon 	} else if (state) {
2185d30cab67SMatthew Dillon 		/*
2186d30cab67SMatthew Dillon 		 * Message over an existing transaction (CREATE should not
2187d30cab67SMatthew Dillon 		 * be set).
2188d30cab67SMatthew Dillon 		 */
21890a9eefcaSMatthew Dillon 		dmsg_state_drop(msg->state);
21900a9eefcaSMatthew Dillon 		dmsg_state_hold(state);
2191d30cab67SMatthew Dillon 		msg->state = state;
2192d30cab67SMatthew Dillon 		assert(pstate == state->parent);
2193d30cab67SMatthew Dillon 	} else {
2194d30cab67SMatthew Dillon 		/*
2195d30cab67SMatthew Dillon 		 * Either a new transaction (if CREATE set) or a one-off.
2196d30cab67SMatthew Dillon 		 */
2197d30cab67SMatthew Dillon 		state = pstate;
2198d30cab67SMatthew Dillon 	}
2199d30cab67SMatthew Dillon 
2200d30cab67SMatthew Dillon 	/*
2201d30cab67SMatthew Dillon 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
2202d30cab67SMatthew Dillon 	 * inside the case statements.
2203d30cab67SMatthew Dillon 	 *
2204d30cab67SMatthew Dillon 	 * Construct new state as necessary.
2205d30cab67SMatthew Dillon 	 */
2206d30cab67SMatthew Dillon 	switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
2207d30cab67SMatthew Dillon 				    DMSGF_REPLY)) {
2208d30cab67SMatthew Dillon 	case DMSGF_CREATE:
2209d30cab67SMatthew Dillon 	case DMSGF_CREATE | DMSGF_DELETE:
2210d30cab67SMatthew Dillon 		/*
2211d30cab67SMatthew Dillon 		 * Create new sub-transaction under pstate.
2212d30cab67SMatthew Dillon 		 * (any DELETE is handled in post-processing of msg).
2213d30cab67SMatthew Dillon 		 *
2214d30cab67SMatthew Dillon 		 * (During routing the msgid was made unique for this
2215d30cab67SMatthew Dillon 		 * direction over the comlink, so our RB trees can be
2216d30cab67SMatthew Dillon 		 * iocom-based instead of state-based).
2217d30cab67SMatthew Dillon 		 */
2218d30cab67SMatthew Dillon 		if (state != pstate) {
22195ab1caedSMatthew Dillon 			dmio_printf(iocom, 2,
2220d30cab67SMatthew Dillon 				    "duplicate transaction %s\n",
2221d30cab67SMatthew Dillon 				    dmsg_msg_str(msg));
2222d30cab67SMatthew Dillon 			error = DMSG_IOQ_ERROR_TRANS;
2223d30cab67SMatthew Dillon 			assert(0);
2224d30cab67SMatthew Dillon 			break;
2225d30cab67SMatthew Dillon 		}
2226d30cab67SMatthew Dillon 
2227d30cab67SMatthew Dillon 		/*
2228d30cab67SMatthew Dillon 		 * Allocate the new state.
22291b8eded1SMatthew Dillon 		 */
22300c3a8cd0SMatthew Dillon 		state = malloc(sizeof(*state));
22310c3a8cd0SMatthew Dillon 		bzero(state, sizeof(*state));
22320a9eefcaSMatthew Dillon 		atomic_add_int(&dmsg_state_count, 1);
22330a9eefcaSMatthew Dillon 
22341b8eded1SMatthew Dillon 		TAILQ_INIT(&state->subq);
2235323c0947SMatthew Dillon 		dmsg_state_hold(pstate);
22361b8eded1SMatthew Dillon 		state->parent = pstate;
22370c3a8cd0SMatthew Dillon 		state->iocom = iocom;
22381b8eded1SMatthew Dillon 		state->flags = DMSG_STATE_DYNAMIC |
2239d30cab67SMatthew Dillon 			       DMSG_STATE_OPPOSITE;
22401b8eded1SMatthew Dillon 		state->msgid = msg->any.head.msgid;
22410c3a8cd0SMatthew Dillon 		state->txcmd = DMSGF_REPLY;
22420c3a8cd0SMatthew Dillon 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
22430d20ec8aSMatthew Dillon 		state->icmd = state->rxcmd & DMSGF_BASECMDMASK;
22440a9eefcaSMatthew Dillon 		state->flags &= ~DMSG_STATE_NEW;
22450c3a8cd0SMatthew Dillon 		msg->state = state;
22460a9eefcaSMatthew Dillon 
22471b8eded1SMatthew Dillon 		RB_INSERT(dmsg_state_tree, &iocom->staterd_tree, state);
22480a9eefcaSMatthew Dillon 		if (TAILQ_EMPTY(&pstate->subq))
22490a9eefcaSMatthew Dillon 			dmsg_state_hold(pstate);/* pstate->subq */
22501b8eded1SMatthew Dillon 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
2251a06d536bSMatthew Dillon 		state->flags |= DMSG_STATE_SUBINSERTED |
2252a06d536bSMatthew Dillon 				DMSG_STATE_RBINSERTED;
22530a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* pstate->subq */
22540a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* state on rbtree */
22550a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* msg->state */
2256d30cab67SMatthew Dillon 
2257d30cab67SMatthew Dillon 		/*
2258d30cab67SMatthew Dillon 		 * If the parent is a relay set up the state handler to
2259d30cab67SMatthew Dillon 		 * automatically route the message.  Local processing will
2260d30cab67SMatthew Dillon 		 * not occur if set.
2261d30cab67SMatthew Dillon 		 *
2262d30cab67SMatthew Dillon 		 * (state relays are seeded by SPAN processing)
2263d30cab67SMatthew Dillon 		 */
2264d30cab67SMatthew Dillon 		if (pstate->relay)
2265d30cab67SMatthew Dillon 			state->func = dmsg_state_relay;
22660c3a8cd0SMatthew Dillon 		error = 0;
22670c3a8cd0SMatthew Dillon 		break;
22680c3a8cd0SMatthew Dillon 	case DMSGF_DELETE:
22690c3a8cd0SMatthew Dillon 		/*
22700c3a8cd0SMatthew Dillon 		 * Persistent state is expected but might not exist if an
22710c3a8cd0SMatthew Dillon 		 * ABORT+DELETE races the close.
2272d30cab67SMatthew Dillon 		 *
2273d30cab67SMatthew Dillon 		 * (any DELETE is handled in post-processing of msg).
22740c3a8cd0SMatthew Dillon 		 */
2275d30cab67SMatthew Dillon 		if (state == pstate) {
22760c3a8cd0SMatthew Dillon 			if (msg->any.head.cmd & DMSGF_ABORT) {
22770c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
22780c3a8cd0SMatthew Dillon 			} else {
22795ab1caedSMatthew Dillon 				dmio_printf(iocom, 2,
22805ab1caedSMatthew Dillon 					    "missing-state %s\n",
22810c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
22820c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_TRANS;
22830c3a8cd0SMatthew Dillon 				assert(0);
22840c3a8cd0SMatthew Dillon 			}
22850c3a8cd0SMatthew Dillon 			break;
22860c3a8cd0SMatthew Dillon 		}
22870c3a8cd0SMatthew Dillon 
22880c3a8cd0SMatthew Dillon 		/*
22890c3a8cd0SMatthew Dillon 		 * Handle another ABORT+DELETE case if the msgid has already
22900c3a8cd0SMatthew Dillon 		 * been reused.
22910c3a8cd0SMatthew Dillon 		 */
22920c3a8cd0SMatthew Dillon 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
22930c3a8cd0SMatthew Dillon 			if (msg->any.head.cmd & DMSGF_ABORT) {
22940c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
22950c3a8cd0SMatthew Dillon 			} else {
22965ab1caedSMatthew Dillon 				dmio_printf(iocom, 2,
22975ab1caedSMatthew Dillon 					    "reused-state %s\n",
22980c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
22990c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_TRANS;
23000c3a8cd0SMatthew Dillon 				assert(0);
23010c3a8cd0SMatthew Dillon 			}
23020c3a8cd0SMatthew Dillon 			break;
23030c3a8cd0SMatthew Dillon 		}
23040c3a8cd0SMatthew Dillon 		error = 0;
23050c3a8cd0SMatthew Dillon 		break;
23060c3a8cd0SMatthew Dillon 	default:
23070c3a8cd0SMatthew Dillon 		/*
23080c3a8cd0SMatthew Dillon 		 * Check for mid-stream ABORT command received, otherwise
23090c3a8cd0SMatthew Dillon 		 * allow.
23100c3a8cd0SMatthew Dillon 		 */
23110c3a8cd0SMatthew Dillon 		if (msg->any.head.cmd & DMSGF_ABORT) {
2312d30cab67SMatthew Dillon 			if ((state == pstate) ||
23130c3a8cd0SMatthew Dillon 			    (state->rxcmd & DMSGF_CREATE) == 0) {
23140c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
23150c3a8cd0SMatthew Dillon 				break;
23160c3a8cd0SMatthew Dillon 			}
23170c3a8cd0SMatthew Dillon 		}
23180c3a8cd0SMatthew Dillon 		error = 0;
23190c3a8cd0SMatthew Dillon 		break;
23200c3a8cd0SMatthew Dillon 	case DMSGF_REPLY | DMSGF_CREATE:
23210c3a8cd0SMatthew Dillon 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
23220c3a8cd0SMatthew Dillon 		/*
23230c3a8cd0SMatthew Dillon 		 * When receiving a reply with CREATE set the original
23240c3a8cd0SMatthew Dillon 		 * persistent state message should already exist.
23250c3a8cd0SMatthew Dillon 		 */
2326d30cab67SMatthew Dillon 		if (state == pstate) {
23275ab1caedSMatthew Dillon 			dmio_printf(iocom, 2, "no-state(r) %s\n",
23280c3a8cd0SMatthew Dillon 				    dmsg_msg_str(msg));
23290c3a8cd0SMatthew Dillon 			error = DMSG_IOQ_ERROR_TRANS;
23300c3a8cd0SMatthew Dillon 			assert(0);
23310c3a8cd0SMatthew Dillon 			break;
23320c3a8cd0SMatthew Dillon 		}
2333d30cab67SMatthew Dillon 		assert(((state->rxcmd ^ msg->any.head.cmd) & DMSGF_REPLY) == 0);
23340c3a8cd0SMatthew Dillon 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
23350c3a8cd0SMatthew Dillon 		error = 0;
23360c3a8cd0SMatthew Dillon 		break;
23370c3a8cd0SMatthew Dillon 	case DMSGF_REPLY | DMSGF_DELETE:
23380c3a8cd0SMatthew Dillon 		/*
23390c3a8cd0SMatthew Dillon 		 * Received REPLY+ABORT+DELETE in case where msgid has
23400c3a8cd0SMatthew Dillon 		 * already been fully closed, ignore the message.
23410c3a8cd0SMatthew Dillon 		 */
2342d30cab67SMatthew Dillon 		if (state == pstate) {
23430c3a8cd0SMatthew Dillon 			if (msg->any.head.cmd & DMSGF_ABORT) {
23440c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
23450c3a8cd0SMatthew Dillon 			} else {
23465ab1caedSMatthew Dillon 				dmio_printf(iocom, 2,
23475ab1caedSMatthew Dillon 					    "no-state(r,d) %s\n",
23480c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
23490c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_TRANS;
23500c3a8cd0SMatthew Dillon 				assert(0);
23510c3a8cd0SMatthew Dillon 			}
23520c3a8cd0SMatthew Dillon 			break;
23530c3a8cd0SMatthew Dillon 		}
23540c3a8cd0SMatthew Dillon 
23550c3a8cd0SMatthew Dillon 		/*
23560c3a8cd0SMatthew Dillon 		 * Received REPLY+ABORT+DELETE in case where msgid has
23570c3a8cd0SMatthew Dillon 		 * already been reused for an unrelated message,
23580c3a8cd0SMatthew Dillon 		 * ignore the message.
23590c3a8cd0SMatthew Dillon 		 */
23600c3a8cd0SMatthew Dillon 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
23610c3a8cd0SMatthew Dillon 			if (msg->any.head.cmd & DMSGF_ABORT) {
23620c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
23630c3a8cd0SMatthew Dillon 			} else {
23645ab1caedSMatthew Dillon 				dmio_printf(iocom, 2,
23655ab1caedSMatthew Dillon 					    "reused-state(r,d) %s\n",
23660c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
23670c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_TRANS;
23680c3a8cd0SMatthew Dillon 				assert(0);
23690c3a8cd0SMatthew Dillon 			}
23700c3a8cd0SMatthew Dillon 			break;
23710c3a8cd0SMatthew Dillon 		}
23720c3a8cd0SMatthew Dillon 		error = 0;
23730c3a8cd0SMatthew Dillon 		break;
23740c3a8cd0SMatthew Dillon 	case DMSGF_REPLY:
23750c3a8cd0SMatthew Dillon 		/*
23760c3a8cd0SMatthew Dillon 		 * Check for mid-stream ABORT reply received to sent command.
23770c3a8cd0SMatthew Dillon 		 */
23780c3a8cd0SMatthew Dillon 		if (msg->any.head.cmd & DMSGF_ABORT) {
2379d30cab67SMatthew Dillon 			if (state == pstate ||
23800c3a8cd0SMatthew Dillon 			    (state->rxcmd & DMSGF_CREATE) == 0) {
23810c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
23820c3a8cd0SMatthew Dillon 				break;
23830c3a8cd0SMatthew Dillon 			}
23840c3a8cd0SMatthew Dillon 		}
23850c3a8cd0SMatthew Dillon 		error = 0;
23860c3a8cd0SMatthew Dillon 		break;
23870c3a8cd0SMatthew Dillon 	}
23888e226bc8SMatthew Dillon 
23898e226bc8SMatthew Dillon 	/*
23908e226bc8SMatthew Dillon 	 * Calculate the easy-switch() transactional command.  Represents
23918e226bc8SMatthew Dillon 	 * the outer-transaction command for any transaction-create or
23928e226bc8SMatthew Dillon 	 * transaction-delete, and the inner message command for any
23938e226bc8SMatthew Dillon 	 * non-transaction or inside-transaction command.  tcmd will be
23948e226bc8SMatthew Dillon 	 * set to 0 for any messaging error condition.
23958e226bc8SMatthew Dillon 	 *
23968e226bc8SMatthew Dillon 	 * The two can be told apart because outer-transaction commands
23978e226bc8SMatthew Dillon 	 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
23988e226bc8SMatthew Dillon 	 */
23998e226bc8SMatthew Dillon 	if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
24007adbba57SMatthew Dillon 		if ((msg->state->flags & DMSG_STATE_ROOT) == 0) {
24010a9eefcaSMatthew Dillon 			msg->tcmd = (state->icmd & DMSGF_BASECMDMASK) |
24028e226bc8SMatthew Dillon 				    (msg->any.head.cmd & (DMSGF_CREATE |
24038e226bc8SMatthew Dillon 							  DMSGF_DELETE |
24048e226bc8SMatthew Dillon 							  DMSGF_REPLY));
24058e226bc8SMatthew Dillon 		} else {
24068e226bc8SMatthew Dillon 			msg->tcmd = 0;
24078e226bc8SMatthew Dillon 		}
24088e226bc8SMatthew Dillon 	} else {
24098e226bc8SMatthew Dillon 		msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
24108e226bc8SMatthew Dillon 	}
24117adbba57SMatthew Dillon 
24127adbba57SMatthew Dillon #ifdef DMSG_BLOCK_DEBUG
24137adbba57SMatthew Dillon 	switch (msg->tcmd) {
24147adbba57SMatthew Dillon 	case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE:
24157adbba57SMatthew Dillon 	case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE:
24165ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
24175ab1caedSMatthew Dillon 			    "read  BIO %-3d %016jx %d@%016jx\n",
24187adbba57SMatthew Dillon 			    biocount, msg->any.head.msgid,
24197adbba57SMatthew Dillon 			    msg->any.blk_read.bytes,
24207adbba57SMatthew Dillon 			    msg->any.blk_read.offset);
24217adbba57SMatthew Dillon 		break;
24227adbba57SMatthew Dillon 	case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
24237adbba57SMatthew Dillon 	case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
24245ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
24255ab1caedSMatthew Dillon 			    "rread BIO %-3d %016jx %d@%016jx\n",
24267adbba57SMatthew Dillon 			    biocount, msg->any.head.msgid,
24277adbba57SMatthew Dillon 			    msg->any.blk_read.bytes,
24287adbba57SMatthew Dillon 			    msg->any.blk_read.offset);
24297adbba57SMatthew Dillon 		break;
24307adbba57SMatthew Dillon 	default:
24317adbba57SMatthew Dillon 		break;
24327adbba57SMatthew Dillon 	}
24337adbba57SMatthew Dillon #endif
24347adbba57SMatthew Dillon 
24350a9eefcaSMatthew Dillon 	/*
24360a9eefcaSMatthew Dillon 	 * Adjust state, mark receive side as DELETED if appropriate and
24370a9eefcaSMatthew Dillon 	 * adjust RB tree if both sides are DELETED.  cleanuprx handles
24380a9eefcaSMatthew Dillon 	 * the rest after the state callback returns.
24390a9eefcaSMatthew Dillon 	 */
24400a9eefcaSMatthew Dillon 	assert(msg->state->iocom == iocom);
24410a9eefcaSMatthew Dillon 	assert(msg->state == state);
24420a9eefcaSMatthew Dillon 
24430a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_ROOT) {
24440a9eefcaSMatthew Dillon 		/*
24450a9eefcaSMatthew Dillon 		 * Nothing to do for non-transactional messages.
24460a9eefcaSMatthew Dillon 		 */
24470a9eefcaSMatthew Dillon 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
24480a9eefcaSMatthew Dillon 		/*
24490a9eefcaSMatthew Dillon 		 * Message terminating transaction, remove the state from
24500a9eefcaSMatthew Dillon 		 * the RB tree if the full transaction is now complete.
24510a9eefcaSMatthew Dillon 		 * The related state, subq, and parent link is retained
24520a9eefcaSMatthew Dillon 		 * until after the state callback is complete.
24530a9eefcaSMatthew Dillon 		 */
24540a9eefcaSMatthew Dillon 		assert((state->rxcmd & DMSGF_DELETE) == 0);
24550a9eefcaSMatthew Dillon 		state->rxcmd |= DMSGF_DELETE;
24560a9eefcaSMatthew Dillon 		if (state->txcmd & DMSGF_DELETE) {
24570a9eefcaSMatthew Dillon 			assert(state->flags & DMSG_STATE_RBINSERTED);
24580a9eefcaSMatthew Dillon 			if (state->rxcmd & DMSGF_REPLY) {
24590a9eefcaSMatthew Dillon 				assert(msg->any.head.cmd & DMSGF_REPLY);
24600a9eefcaSMatthew Dillon 				RB_REMOVE(dmsg_state_tree,
24610a9eefcaSMatthew Dillon 					  &iocom->statewr_tree, state);
24620a9eefcaSMatthew Dillon 			} else {
24630a9eefcaSMatthew Dillon 				assert((msg->any.head.cmd & DMSGF_REPLY) == 0);
24640a9eefcaSMatthew Dillon 				RB_REMOVE(dmsg_state_tree,
24650a9eefcaSMatthew Dillon 					  &iocom->staterd_tree, state);
24660a9eefcaSMatthew Dillon 			}
24670a9eefcaSMatthew Dillon 			state->flags &= ~DMSG_STATE_RBINSERTED;
24680a9eefcaSMatthew Dillon 			dmsg_state_drop(state);
24690a9eefcaSMatthew Dillon 		}
24700a9eefcaSMatthew Dillon 	}
24710a9eefcaSMatthew Dillon 
24720a9eefcaSMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
24730a9eefcaSMatthew Dillon 
24740a9eefcaSMatthew Dillon 	if (DMsgDebugOpt && error)
24755ab1caedSMatthew Dillon 		dmio_printf(iocom, 1, "msgrx: error %d\n", error);
24760a9eefcaSMatthew Dillon 
24770c3a8cd0SMatthew Dillon 	return (error);
24780c3a8cd0SMatthew Dillon }
24790c3a8cd0SMatthew Dillon 
24801b8eded1SMatthew Dillon /*
2481d30cab67SMatthew Dillon  * Route the message and handle pair-state processing.
24821b8eded1SMatthew Dillon  */
2483d30cab67SMatthew Dillon void
2484d30cab67SMatthew Dillon dmsg_state_relay(dmsg_msg_t *lmsg)
24851b8eded1SMatthew Dillon {
2486d30cab67SMatthew Dillon 	dmsg_state_t *lpstate;
2487d30cab67SMatthew Dillon 	dmsg_state_t *rpstate;
2488d30cab67SMatthew Dillon 	dmsg_state_t *lstate;
2489d30cab67SMatthew Dillon 	dmsg_state_t *rstate;
2490d30cab67SMatthew Dillon 	dmsg_msg_t *rmsg;
24911b8eded1SMatthew Dillon 
24927adbba57SMatthew Dillon #ifdef DMSG_BLOCK_DEBUG
24937adbba57SMatthew Dillon 	switch (lmsg->tcmd) {
24940a9eefcaSMatthew Dillon 	case DMSG_BLK_OPEN | DMSGF_CREATE:
24955ab1caedSMatthew Dillon 		dmio_printf(iocom, 4, "%s\n",
24965ab1caedSMatthew Dillon 			    "relay BIO_OPEN (CREATE)");
24970a9eefcaSMatthew Dillon 		break;
24980a9eefcaSMatthew Dillon 	case DMSG_BLK_OPEN | DMSGF_DELETE:
24995ab1caedSMatthew Dillon 		dmio_printf(iocom, 4, "%s\n",
25005ab1caedSMatthew Dillon 			    "relay BIO_OPEN (DELETE)");
25010a9eefcaSMatthew Dillon 		break;
25027adbba57SMatthew Dillon 	case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE:
25037adbba57SMatthew Dillon 	case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE:
25047adbba57SMatthew Dillon 		atomic_add_int(&biocount, 1);
25055ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
25065ab1caedSMatthew Dillon 			    "relay BIO %-3d %016jx %d@%016jx\n",
25077adbba57SMatthew Dillon 			    biocount, lmsg->any.head.msgid,
25087adbba57SMatthew Dillon 			    lmsg->any.blk_read.bytes,
25097adbba57SMatthew Dillon 			    lmsg->any.blk_read.offset);
25107adbba57SMatthew Dillon 		break;
25117adbba57SMatthew Dillon 	case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
25127adbba57SMatthew Dillon 	case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
25135ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
25145ab1caedSMatthew Dillon 			    "retrn BIO %-3d %016jx %d@%016jx\n",
25157adbba57SMatthew Dillon 			    biocount, lmsg->any.head.msgid,
25167adbba57SMatthew Dillon 			    lmsg->any.blk_read.bytes,
25177adbba57SMatthew Dillon 			    lmsg->any.blk_read.offset);
25187adbba57SMatthew Dillon 		atomic_add_int(&biocount, -1);
25197adbba57SMatthew Dillon 		break;
25207adbba57SMatthew Dillon 	default:
25217adbba57SMatthew Dillon 		break;
25227adbba57SMatthew Dillon 	}
25237adbba57SMatthew Dillon #endif
25247adbba57SMatthew Dillon 
2525d30cab67SMatthew Dillon 	if ((lmsg->any.head.cmd & (DMSGF_CREATE | DMSGF_REPLY)) ==
2526d30cab67SMatthew Dillon 	    DMSGF_CREATE) {
25271b8eded1SMatthew Dillon 		/*
2528d30cab67SMatthew Dillon 		 * New sub-transaction, establish new state and relay.
25291b8eded1SMatthew Dillon 		 */
2530d30cab67SMatthew Dillon 		lstate = lmsg->state;
2531d30cab67SMatthew Dillon 		lpstate = lstate->parent;
2532d30cab67SMatthew Dillon 		rpstate = lpstate->relay;
2533d30cab67SMatthew Dillon 		assert(lstate->relay == NULL);
2534d30cab67SMatthew Dillon 		assert(rpstate != NULL);
25351b8eded1SMatthew Dillon 
2536e96cef49SMatthew Dillon 		rmsg = dmsg_msg_alloc(rpstate, 0,
2537d30cab67SMatthew Dillon 				      lmsg->any.head.cmd,
2538d30cab67SMatthew Dillon 				      dmsg_state_relay, NULL);
2539d30cab67SMatthew Dillon 		rstate = rmsg->state;
2540d30cab67SMatthew Dillon 		rstate->relay = lstate;
2541d30cab67SMatthew Dillon 		lstate->relay = rstate;
2542323c0947SMatthew Dillon 		dmsg_state_hold(lstate);
2543323c0947SMatthew Dillon 		dmsg_state_hold(rstate);
25441b8eded1SMatthew Dillon 	} else {
25451b8eded1SMatthew Dillon 		/*
2546d30cab67SMatthew Dillon 		 * State & relay already established
25471b8eded1SMatthew Dillon 		 */
2548d30cab67SMatthew Dillon 		lstate = lmsg->state;
2549d30cab67SMatthew Dillon 		rstate = lstate->relay;
2550d30cab67SMatthew Dillon 		assert(rstate != NULL);
2551d30cab67SMatthew Dillon 
25520a9eefcaSMatthew Dillon 		assert((rstate->txcmd & DMSGF_DELETE) == 0);
25530a9eefcaSMatthew Dillon 
25540a9eefcaSMatthew Dillon #if 0
25550a9eefcaSMatthew Dillon 		if (lstate->flags & DMSG_STATE_ABORTING) {
25565ab1caedSMatthew Dillon 			dmio_printf(iocom, 4,
25570a9eefcaSMatthew Dillon 				    "relay: relay lost link l=%p r=%p\n",
25580a9eefcaSMatthew Dillon 				    lstate, rstate);
25590a9eefcaSMatthew Dillon 			dmsg_simulate_failure(rstate, 0, DMSG_ERR_LOSTLINK);
25600a9eefcaSMatthew Dillon 		}
25610a9eefcaSMatthew Dillon #endif
25620a9eefcaSMatthew Dillon 
2563e96cef49SMatthew Dillon 		rmsg = dmsg_msg_alloc(rstate, 0,
2564d30cab67SMatthew Dillon 				      lmsg->any.head.cmd,
2565d30cab67SMatthew Dillon 				      dmsg_state_relay, NULL);
25661b8eded1SMatthew Dillon 	}
2567d30cab67SMatthew Dillon 	if (lmsg->hdr_size > sizeof(lmsg->any.head)) {
2568d30cab67SMatthew Dillon 		bcopy(&lmsg->any.head + 1, &rmsg->any.head + 1,
2569d30cab67SMatthew Dillon 		      lmsg->hdr_size - sizeof(lmsg->any.head));
2570d30cab67SMatthew Dillon 	}
2571d30cab67SMatthew Dillon 	rmsg->any.head.error = lmsg->any.head.error;
2572d30cab67SMatthew Dillon 	rmsg->any.head.reserved02 = lmsg->any.head.reserved02;
2573d30cab67SMatthew Dillon 	rmsg->any.head.reserved18 = lmsg->any.head.reserved18;
2574e96cef49SMatthew Dillon 	rmsg->aux_size = lmsg->aux_size;
2575d30cab67SMatthew Dillon 	rmsg->aux_data = lmsg->aux_data;
2576d30cab67SMatthew Dillon 	lmsg->aux_data = NULL;
25770a9eefcaSMatthew Dillon 
2578d30cab67SMatthew Dillon 	dmsg_msg_write(rmsg);
25791b8eded1SMatthew Dillon }
25801b8eded1SMatthew Dillon 
2581d30cab67SMatthew Dillon /*
25820a9eefcaSMatthew Dillon  * Cleanup and retire msg after issuing the state callback.  The state
25830a9eefcaSMatthew Dillon  * has already been removed from the RB tree.  The subq and msg must be
25840a9eefcaSMatthew Dillon  * cleaned up.
25850a9eefcaSMatthew Dillon  *
25860a9eefcaSMatthew Dillon  * Called with the iocom mutex held (to handle subq disconnection).
2587d30cab67SMatthew Dillon  */
25880c3a8cd0SMatthew Dillon void
25890c3a8cd0SMatthew Dillon dmsg_state_cleanuprx(dmsg_iocom_t *iocom, dmsg_msg_t *msg)
25900c3a8cd0SMatthew Dillon {
25910c3a8cd0SMatthew Dillon 	dmsg_state_t *state;
25920c3a8cd0SMatthew Dillon 
25931b8eded1SMatthew Dillon 	assert(msg->state->iocom == iocom);
25941b8eded1SMatthew Dillon 	state = msg->state;
2595d30cab67SMatthew Dillon 	if (state->flags & DMSG_STATE_ROOT) {
25960c3a8cd0SMatthew Dillon 		/*
25970c3a8cd0SMatthew Dillon 		 * Free a non-transactional message, there is no state
25980c3a8cd0SMatthew Dillon 		 * to worry about.
25990c3a8cd0SMatthew Dillon 		 */
26000c3a8cd0SMatthew Dillon 		dmsg_msg_free(msg);
26010a9eefcaSMatthew Dillon 	} else if ((state->flags & DMSG_STATE_SUBINSERTED) &&
26020a9eefcaSMatthew Dillon 		   (state->rxcmd & DMSGF_DELETE) &&
26030a9eefcaSMatthew Dillon 		   (state->txcmd & DMSGF_DELETE)) {
26040c3a8cd0SMatthew Dillon 		/*
26050a9eefcaSMatthew Dillon 		 * Must disconnect from parent and drop relay.
26060c3a8cd0SMatthew Dillon 		 */
26070a9eefcaSMatthew Dillon 		dmsg_subq_delete(state);
2608d30cab67SMatthew Dillon 		if (state->relay) {
2609323c0947SMatthew Dillon 			dmsg_state_drop(state->relay);
2610d30cab67SMatthew Dillon 			state->relay = NULL;
2611d30cab67SMatthew Dillon 		}
26121b8eded1SMatthew Dillon 		dmsg_msg_free(msg);
26131b8eded1SMatthew Dillon 	} else {
26140c3a8cd0SMatthew Dillon 		/*
26150c3a8cd0SMatthew Dillon 		 * Message not terminating transaction, leave state intact
26160c3a8cd0SMatthew Dillon 		 * and free message if it isn't the CREATE message.
26170c3a8cd0SMatthew Dillon 		 */
26180c3a8cd0SMatthew Dillon 		dmsg_msg_free(msg);
26190c3a8cd0SMatthew Dillon 	}
26200c3a8cd0SMatthew Dillon }
26210c3a8cd0SMatthew Dillon 
2622323c0947SMatthew Dillon /*
2623323c0947SMatthew Dillon  * Clean up the state after pulling out needed fields and queueing the
2624323c0947SMatthew Dillon  * message for transmission.   This occurs in dmsg_msg_write().
26250a9eefcaSMatthew Dillon  *
26260a9eefcaSMatthew Dillon  * Called with the mutex locked.
2627323c0947SMatthew Dillon  */
26280c3a8cd0SMatthew Dillon static void
26291b8eded1SMatthew Dillon dmsg_state_cleanuptx(dmsg_iocom_t *iocom, dmsg_msg_t *msg)
26300c3a8cd0SMatthew Dillon {
26310c3a8cd0SMatthew Dillon 	dmsg_state_t *state;
26320c3a8cd0SMatthew Dillon 
26331b8eded1SMatthew Dillon 	assert(iocom == msg->state->iocom);
26341b8eded1SMatthew Dillon 	state = msg->state;
26350a9eefcaSMatthew Dillon 
26360a9eefcaSMatthew Dillon 	dmsg_state_hold(state);
26370a9eefcaSMatthew Dillon 
2638d30cab67SMatthew Dillon 	if (state->flags & DMSG_STATE_ROOT) {
2639323c0947SMatthew Dillon 		;
26400c3a8cd0SMatthew Dillon 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
2641323c0947SMatthew Dillon 		/*
2642323c0947SMatthew Dillon 		 * Message terminating transaction, destroy the related
2643323c0947SMatthew Dillon 		 * state, the original message, and this message (if it
2644323c0947SMatthew Dillon 		 * isn't the original message due to a CREATE|DELETE).
2645323c0947SMatthew Dillon 		 *
2646323c0947SMatthew Dillon 		 * It's possible for governing state to terminate while
2647323c0947SMatthew Dillon 		 * sub-transactions still exist.  This is allowed but
2648323c0947SMatthew Dillon 		 * will cause sub-transactions to recursively fail.
2649323c0947SMatthew Dillon 		 * Further reception of sub-transaction messages will be
2650323c0947SMatthew Dillon 		 * impossible because the circuit will no longer exist.
2651323c0947SMatthew Dillon 		 * (XXX need code to make sure that happens properly).
26520a9eefcaSMatthew Dillon 		 *
26530a9eefcaSMatthew Dillon 		 * NOTE: It is possible for a fafilure to terminate the
26540a9eefcaSMatthew Dillon 		 *	 state after we have written the message but before
26550a9eefcaSMatthew Dillon 		 *	 we are able to call cleanuptx, so txcmd might already
26560a9eefcaSMatthew Dillon 		 *	 have DMSGF_DELETE set.
2657323c0947SMatthew Dillon 		 */
26580a9eefcaSMatthew Dillon 		if ((state->txcmd & DMSGF_DELETE) == 0 &&
26590a9eefcaSMatthew Dillon 		    (state->rxcmd & DMSGF_DELETE)) {
26600c3a8cd0SMatthew Dillon 			state->txcmd |= DMSGF_DELETE;
2661a06d536bSMatthew Dillon 			assert(state->flags & DMSG_STATE_RBINSERTED);
26620c3a8cd0SMatthew Dillon 			if (state->txcmd & DMSGF_REPLY) {
26630c3a8cd0SMatthew Dillon 				assert(msg->any.head.cmd & DMSGF_REPLY);
26640c3a8cd0SMatthew Dillon 				RB_REMOVE(dmsg_state_tree,
26651b8eded1SMatthew Dillon 					  &iocom->staterd_tree, state);
26660c3a8cd0SMatthew Dillon 			} else {
26670c3a8cd0SMatthew Dillon 				assert((msg->any.head.cmd & DMSGF_REPLY) == 0);
26680c3a8cd0SMatthew Dillon 				RB_REMOVE(dmsg_state_tree,
26691b8eded1SMatthew Dillon 					  &iocom->statewr_tree, state);
26701b8eded1SMatthew Dillon 			}
2671a06d536bSMatthew Dillon 			state->flags &= ~DMSG_STATE_RBINSERTED;
26720a9eefcaSMatthew Dillon 			dmsg_subq_delete(state);
2673d30cab67SMatthew Dillon 
2674d30cab67SMatthew Dillon 			if (state->relay) {
2675323c0947SMatthew Dillon 				dmsg_state_drop(state->relay);
2676d30cab67SMatthew Dillon 				state->relay = NULL;
2677d30cab67SMatthew Dillon 			}
26780a9eefcaSMatthew Dillon 			dmsg_state_drop(state);	/* state->rbtree */
26790a9eefcaSMatthew Dillon 		} else if ((state->txcmd & DMSGF_DELETE) == 0) {
26800a9eefcaSMatthew Dillon 			state->txcmd |= DMSGF_DELETE;
26810c3a8cd0SMatthew Dillon 		}
26820c3a8cd0SMatthew Dillon 	}
26830a9eefcaSMatthew Dillon 
26840a9eefcaSMatthew Dillon 	/*
26850a9eefcaSMatthew Dillon 	 * Deferred abort after transmission.
26860a9eefcaSMatthew Dillon 	 */
26870a9eefcaSMatthew Dillon 	if ((state->flags & (DMSG_STATE_ABORTING | DMSG_STATE_DYING)) &&
26880a9eefcaSMatthew Dillon 	    (state->rxcmd & DMSGF_DELETE) == 0) {
26895ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
26905ab1caedSMatthew Dillon 			    "cleanuptx: state=%p "
26910a9eefcaSMatthew Dillon 			    "executing deferred abort\n",
26920a9eefcaSMatthew Dillon 			    state);
26930a9eefcaSMatthew Dillon 		state->flags &= ~DMSG_STATE_ABORTING;
26940a9eefcaSMatthew Dillon 		dmsg_simulate_failure(state, 1, DMSG_ERR_LOSTLINK);
26950a9eefcaSMatthew Dillon 	}
26960a9eefcaSMatthew Dillon 
26970a9eefcaSMatthew Dillon 	dmsg_state_drop(state);
26980c3a8cd0SMatthew Dillon }
26990c3a8cd0SMatthew Dillon 
27000c3a8cd0SMatthew Dillon /*
2701323c0947SMatthew Dillon  * Called with or without locks
2702323c0947SMatthew Dillon  */
2703323c0947SMatthew Dillon void
2704323c0947SMatthew Dillon dmsg_state_hold(dmsg_state_t *state)
2705323c0947SMatthew Dillon {
2706323c0947SMatthew Dillon 	atomic_add_int(&state->refs, 1);
2707323c0947SMatthew Dillon }
2708323c0947SMatthew Dillon 
2709323c0947SMatthew Dillon void
2710323c0947SMatthew Dillon dmsg_state_drop(dmsg_state_t *state)
2711323c0947SMatthew Dillon {
27120a9eefcaSMatthew Dillon 	assert(state->refs > 0);
2713323c0947SMatthew Dillon 	if (atomic_fetchadd_int(&state->refs, -1) == 1)
2714323c0947SMatthew Dillon 		dmsg_state_free(state);
2715323c0947SMatthew Dillon }
2716323c0947SMatthew Dillon 
2717323c0947SMatthew Dillon /*
27180c3a8cd0SMatthew Dillon  * Called with iocom locked
27190c3a8cd0SMatthew Dillon  */
2720323c0947SMatthew Dillon static void
27210c3a8cd0SMatthew Dillon dmsg_state_free(dmsg_state_t *state)
27220c3a8cd0SMatthew Dillon {
2723323c0947SMatthew Dillon 	atomic_add_int(&dmsg_state_count, -1);
27245ab1caedSMatthew Dillon 	dmio_printf(state->iocom, 5, "terminate state %p\n", state);
2725a06d536bSMatthew Dillon 	assert((state->flags & (DMSG_STATE_ROOT |
2726a06d536bSMatthew Dillon 				DMSG_STATE_SUBINSERTED |
2727a06d536bSMatthew Dillon 				DMSG_STATE_RBINSERTED)) == 0);
2728323c0947SMatthew Dillon 	assert(TAILQ_EMPTY(&state->subq));
2729323c0947SMatthew Dillon 	assert(state->refs == 0);
2730f306de83SMatthew Dillon 	if (state->any.any != NULL)   /* XXX avoid deadlock w/exit & kernel */
2731f306de83SMatthew Dillon 		closefrom(3);
27320c3a8cd0SMatthew Dillon 	assert(state->any.any == NULL);
27330c3a8cd0SMatthew Dillon 	free(state);
27340d20ec8aSMatthew Dillon }
27350c3a8cd0SMatthew Dillon 
27360c3a8cd0SMatthew Dillon /*
27370c3a8cd0SMatthew Dillon  * This swaps endian for a hammer2_msg_hdr.  Note that the extended
27380c3a8cd0SMatthew Dillon  * header is not adjusted, just the core header.
27390c3a8cd0SMatthew Dillon  */
27400c3a8cd0SMatthew Dillon void
27410c3a8cd0SMatthew Dillon dmsg_bswap_head(dmsg_hdr_t *head)
27420c3a8cd0SMatthew Dillon {
27430c3a8cd0SMatthew Dillon 	head->magic	= bswap16(head->magic);
27440c3a8cd0SMatthew Dillon 	head->reserved02 = bswap16(head->reserved02);
27450c3a8cd0SMatthew Dillon 	head->salt	= bswap32(head->salt);
27460c3a8cd0SMatthew Dillon 
27470c3a8cd0SMatthew Dillon 	head->msgid	= bswap64(head->msgid);
27480d20ec8aSMatthew Dillon 	head->circuit	= bswap64(head->circuit);
27490d20ec8aSMatthew Dillon 	head->reserved18= bswap64(head->reserved18);
27500c3a8cd0SMatthew Dillon 
27510c3a8cd0SMatthew Dillon 	head->cmd	= bswap32(head->cmd);
27520c3a8cd0SMatthew Dillon 	head->aux_crc	= bswap32(head->aux_crc);
27530c3a8cd0SMatthew Dillon 	head->aux_bytes	= bswap32(head->aux_bytes);
27540c3a8cd0SMatthew Dillon 	head->error	= bswap32(head->error);
27550c3a8cd0SMatthew Dillon 	head->aux_descr = bswap64(head->aux_descr);
27560c3a8cd0SMatthew Dillon 	head->reserved38= bswap32(head->reserved38);
27570c3a8cd0SMatthew Dillon 	head->hdr_crc	= bswap32(head->hdr_crc);
27580c3a8cd0SMatthew Dillon }
2759