xref: /dragonfly/lib/libdmsg/msg.c (revision a988b43e)
10c3a8cd0SMatthew Dillon /*
2e96cef49SMatthew Dillon  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
30c3a8cd0SMatthew Dillon  *
40c3a8cd0SMatthew Dillon  * This code is derived from software contributed to The DragonFly Project
50c3a8cd0SMatthew Dillon  * by Matthew Dillon <dillon@dragonflybsd.org>
60c3a8cd0SMatthew Dillon  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
70c3a8cd0SMatthew Dillon  *
80c3a8cd0SMatthew Dillon  * Redistribution and use in source and binary forms, with or without
90c3a8cd0SMatthew Dillon  * modification, are permitted provided that the following conditions
100c3a8cd0SMatthew Dillon  * are met:
110c3a8cd0SMatthew Dillon  *
120c3a8cd0SMatthew Dillon  * 1. Redistributions of source code must retain the above copyright
130c3a8cd0SMatthew Dillon  *    notice, this list of conditions and the following disclaimer.
140c3a8cd0SMatthew Dillon  * 2. Redistributions in binary form must reproduce the above copyright
150c3a8cd0SMatthew Dillon  *    notice, this list of conditions and the following disclaimer in
160c3a8cd0SMatthew Dillon  *    the documentation and/or other materials provided with the
170c3a8cd0SMatthew Dillon  *    distribution.
180c3a8cd0SMatthew Dillon  * 3. Neither the name of The DragonFly Project nor the names of its
190c3a8cd0SMatthew Dillon  *    contributors may be used to endorse or promote products derived
200c3a8cd0SMatthew Dillon  *    from this software without specific, prior written permission.
210c3a8cd0SMatthew Dillon  *
220c3a8cd0SMatthew Dillon  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
230c3a8cd0SMatthew Dillon  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
240c3a8cd0SMatthew Dillon  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
250c3a8cd0SMatthew Dillon  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
260c3a8cd0SMatthew Dillon  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
270c3a8cd0SMatthew Dillon  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
280c3a8cd0SMatthew Dillon  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
290c3a8cd0SMatthew Dillon  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
300c3a8cd0SMatthew Dillon  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
310c3a8cd0SMatthew Dillon  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
320c3a8cd0SMatthew Dillon  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
330c3a8cd0SMatthew Dillon  * SUCH DAMAGE.
340c3a8cd0SMatthew Dillon  */
350c3a8cd0SMatthew Dillon 
360c3a8cd0SMatthew Dillon #include "dmsg_local.h"
370c3a8cd0SMatthew Dillon 
380a9eefcaSMatthew Dillon #define DMSG_BLOCK_DEBUG
390a9eefcaSMatthew Dillon 
400c3a8cd0SMatthew Dillon int DMsgDebugOpt;
41157f2a25STomohiro Kusumi static unsigned int dmsg_state_count;
427adbba57SMatthew Dillon #ifdef DMSG_BLOCK_DEBUG
43157f2a25STomohiro Kusumi static unsigned int biocount;
447adbba57SMatthew Dillon #endif
450c3a8cd0SMatthew Dillon 
460a9eefcaSMatthew Dillon static int dmsg_state_msgrx(dmsg_msg_t *msg, int mstate);
471b8eded1SMatthew Dillon static void dmsg_state_cleanuptx(dmsg_iocom_t *iocom, dmsg_msg_t *msg);
48a2179323SMatthew Dillon static void dmsg_msg_free_locked(dmsg_msg_t *msg);
49323c0947SMatthew Dillon static void dmsg_state_free(dmsg_state_t *state);
500a9eefcaSMatthew Dillon static void dmsg_subq_delete(dmsg_state_t *state);
510a9eefcaSMatthew Dillon static void dmsg_simulate_failure(dmsg_state_t *state, int meto, int error);
520a9eefcaSMatthew Dillon static void dmsg_state_abort(dmsg_state_t *state);
530a9eefcaSMatthew Dillon static void dmsg_state_dying(dmsg_state_t *state);
540c3a8cd0SMatthew Dillon 
550d20ec8aSMatthew Dillon RB_GENERATE(dmsg_state_tree, dmsg_state, rbnode, dmsg_state_cmp);
560c3a8cd0SMatthew Dillon 
570c3a8cd0SMatthew Dillon /*
580c3a8cd0SMatthew Dillon  * STATE TREE - Represents open transactions which are indexed by their
590d20ec8aSMatthew Dillon  *		{ msgid } relative to the governing iocom.
600c3a8cd0SMatthew Dillon  */
610c3a8cd0SMatthew Dillon int
dmsg_state_cmp(dmsg_state_t * state1,dmsg_state_t * state2)620c3a8cd0SMatthew Dillon dmsg_state_cmp(dmsg_state_t *state1, dmsg_state_t *state2)
630c3a8cd0SMatthew Dillon {
640c3a8cd0SMatthew Dillon 	if (state1->msgid < state2->msgid)
650c3a8cd0SMatthew Dillon 		return(-1);
660c3a8cd0SMatthew Dillon 	if (state1->msgid > state2->msgid)
670c3a8cd0SMatthew Dillon 		return(1);
680c3a8cd0SMatthew Dillon 	return(0);
690c3a8cd0SMatthew Dillon }
700c3a8cd0SMatthew Dillon 
710d20ec8aSMatthew Dillon /*
720c3a8cd0SMatthew Dillon  * Initialize a low-level ioq
730c3a8cd0SMatthew Dillon  */
740c3a8cd0SMatthew Dillon void
dmsg_ioq_init(dmsg_iocom_t * iocom __unused,dmsg_ioq_t * ioq)750c3a8cd0SMatthew Dillon dmsg_ioq_init(dmsg_iocom_t *iocom __unused, dmsg_ioq_t *ioq)
760c3a8cd0SMatthew Dillon {
770c3a8cd0SMatthew Dillon 	bzero(ioq, sizeof(*ioq));
780c3a8cd0SMatthew Dillon 	ioq->state = DMSG_MSGQ_STATE_HEADER1;
790c3a8cd0SMatthew Dillon 	TAILQ_INIT(&ioq->msgq);
800c3a8cd0SMatthew Dillon }
810c3a8cd0SMatthew Dillon 
820c3a8cd0SMatthew Dillon /*
830c3a8cd0SMatthew Dillon  * Cleanup queue.
840c3a8cd0SMatthew Dillon  *
850c3a8cd0SMatthew Dillon  * caller holds iocom->mtx.
860c3a8cd0SMatthew Dillon  */
870c3a8cd0SMatthew Dillon void
dmsg_ioq_done(dmsg_iocom_t * iocom __unused,dmsg_ioq_t * ioq)880c3a8cd0SMatthew Dillon dmsg_ioq_done(dmsg_iocom_t *iocom __unused, dmsg_ioq_t *ioq)
890c3a8cd0SMatthew Dillon {
900c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
910c3a8cd0SMatthew Dillon 
920c3a8cd0SMatthew Dillon 	while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
930c3a8cd0SMatthew Dillon 		assert(0);	/* shouldn't happen */
940c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&ioq->msgq, msg, qentry);
950c3a8cd0SMatthew Dillon 		dmsg_msg_free(msg);
960c3a8cd0SMatthew Dillon 	}
970c3a8cd0SMatthew Dillon 	if ((msg = ioq->msg) != NULL) {
980c3a8cd0SMatthew Dillon 		ioq->msg = NULL;
990c3a8cd0SMatthew Dillon 		dmsg_msg_free(msg);
1000c3a8cd0SMatthew Dillon 	}
1010c3a8cd0SMatthew Dillon }
1020c3a8cd0SMatthew Dillon 
1030c3a8cd0SMatthew Dillon /*
1040c3a8cd0SMatthew Dillon  * Initialize a low-level communications channel.
1050c3a8cd0SMatthew Dillon  *
1060c3a8cd0SMatthew Dillon  * NOTE: The signal_func() is called at least once from the loop and can be
1070c3a8cd0SMatthew Dillon  *	 re-armed via dmsg_iocom_restate().
1080c3a8cd0SMatthew Dillon  */
1090c3a8cd0SMatthew Dillon void
dmsg_iocom_init(dmsg_iocom_t * iocom,int sock_fd,int alt_fd,void (* signal_func)(dmsg_iocom_t * iocom),void (* rcvmsg_func)(dmsg_msg_t * msg),void (* usrmsg_func)(dmsg_msg_t * msg,int unmanaged),void (* altmsg_func)(dmsg_iocom_t * iocom))1100c3a8cd0SMatthew Dillon dmsg_iocom_init(dmsg_iocom_t *iocom, int sock_fd, int alt_fd,
11101e43224SMatthew Dillon 		   void (*signal_func)(dmsg_iocom_t *iocom),
11201e43224SMatthew Dillon 		   void (*rcvmsg_func)(dmsg_msg_t *msg),
11301e43224SMatthew Dillon 		   void (*usrmsg_func)(dmsg_msg_t *msg, int unmanaged),
11401e43224SMatthew Dillon 		   void (*altmsg_func)(dmsg_iocom_t *iocom))
1150c3a8cd0SMatthew Dillon {
1160c3a8cd0SMatthew Dillon 	struct stat st;
1170c3a8cd0SMatthew Dillon 
1180c3a8cd0SMatthew Dillon 	bzero(iocom, sizeof(*iocom));
1190c3a8cd0SMatthew Dillon 
120f306de83SMatthew Dillon 	asprintf(&iocom->label, "iocom-%p", iocom);
1210d20ec8aSMatthew Dillon 	iocom->signal_callback = signal_func;
1220d20ec8aSMatthew Dillon 	iocom->rcvmsg_callback = rcvmsg_func;
1230d20ec8aSMatthew Dillon 	iocom->altmsg_callback = altmsg_func;
12401e43224SMatthew Dillon 	iocom->usrmsg_callback = usrmsg_func;
1250c3a8cd0SMatthew Dillon 
1260c3a8cd0SMatthew Dillon 	pthread_mutex_init(&iocom->mtx, NULL);
1271b8eded1SMatthew Dillon 	RB_INIT(&iocom->staterd_tree);
1281b8eded1SMatthew Dillon 	RB_INIT(&iocom->statewr_tree);
1290d20ec8aSMatthew Dillon 	TAILQ_INIT(&iocom->txmsgq);
1300c3a8cd0SMatthew Dillon 	iocom->sock_fd = sock_fd;
1310c3a8cd0SMatthew Dillon 	iocom->alt_fd = alt_fd;
13298126869SMatthew Dillon 	iocom->flags = DMSG_IOCOMF_RREQ | DMSG_IOCOMF_CLOSEALT;
1330c3a8cd0SMatthew Dillon 	if (signal_func)
1340c3a8cd0SMatthew Dillon 		iocom->flags |= DMSG_IOCOMF_SWORK;
1350c3a8cd0SMatthew Dillon 	dmsg_ioq_init(iocom, &iocom->ioq_rx);
1360c3a8cd0SMatthew Dillon 	dmsg_ioq_init(iocom, &iocom->ioq_tx);
137323c0947SMatthew Dillon 	iocom->state0.refs = 1;		/* should never trigger a free */
1381b8eded1SMatthew Dillon 	iocom->state0.iocom = iocom;
1391b8eded1SMatthew Dillon 	iocom->state0.parent = &iocom->state0;
140d30cab67SMatthew Dillon 	iocom->state0.flags = DMSG_STATE_ROOT;
1411b8eded1SMatthew Dillon 	TAILQ_INIT(&iocom->state0.subq);
1421b8eded1SMatthew Dillon 
1430c3a8cd0SMatthew Dillon 	if (pipe(iocom->wakeupfds) < 0)
1440c3a8cd0SMatthew Dillon 		assert(0);
1450c3a8cd0SMatthew Dillon 	fcntl(iocom->wakeupfds[0], F_SETFL, O_NONBLOCK);
1460c3a8cd0SMatthew Dillon 	fcntl(iocom->wakeupfds[1], F_SETFL, O_NONBLOCK);
1470c3a8cd0SMatthew Dillon 
1480c3a8cd0SMatthew Dillon 	/*
1490c3a8cd0SMatthew Dillon 	 * Negotiate session crypto synchronously.  This will mark the
1500c3a8cd0SMatthew Dillon 	 * connection as error'd if it fails.  If this is a pipe it's
1510c3a8cd0SMatthew Dillon 	 * a linkage that we set up ourselves to the filesystem and there
1520c3a8cd0SMatthew Dillon 	 * is no crypto.
1530c3a8cd0SMatthew Dillon 	 */
1540c3a8cd0SMatthew Dillon 	if (fstat(sock_fd, &st) < 0)
1550c3a8cd0SMatthew Dillon 		assert(0);
1560c3a8cd0SMatthew Dillon 	if (S_ISSOCK(st.st_mode))
1570c3a8cd0SMatthew Dillon 		dmsg_crypto_negotiate(iocom);
1580c3a8cd0SMatthew Dillon 
1590c3a8cd0SMatthew Dillon 	/*
1600c3a8cd0SMatthew Dillon 	 * Make sure our fds are set to non-blocking for the iocom core.
1610c3a8cd0SMatthew Dillon 	 */
1620c3a8cd0SMatthew Dillon 	if (sock_fd >= 0)
1630c3a8cd0SMatthew Dillon 		fcntl(sock_fd, F_SETFL, O_NONBLOCK);
1640c3a8cd0SMatthew Dillon #if 0
1650c3a8cd0SMatthew Dillon 	/* if line buffered our single fgets() should be fine */
1660c3a8cd0SMatthew Dillon 	if (alt_fd >= 0)
1670c3a8cd0SMatthew Dillon 		fcntl(alt_fd, F_SETFL, O_NONBLOCK);
1680c3a8cd0SMatthew Dillon #endif
1690c3a8cd0SMatthew Dillon }
1700c3a8cd0SMatthew Dillon 
171f306de83SMatthew Dillon void
dmsg_iocom_label(dmsg_iocom_t * iocom,const char * ctl,...)172f306de83SMatthew Dillon dmsg_iocom_label(dmsg_iocom_t *iocom, const char *ctl, ...)
173f306de83SMatthew Dillon {
174f306de83SMatthew Dillon 	va_list va;
175f306de83SMatthew Dillon 	char *optr;
176f306de83SMatthew Dillon 
177f306de83SMatthew Dillon 	va_start(va, ctl);
178f306de83SMatthew Dillon 	optr = iocom->label;
179f306de83SMatthew Dillon 	vasprintf(&iocom->label, ctl, va);
180f306de83SMatthew Dillon 	va_end(va);
181f306de83SMatthew Dillon 	if (optr)
182f306de83SMatthew Dillon 		free(optr);
183f306de83SMatthew Dillon }
184f306de83SMatthew Dillon 
1850c3a8cd0SMatthew Dillon /*
1860c3a8cd0SMatthew Dillon  * May only be called from a callback from iocom_core.
1870c3a8cd0SMatthew Dillon  *
1880c3a8cd0SMatthew Dillon  * Adjust state machine functions, set flags to guarantee that both
1890c3a8cd0SMatthew Dillon  * the recevmsg_func and the sendmsg_func is called at least once.
1900c3a8cd0SMatthew Dillon  */
1910c3a8cd0SMatthew Dillon void
dmsg_iocom_restate(dmsg_iocom_t * iocom,void (* signal_func)(dmsg_iocom_t *),void (* rcvmsg_func)(dmsg_msg_t * msg))1920d20ec8aSMatthew Dillon dmsg_iocom_restate(dmsg_iocom_t *iocom,
1930d20ec8aSMatthew Dillon 		   void (*signal_func)(dmsg_iocom_t *),
19401e43224SMatthew Dillon 		   void (*rcvmsg_func)(dmsg_msg_t *msg))
1950c3a8cd0SMatthew Dillon {
196a2179323SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
1970d20ec8aSMatthew Dillon 	iocom->signal_callback = signal_func;
1980d20ec8aSMatthew Dillon 	iocom->rcvmsg_callback = rcvmsg_func;
1990c3a8cd0SMatthew Dillon 	if (signal_func)
200a2179323SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_SWORK);
2010c3a8cd0SMatthew Dillon 	else
202a2179323SMatthew Dillon 		atomic_clear_int(&iocom->flags, DMSG_IOCOMF_SWORK);
203a2179323SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
2040c3a8cd0SMatthew Dillon }
2050c3a8cd0SMatthew Dillon 
2060c3a8cd0SMatthew Dillon void
dmsg_iocom_signal(dmsg_iocom_t * iocom)2070d20ec8aSMatthew Dillon dmsg_iocom_signal(dmsg_iocom_t *iocom)
2080c3a8cd0SMatthew Dillon {
209a2179323SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
2100d20ec8aSMatthew Dillon 	if (iocom->signal_callback)
211a2179323SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_SWORK);
212a2179323SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
2130c3a8cd0SMatthew Dillon }
2140c3a8cd0SMatthew Dillon 
2150c3a8cd0SMatthew Dillon /*
2160c3a8cd0SMatthew Dillon  * Cleanup a terminating iocom.
2170c3a8cd0SMatthew Dillon  *
2180c3a8cd0SMatthew Dillon  * Caller should not hold iocom->mtx.  The iocom has already been disconnected
2190c3a8cd0SMatthew Dillon  * from all possible references to it.
2200c3a8cd0SMatthew Dillon  */
2210c3a8cd0SMatthew Dillon void
dmsg_iocom_done(dmsg_iocom_t * iocom)2220c3a8cd0SMatthew Dillon dmsg_iocom_done(dmsg_iocom_t *iocom)
2230c3a8cd0SMatthew Dillon {
224*a988b43eSMatthew Dillon 	dmsg_crypto_terminate(iocom);
2250c3a8cd0SMatthew Dillon 	if (iocom->sock_fd >= 0) {
2260c3a8cd0SMatthew Dillon 		close(iocom->sock_fd);
2270c3a8cd0SMatthew Dillon 		iocom->sock_fd = -1;
2280c3a8cd0SMatthew Dillon 	}
22998126869SMatthew Dillon 	if (iocom->alt_fd >= 0 && (iocom->flags & DMSG_IOCOMF_CLOSEALT)) {
2300c3a8cd0SMatthew Dillon 		close(iocom->alt_fd);
2310c3a8cd0SMatthew Dillon 		iocom->alt_fd = -1;
2320c3a8cd0SMatthew Dillon 	}
2330c3a8cd0SMatthew Dillon 	dmsg_ioq_done(iocom, &iocom->ioq_rx);
2340c3a8cd0SMatthew Dillon 	dmsg_ioq_done(iocom, &iocom->ioq_tx);
2350c3a8cd0SMatthew Dillon 	if (iocom->wakeupfds[0] >= 0) {
2360c3a8cd0SMatthew Dillon 		close(iocom->wakeupfds[0]);
2370c3a8cd0SMatthew Dillon 		iocom->wakeupfds[0] = -1;
2380c3a8cd0SMatthew Dillon 	}
2390c3a8cd0SMatthew Dillon 	if (iocom->wakeupfds[1] >= 0) {
2400c3a8cd0SMatthew Dillon 		close(iocom->wakeupfds[1]);
2410c3a8cd0SMatthew Dillon 		iocom->wakeupfds[1] = -1;
2420c3a8cd0SMatthew Dillon 	}
2430c3a8cd0SMatthew Dillon 	pthread_mutex_destroy(&iocom->mtx);
2440c3a8cd0SMatthew Dillon }
2450c3a8cd0SMatthew Dillon 
2460c3a8cd0SMatthew Dillon /*
2471b8eded1SMatthew Dillon  * Allocate a new message using the specified transaction state.
248a2179323SMatthew Dillon  *
2491b8eded1SMatthew Dillon  * If CREATE is set a new transaction is allocated relative to the passed-in
250d30cab67SMatthew Dillon  * transaction (the 'state' argument becomes pstate).
2511b8eded1SMatthew Dillon  *
2521b8eded1SMatthew Dillon  * If CREATE is not set the message is associated with the passed-in
2531b8eded1SMatthew Dillon  * transaction.
2540c3a8cd0SMatthew Dillon  */
2550c3a8cd0SMatthew Dillon dmsg_msg_t *
dmsg_msg_alloc(dmsg_state_t * state,size_t aux_size,uint32_t cmd,void (* func)(dmsg_msg_t *),void * data)2561b8eded1SMatthew Dillon dmsg_msg_alloc(dmsg_state_t *state,
2570d20ec8aSMatthew Dillon 	       size_t aux_size, uint32_t cmd,
2580c3a8cd0SMatthew Dillon 	       void (*func)(dmsg_msg_t *), void *data)
2590c3a8cd0SMatthew Dillon {
2601b8eded1SMatthew Dillon 	dmsg_iocom_t *iocom = state->iocom;
261323c0947SMatthew Dillon 	dmsg_msg_t *msg;
262323c0947SMatthew Dillon 
263323c0947SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
264323c0947SMatthew Dillon 	msg = dmsg_msg_alloc_locked(state, aux_size, cmd, func, data);
265323c0947SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
266323c0947SMatthew Dillon 
267323c0947SMatthew Dillon 	return msg;
268323c0947SMatthew Dillon }
269323c0947SMatthew Dillon 
270323c0947SMatthew Dillon dmsg_msg_t *
dmsg_msg_alloc_locked(dmsg_state_t * state,size_t aux_size,uint32_t cmd,void (* func)(dmsg_msg_t *),void * data)271323c0947SMatthew Dillon dmsg_msg_alloc_locked(dmsg_state_t *state,
272323c0947SMatthew Dillon 	       size_t aux_size, uint32_t cmd,
273323c0947SMatthew Dillon 	       void (*func)(dmsg_msg_t *), void *data)
274323c0947SMatthew Dillon {
275323c0947SMatthew Dillon 	dmsg_iocom_t *iocom = state->iocom;
2761b8eded1SMatthew Dillon 	dmsg_state_t *pstate;
2770c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
2780c3a8cd0SMatthew Dillon 	int hbytes;
279f306de83SMatthew Dillon 	size_t aligned_size;
2800c3a8cd0SMatthew Dillon 
281f306de83SMatthew Dillon 	aligned_size = DMSG_DOALIGN(aux_size);
2820c3a8cd0SMatthew Dillon 	if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
2830c3a8cd0SMatthew Dillon 		/*
2841b8eded1SMatthew Dillon 		 * When CREATE is set without REPLY the caller is
2851b8eded1SMatthew Dillon 		 * initiating a new transaction stacked under the specified
2861b8eded1SMatthew Dillon 		 * circuit.
2870c3a8cd0SMatthew Dillon 		 *
2880a9eefcaSMatthew Dillon 		 * It is possible to race a circuit failure, inherit the
2890a9eefcaSMatthew Dillon 		 * parent's STATE_DYING flag to trigger an abort sequence
2900a9eefcaSMatthew Dillon 		 * in the transmit path.  By not inheriting ABORTING the
2910a9eefcaSMatthew Dillon 		 * abort sequence can recurse.
2920a9eefcaSMatthew Dillon 		 *
2930c3a8cd0SMatthew Dillon 		 * NOTE: CREATE in txcmd handled by dmsg_msg_write()
2940c3a8cd0SMatthew Dillon 		 * NOTE: DELETE in txcmd handled by dmsg_state_cleanuptx()
2950c3a8cd0SMatthew Dillon 		 */
2961b8eded1SMatthew Dillon 		pstate = state;
2970c3a8cd0SMatthew Dillon 		state = malloc(sizeof(*state));
2980c3a8cd0SMatthew Dillon 		bzero(state, sizeof(*state));
2990a9eefcaSMatthew Dillon 		atomic_add_int(&dmsg_state_count, 1);
3000a9eefcaSMatthew Dillon 
3011b8eded1SMatthew Dillon 		TAILQ_INIT(&state->subq);
3021b8eded1SMatthew Dillon 		state->parent = pstate;
3030c3a8cd0SMatthew Dillon 		state->iocom = iocom;
3040c3a8cd0SMatthew Dillon 		state->flags = DMSG_STATE_DYNAMIC;
3050c3a8cd0SMatthew Dillon 		state->msgid = (uint64_t)(uintptr_t)state;
3060c3a8cd0SMatthew Dillon 		state->txcmd = cmd & ~(DMSGF_CREATE | DMSGF_DELETE);
3070c3a8cd0SMatthew Dillon 		state->rxcmd = DMSGF_REPLY;
3080d20ec8aSMatthew Dillon 		state->icmd = state->txcmd & DMSGF_BASECMDMASK;
3090c3a8cd0SMatthew Dillon 		state->func = func;
3100c3a8cd0SMatthew Dillon 		state->any.any = data;
311d30cab67SMatthew Dillon 
312a06d536bSMatthew Dillon 		state->flags |= DMSG_STATE_SUBINSERTED |
313a06d536bSMatthew Dillon 				DMSG_STATE_RBINSERTED;
3140a9eefcaSMatthew Dillon 		state->flags |= pstate->flags & DMSG_STATE_DYING;
3150a9eefcaSMatthew Dillon 		if (TAILQ_EMPTY(&pstate->subq))
3160a9eefcaSMatthew Dillon 			dmsg_state_hold(pstate);
3170a9eefcaSMatthew Dillon 		RB_INSERT(dmsg_state_tree, &iocom->statewr_tree, state);
3180a9eefcaSMatthew Dillon 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
3190a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* state on pstate->subq */
3200a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* state on rbtree */
3210a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* msg->state */
3221b8eded1SMatthew Dillon 	} else {
3231b8eded1SMatthew Dillon 		/*
3241b8eded1SMatthew Dillon 		 * Otherwise the message is transmitted over the existing
3251b8eded1SMatthew Dillon 		 * open transaction.
3261b8eded1SMatthew Dillon 		 */
3271b8eded1SMatthew Dillon 		pstate = state->parent;
3280a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* msg->state */
3290c3a8cd0SMatthew Dillon 	}
3301b8eded1SMatthew Dillon 
331a2179323SMatthew Dillon 	/* XXX SMP race for state */
332a2179323SMatthew Dillon 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
3330a9eefcaSMatthew Dillon 	assert((size_t)hbytes >= sizeof(struct dmsg_hdr));
3340a9eefcaSMatthew Dillon 	msg = malloc(offsetof(struct dmsg_msg, any.head) + hbytes);
335a2179323SMatthew Dillon 	bzero(msg, offsetof(struct dmsg_msg, any.head));
336f306de83SMatthew Dillon 
337f306de83SMatthew Dillon 	/*
338f306de83SMatthew Dillon 	 * [re]allocate the auxillary data buffer.  The caller knows that
339f306de83SMatthew Dillon 	 * a size-aligned buffer will be allocated but we do not want to
340f306de83SMatthew Dillon 	 * force the caller to zero any tail piece, so we do that ourself.
341f306de83SMatthew Dillon 	 */
3420c3a8cd0SMatthew Dillon 	if (msg->aux_size != aux_size) {
3430c3a8cd0SMatthew Dillon 		if (msg->aux_data) {
3440c3a8cd0SMatthew Dillon 			free(msg->aux_data);
3450c3a8cd0SMatthew Dillon 			msg->aux_data = NULL;
3460c3a8cd0SMatthew Dillon 			msg->aux_size = 0;
3470c3a8cd0SMatthew Dillon 		}
3480c3a8cd0SMatthew Dillon 		if (aux_size) {
349f306de83SMatthew Dillon 			msg->aux_data = malloc(aligned_size);
3500c3a8cd0SMatthew Dillon 			msg->aux_size = aux_size;
351f306de83SMatthew Dillon 			if (aux_size != aligned_size) {
352f306de83SMatthew Dillon 				bzero(msg->aux_data + aux_size,
353f306de83SMatthew Dillon 				      aligned_size - aux_size);
354f306de83SMatthew Dillon 			}
3550c3a8cd0SMatthew Dillon 		}
3560c3a8cd0SMatthew Dillon 	}
3571b8eded1SMatthew Dillon 
3581b8eded1SMatthew Dillon 	/*
3591b8eded1SMatthew Dillon 	 * Set REVTRANS if the transaction was remotely initiated
3601b8eded1SMatthew Dillon 	 * Set REVCIRC if the circuit was remotely initiated
3611b8eded1SMatthew Dillon 	 */
3621b8eded1SMatthew Dillon 	if (state->flags & DMSG_STATE_OPPOSITE)
3631b8eded1SMatthew Dillon 		cmd |= DMSGF_REVTRANS;
3641b8eded1SMatthew Dillon 	if (pstate->flags & DMSG_STATE_OPPOSITE)
3651b8eded1SMatthew Dillon 		cmd |= DMSGF_REVCIRC;
3661b8eded1SMatthew Dillon 
3671b8eded1SMatthew Dillon 	/*
3681b8eded1SMatthew Dillon 	 * Finish filling out the header.
3691b8eded1SMatthew Dillon 	 */
3700c3a8cd0SMatthew Dillon 	bzero(&msg->any.head, hbytes);
3710c3a8cd0SMatthew Dillon 	msg->hdr_size = hbytes;
3720d20ec8aSMatthew Dillon 	msg->any.head.magic = DMSG_HDR_MAGIC;
3730c3a8cd0SMatthew Dillon 	msg->any.head.cmd = cmd;
3740c3a8cd0SMatthew Dillon 	msg->any.head.aux_descr = 0;
3750c3a8cd0SMatthew Dillon 	msg->any.head.aux_crc = 0;
3760c3a8cd0SMatthew Dillon 	msg->any.head.msgid = state->msgid;
3771b8eded1SMatthew Dillon 	msg->any.head.circuit = pstate->msgid;
3781b8eded1SMatthew Dillon 	msg->state = state;
3791b8eded1SMatthew Dillon 
3800c3a8cd0SMatthew Dillon 	return (msg);
3810c3a8cd0SMatthew Dillon }
3820c3a8cd0SMatthew Dillon 
3830c3a8cd0SMatthew Dillon /*
3840c3a8cd0SMatthew Dillon  * Free a message so it can be reused afresh.
3850c3a8cd0SMatthew Dillon  *
3860c3a8cd0SMatthew Dillon  * NOTE: aux_size can be 0 with a non-NULL aux_data.
3870c3a8cd0SMatthew Dillon  */
3880c3a8cd0SMatthew Dillon static
3890c3a8cd0SMatthew Dillon void
dmsg_msg_free_locked(dmsg_msg_t * msg)3900c3a8cd0SMatthew Dillon dmsg_msg_free_locked(dmsg_msg_t *msg)
3910c3a8cd0SMatthew Dillon {
3920a9eefcaSMatthew Dillon 	dmsg_state_t *state;
393e96cef49SMatthew Dillon 
3940a9eefcaSMatthew Dillon 	if ((state = msg->state) != NULL) {
3950a9eefcaSMatthew Dillon 		dmsg_state_drop(state);
396323c0947SMatthew Dillon 		msg->state = NULL;	/* safety */
3970a9eefcaSMatthew Dillon 	}
398a2179323SMatthew Dillon 	if (msg->aux_data) {
399a2179323SMatthew Dillon 		free(msg->aux_data);
4000a9eefcaSMatthew Dillon 		msg->aux_data = NULL;	/* safety */
401a2179323SMatthew Dillon 	}
402a2179323SMatthew Dillon 	msg->aux_size = 0;
403a2179323SMatthew Dillon 	free (msg);
4040c3a8cd0SMatthew Dillon }
4050c3a8cd0SMatthew Dillon 
4060c3a8cd0SMatthew Dillon void
dmsg_msg_free(dmsg_msg_t * msg)4070c3a8cd0SMatthew Dillon dmsg_msg_free(dmsg_msg_t *msg)
4080c3a8cd0SMatthew Dillon {
4091b8eded1SMatthew Dillon 	dmsg_iocom_t *iocom = msg->state->iocom;
4100c3a8cd0SMatthew Dillon 
4110c3a8cd0SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
4120c3a8cd0SMatthew Dillon 	dmsg_msg_free_locked(msg);
4130c3a8cd0SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
4140c3a8cd0SMatthew Dillon }
4150c3a8cd0SMatthew Dillon 
4160c3a8cd0SMatthew Dillon /*
4170c3a8cd0SMatthew Dillon  * I/O core loop for an iocom.
4180c3a8cd0SMatthew Dillon  *
4190c3a8cd0SMatthew Dillon  * Thread localized, iocom->mtx not held.
4200c3a8cd0SMatthew Dillon  */
4210c3a8cd0SMatthew Dillon void
dmsg_iocom_core(dmsg_iocom_t * iocom)4220c3a8cd0SMatthew Dillon dmsg_iocom_core(dmsg_iocom_t *iocom)
4230c3a8cd0SMatthew Dillon {
4240c3a8cd0SMatthew Dillon 	struct pollfd fds[3];
4250c3a8cd0SMatthew Dillon 	char dummybuf[256];
4260c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
4270c3a8cd0SMatthew Dillon 	int timeout;
4280c3a8cd0SMatthew Dillon 	int count;
4290c3a8cd0SMatthew Dillon 	int wi;	/* wakeup pipe */
4300c3a8cd0SMatthew Dillon 	int si;	/* socket */
4310c3a8cd0SMatthew Dillon 	int ai;	/* alt bulk path socket */
4320c3a8cd0SMatthew Dillon 
4330c3a8cd0SMatthew Dillon 	while ((iocom->flags & DMSG_IOCOMF_EOF) == 0) {
434a2179323SMatthew Dillon 		/*
435a2179323SMatthew Dillon 		 * These iocom->flags are only manipulated within the
436a2179323SMatthew Dillon 		 * context of the current thread.  However, modifications
437a2179323SMatthew Dillon 		 * still require atomic ops.
438a2179323SMatthew Dillon 		 */
4395ab1caedSMatthew Dillon 		dmio_printf(iocom, 5, "iocom %p %08x\n",
4405ab1caedSMatthew Dillon 			    iocom, iocom->flags);
4410c3a8cd0SMatthew Dillon 		if ((iocom->flags & (DMSG_IOCOMF_RWORK |
4420c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_WWORK |
4430c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_PWORK |
4440c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_SWORK |
4450c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_ARWORK |
4460c3a8cd0SMatthew Dillon 				     DMSG_IOCOMF_AWWORK)) == 0) {
4470c3a8cd0SMatthew Dillon 			/*
4480c3a8cd0SMatthew Dillon 			 * Only poll if no immediate work is pending.
4490c3a8cd0SMatthew Dillon 			 * Otherwise we are just wasting our time calling
4500c3a8cd0SMatthew Dillon 			 * poll.
4510c3a8cd0SMatthew Dillon 			 */
4520c3a8cd0SMatthew Dillon 			timeout = 5000;
4530c3a8cd0SMatthew Dillon 
4540c3a8cd0SMatthew Dillon 			count = 0;
4550c3a8cd0SMatthew Dillon 			wi = -1;
4560c3a8cd0SMatthew Dillon 			si = -1;
4570c3a8cd0SMatthew Dillon 			ai = -1;
4580c3a8cd0SMatthew Dillon 
4590c3a8cd0SMatthew Dillon 			/*
4600c3a8cd0SMatthew Dillon 			 * Always check the inter-thread pipe, e.g.
4610c3a8cd0SMatthew Dillon 			 * for iocom->txmsgq work.
4620c3a8cd0SMatthew Dillon 			 */
4630c3a8cd0SMatthew Dillon 			wi = count++;
4640c3a8cd0SMatthew Dillon 			fds[wi].fd = iocom->wakeupfds[0];
4650c3a8cd0SMatthew Dillon 			fds[wi].events = POLLIN;
4660c3a8cd0SMatthew Dillon 			fds[wi].revents = 0;
4670c3a8cd0SMatthew Dillon 
4680c3a8cd0SMatthew Dillon 			/*
4690c3a8cd0SMatthew Dillon 			 * Check the socket input/output direction as
4700c3a8cd0SMatthew Dillon 			 * requested
4710c3a8cd0SMatthew Dillon 			 */
4720c3a8cd0SMatthew Dillon 			if (iocom->flags & (DMSG_IOCOMF_RREQ |
4730c3a8cd0SMatthew Dillon 					    DMSG_IOCOMF_WREQ)) {
4740c3a8cd0SMatthew Dillon 				si = count++;
4750c3a8cd0SMatthew Dillon 				fds[si].fd = iocom->sock_fd;
4760c3a8cd0SMatthew Dillon 				fds[si].events = 0;
4770c3a8cd0SMatthew Dillon 				fds[si].revents = 0;
4780c3a8cd0SMatthew Dillon 
4790c3a8cd0SMatthew Dillon 				if (iocom->flags & DMSG_IOCOMF_RREQ)
4800c3a8cd0SMatthew Dillon 					fds[si].events |= POLLIN;
4810c3a8cd0SMatthew Dillon 				if (iocom->flags & DMSG_IOCOMF_WREQ)
4820c3a8cd0SMatthew Dillon 					fds[si].events |= POLLOUT;
4830c3a8cd0SMatthew Dillon 			}
4840c3a8cd0SMatthew Dillon 
4850c3a8cd0SMatthew Dillon 			/*
4860c3a8cd0SMatthew Dillon 			 * Check the alternative fd for work.
4870c3a8cd0SMatthew Dillon 			 */
4880c3a8cd0SMatthew Dillon 			if (iocom->alt_fd >= 0) {
4890c3a8cd0SMatthew Dillon 				ai = count++;
4900c3a8cd0SMatthew Dillon 				fds[ai].fd = iocom->alt_fd;
4910c3a8cd0SMatthew Dillon 				fds[ai].events = POLLIN;
4920c3a8cd0SMatthew Dillon 				fds[ai].revents = 0;
4930c3a8cd0SMatthew Dillon 			}
4940c3a8cd0SMatthew Dillon 			poll(fds, count, timeout);
4950c3a8cd0SMatthew Dillon 
4960c3a8cd0SMatthew Dillon 			if (wi >= 0 && (fds[wi].revents & POLLIN))
497a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
498a2179323SMatthew Dillon 					       DMSG_IOCOMF_PWORK);
4990c3a8cd0SMatthew Dillon 			if (si >= 0 && (fds[si].revents & POLLIN))
500a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
501a2179323SMatthew Dillon 					       DMSG_IOCOMF_RWORK);
5020c3a8cd0SMatthew Dillon 			if (si >= 0 && (fds[si].revents & POLLOUT))
503a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
504a2179323SMatthew Dillon 					       DMSG_IOCOMF_WWORK);
5050c3a8cd0SMatthew Dillon 			if (wi >= 0 && (fds[wi].revents & POLLOUT))
506a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
507a2179323SMatthew Dillon 					       DMSG_IOCOMF_WWORK);
5080c3a8cd0SMatthew Dillon 			if (ai >= 0 && (fds[ai].revents & POLLIN))
509a2179323SMatthew Dillon 				atomic_set_int(&iocom->flags,
510a2179323SMatthew Dillon 					       DMSG_IOCOMF_ARWORK);
5110c3a8cd0SMatthew Dillon 		} else {
5120c3a8cd0SMatthew Dillon 			/*
5130c3a8cd0SMatthew Dillon 			 * Always check the pipe
5140c3a8cd0SMatthew Dillon 			 */
515a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_PWORK);
5160c3a8cd0SMatthew Dillon 		}
5170c3a8cd0SMatthew Dillon 
5180c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_SWORK) {
519a2179323SMatthew Dillon 			atomic_clear_int(&iocom->flags, DMSG_IOCOMF_SWORK);
5200d20ec8aSMatthew Dillon 			iocom->signal_callback(iocom);
5210c3a8cd0SMatthew Dillon 		}
5220c3a8cd0SMatthew Dillon 
5230c3a8cd0SMatthew Dillon 		/*
5240c3a8cd0SMatthew Dillon 		 * Pending message queues from other threads wake us up
5250c3a8cd0SMatthew Dillon 		 * with a write to the wakeupfds[] pipe.  We have to clear
5260c3a8cd0SMatthew Dillon 		 * the pipe with a dummy read.
5270c3a8cd0SMatthew Dillon 		 */
5280c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_PWORK) {
529a2179323SMatthew Dillon 			atomic_clear_int(&iocom->flags, DMSG_IOCOMF_PWORK);
5300c3a8cd0SMatthew Dillon 			read(iocom->wakeupfds[0], dummybuf, sizeof(dummybuf));
531a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_RWORK);
532a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_WWORK);
5330c3a8cd0SMatthew Dillon 		}
5340c3a8cd0SMatthew Dillon 
5350c3a8cd0SMatthew Dillon 		/*
5360c3a8cd0SMatthew Dillon 		 * Message write sequencing
5370c3a8cd0SMatthew Dillon 		 */
5380c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_WWORK)
5390c3a8cd0SMatthew Dillon 			dmsg_iocom_flush1(iocom);
5400c3a8cd0SMatthew Dillon 
5410c3a8cd0SMatthew Dillon 		/*
5420c3a8cd0SMatthew Dillon 		 * Message read sequencing.  Run this after the write
5430c3a8cd0SMatthew Dillon 		 * sequencing in case the write sequencing allowed another
5440c3a8cd0SMatthew Dillon 		 * auto-DELETE to occur on the read side.
5450c3a8cd0SMatthew Dillon 		 */
5460c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_RWORK) {
5470c3a8cd0SMatthew Dillon 			while ((iocom->flags & DMSG_IOCOMF_EOF) == 0 &&
5480c3a8cd0SMatthew Dillon 			       (msg = dmsg_ioq_read(iocom)) != NULL) {
5495ab1caedSMatthew Dillon 				dmio_printf(iocom, 4, "receive %s\n",
5500c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
5510d20ec8aSMatthew Dillon 				iocom->rcvmsg_callback(msg);
5520a9eefcaSMatthew Dillon 				pthread_mutex_lock(&iocom->mtx);
5530c3a8cd0SMatthew Dillon 				dmsg_state_cleanuprx(iocom, msg);
5540a9eefcaSMatthew Dillon 				pthread_mutex_unlock(&iocom->mtx);
5550c3a8cd0SMatthew Dillon 			}
5560c3a8cd0SMatthew Dillon 		}
5570c3a8cd0SMatthew Dillon 
5580c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_ARWORK) {
559a2179323SMatthew Dillon 			atomic_clear_int(&iocom->flags, DMSG_IOCOMF_ARWORK);
5600d20ec8aSMatthew Dillon 			iocom->altmsg_callback(iocom);
5610c3a8cd0SMatthew Dillon 		}
5620c3a8cd0SMatthew Dillon 	}
5630c3a8cd0SMatthew Dillon }
5640c3a8cd0SMatthew Dillon 
5650c3a8cd0SMatthew Dillon /*
5660c3a8cd0SMatthew Dillon  * Make sure there's enough room in the FIFO to hold the
5670c3a8cd0SMatthew Dillon  * needed data.
5680c3a8cd0SMatthew Dillon  *
5690c3a8cd0SMatthew Dillon  * Assume worst case encrypted form is 2x the size of the
5700c3a8cd0SMatthew Dillon  * plaintext equivalent.
5710c3a8cd0SMatthew Dillon  */
5720c3a8cd0SMatthew Dillon static
5730c3a8cd0SMatthew Dillon size_t
dmsg_ioq_makeroom(dmsg_ioq_t * ioq,size_t needed)5740c3a8cd0SMatthew Dillon dmsg_ioq_makeroom(dmsg_ioq_t *ioq, size_t needed)
5750c3a8cd0SMatthew Dillon {
5760c3a8cd0SMatthew Dillon 	size_t bytes;
5770c3a8cd0SMatthew Dillon 	size_t nmax;
5780c3a8cd0SMatthew Dillon 
5790c3a8cd0SMatthew Dillon 	bytes = ioq->fifo_cdx - ioq->fifo_beg;
5800c3a8cd0SMatthew Dillon 	nmax = sizeof(ioq->buf) - ioq->fifo_end;
5810c3a8cd0SMatthew Dillon 	if (bytes + nmax / 2 < needed) {
5820c3a8cd0SMatthew Dillon 		if (bytes) {
5830c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg,
5840c3a8cd0SMatthew Dillon 			      ioq->buf,
5850c3a8cd0SMatthew Dillon 			      bytes);
5860c3a8cd0SMatthew Dillon 		}
5870c3a8cd0SMatthew Dillon 		ioq->fifo_cdx -= ioq->fifo_beg;
5880c3a8cd0SMatthew Dillon 		ioq->fifo_beg = 0;
5890c3a8cd0SMatthew Dillon 		if (ioq->fifo_cdn < ioq->fifo_end) {
5900c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_cdn,
5910c3a8cd0SMatthew Dillon 			      ioq->buf + ioq->fifo_cdx,
5920c3a8cd0SMatthew Dillon 			      ioq->fifo_end - ioq->fifo_cdn);
5930c3a8cd0SMatthew Dillon 		}
5940c3a8cd0SMatthew Dillon 		ioq->fifo_end -= ioq->fifo_cdn - ioq->fifo_cdx;
5950c3a8cd0SMatthew Dillon 		ioq->fifo_cdn = ioq->fifo_cdx;
5960c3a8cd0SMatthew Dillon 		nmax = sizeof(ioq->buf) - ioq->fifo_end;
5970c3a8cd0SMatthew Dillon 	}
5980c3a8cd0SMatthew Dillon 	return(nmax);
5990c3a8cd0SMatthew Dillon }
6000c3a8cd0SMatthew Dillon 
6010c3a8cd0SMatthew Dillon /*
6020c3a8cd0SMatthew Dillon  * Read the next ready message from the ioq, issuing I/O if needed.
6030c3a8cd0SMatthew Dillon  * Caller should retry on a read-event when NULL is returned.
6040c3a8cd0SMatthew Dillon  *
6050c3a8cd0SMatthew Dillon  * If an error occurs during reception a DMSG_LNK_ERROR msg will
6060c3a8cd0SMatthew Dillon  * be returned for each open transaction, then the ioq and iocom
6070c3a8cd0SMatthew Dillon  * will be errored out and a non-transactional DMSG_LNK_ERROR
6080c3a8cd0SMatthew Dillon  * msg will be returned as the final message.  The caller should not call
6090c3a8cd0SMatthew Dillon  * us again after the final message is returned.
6100c3a8cd0SMatthew Dillon  *
6110c3a8cd0SMatthew Dillon  * Thread localized, iocom->mtx not held.
6120c3a8cd0SMatthew Dillon  */
6130c3a8cd0SMatthew Dillon dmsg_msg_t *
dmsg_ioq_read(dmsg_iocom_t * iocom)6140c3a8cd0SMatthew Dillon dmsg_ioq_read(dmsg_iocom_t *iocom)
6150c3a8cd0SMatthew Dillon {
6160c3a8cd0SMatthew Dillon 	dmsg_ioq_t *ioq = &iocom->ioq_rx;
6170c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
6180c3a8cd0SMatthew Dillon 	dmsg_hdr_t *head;
6190c3a8cd0SMatthew Dillon 	ssize_t n;
6200c3a8cd0SMatthew Dillon 	size_t bytes;
6210c3a8cd0SMatthew Dillon 	size_t nmax;
622f306de83SMatthew Dillon 	uint32_t aux_size;
6230c3a8cd0SMatthew Dillon 	uint32_t xcrc32;
6240c3a8cd0SMatthew Dillon 	int error;
6250c3a8cd0SMatthew Dillon 
6260c3a8cd0SMatthew Dillon again:
6270c3a8cd0SMatthew Dillon 	/*
6280c3a8cd0SMatthew Dillon 	 * If a message is already pending we can just remove and
6290c3a8cd0SMatthew Dillon 	 * return it.  Message state has already been processed.
6300c3a8cd0SMatthew Dillon 	 * (currently not implemented)
6310c3a8cd0SMatthew Dillon 	 */
6320c3a8cd0SMatthew Dillon 	if ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
6330c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&ioq->msgq, msg, qentry);
634a06d536bSMatthew Dillon 
635a06d536bSMatthew Dillon 		if (msg->state == &iocom->state0) {
636a06d536bSMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_EOF);
6375ab1caedSMatthew Dillon 			dmio_printf(iocom, 1,
6385ab1caedSMatthew Dillon 				    "EOF ON SOCKET %d\n",
6395ab1caedSMatthew Dillon 				    iocom->sock_fd);
640a06d536bSMatthew Dillon 		}
6410c3a8cd0SMatthew Dillon 		return (msg);
6420c3a8cd0SMatthew Dillon 	}
643a2179323SMatthew Dillon 	atomic_clear_int(&iocom->flags, DMSG_IOCOMF_RREQ | DMSG_IOCOMF_RWORK);
6440c3a8cd0SMatthew Dillon 
6450c3a8cd0SMatthew Dillon 	/*
6460c3a8cd0SMatthew Dillon 	 * If the stream is errored out we stop processing it.
6470c3a8cd0SMatthew Dillon 	 */
6480c3a8cd0SMatthew Dillon 	if (ioq->error)
6490c3a8cd0SMatthew Dillon 		goto skip;
6500c3a8cd0SMatthew Dillon 
6510c3a8cd0SMatthew Dillon 	/*
6520c3a8cd0SMatthew Dillon 	 * Message read in-progress (msg is NULL at the moment).  We don't
6530c3a8cd0SMatthew Dillon 	 * allocate a msg until we have its core header.
6540c3a8cd0SMatthew Dillon 	 */
6550c3a8cd0SMatthew Dillon 	nmax = sizeof(ioq->buf) - ioq->fifo_end;
6560c3a8cd0SMatthew Dillon 	bytes = ioq->fifo_cdx - ioq->fifo_beg;		/* already decrypted */
6570c3a8cd0SMatthew Dillon 	msg = ioq->msg;
6580c3a8cd0SMatthew Dillon 
6590c3a8cd0SMatthew Dillon 	switch(ioq->state) {
6600c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_HEADER1:
6610c3a8cd0SMatthew Dillon 		/*
6620c3a8cd0SMatthew Dillon 		 * Load the primary header, fail on any non-trivial read
6630c3a8cd0SMatthew Dillon 		 * error or on EOF.  Since the primary header is the same
6640c3a8cd0SMatthew Dillon 		 * size is the message alignment it will never straddle
6650c3a8cd0SMatthew Dillon 		 * the end of the buffer.
6660c3a8cd0SMatthew Dillon 		 */
6670c3a8cd0SMatthew Dillon 		nmax = dmsg_ioq_makeroom(ioq, sizeof(msg->any.head));
6680c3a8cd0SMatthew Dillon 		if (bytes < sizeof(msg->any.head)) {
6690c3a8cd0SMatthew Dillon 			n = read(iocom->sock_fd,
6700c3a8cd0SMatthew Dillon 				 ioq->buf + ioq->fifo_end,
6710c3a8cd0SMatthew Dillon 				 nmax);
6720c3a8cd0SMatthew Dillon 			if (n <= 0) {
6730c3a8cd0SMatthew Dillon 				if (n == 0) {
6740c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_EOF;
6750c3a8cd0SMatthew Dillon 					break;
6760c3a8cd0SMatthew Dillon 				}
6770c3a8cd0SMatthew Dillon 				if (errno != EINTR &&
6780c3a8cd0SMatthew Dillon 				    errno != EINPROGRESS &&
6790c3a8cd0SMatthew Dillon 				    errno != EAGAIN) {
6800c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_SOCK;
6810c3a8cd0SMatthew Dillon 					break;
6820c3a8cd0SMatthew Dillon 				}
6830c3a8cd0SMatthew Dillon 				n = 0;
6840c3a8cd0SMatthew Dillon 				/* fall through */
6850c3a8cd0SMatthew Dillon 			}
6860c3a8cd0SMatthew Dillon 			ioq->fifo_end += (size_t)n;
6870c3a8cd0SMatthew Dillon 			nmax -= (size_t)n;
6880c3a8cd0SMatthew Dillon 		}
6890c3a8cd0SMatthew Dillon 
6900c3a8cd0SMatthew Dillon 		/*
6910c3a8cd0SMatthew Dillon 		 * Decrypt data received so far.  Data will be decrypted
6920c3a8cd0SMatthew Dillon 		 * in-place but might create gaps in the FIFO.  Partial
6930c3a8cd0SMatthew Dillon 		 * blocks are not immediately decrypted.
6940c3a8cd0SMatthew Dillon 		 *
6950c3a8cd0SMatthew Dillon 		 * WARNING!  The header might be in the wrong endian, we
6960c3a8cd0SMatthew Dillon 		 *	     do not fix it up until we get the entire
6970c3a8cd0SMatthew Dillon 		 *	     extended header.
6980c3a8cd0SMatthew Dillon 		 */
6990c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_CRYPTED) {
7000c3a8cd0SMatthew Dillon 			dmsg_crypto_decrypt(iocom, ioq);
7010c3a8cd0SMatthew Dillon 		} else {
7020c3a8cd0SMatthew Dillon 			ioq->fifo_cdx = ioq->fifo_end;
7030c3a8cd0SMatthew Dillon 			ioq->fifo_cdn = ioq->fifo_end;
7040c3a8cd0SMatthew Dillon 		}
7050c3a8cd0SMatthew Dillon 		bytes = ioq->fifo_cdx - ioq->fifo_beg;
7060c3a8cd0SMatthew Dillon 
7070c3a8cd0SMatthew Dillon 		/*
7080c3a8cd0SMatthew Dillon 		 * Insufficient data accumulated (msg is NULL, caller will
7090c3a8cd0SMatthew Dillon 		 * retry on event).
7100c3a8cd0SMatthew Dillon 		 */
7110c3a8cd0SMatthew Dillon 		assert(msg == NULL);
7120c3a8cd0SMatthew Dillon 		if (bytes < sizeof(msg->any.head))
7130c3a8cd0SMatthew Dillon 			break;
7140c3a8cd0SMatthew Dillon 
7150c3a8cd0SMatthew Dillon 		/*
7160c3a8cd0SMatthew Dillon 		 * Check and fixup the core header.  Note that the icrc
7170c3a8cd0SMatthew Dillon 		 * has to be calculated before any fixups, but the crc
7180c3a8cd0SMatthew Dillon 		 * fields in the msg may have to be swapped like everything
7190c3a8cd0SMatthew Dillon 		 * else.
7200c3a8cd0SMatthew Dillon 		 */
7210c3a8cd0SMatthew Dillon 		head = (void *)(ioq->buf + ioq->fifo_beg);
7220c3a8cd0SMatthew Dillon 		if (head->magic != DMSG_HDR_MAGIC &&
7230c3a8cd0SMatthew Dillon 		    head->magic != DMSG_HDR_MAGIC_REV) {
7245ab1caedSMatthew Dillon 			dmio_printf(iocom, 1,
7255ab1caedSMatthew Dillon 				    "%s: head->magic is bad %02x\n",
726f306de83SMatthew Dillon 				    iocom->label, head->magic);
727f306de83SMatthew Dillon 			if (iocom->flags & DMSG_IOCOMF_CRYPTED)
7285ab1caedSMatthew Dillon 				dmio_printf(iocom, 1, "%s\n",
7295ab1caedSMatthew Dillon 					    "(on encrypted link)");
7300c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_SYNC;
7310c3a8cd0SMatthew Dillon 			break;
7320c3a8cd0SMatthew Dillon 		}
7330c3a8cd0SMatthew Dillon 
7340c3a8cd0SMatthew Dillon 		/*
7350c3a8cd0SMatthew Dillon 		 * Calculate the full header size and aux data size
7360c3a8cd0SMatthew Dillon 		 */
7370c3a8cd0SMatthew Dillon 		if (head->magic == DMSG_HDR_MAGIC_REV) {
7380c3a8cd0SMatthew Dillon 			ioq->hbytes = (bswap32(head->cmd) & DMSGF_SIZE) *
7390c3a8cd0SMatthew Dillon 				      DMSG_ALIGN;
740f306de83SMatthew Dillon 			aux_size = bswap32(head->aux_bytes);
7410c3a8cd0SMatthew Dillon 		} else {
7420c3a8cd0SMatthew Dillon 			ioq->hbytes = (head->cmd & DMSGF_SIZE) *
7430c3a8cd0SMatthew Dillon 				      DMSG_ALIGN;
744f306de83SMatthew Dillon 			aux_size = head->aux_bytes;
7450c3a8cd0SMatthew Dillon 		}
746f306de83SMatthew Dillon 		ioq->abytes = DMSG_DOALIGN(aux_size);
747f306de83SMatthew Dillon 		ioq->unaligned_aux_size = aux_size;
7480c3a8cd0SMatthew Dillon 		if (ioq->hbytes < sizeof(msg->any.head) ||
7490c3a8cd0SMatthew Dillon 		    ioq->hbytes > sizeof(msg->any) ||
7500c3a8cd0SMatthew Dillon 		    ioq->abytes > DMSG_AUX_MAX) {
7510c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_FIELD;
7520c3a8cd0SMatthew Dillon 			break;
7530c3a8cd0SMatthew Dillon 		}
7540c3a8cd0SMatthew Dillon 
7550c3a8cd0SMatthew Dillon 		/*
7560c3a8cd0SMatthew Dillon 		 * Allocate the message, the next state will fill it in.
7571b8eded1SMatthew Dillon 		 *
7581b8eded1SMatthew Dillon 		 * NOTE: The aux_data buffer will be sized to an aligned
7591b8eded1SMatthew Dillon 		 *	 value and the aligned remainder zero'd for
7601b8eded1SMatthew Dillon 		 *	 convenience.
7611b8eded1SMatthew Dillon 		 *
7621b8eded1SMatthew Dillon 		 * NOTE: Supply dummy state and a degenerate cmd without
7631b8eded1SMatthew Dillon 		 *	 CREATE set.  The message will temporarily be
7641b8eded1SMatthew Dillon 		 *	 associated with state0 until later post-processing.
7650c3a8cd0SMatthew Dillon 		 */
7661b8eded1SMatthew Dillon 		msg = dmsg_msg_alloc(&iocom->state0, aux_size,
767a2179323SMatthew Dillon 				     ioq->hbytes / DMSG_ALIGN,
7680c3a8cd0SMatthew Dillon 				     NULL, NULL);
7690c3a8cd0SMatthew Dillon 		ioq->msg = msg;
7700c3a8cd0SMatthew Dillon 
7710c3a8cd0SMatthew Dillon 		/*
7720c3a8cd0SMatthew Dillon 		 * Fall through to the next state.  Make sure that the
7730c3a8cd0SMatthew Dillon 		 * extended header does not straddle the end of the buffer.
7740c3a8cd0SMatthew Dillon 		 * We still want to issue larger reads into our buffer,
7750c3a8cd0SMatthew Dillon 		 * book-keeping is easier if we don't bcopy() yet.
7760c3a8cd0SMatthew Dillon 		 *
7770c3a8cd0SMatthew Dillon 		 * Make sure there is enough room for bloated encrypt data.
7780c3a8cd0SMatthew Dillon 		 */
7790c3a8cd0SMatthew Dillon 		nmax = dmsg_ioq_makeroom(ioq, ioq->hbytes);
7800c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_HEADER2;
7810c3a8cd0SMatthew Dillon 		/* fall through */
7820c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_HEADER2:
7830c3a8cd0SMatthew Dillon 		/*
7840c3a8cd0SMatthew Dillon 		 * Fill out the extended header.
7850c3a8cd0SMatthew Dillon 		 */
7860c3a8cd0SMatthew Dillon 		assert(msg != NULL);
7870c3a8cd0SMatthew Dillon 		if (bytes < ioq->hbytes) {
7880a9eefcaSMatthew Dillon 			assert(nmax > 0);
7890c3a8cd0SMatthew Dillon 			n = read(iocom->sock_fd,
7900c3a8cd0SMatthew Dillon 				 ioq->buf + ioq->fifo_end,
7910c3a8cd0SMatthew Dillon 				 nmax);
7920c3a8cd0SMatthew Dillon 			if (n <= 0) {
7930c3a8cd0SMatthew Dillon 				if (n == 0) {
7940c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_EOF;
7950c3a8cd0SMatthew Dillon 					break;
7960c3a8cd0SMatthew Dillon 				}
7970c3a8cd0SMatthew Dillon 				if (errno != EINTR &&
7980c3a8cd0SMatthew Dillon 				    errno != EINPROGRESS &&
7990c3a8cd0SMatthew Dillon 				    errno != EAGAIN) {
8000c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_SOCK;
8010c3a8cd0SMatthew Dillon 					break;
8020c3a8cd0SMatthew Dillon 				}
8030c3a8cd0SMatthew Dillon 				n = 0;
8040c3a8cd0SMatthew Dillon 				/* fall through */
8050c3a8cd0SMatthew Dillon 			}
8060c3a8cd0SMatthew Dillon 			ioq->fifo_end += (size_t)n;
8070c3a8cd0SMatthew Dillon 			nmax -= (size_t)n;
8080c3a8cd0SMatthew Dillon 		}
8090c3a8cd0SMatthew Dillon 
8100c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_CRYPTED) {
8110c3a8cd0SMatthew Dillon 			dmsg_crypto_decrypt(iocom, ioq);
8120c3a8cd0SMatthew Dillon 		} else {
8130c3a8cd0SMatthew Dillon 			ioq->fifo_cdx = ioq->fifo_end;
8140c3a8cd0SMatthew Dillon 			ioq->fifo_cdn = ioq->fifo_end;
8150c3a8cd0SMatthew Dillon 		}
8160c3a8cd0SMatthew Dillon 		bytes = ioq->fifo_cdx - ioq->fifo_beg;
8170c3a8cd0SMatthew Dillon 
8180c3a8cd0SMatthew Dillon 		/*
8190c3a8cd0SMatthew Dillon 		 * Insufficient data accumulated (set msg NULL so caller will
8200c3a8cd0SMatthew Dillon 		 * retry on event).
8210c3a8cd0SMatthew Dillon 		 */
8220c3a8cd0SMatthew Dillon 		if (bytes < ioq->hbytes) {
8230c3a8cd0SMatthew Dillon 			msg = NULL;
8240c3a8cd0SMatthew Dillon 			break;
8250c3a8cd0SMatthew Dillon 		}
8260c3a8cd0SMatthew Dillon 
8270c3a8cd0SMatthew Dillon 		/*
8280c3a8cd0SMatthew Dillon 		 * Calculate the extended header, decrypt data received
8290c3a8cd0SMatthew Dillon 		 * so far.  Handle endian-conversion for the entire extended
8300c3a8cd0SMatthew Dillon 		 * header.
8310c3a8cd0SMatthew Dillon 		 */
8320c3a8cd0SMatthew Dillon 		head = (void *)(ioq->buf + ioq->fifo_beg);
8330c3a8cd0SMatthew Dillon 
8340c3a8cd0SMatthew Dillon 		/*
8350c3a8cd0SMatthew Dillon 		 * Check the CRC.
8360c3a8cd0SMatthew Dillon 		 */
8370c3a8cd0SMatthew Dillon 		if (head->magic == DMSG_HDR_MAGIC_REV)
8380c3a8cd0SMatthew Dillon 			xcrc32 = bswap32(head->hdr_crc);
8390c3a8cd0SMatthew Dillon 		else
8400c3a8cd0SMatthew Dillon 			xcrc32 = head->hdr_crc;
8410c3a8cd0SMatthew Dillon 		head->hdr_crc = 0;
8420c3a8cd0SMatthew Dillon 		if (dmsg_icrc32(head, ioq->hbytes) != xcrc32) {
8430c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_XCRC;
8445ab1caedSMatthew Dillon 			dmio_printf(iocom, 1, "BAD-XCRC(%08x,%08x) %s\n",
8450c3a8cd0SMatthew Dillon 				    xcrc32, dmsg_icrc32(head, ioq->hbytes),
8460c3a8cd0SMatthew Dillon 				    dmsg_msg_str(msg));
8470c3a8cd0SMatthew Dillon 			assert(0);
8480c3a8cd0SMatthew Dillon 			break;
8490c3a8cd0SMatthew Dillon 		}
8500c3a8cd0SMatthew Dillon 		head->hdr_crc = xcrc32;
8510c3a8cd0SMatthew Dillon 
8520c3a8cd0SMatthew Dillon 		if (head->magic == DMSG_HDR_MAGIC_REV) {
8530c3a8cd0SMatthew Dillon 			dmsg_bswap_head(head);
8540c3a8cd0SMatthew Dillon 		}
8550c3a8cd0SMatthew Dillon 
8560c3a8cd0SMatthew Dillon 		/*
8570c3a8cd0SMatthew Dillon 		 * Copy the extended header into the msg and adjust the
8580c3a8cd0SMatthew Dillon 		 * FIFO.
8590c3a8cd0SMatthew Dillon 		 */
8600c3a8cd0SMatthew Dillon 		bcopy(head, &msg->any, ioq->hbytes);
8610c3a8cd0SMatthew Dillon 
8620c3a8cd0SMatthew Dillon 		/*
8630c3a8cd0SMatthew Dillon 		 * We are either done or we fall-through.
8640c3a8cd0SMatthew Dillon 		 */
8650c3a8cd0SMatthew Dillon 		if (ioq->abytes == 0) {
8660c3a8cd0SMatthew Dillon 			ioq->fifo_beg += ioq->hbytes;
8670c3a8cd0SMatthew Dillon 			break;
8680c3a8cd0SMatthew Dillon 		}
8690c3a8cd0SMatthew Dillon 
8700c3a8cd0SMatthew Dillon 		/*
8710c3a8cd0SMatthew Dillon 		 * Must adjust bytes (and the state) when falling through.
8720c3a8cd0SMatthew Dillon 		 * nmax doesn't change.
8730c3a8cd0SMatthew Dillon 		 */
8740c3a8cd0SMatthew Dillon 		ioq->fifo_beg += ioq->hbytes;
8750c3a8cd0SMatthew Dillon 		bytes -= ioq->hbytes;
8760c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_AUXDATA1;
8770c3a8cd0SMatthew Dillon 		/* fall through */
8780c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_AUXDATA1:
8790c3a8cd0SMatthew Dillon 		/*
880a2179323SMatthew Dillon 		 * Copy the partial or complete [decrypted] payload from
881a2179323SMatthew Dillon 		 * remaining bytes in the FIFO in order to optimize the
882a2179323SMatthew Dillon 		 * makeroom call in the AUXDATA2 state.  We have to
883a2179323SMatthew Dillon 		 * fall-through either way so we can check the crc.
8840c3a8cd0SMatthew Dillon 		 *
8850c3a8cd0SMatthew Dillon 		 * msg->aux_size tracks our aux data.
886a2179323SMatthew Dillon 		 *
887a2179323SMatthew Dillon 		 * (Lets not complicate matters if the data is encrypted,
888a2179323SMatthew Dillon 		 *  since the data in-stream is not the same size as the
889a2179323SMatthew Dillon 		 *  data decrypted).
8900c3a8cd0SMatthew Dillon 		 */
8910c3a8cd0SMatthew Dillon 		if (bytes >= ioq->abytes) {
8920c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg, msg->aux_data,
8930c3a8cd0SMatthew Dillon 			      ioq->abytes);
8940c3a8cd0SMatthew Dillon 			msg->aux_size = ioq->abytes;
8950c3a8cd0SMatthew Dillon 			ioq->fifo_beg += ioq->abytes;
8960c3a8cd0SMatthew Dillon 			assert(ioq->fifo_beg <= ioq->fifo_cdx);
8970c3a8cd0SMatthew Dillon 			assert(ioq->fifo_cdx <= ioq->fifo_cdn);
8980c3a8cd0SMatthew Dillon 			bytes -= ioq->abytes;
8990c3a8cd0SMatthew Dillon 		} else if (bytes) {
9000c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg, msg->aux_data,
9010c3a8cd0SMatthew Dillon 			      bytes);
9020c3a8cd0SMatthew Dillon 			msg->aux_size = bytes;
9030c3a8cd0SMatthew Dillon 			ioq->fifo_beg += bytes;
9040c3a8cd0SMatthew Dillon 			if (ioq->fifo_cdx < ioq->fifo_beg)
9050c3a8cd0SMatthew Dillon 				ioq->fifo_cdx = ioq->fifo_beg;
9060c3a8cd0SMatthew Dillon 			assert(ioq->fifo_beg <= ioq->fifo_cdx);
9070c3a8cd0SMatthew Dillon 			assert(ioq->fifo_cdx <= ioq->fifo_cdn);
9080c3a8cd0SMatthew Dillon 			bytes = 0;
9090c3a8cd0SMatthew Dillon 		} else {
9100c3a8cd0SMatthew Dillon 			msg->aux_size = 0;
9110c3a8cd0SMatthew Dillon 		}
9120c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_AUXDATA2;
9130c3a8cd0SMatthew Dillon 		/* fall through */
9140c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_AUXDATA2:
9150c3a8cd0SMatthew Dillon 		/*
9160c3a8cd0SMatthew Dillon 		 * Make sure there is enough room for more data.
9170c3a8cd0SMatthew Dillon 		 */
9180c3a8cd0SMatthew Dillon 		assert(msg);
9190c3a8cd0SMatthew Dillon 		nmax = dmsg_ioq_makeroom(ioq, ioq->abytes - msg->aux_size);
9200c3a8cd0SMatthew Dillon 
9210c3a8cd0SMatthew Dillon 		/*
9220c3a8cd0SMatthew Dillon 		 * Read and decrypt more of the payload.
9230c3a8cd0SMatthew Dillon 		 */
9240c3a8cd0SMatthew Dillon 		if (msg->aux_size < ioq->abytes) {
9250a9eefcaSMatthew Dillon 			assert(nmax > 0);
9260c3a8cd0SMatthew Dillon 			assert(bytes == 0);
9270c3a8cd0SMatthew Dillon 			n = read(iocom->sock_fd,
9280c3a8cd0SMatthew Dillon 				 ioq->buf + ioq->fifo_end,
9290c3a8cd0SMatthew Dillon 				 nmax);
9300c3a8cd0SMatthew Dillon 			if (n <= 0) {
9310c3a8cd0SMatthew Dillon 				if (n == 0) {
9320c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_EOF;
9330c3a8cd0SMatthew Dillon 					break;
9340c3a8cd0SMatthew Dillon 				}
9350c3a8cd0SMatthew Dillon 				if (errno != EINTR &&
9360c3a8cd0SMatthew Dillon 				    errno != EINPROGRESS &&
9370c3a8cd0SMatthew Dillon 				    errno != EAGAIN) {
9380c3a8cd0SMatthew Dillon 					ioq->error = DMSG_IOQ_ERROR_SOCK;
9390c3a8cd0SMatthew Dillon 					break;
9400c3a8cd0SMatthew Dillon 				}
9410c3a8cd0SMatthew Dillon 				n = 0;
9420c3a8cd0SMatthew Dillon 				/* fall through */
9430c3a8cd0SMatthew Dillon 			}
9440c3a8cd0SMatthew Dillon 			ioq->fifo_end += (size_t)n;
9450c3a8cd0SMatthew Dillon 			nmax -= (size_t)n;
9460c3a8cd0SMatthew Dillon 		}
9470c3a8cd0SMatthew Dillon 
9480c3a8cd0SMatthew Dillon 		if (iocom->flags & DMSG_IOCOMF_CRYPTED) {
9490c3a8cd0SMatthew Dillon 			dmsg_crypto_decrypt(iocom, ioq);
9500c3a8cd0SMatthew Dillon 		} else {
9510c3a8cd0SMatthew Dillon 			ioq->fifo_cdx = ioq->fifo_end;
9520c3a8cd0SMatthew Dillon 			ioq->fifo_cdn = ioq->fifo_end;
9530c3a8cd0SMatthew Dillon 		}
9540c3a8cd0SMatthew Dillon 		bytes = ioq->fifo_cdx - ioq->fifo_beg;
9550c3a8cd0SMatthew Dillon 
9560c3a8cd0SMatthew Dillon 		if (bytes > ioq->abytes - msg->aux_size)
9570c3a8cd0SMatthew Dillon 			bytes = ioq->abytes - msg->aux_size;
9580c3a8cd0SMatthew Dillon 
9590c3a8cd0SMatthew Dillon 		if (bytes) {
9600c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg,
9610c3a8cd0SMatthew Dillon 			      msg->aux_data + msg->aux_size,
9620c3a8cd0SMatthew Dillon 			      bytes);
9630c3a8cd0SMatthew Dillon 			msg->aux_size += bytes;
9640c3a8cd0SMatthew Dillon 			ioq->fifo_beg += bytes;
9650c3a8cd0SMatthew Dillon 		}
9660c3a8cd0SMatthew Dillon 
9670c3a8cd0SMatthew Dillon 		/*
9680c3a8cd0SMatthew Dillon 		 * Insufficient data accumulated (set msg NULL so caller will
9690c3a8cd0SMatthew Dillon 		 * retry on event).
970f306de83SMatthew Dillon 		 *
971f306de83SMatthew Dillon 		 * Assert the auxillary data size is correct, then record the
972f306de83SMatthew Dillon 		 * original unaligned size from the message header.
9730c3a8cd0SMatthew Dillon 		 */
9740c3a8cd0SMatthew Dillon 		if (msg->aux_size < ioq->abytes) {
9750c3a8cd0SMatthew Dillon 			msg = NULL;
9760c3a8cd0SMatthew Dillon 			break;
9770c3a8cd0SMatthew Dillon 		}
9780c3a8cd0SMatthew Dillon 		assert(msg->aux_size == ioq->abytes);
979f306de83SMatthew Dillon 		msg->aux_size = ioq->unaligned_aux_size;
9800c3a8cd0SMatthew Dillon 
9810c3a8cd0SMatthew Dillon 		/*
982f306de83SMatthew Dillon 		 * Check aux_crc, then we are done.  Note that the crc
983f306de83SMatthew Dillon 		 * is calculated over the aligned size, not the actual
984f306de83SMatthew Dillon 		 * size.
9850c3a8cd0SMatthew Dillon 		 */
986f306de83SMatthew Dillon 		xcrc32 = dmsg_icrc32(msg->aux_data, ioq->abytes);
9870c3a8cd0SMatthew Dillon 		if (xcrc32 != msg->any.head.aux_crc) {
9880c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_ACRC;
9895ab1caedSMatthew Dillon 			dmio_printf(iocom, 1,
990d30cab67SMatthew Dillon 				    "iocom: ACRC error %08x vs %08x "
991d30cab67SMatthew Dillon 				    "msgid %016jx msgcmd %08x auxsize %d\n",
992d30cab67SMatthew Dillon 				    xcrc32,
993d30cab67SMatthew Dillon 				    msg->any.head.aux_crc,
994d30cab67SMatthew Dillon 				    (intmax_t)msg->any.head.msgid,
995d30cab67SMatthew Dillon 				    msg->any.head.cmd,
996d30cab67SMatthew Dillon 				    msg->any.head.aux_bytes);
9970c3a8cd0SMatthew Dillon 			break;
9980c3a8cd0SMatthew Dillon 		}
9990c3a8cd0SMatthew Dillon 		break;
10000c3a8cd0SMatthew Dillon 	case DMSG_MSGQ_STATE_ERROR:
10010c3a8cd0SMatthew Dillon 		/*
10020c3a8cd0SMatthew Dillon 		 * Continued calls to drain recorded transactions (returning
10030c3a8cd0SMatthew Dillon 		 * a LNK_ERROR for each one), before we return the final
10040c3a8cd0SMatthew Dillon 		 * LNK_ERROR.
10050c3a8cd0SMatthew Dillon 		 */
10060c3a8cd0SMatthew Dillon 		assert(msg == NULL);
10070c3a8cd0SMatthew Dillon 		break;
10080c3a8cd0SMatthew Dillon 	default:
10090c3a8cd0SMatthew Dillon 		/*
10100c3a8cd0SMatthew Dillon 		 * We don't double-return errors, the caller should not
10110c3a8cd0SMatthew Dillon 		 * have called us again after getting an error msg.
10120c3a8cd0SMatthew Dillon 		 */
10130c3a8cd0SMatthew Dillon 		assert(0);
10140c3a8cd0SMatthew Dillon 		break;
10150c3a8cd0SMatthew Dillon 	}
10160c3a8cd0SMatthew Dillon 
10170c3a8cd0SMatthew Dillon 	/*
10180c3a8cd0SMatthew Dillon 	 * Check the message sequence.  The iv[] should prevent any
10190c3a8cd0SMatthew Dillon 	 * possibility of a replay but we add this check anyway.
10200c3a8cd0SMatthew Dillon 	 */
10210c3a8cd0SMatthew Dillon 	if (msg && ioq->error == 0) {
10220c3a8cd0SMatthew Dillon 		if ((msg->any.head.salt & 255) != (ioq->seq & 255)) {
10230c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_MSGSEQ;
10240c3a8cd0SMatthew Dillon 		} else {
10250c3a8cd0SMatthew Dillon 			++ioq->seq;
10260c3a8cd0SMatthew Dillon 		}
10270c3a8cd0SMatthew Dillon 	}
10280c3a8cd0SMatthew Dillon 
10290c3a8cd0SMatthew Dillon 	/*
10300c3a8cd0SMatthew Dillon 	 * Handle error, RREQ, or completion
10310c3a8cd0SMatthew Dillon 	 *
10320c3a8cd0SMatthew Dillon 	 * NOTE: nmax and bytes are invalid at this point, we don't bother
10330c3a8cd0SMatthew Dillon 	 *	 to update them when breaking out.
10340c3a8cd0SMatthew Dillon 	 */
10350c3a8cd0SMatthew Dillon 	if (ioq->error) {
10360c3a8cd0SMatthew Dillon skip:
10370c3a8cd0SMatthew Dillon 		/*
10380c3a8cd0SMatthew Dillon 		 * An unrecoverable error causes all active receive
10390c3a8cd0SMatthew Dillon 		 * transactions to be terminated with a LNK_ERROR message.
10400c3a8cd0SMatthew Dillon 		 *
10410c3a8cd0SMatthew Dillon 		 * Once all active transactions are exhausted we set the
10420c3a8cd0SMatthew Dillon 		 * iocom ERROR flag and return a non-transactional LNK_ERROR
10430c3a8cd0SMatthew Dillon 		 * message, which should cause master processing loops to
10440c3a8cd0SMatthew Dillon 		 * terminate.
10450c3a8cd0SMatthew Dillon 		 */
10465ab1caedSMatthew Dillon 		dmio_printf(iocom, 1, "IOQ ERROR %d\n", ioq->error);
10470c3a8cd0SMatthew Dillon 		assert(ioq->msg == msg);
10480c3a8cd0SMatthew Dillon 		if (msg) {
10490c3a8cd0SMatthew Dillon 			dmsg_msg_free(msg);
10500c3a8cd0SMatthew Dillon 			ioq->msg = NULL;
1051323c0947SMatthew Dillon 			msg = NULL;
10520c3a8cd0SMatthew Dillon 		}
10530c3a8cd0SMatthew Dillon 
10540c3a8cd0SMatthew Dillon 		/*
10550c3a8cd0SMatthew Dillon 		 * No more I/O read processing
10560c3a8cd0SMatthew Dillon 		 */
10570c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_ERROR;
10580c3a8cd0SMatthew Dillon 
10590c3a8cd0SMatthew Dillon 		/*
10600c3a8cd0SMatthew Dillon 		 * Simulate a remote LNK_ERROR DELETE msg for any open
10610c3a8cd0SMatthew Dillon 		 * transactions, ending with a final non-transactional
10620c3a8cd0SMatthew Dillon 		 * LNK_ERROR (that the session can detect) when no
10630c3a8cd0SMatthew Dillon 		 * transactions remain.
10640d20ec8aSMatthew Dillon 		 *
10651b8eded1SMatthew Dillon 		 * NOTE: Temporarily supply state0 and a degenerate cmd
10661b8eded1SMatthew Dillon 		 *	 without CREATE set.  The real state will be
10671b8eded1SMatthew Dillon 		 *	 assigned in the loop.
10681b8eded1SMatthew Dillon 		 *
10691b8eded1SMatthew Dillon 		 * NOTE: We are simulating a received message using our
10701b8eded1SMatthew Dillon 		 *	 side of the state, so the DMSGF_REV* bits have
10711b8eded1SMatthew Dillon 		 *	 to be reversed.
10720c3a8cd0SMatthew Dillon 		 */
10730c3a8cd0SMatthew Dillon 		pthread_mutex_lock(&iocom->mtx);
10740c3a8cd0SMatthew Dillon 		dmsg_iocom_drain(iocom);
10750a9eefcaSMatthew Dillon 		dmsg_simulate_failure(&iocom->state0, 0, ioq->error);
10760c3a8cd0SMatthew Dillon 		pthread_mutex_unlock(&iocom->mtx);
1077323c0947SMatthew Dillon 		if (TAILQ_FIRST(&ioq->msgq))
1078323c0947SMatthew Dillon 			goto again;
10790c3a8cd0SMatthew Dillon 
1080323c0947SMatthew Dillon #if 0
10810c3a8cd0SMatthew Dillon 		/*
10820c3a8cd0SMatthew Dillon 		 * For the iocom error case we want to set RWORK to indicate
10830c3a8cd0SMatthew Dillon 		 * that more messages might be pending.
10840c3a8cd0SMatthew Dillon 		 *
10850c3a8cd0SMatthew Dillon 		 * It is possible to return NULL when there is more work to
10860c3a8cd0SMatthew Dillon 		 * do because each message has to be DELETEd in both
10870c3a8cd0SMatthew Dillon 		 * directions before we continue on with the next (though
10880c3a8cd0SMatthew Dillon 		 * this could be optimized).  The transmit direction will
10890c3a8cd0SMatthew Dillon 		 * re-set RWORK.
10900c3a8cd0SMatthew Dillon 		 */
10910c3a8cd0SMatthew Dillon 		if (msg)
1092a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_RWORK);
1093323c0947SMatthew Dillon #endif
10940c3a8cd0SMatthew Dillon 	} else if (msg == NULL) {
10950c3a8cd0SMatthew Dillon 		/*
10960c3a8cd0SMatthew Dillon 		 * Insufficient data received to finish building the message,
10970c3a8cd0SMatthew Dillon 		 * set RREQ and return NULL.
10980c3a8cd0SMatthew Dillon 		 *
10990c3a8cd0SMatthew Dillon 		 * Leave ioq->msg intact.
11000c3a8cd0SMatthew Dillon 		 * Leave the FIFO intact.
11010c3a8cd0SMatthew Dillon 		 */
1102a2179323SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_RREQ);
11030c3a8cd0SMatthew Dillon 	} else {
11040c3a8cd0SMatthew Dillon 		/*
11050d20ec8aSMatthew Dillon 		 * Continue processing msg.
11060c3a8cd0SMatthew Dillon 		 *
11070c3a8cd0SMatthew Dillon 		 * The fifo has already been advanced past the message.
11080c3a8cd0SMatthew Dillon 		 * Trivially reset the FIFO indices if possible.
11090c3a8cd0SMatthew Dillon 		 *
11100c3a8cd0SMatthew Dillon 		 * clear the FIFO if it is now empty and set RREQ to wait
11110c3a8cd0SMatthew Dillon 		 * for more from the socket.  If the FIFO is not empty set
11120c3a8cd0SMatthew Dillon 		 * TWORK to bypass the poll so we loop immediately.
11130c3a8cd0SMatthew Dillon 		 */
11140c3a8cd0SMatthew Dillon 		if (ioq->fifo_beg == ioq->fifo_cdx &&
11150c3a8cd0SMatthew Dillon 		    ioq->fifo_cdn == ioq->fifo_end) {
1116a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_RREQ);
11170c3a8cd0SMatthew Dillon 			ioq->fifo_cdx = 0;
11180c3a8cd0SMatthew Dillon 			ioq->fifo_cdn = 0;
11190c3a8cd0SMatthew Dillon 			ioq->fifo_beg = 0;
11200c3a8cd0SMatthew Dillon 			ioq->fifo_end = 0;
11210c3a8cd0SMatthew Dillon 		} else {
1122a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_RWORK);
11230c3a8cd0SMatthew Dillon 		}
11240c3a8cd0SMatthew Dillon 		ioq->state = DMSG_MSGQ_STATE_HEADER1;
11250c3a8cd0SMatthew Dillon 		ioq->msg = NULL;
11260d20ec8aSMatthew Dillon 
11270d20ec8aSMatthew Dillon 		/*
11280d20ec8aSMatthew Dillon 		 * Handle message routing.  Validates non-zero sources
11290d20ec8aSMatthew Dillon 		 * and routes message.  Error will be 0 if the message is
11300d20ec8aSMatthew Dillon 		 * destined for us.
11310d20ec8aSMatthew Dillon 		 *
11320d20ec8aSMatthew Dillon 		 * State processing only occurs for messages destined for us.
11330d20ec8aSMatthew Dillon 		 */
11345ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
11350a9eefcaSMatthew Dillon 			    "rxmsg cmd=%08x circ=%016jx\n",
1136a2179323SMatthew Dillon 			    msg->any.head.cmd,
1137a2179323SMatthew Dillon 			    (intmax_t)msg->any.head.circuit);
11387adbba57SMatthew Dillon 
11390a9eefcaSMatthew Dillon 		error = dmsg_state_msgrx(msg, 0);
11400d20ec8aSMatthew Dillon 
11410d20ec8aSMatthew Dillon 		if (error) {
11420d20ec8aSMatthew Dillon 			/*
11430d20ec8aSMatthew Dillon 			 * Abort-after-closure, throw message away and
11440d20ec8aSMatthew Dillon 			 * start reading another.
11450d20ec8aSMatthew Dillon 			 */
11460d20ec8aSMatthew Dillon 			if (error == DMSG_IOQ_ERROR_EALREADY) {
11470d20ec8aSMatthew Dillon 				dmsg_msg_free(msg);
11480d20ec8aSMatthew Dillon 				goto again;
11490d20ec8aSMatthew Dillon 			}
11500d20ec8aSMatthew Dillon 
11510d20ec8aSMatthew Dillon 			/*
11520d20ec8aSMatthew Dillon 			 * Process real error and throw away message.
11530d20ec8aSMatthew Dillon 			 */
11540d20ec8aSMatthew Dillon 			ioq->error = error;
11550d20ec8aSMatthew Dillon 			goto skip;
11560d20ec8aSMatthew Dillon 		}
1157a06d536bSMatthew Dillon 
1158a06d536bSMatthew Dillon 		/*
1159a06d536bSMatthew Dillon 		 * No error and not routed
1160a06d536bSMatthew Dillon 		 */
11610d20ec8aSMatthew Dillon 		/* no error, not routed.  Fall through and return msg */
11620c3a8cd0SMatthew Dillon 	}
11630c3a8cd0SMatthew Dillon 	return (msg);
11640c3a8cd0SMatthew Dillon }
11650c3a8cd0SMatthew Dillon 
11660c3a8cd0SMatthew Dillon /*
11670c3a8cd0SMatthew Dillon  * Calculate the header and data crc's and write a low-level message to
11680c3a8cd0SMatthew Dillon  * the connection.  If aux_crc is non-zero the aux_data crc is already
11690c3a8cd0SMatthew Dillon  * assumed to have been set.
11700c3a8cd0SMatthew Dillon  *
11710c3a8cd0SMatthew Dillon  * A non-NULL msg is added to the queue but not necessarily flushed.
11720c3a8cd0SMatthew Dillon  * Calling this function with msg == NULL will get a flush going.
11730c3a8cd0SMatthew Dillon  *
1174a2179323SMatthew Dillon  * (called from iocom_core only)
11750c3a8cd0SMatthew Dillon  */
11760c3a8cd0SMatthew Dillon void
dmsg_iocom_flush1(dmsg_iocom_t * iocom)11770c3a8cd0SMatthew Dillon dmsg_iocom_flush1(dmsg_iocom_t *iocom)
11780c3a8cd0SMatthew Dillon {
11790c3a8cd0SMatthew Dillon 	dmsg_ioq_t *ioq = &iocom->ioq_tx;
11800c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
11810c3a8cd0SMatthew Dillon 	uint32_t xcrc32;
1182f306de83SMatthew Dillon 	size_t hbytes;
1183f306de83SMatthew Dillon 	size_t abytes;
11840c3a8cd0SMatthew Dillon 	dmsg_msg_queue_t tmpq;
11850c3a8cd0SMatthew Dillon 
1186a2179323SMatthew Dillon 	atomic_clear_int(&iocom->flags, DMSG_IOCOMF_WREQ | DMSG_IOCOMF_WWORK);
11870c3a8cd0SMatthew Dillon 	TAILQ_INIT(&tmpq);
11880c3a8cd0SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
11890d20ec8aSMatthew Dillon 	while ((msg = TAILQ_FIRST(&iocom->txmsgq)) != NULL) {
11900d20ec8aSMatthew Dillon 		TAILQ_REMOVE(&iocom->txmsgq, msg, qentry);
11910c3a8cd0SMatthew Dillon 		TAILQ_INSERT_TAIL(&tmpq, msg, qentry);
11920c3a8cd0SMatthew Dillon 	}
11930c3a8cd0SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
11940c3a8cd0SMatthew Dillon 
11950a9eefcaSMatthew Dillon 	/*
11960a9eefcaSMatthew Dillon 	 * Flush queue, doing all required encryption and CRC generation,
11970a9eefcaSMatthew Dillon 	 * with the mutex unlocked.
11980a9eefcaSMatthew Dillon 	 */
11990c3a8cd0SMatthew Dillon 	while ((msg = TAILQ_FIRST(&tmpq)) != NULL) {
12000c3a8cd0SMatthew Dillon 		/*
12010c3a8cd0SMatthew Dillon 		 * Process terminal connection errors.
12020c3a8cd0SMatthew Dillon 		 */
12030c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&tmpq, msg, qentry);
12040c3a8cd0SMatthew Dillon 		if (ioq->error) {
12050c3a8cd0SMatthew Dillon 			TAILQ_INSERT_TAIL(&ioq->msgq, msg, qentry);
12060c3a8cd0SMatthew Dillon 			++ioq->msgcount;
12070c3a8cd0SMatthew Dillon 			continue;
12080c3a8cd0SMatthew Dillon 		}
12090c3a8cd0SMatthew Dillon 
12100c3a8cd0SMatthew Dillon 		/*
12110c3a8cd0SMatthew Dillon 		 * Finish populating the msg fields.  The salt ensures that
12120c3a8cd0SMatthew Dillon 		 * the iv[] array is ridiculously randomized and we also
12130c3a8cd0SMatthew Dillon 		 * re-seed our PRNG every 32768 messages just to be sure.
12140c3a8cd0SMatthew Dillon 		 */
12150c3a8cd0SMatthew Dillon 		msg->any.head.magic = DMSG_HDR_MAGIC;
12160c3a8cd0SMatthew Dillon 		msg->any.head.salt = (random() << 8) | (ioq->seq & 255);
12170c3a8cd0SMatthew Dillon 		++ioq->seq;
12180a9eefcaSMatthew Dillon 		if ((ioq->seq & 32767) == 0) {
12190a9eefcaSMatthew Dillon 			pthread_mutex_lock(&iocom->mtx);
12200c3a8cd0SMatthew Dillon 			srandomdev();
12210a9eefcaSMatthew Dillon 			pthread_mutex_unlock(&iocom->mtx);
12220a9eefcaSMatthew Dillon 		}
12230c3a8cd0SMatthew Dillon 
12240c3a8cd0SMatthew Dillon 		/*
12250c3a8cd0SMatthew Dillon 		 * Calculate aux_crc if 0, then calculate hdr_crc.
12260c3a8cd0SMatthew Dillon 		 */
12270c3a8cd0SMatthew Dillon 		if (msg->aux_size && msg->any.head.aux_crc == 0) {
1228f306de83SMatthew Dillon 			abytes = DMSG_DOALIGN(msg->aux_size);
1229f306de83SMatthew Dillon 			xcrc32 = dmsg_icrc32(msg->aux_data, abytes);
12300c3a8cd0SMatthew Dillon 			msg->any.head.aux_crc = xcrc32;
12310c3a8cd0SMatthew Dillon 		}
1232f306de83SMatthew Dillon 		msg->any.head.aux_bytes = msg->aux_size;
12330c3a8cd0SMatthew Dillon 
12340c3a8cd0SMatthew Dillon 		hbytes = (msg->any.head.cmd & DMSGF_SIZE) *
12350c3a8cd0SMatthew Dillon 			 DMSG_ALIGN;
12360c3a8cd0SMatthew Dillon 		msg->any.head.hdr_crc = 0;
12370c3a8cd0SMatthew Dillon 		msg->any.head.hdr_crc = dmsg_icrc32(&msg->any.head, hbytes);
12380c3a8cd0SMatthew Dillon 
12390c3a8cd0SMatthew Dillon 		/*
12400c3a8cd0SMatthew Dillon 		 * Enqueue the message (the flush codes handles stream
12410c3a8cd0SMatthew Dillon 		 * encryption).
12420c3a8cd0SMatthew Dillon 		 */
12430c3a8cd0SMatthew Dillon 		TAILQ_INSERT_TAIL(&ioq->msgq, msg, qentry);
12440c3a8cd0SMatthew Dillon 		++ioq->msgcount;
12450c3a8cd0SMatthew Dillon 	}
12460c3a8cd0SMatthew Dillon 	dmsg_iocom_flush2(iocom);
12470c3a8cd0SMatthew Dillon }
12480c3a8cd0SMatthew Dillon 
12490c3a8cd0SMatthew Dillon /*
12500c3a8cd0SMatthew Dillon  * Thread localized, iocom->mtx not held by caller.
1251a2179323SMatthew Dillon  *
1252a2179323SMatthew Dillon  * (called from iocom_core via iocom_flush1 only)
12530c3a8cd0SMatthew Dillon  */
12540c3a8cd0SMatthew Dillon void
dmsg_iocom_flush2(dmsg_iocom_t * iocom)12550c3a8cd0SMatthew Dillon dmsg_iocom_flush2(dmsg_iocom_t *iocom)
12560c3a8cd0SMatthew Dillon {
12570c3a8cd0SMatthew Dillon 	dmsg_ioq_t *ioq = &iocom->ioq_tx;
12580c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
12590c3a8cd0SMatthew Dillon 	ssize_t n;
12600c3a8cd0SMatthew Dillon 	struct iovec iov[DMSG_IOQ_MAXIOVEC];
12610c3a8cd0SMatthew Dillon 	size_t nact;
12620c3a8cd0SMatthew Dillon 	size_t hbytes;
12630c3a8cd0SMatthew Dillon 	size_t abytes;
12640c3a8cd0SMatthew Dillon 	size_t hoff;
12650c3a8cd0SMatthew Dillon 	size_t aoff;
12660c3a8cd0SMatthew Dillon 	int iovcnt;
12677adbba57SMatthew Dillon 	int save_errno;
12680c3a8cd0SMatthew Dillon 
12690c3a8cd0SMatthew Dillon 	if (ioq->error) {
12700c3a8cd0SMatthew Dillon 		dmsg_iocom_drain(iocom);
12710c3a8cd0SMatthew Dillon 		return;
12720c3a8cd0SMatthew Dillon 	}
12730c3a8cd0SMatthew Dillon 
12740c3a8cd0SMatthew Dillon 	/*
12750c3a8cd0SMatthew Dillon 	 * Pump messages out the connection by building an iovec.
12760c3a8cd0SMatthew Dillon 	 *
12770c3a8cd0SMatthew Dillon 	 * ioq->hbytes/ioq->abytes tracks how much of the first message
12780c3a8cd0SMatthew Dillon 	 * in the queue has been successfully written out, so we can
12790c3a8cd0SMatthew Dillon 	 * resume writing.
12800c3a8cd0SMatthew Dillon 	 */
12810c3a8cd0SMatthew Dillon 	iovcnt = 0;
12820c3a8cd0SMatthew Dillon 	nact = 0;
12830c3a8cd0SMatthew Dillon 	hoff = ioq->hbytes;
12840c3a8cd0SMatthew Dillon 	aoff = ioq->abytes;
12850c3a8cd0SMatthew Dillon 
12860c3a8cd0SMatthew Dillon 	TAILQ_FOREACH(msg, &ioq->msgq, qentry) {
12870c3a8cd0SMatthew Dillon 		hbytes = (msg->any.head.cmd & DMSGF_SIZE) *
12880c3a8cd0SMatthew Dillon 			 DMSG_ALIGN;
12898d6d37b8SMatthew Dillon 		abytes = DMSG_DOALIGN(msg->aux_size);
12900c3a8cd0SMatthew Dillon 		assert(hoff <= hbytes && aoff <= abytes);
12910c3a8cd0SMatthew Dillon 
12920c3a8cd0SMatthew Dillon 		if (hoff < hbytes) {
1293024de405SMatthew Dillon 			size_t maxlen = hbytes - hoff;
1294024de405SMatthew Dillon 			if (maxlen > sizeof(ioq->buf) / 2)
1295024de405SMatthew Dillon 				maxlen = sizeof(ioq->buf) / 2;
12960c3a8cd0SMatthew Dillon 			iov[iovcnt].iov_base = (char *)&msg->any.head + hoff;
1297024de405SMatthew Dillon 			iov[iovcnt].iov_len = maxlen;
1298024de405SMatthew Dillon 			nact += maxlen;
12990c3a8cd0SMatthew Dillon 			++iovcnt;
1300024de405SMatthew Dillon 			if (iovcnt == DMSG_IOQ_MAXIOVEC ||
1301024de405SMatthew Dillon 			    maxlen != hbytes - hoff) {
13020c3a8cd0SMatthew Dillon 				break;
13030c3a8cd0SMatthew Dillon 			}
1304024de405SMatthew Dillon 		}
13050c3a8cd0SMatthew Dillon 		if (aoff < abytes) {
1306024de405SMatthew Dillon 			size_t maxlen = abytes - aoff;
1307024de405SMatthew Dillon 			if (maxlen > sizeof(ioq->buf) / 2)
1308024de405SMatthew Dillon 				maxlen = sizeof(ioq->buf) / 2;
1309024de405SMatthew Dillon 
13100c3a8cd0SMatthew Dillon 			assert(msg->aux_data != NULL);
13110c3a8cd0SMatthew Dillon 			iov[iovcnt].iov_base = (char *)msg->aux_data + aoff;
1312024de405SMatthew Dillon 			iov[iovcnt].iov_len = maxlen;
1313024de405SMatthew Dillon 			nact += maxlen;
13140c3a8cd0SMatthew Dillon 			++iovcnt;
1315024de405SMatthew Dillon 			if (iovcnt == DMSG_IOQ_MAXIOVEC ||
1316024de405SMatthew Dillon 			    maxlen != abytes - aoff) {
13170c3a8cd0SMatthew Dillon 				break;
13180c3a8cd0SMatthew Dillon 			}
1319024de405SMatthew Dillon 		}
13200c3a8cd0SMatthew Dillon 		hoff = 0;
13210c3a8cd0SMatthew Dillon 		aoff = 0;
13220c3a8cd0SMatthew Dillon 	}
13237adbba57SMatthew Dillon 
13247adbba57SMatthew Dillon 	/*
13257adbba57SMatthew Dillon 	 * Shortcut if no work to do.  Be sure to check for old work still
13267adbba57SMatthew Dillon 	 * pending in the FIFO.
13277adbba57SMatthew Dillon 	 */
13287adbba57SMatthew Dillon 	if (iovcnt == 0 && ioq->fifo_beg == ioq->fifo_cdx)
13290c3a8cd0SMatthew Dillon 		return;
13300c3a8cd0SMatthew Dillon 
13310c3a8cd0SMatthew Dillon 	/*
13320c3a8cd0SMatthew Dillon 	 * Encrypt and write the data.  The crypto code will move the
13330c3a8cd0SMatthew Dillon 	 * data into the fifo and adjust the iov as necessary.  If
13340c3a8cd0SMatthew Dillon 	 * encryption is disabled the iov is left alone.
13350c3a8cd0SMatthew Dillon 	 *
13360c3a8cd0SMatthew Dillon 	 * May return a smaller iov (thus a smaller n), with aggregated
13370c3a8cd0SMatthew Dillon 	 * chunks.  May reduce nmax to what fits in the FIFO.
13380c3a8cd0SMatthew Dillon 	 *
13390c3a8cd0SMatthew Dillon 	 * This function sets nact to the number of original bytes now
13400c3a8cd0SMatthew Dillon 	 * encrypted, adding to the FIFO some number of bytes that might
13410c3a8cd0SMatthew Dillon 	 * be greater depending on the crypto mechanic.  iov[] is adjusted
13420c3a8cd0SMatthew Dillon 	 * to point at the FIFO if necessary.
13430c3a8cd0SMatthew Dillon 	 *
1344f2239a4eSMatthew Dillon 	 * NOTE: nact is the number of bytes eaten from the message.  For
1345f2239a4eSMatthew Dillon 	 *	 encrypted data this is the number of bytes processed for
1346f2239a4eSMatthew Dillon 	 *	 encryption and not necessarily the number of bytes writable.
1347f2239a4eSMatthew Dillon 	 *	 The return value from the writev() is the post-encrypted
1348f2239a4eSMatthew Dillon 	 *	 byte count which might be larger.
1349f2239a4eSMatthew Dillon 	 *
1350f2239a4eSMatthew Dillon 	 * NOTE: For direct writes, nact is the return value from the writev().
13510c3a8cd0SMatthew Dillon 	 */
13520c3a8cd0SMatthew Dillon 	if (iocom->flags & DMSG_IOCOMF_CRYPTED) {
13530c3a8cd0SMatthew Dillon 		/*
13540c3a8cd0SMatthew Dillon 		 * Make sure the FIFO has a reasonable amount of space
13550c3a8cd0SMatthew Dillon 		 * left (if not completely full).
1356a2179323SMatthew Dillon 		 *
1357a2179323SMatthew Dillon 		 * In this situation we are staging the encrypted message
1358a2179323SMatthew Dillon 		 * data in the FIFO.  (nact) represents how much plaintext
1359a2179323SMatthew Dillon 		 * has been staged, (n) represents how much encrypted data
1360a2179323SMatthew Dillon 		 * has been flushed.  The two are independent of each other.
13610c3a8cd0SMatthew Dillon 		 */
13620c3a8cd0SMatthew Dillon 		if (ioq->fifo_beg > sizeof(ioq->buf) / 2 &&
1363a2179323SMatthew Dillon 		    sizeof(ioq->buf) - ioq->fifo_end < DMSG_ALIGN * 2) {
13640c3a8cd0SMatthew Dillon 			bcopy(ioq->buf + ioq->fifo_beg, ioq->buf,
13650c3a8cd0SMatthew Dillon 			      ioq->fifo_end - ioq->fifo_beg);
13660c3a8cd0SMatthew Dillon 			ioq->fifo_cdx -= ioq->fifo_beg;
13670c3a8cd0SMatthew Dillon 			ioq->fifo_cdn -= ioq->fifo_beg;
13680c3a8cd0SMatthew Dillon 			ioq->fifo_end -= ioq->fifo_beg;
13690c3a8cd0SMatthew Dillon 			ioq->fifo_beg = 0;
13700c3a8cd0SMatthew Dillon 		}
13710c3a8cd0SMatthew Dillon 
1372f2239a4eSMatthew Dillon 		/*
1373f2239a4eSMatthew Dillon 		 * beg .... cdx ............ cdn ............. end
1374f2239a4eSMatthew Dillon 		 * [WRITABLE] [PARTIALENCRYPT] [NOTYETENCRYPTED]
1375f2239a4eSMatthew Dillon 		 *
1376e96cef49SMatthew Dillon 		 * Advance fifo_beg on a successful write.
1377f2239a4eSMatthew Dillon 		 */
13780c3a8cd0SMatthew Dillon 		iovcnt = dmsg_crypto_encrypt(iocom, ioq, iov, iovcnt, &nact);
13790c3a8cd0SMatthew Dillon 		n = writev(iocom->sock_fd, iov, iovcnt);
13807adbba57SMatthew Dillon 		save_errno = errno;
13810c3a8cd0SMatthew Dillon 		if (n > 0) {
13820c3a8cd0SMatthew Dillon 			ioq->fifo_beg += n;
13830c3a8cd0SMatthew Dillon 			if (ioq->fifo_beg == ioq->fifo_end) {
13840c3a8cd0SMatthew Dillon 				ioq->fifo_beg = 0;
13850c3a8cd0SMatthew Dillon 				ioq->fifo_cdn = 0;
13860c3a8cd0SMatthew Dillon 				ioq->fifo_cdx = 0;
13870c3a8cd0SMatthew Dillon 				ioq->fifo_end = 0;
13880c3a8cd0SMatthew Dillon 			}
13890c3a8cd0SMatthew Dillon 		}
13907adbba57SMatthew Dillon 
1391a2179323SMatthew Dillon 		/*
1392a2179323SMatthew Dillon 		 * We don't mess with the nact returned by the crypto_encrypt
1393a2179323SMatthew Dillon 		 * call, which represents the filling of the FIFO.  (n) tells
1394a2179323SMatthew Dillon 		 * us how much we were able to write from the FIFO.  The two
1395a2179323SMatthew Dillon 		 * are different beasts when encrypting.
1396a2179323SMatthew Dillon 		 */
13970c3a8cd0SMatthew Dillon 	} else {
1398a2179323SMatthew Dillon 		/*
1399a2179323SMatthew Dillon 		 * In this situation we are not staging the messages to the
1400a2179323SMatthew Dillon 		 * FIFO but instead writing them directly from the msg
1401f2239a4eSMatthew Dillon 		 * structure(s) unencrypted, so (nact) is basically (n).
1402a2179323SMatthew Dillon 		 */
14030c3a8cd0SMatthew Dillon 		n = writev(iocom->sock_fd, iov, iovcnt);
14047adbba57SMatthew Dillon 		save_errno = errno;
14050c3a8cd0SMatthew Dillon 		if (n > 0)
14060c3a8cd0SMatthew Dillon 			nact = n;
14070c3a8cd0SMatthew Dillon 		else
14080c3a8cd0SMatthew Dillon 			nact = 0;
14090c3a8cd0SMatthew Dillon 	}
14100c3a8cd0SMatthew Dillon 
14110c3a8cd0SMatthew Dillon 	/*
14120c3a8cd0SMatthew Dillon 	 * Clean out the transmit queue based on what we successfully
14137adbba57SMatthew Dillon 	 * encrypted (nact is the plaintext count) and is now in the FIFO.
14147adbba57SMatthew Dillon 	 * ioq->hbytes/abytes represents the portion of the first message
14157adbba57SMatthew Dillon 	 * previously sent.
14160c3a8cd0SMatthew Dillon 	 */
14170c3a8cd0SMatthew Dillon 	while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
14180c3a8cd0SMatthew Dillon 		hbytes = (msg->any.head.cmd & DMSGF_SIZE) *
14190c3a8cd0SMatthew Dillon 			 DMSG_ALIGN;
14208d6d37b8SMatthew Dillon 		abytes = DMSG_DOALIGN(msg->aux_size);
14210c3a8cd0SMatthew Dillon 
14220c3a8cd0SMatthew Dillon 		if ((size_t)nact < hbytes - ioq->hbytes) {
14230c3a8cd0SMatthew Dillon 			ioq->hbytes += nact;
14240c3a8cd0SMatthew Dillon 			nact = 0;
14250c3a8cd0SMatthew Dillon 			break;
14260c3a8cd0SMatthew Dillon 		}
14270c3a8cd0SMatthew Dillon 		nact -= hbytes - ioq->hbytes;
14280c3a8cd0SMatthew Dillon 		ioq->hbytes = hbytes;
14290c3a8cd0SMatthew Dillon 		if ((size_t)nact < abytes - ioq->abytes) {
14300c3a8cd0SMatthew Dillon 			ioq->abytes += nact;
14310c3a8cd0SMatthew Dillon 			nact = 0;
14320c3a8cd0SMatthew Dillon 			break;
14330c3a8cd0SMatthew Dillon 		}
14340c3a8cd0SMatthew Dillon 		nact -= abytes - ioq->abytes;
1435a2179323SMatthew Dillon 		/* ioq->abytes = abytes; optimized out */
1436a2179323SMatthew Dillon 
14375ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
14380a9eefcaSMatthew Dillon 			    "txmsg cmd=%08x circ=%016jx\n",
1439a2179323SMatthew Dillon 			    msg->any.head.cmd,
1440a2179323SMatthew Dillon 			    (intmax_t)msg->any.head.circuit);
14410c3a8cd0SMatthew Dillon 
14427adbba57SMatthew Dillon #ifdef DMSG_BLOCK_DEBUG
14437adbba57SMatthew Dillon 		uint32_t tcmd;
14447adbba57SMatthew Dillon 
14457adbba57SMatthew Dillon 		if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
14467adbba57SMatthew Dillon 			if ((msg->state->flags & DMSG_STATE_ROOT) == 0) {
14477adbba57SMatthew Dillon 				tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
14487adbba57SMatthew Dillon 					    (msg->any.head.cmd & (DMSGF_CREATE |
14497adbba57SMatthew Dillon 								  DMSGF_DELETE |
14507adbba57SMatthew Dillon 								  DMSGF_REPLY));
14517adbba57SMatthew Dillon 			} else {
14527adbba57SMatthew Dillon 				tcmd = 0;
14537adbba57SMatthew Dillon 			}
14547adbba57SMatthew Dillon 		} else {
14557adbba57SMatthew Dillon 			tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
14567adbba57SMatthew Dillon 		}
14577adbba57SMatthew Dillon 
14587adbba57SMatthew Dillon 		switch (tcmd) {
14597adbba57SMatthew Dillon 		case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE:
14607adbba57SMatthew Dillon 		case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE:
14615ab1caedSMatthew Dillon 			dmio_printf(iocom, 4,
14625ab1caedSMatthew Dillon 				    "write BIO %-3d %016jx %d@%016jx\n",
14637adbba57SMatthew Dillon 				    biocount, msg->any.head.msgid,
14647adbba57SMatthew Dillon 				    msg->any.blk_read.bytes,
14657adbba57SMatthew Dillon 				    msg->any.blk_read.offset);
14667adbba57SMatthew Dillon 			break;
14677adbba57SMatthew Dillon 		case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
14687adbba57SMatthew Dillon 		case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
14695ab1caedSMatthew Dillon 			dmio_printf(iocom, 4,
14705ab1caedSMatthew Dillon 				    "wretr BIO %-3d %016jx %d@%016jx\n",
14717adbba57SMatthew Dillon 				    biocount, msg->any.head.msgid,
14727adbba57SMatthew Dillon 				    msg->any.blk_read.bytes,
14737adbba57SMatthew Dillon 				    msg->any.blk_read.offset);
14747adbba57SMatthew Dillon 			break;
14757adbba57SMatthew Dillon 		default:
14767adbba57SMatthew Dillon 			break;
14777adbba57SMatthew Dillon 		}
14787adbba57SMatthew Dillon #endif
14797adbba57SMatthew Dillon 
14800c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&ioq->msgq, msg, qentry);
14810c3a8cd0SMatthew Dillon 		--ioq->msgcount;
14820c3a8cd0SMatthew Dillon 		ioq->hbytes = 0;
14830c3a8cd0SMatthew Dillon 		ioq->abytes = 0;
1484323c0947SMatthew Dillon 		dmsg_msg_free(msg);
14850c3a8cd0SMatthew Dillon 	}
14860c3a8cd0SMatthew Dillon 	assert(nact == 0);
14870c3a8cd0SMatthew Dillon 
14880c3a8cd0SMatthew Dillon 	/*
14890c3a8cd0SMatthew Dillon 	 * Process the return value from the write w/regards to blocking.
14900c3a8cd0SMatthew Dillon 	 */
14910c3a8cd0SMatthew Dillon 	if (n < 0) {
14927adbba57SMatthew Dillon 		if (save_errno != EINTR &&
14937adbba57SMatthew Dillon 		    save_errno != EINPROGRESS &&
14947adbba57SMatthew Dillon 		    save_errno != EAGAIN) {
14950c3a8cd0SMatthew Dillon 			/*
14960c3a8cd0SMatthew Dillon 			 * Fatal write error
14970c3a8cd0SMatthew Dillon 			 */
14980c3a8cd0SMatthew Dillon 			ioq->error = DMSG_IOQ_ERROR_SOCK;
14990c3a8cd0SMatthew Dillon 			dmsg_iocom_drain(iocom);
15000c3a8cd0SMatthew Dillon 		} else {
15010c3a8cd0SMatthew Dillon 			/*
15027adbba57SMatthew Dillon 			 * Wait for socket buffer space, do not try to
15037adbba57SMatthew Dillon 			 * process more packets for transmit until space
15047adbba57SMatthew Dillon 			 * is available.
15050c3a8cd0SMatthew Dillon 			 */
1506a2179323SMatthew Dillon 			atomic_set_int(&iocom->flags, DMSG_IOCOMF_WREQ);
15070c3a8cd0SMatthew Dillon 		}
15087adbba57SMatthew Dillon 	} else if (TAILQ_FIRST(&ioq->msgq) ||
15097adbba57SMatthew Dillon 		   TAILQ_FIRST(&iocom->txmsgq) ||
15107adbba57SMatthew Dillon 		   ioq->fifo_beg != ioq->fifo_cdx) {
15117adbba57SMatthew Dillon 		/*
15127adbba57SMatthew Dillon 		 * If the write succeeded and more messages are pending
15137adbba57SMatthew Dillon 		 * in either msgq, or the FIFO WWORK must remain set.
15147adbba57SMatthew Dillon 		 */
15157adbba57SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_WWORK);
15160c3a8cd0SMatthew Dillon 	}
15177adbba57SMatthew Dillon 	/* else no transmit-side work remains */
15187adbba57SMatthew Dillon 
15190c3a8cd0SMatthew Dillon 	if (ioq->error) {
15200c3a8cd0SMatthew Dillon 		dmsg_iocom_drain(iocom);
15210c3a8cd0SMatthew Dillon 	}
15220c3a8cd0SMatthew Dillon }
15230c3a8cd0SMatthew Dillon 
15240c3a8cd0SMatthew Dillon /*
15250c3a8cd0SMatthew Dillon  * Kill pending msgs on ioq_tx and adjust the flags such that no more
15260c3a8cd0SMatthew Dillon  * write events will occur.  We don't kill read msgs because we want
15270c3a8cd0SMatthew Dillon  * the caller to pull off our contrived terminal error msg to detect
15280c3a8cd0SMatthew Dillon  * the connection failure.
15290c3a8cd0SMatthew Dillon  *
1530a2179323SMatthew Dillon  * Localized to iocom_core thread, iocom->mtx not held by caller.
15310c3a8cd0SMatthew Dillon  */
15320c3a8cd0SMatthew Dillon void
dmsg_iocom_drain(dmsg_iocom_t * iocom)15330c3a8cd0SMatthew Dillon dmsg_iocom_drain(dmsg_iocom_t *iocom)
15340c3a8cd0SMatthew Dillon {
15350c3a8cd0SMatthew Dillon 	dmsg_ioq_t *ioq = &iocom->ioq_tx;
15360c3a8cd0SMatthew Dillon 	dmsg_msg_t *msg;
15370c3a8cd0SMatthew Dillon 
1538a2179323SMatthew Dillon 	atomic_clear_int(&iocom->flags, DMSG_IOCOMF_WREQ | DMSG_IOCOMF_WWORK);
15390c3a8cd0SMatthew Dillon 	ioq->hbytes = 0;
15400c3a8cd0SMatthew Dillon 	ioq->abytes = 0;
15410c3a8cd0SMatthew Dillon 
15420c3a8cd0SMatthew Dillon 	while ((msg = TAILQ_FIRST(&ioq->msgq)) != NULL) {
15430c3a8cd0SMatthew Dillon 		TAILQ_REMOVE(&ioq->msgq, msg, qentry);
15440c3a8cd0SMatthew Dillon 		--ioq->msgcount;
1545323c0947SMatthew Dillon 		dmsg_msg_free(msg);
15460c3a8cd0SMatthew Dillon 	}
15470c3a8cd0SMatthew Dillon }
15480c3a8cd0SMatthew Dillon 
15490c3a8cd0SMatthew Dillon /*
15500c3a8cd0SMatthew Dillon  * Write a message to an iocom, with additional state processing.
15510c3a8cd0SMatthew Dillon  */
15520c3a8cd0SMatthew Dillon void
dmsg_msg_write(dmsg_msg_t * msg)15530c3a8cd0SMatthew Dillon dmsg_msg_write(dmsg_msg_t *msg)
15540c3a8cd0SMatthew Dillon {
15551b8eded1SMatthew Dillon 	dmsg_iocom_t *iocom = msg->state->iocom;
15560c3a8cd0SMatthew Dillon 	dmsg_state_t *state;
15570c3a8cd0SMatthew Dillon 	char dummy;
15580c3a8cd0SMatthew Dillon 
15590c3a8cd0SMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
15601b8eded1SMatthew Dillon 	state = msg->state;
1561d30cab67SMatthew Dillon 
15625ab1caedSMatthew Dillon 	dmio_printf(iocom, 5,
15630a9eefcaSMatthew Dillon 		    "msgtx: cmd=%08x msgid=%016jx "
15640a9eefcaSMatthew Dillon 		    "state %p(%08x) error=%d\n",
15650a9eefcaSMatthew Dillon 		    msg->any.head.cmd, msg->any.head.msgid,
15660a9eefcaSMatthew Dillon 		    state, (state ? state->icmd : 0),
15670a9eefcaSMatthew Dillon 		    msg->any.head.error);
15680a9eefcaSMatthew Dillon 
15690a9eefcaSMatthew Dillon 
1570a06d536bSMatthew Dillon #if 0
1571323c0947SMatthew Dillon 	/*
1572323c0947SMatthew Dillon 	 * Make sure the parent transaction is still open in the transmit
1573323c0947SMatthew Dillon 	 * direction.  If it isn't the message is dead and we have to
1574323c0947SMatthew Dillon 	 * potentially simulate a rxmsg terminating the transaction.
1575323c0947SMatthew Dillon 	 */
1576a06d536bSMatthew Dillon 	if ((state->parent->txcmd & DMSGF_DELETE) ||
1577a06d536bSMatthew Dillon 	    (state->parent->rxcmd & DMSGF_DELETE)) {
15785ab1caedSMatthew Dillon 		dmio_printf(iocom, 4, "dmsg_msg_write: EARLY TERMINATION\n");
1579a06d536bSMatthew Dillon 		dmsg_simulate_failure(state, DMSG_ERR_LOSTLINK);
1580323c0947SMatthew Dillon 		dmsg_state_cleanuptx(iocom, msg);
1581323c0947SMatthew Dillon 		dmsg_msg_free(msg);
1582323c0947SMatthew Dillon 		pthread_mutex_unlock(&iocom->mtx);
1583323c0947SMatthew Dillon 		return;
1584323c0947SMatthew Dillon 	}
1585a06d536bSMatthew Dillon #endif
1586323c0947SMatthew Dillon 	/*
1587323c0947SMatthew Dillon 	 * Process state data into the message as needed, then update the
1588323c0947SMatthew Dillon 	 * state based on the message.
1589323c0947SMatthew Dillon 	 */
1590d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
15910c3a8cd0SMatthew Dillon 		/*
15920c3a8cd0SMatthew Dillon 		 * Existing transaction (could be reply).  It is also
15930c3a8cd0SMatthew Dillon 		 * possible for this to be the first reply (CREATE is set),
15940c3a8cd0SMatthew Dillon 		 * in which case we populate state->txcmd.
15950c3a8cd0SMatthew Dillon 		 *
15960c3a8cd0SMatthew Dillon 		 * state->txcmd is adjusted to hold the final message cmd,
15970c3a8cd0SMatthew Dillon 		 * and we also be sure to set the CREATE bit here.  We did
15980c3a8cd0SMatthew Dillon 		 * not set it in dmsg_msg_alloc() because that would have
15990c3a8cd0SMatthew Dillon 		 * not been serialized (state could have gotten ripped out
16000c3a8cd0SMatthew Dillon 		 * from under the message prior to it being transmitted).
16010c3a8cd0SMatthew Dillon 		 */
16020c3a8cd0SMatthew Dillon 		if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_REPLY)) ==
16030c3a8cd0SMatthew Dillon 		    DMSGF_CREATE) {
16040c3a8cd0SMatthew Dillon 			state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
16050d20ec8aSMatthew Dillon 			state->icmd = state->txcmd & DMSGF_BASECMDMASK;
16060a9eefcaSMatthew Dillon 			state->flags &= ~DMSG_STATE_NEW;
16070c3a8cd0SMatthew Dillon 		}
16080c3a8cd0SMatthew Dillon 		msg->any.head.msgid = state->msgid;
16091b8eded1SMatthew Dillon 
16100d20ec8aSMatthew Dillon 		if (msg->any.head.cmd & DMSGF_CREATE) {
16110c3a8cd0SMatthew Dillon 			state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
16120c3a8cd0SMatthew Dillon 		}
16130d20ec8aSMatthew Dillon 	}
16141b8eded1SMatthew Dillon 
16150c3a8cd0SMatthew Dillon 	/*
16160a9eefcaSMatthew Dillon 	 * Discard messages sent to transactions which are already dead.
16170c3a8cd0SMatthew Dillon 	 */
16180a9eefcaSMatthew Dillon 	if (state && (state->txcmd & DMSGF_DELETE)) {
16195ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
16205ab1caedSMatthew Dillon 			    "dmsg_msg_write: drop msg %08x to dead "
16210a9eefcaSMatthew Dillon 			    "circuit state=%p\n",
16220a9eefcaSMatthew Dillon 			    msg->any.head.cmd, state);
16230a9eefcaSMatthew Dillon 		dmsg_msg_free(msg);
16240a9eefcaSMatthew Dillon 		return;
16250a9eefcaSMatthew Dillon 	}
16260a9eefcaSMatthew Dillon 
16270a9eefcaSMatthew Dillon 	/*
16280a9eefcaSMatthew Dillon 	 * Normally we queue the msg for output.  However, if the circuit is
16290a9eefcaSMatthew Dillon 	 * dead or dying we must simulate a failure in the return direction
16300a9eefcaSMatthew Dillon 	 * and throw the message away.  The other end is not expecting any
16310a9eefcaSMatthew Dillon 	 * further messages from us on this state.
16320a9eefcaSMatthew Dillon 	 *
16330a9eefcaSMatthew Dillon 	 * Note that the I/O thread is responsible for generating the CRCs
16340a9eefcaSMatthew Dillon 	 * and encryption.
16350a9eefcaSMatthew Dillon 	 */
16360a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_DYING) {
16370a9eefcaSMatthew Dillon #if 0
16380a9eefcaSMatthew Dillon 	if ((state->parent->txcmd & DMSGF_DELETE) ||
16390a9eefcaSMatthew Dillon 	    (state->parent->flags & DMSG_STATE_DYING) ||
16400a9eefcaSMatthew Dillon 	    (state->flags & DMSG_STATE_DYING)) {
16410a9eefcaSMatthew Dillon #endif
16420a9eefcaSMatthew Dillon 		/*
16430a9eefcaSMatthew Dillon 		 * Illegal message, kill state and related sub-state.
16440a9eefcaSMatthew Dillon 		 * Cannot transmit if state is already dying.
16450a9eefcaSMatthew Dillon 		 */
16465ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
16475ab1caedSMatthew Dillon 			    "dmsg_msg_write: Write to dying circuit "
16480a9eefcaSMatthew Dillon 			    "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
16490a9eefcaSMatthew Dillon 			    state->parent->rxcmd,
16500a9eefcaSMatthew Dillon 			    state->parent->txcmd,
16510a9eefcaSMatthew Dillon 			    state->parent->flags);
16520a9eefcaSMatthew Dillon 		dmsg_state_hold(state);
16530a9eefcaSMatthew Dillon 		dmsg_state_cleanuptx(iocom, msg);
16540a9eefcaSMatthew Dillon 		if ((state->flags & DMSG_STATE_ABORTING) == 0) {
16550a9eefcaSMatthew Dillon 			dmsg_simulate_failure(state, 1, DMSG_ERR_LOSTLINK);
16560a9eefcaSMatthew Dillon 		}
16570a9eefcaSMatthew Dillon 		dmsg_state_drop(state);
16580a9eefcaSMatthew Dillon 		dmsg_msg_free(msg);
16590a9eefcaSMatthew Dillon 	} else {
16600a9eefcaSMatthew Dillon 		/*
16610a9eefcaSMatthew Dillon 		 * Queue the message, clean up transmit state prior to queueing
16620a9eefcaSMatthew Dillon 		 * to avoid SMP races.
16630a9eefcaSMatthew Dillon 		 */
16645ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
16655ab1caedSMatthew Dillon 			    "dmsg_msg_write: commit msg state=%p to txkmsgq\n",
16665ab1caedSMatthew Dillon 			    state);
16670a9eefcaSMatthew Dillon 		dmsg_state_cleanuptx(iocom, msg);
16680d20ec8aSMatthew Dillon 		TAILQ_INSERT_TAIL(&iocom->txmsgq, msg, qentry);
16690c3a8cd0SMatthew Dillon 		dummy = 0;
16700c3a8cd0SMatthew Dillon 		write(iocom->wakeupfds[1], &dummy, 1);	/* XXX optimize me */
16710a9eefcaSMatthew Dillon 	}
16720c3a8cd0SMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
16730c3a8cd0SMatthew Dillon }
16740c3a8cd0SMatthew Dillon 
16750c3a8cd0SMatthew Dillon /*
16760a9eefcaSMatthew Dillon  * Remove state from its parent's subq.  This can wind up recursively
16770a9eefcaSMatthew Dillon  * dropping the parent upward.
16780a9eefcaSMatthew Dillon  *
16790a9eefcaSMatthew Dillon  * NOTE: iocom must be locked.
16800a9eefcaSMatthew Dillon  *
16810a9eefcaSMatthew Dillon  * NOTE: Once we drop the parent, our pstate pointer may become invalid.
16820a9eefcaSMatthew Dillon  */
16830a9eefcaSMatthew Dillon static
16840a9eefcaSMatthew Dillon void
16850a9eefcaSMatthew Dillon dmsg_subq_delete(dmsg_state_t *state)
16860a9eefcaSMatthew Dillon {
16870a9eefcaSMatthew Dillon 	dmsg_state_t *pstate;
16880a9eefcaSMatthew Dillon 
16890a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_SUBINSERTED) {
16900a9eefcaSMatthew Dillon 		pstate = state->parent;
16910a9eefcaSMatthew Dillon 		assert(pstate);
16920a9eefcaSMatthew Dillon 		if (pstate->scan == state)
16930a9eefcaSMatthew Dillon 			pstate->scan = NULL;
16940a9eefcaSMatthew Dillon 		TAILQ_REMOVE(&pstate->subq, state, entry);
16950a9eefcaSMatthew Dillon 		state->flags &= ~DMSG_STATE_SUBINSERTED;
16960a9eefcaSMatthew Dillon 		state->parent = NULL;
16970a9eefcaSMatthew Dillon 		if (TAILQ_EMPTY(&pstate->subq))
16980a9eefcaSMatthew Dillon 			dmsg_state_drop(pstate);/* pstate->subq */
16990a9eefcaSMatthew Dillon 		pstate = NULL;			/* safety */
17000a9eefcaSMatthew Dillon 		dmsg_state_drop(state);         /* pstate->subq */
17010a9eefcaSMatthew Dillon 	} else {
17020a9eefcaSMatthew Dillon 		assert(state->parent == NULL);
17030a9eefcaSMatthew Dillon 	}
17040a9eefcaSMatthew Dillon }
17050a9eefcaSMatthew Dillon 
17060a9eefcaSMatthew Dillon /*
1707a06d536bSMatthew Dillon  * Simulate reception of a transaction DELETE message when the link goes
1708a06d536bSMatthew Dillon  * bad.  This routine must recurse through state->subq and generate messages
1709a06d536bSMatthew Dillon  * and callbacks bottom-up.
1710a06d536bSMatthew Dillon  *
1711323c0947SMatthew Dillon  * iocom->mtx must be held by caller.
1712323c0947SMatthew Dillon  */
1713323c0947SMatthew Dillon static
1714323c0947SMatthew Dillon void
17150a9eefcaSMatthew Dillon dmsg_simulate_failure(dmsg_state_t *state, int meto, int error)
1716323c0947SMatthew Dillon {
1717a06d536bSMatthew Dillon 	dmsg_state_t *substate;
17180a9eefcaSMatthew Dillon 
17190a9eefcaSMatthew Dillon 	dmsg_state_hold(state);
17200a9eefcaSMatthew Dillon 	if (meto)
17210a9eefcaSMatthew Dillon 		dmsg_state_abort(state);
17220a9eefcaSMatthew Dillon 
17230a9eefcaSMatthew Dillon 	/*
17240a9eefcaSMatthew Dillon 	 * Recurse through sub-states.
17250a9eefcaSMatthew Dillon 	 */
17260a9eefcaSMatthew Dillon again:
17270a9eefcaSMatthew Dillon 	TAILQ_FOREACH(substate, &state->subq, entry) {
17280a9eefcaSMatthew Dillon 		if (substate->flags & DMSG_STATE_ABORTING)
17290a9eefcaSMatthew Dillon 			continue;
17300a9eefcaSMatthew Dillon 		state->scan = substate;
17310a9eefcaSMatthew Dillon 		dmsg_simulate_failure(substate, 1, error);
17320a9eefcaSMatthew Dillon 		if (state->scan != substate)
17330a9eefcaSMatthew Dillon 			goto again;
17340a9eefcaSMatthew Dillon 	}
17350a9eefcaSMatthew Dillon 
17360a9eefcaSMatthew Dillon 	dmsg_state_drop(state);
17370a9eefcaSMatthew Dillon }
17380a9eefcaSMatthew Dillon 
17390a9eefcaSMatthew Dillon static
17400a9eefcaSMatthew Dillon void
17410a9eefcaSMatthew Dillon dmsg_state_abort(dmsg_state_t *state)
17420a9eefcaSMatthew Dillon {
1743a06d536bSMatthew Dillon 	dmsg_iocom_t *iocom;
1744323c0947SMatthew Dillon 	dmsg_msg_t *msg;
1745323c0947SMatthew Dillon 
17460a9eefcaSMatthew Dillon 	/*
17470a9eefcaSMatthew Dillon 	 * Set ABORTING and DYING, return if already set.  If the state was
17480a9eefcaSMatthew Dillon 	 * just allocated we defer the abort operation until the related
17490a9eefcaSMatthew Dillon 	 * message is processed.
17500a9eefcaSMatthew Dillon 	 */
17510a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_ABORTING)
17520a9eefcaSMatthew Dillon 		return;
17530a9eefcaSMatthew Dillon 	state->flags |= DMSG_STATE_ABORTING;
17540a9eefcaSMatthew Dillon 	dmsg_state_dying(state);
17550a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_NEW) {
17565ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
17575ab1caedSMatthew Dillon 			    "dmsg_state_abort(0): state %p rxcmd %08x "
17585ab1caedSMatthew Dillon 			    "txcmd %08x flags %08x - in NEW state\n",
17595ab1caedSMatthew Dillon 			    state, state->rxcmd,
17605ab1caedSMatthew Dillon 			    state->txcmd, state->flags);
17610a9eefcaSMatthew Dillon 		return;
1762a06d536bSMatthew Dillon 	}
1763323c0947SMatthew Dillon 
1764323c0947SMatthew Dillon 	/*
17650a9eefcaSMatthew Dillon 	 * Simulate parent state failure before child states.  Device
17660a9eefcaSMatthew Dillon 	 * drivers need to understand this and flag the situation but might
17670a9eefcaSMatthew Dillon 	 * have asynchronous operations in progress that they cannot stop.
17680a9eefcaSMatthew Dillon 	 * To make things easier, parent states will not actually disappear
17690a9eefcaSMatthew Dillon 	 * until the children are all gone.
1770323c0947SMatthew Dillon 	 */
1771a06d536bSMatthew Dillon 	if ((state->rxcmd & DMSGF_DELETE) == 0) {
17725ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
17735ab1caedSMatthew Dillon 			    "dmsg_state_abort() on state %p\n",
17745ab1caedSMatthew Dillon 			    state);
17750a9eefcaSMatthew Dillon 		msg = dmsg_msg_alloc_locked(state, 0, DMSG_LNK_ERROR,
1776323c0947SMatthew Dillon 					    NULL, NULL);
1777323c0947SMatthew Dillon 		if ((state->rxcmd & DMSGF_CREATE) == 0)
1778323c0947SMatthew Dillon 			msg->any.head.cmd |= DMSGF_CREATE;
17790a9eefcaSMatthew Dillon 		msg->any.head.cmd |= DMSGF_DELETE |
17800a9eefcaSMatthew Dillon 				     (state->rxcmd & DMSGF_REPLY);
17810a9eefcaSMatthew Dillon 		msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
17820a9eefcaSMatthew Dillon 		msg->any.head.error = DMSG_ERR_LOSTLINK;
17830a9eefcaSMatthew Dillon 		msg->any.head.cmd |= DMSGF_ABORT;
17840a9eefcaSMatthew Dillon 
17850a9eefcaSMatthew Dillon 		/*
17860a9eefcaSMatthew Dillon 		 * Issue callback synchronously even though this isn't
17870a9eefcaSMatthew Dillon 		 * the receiver thread.  We need to issue the callback
17880a9eefcaSMatthew Dillon 		 * before removing state from the subq in order to allow
17890a9eefcaSMatthew Dillon 		 * the callback to reply.
17900a9eefcaSMatthew Dillon 		 */
17910a9eefcaSMatthew Dillon 		iocom = state->iocom;
17920a9eefcaSMatthew Dillon 		dmsg_state_msgrx(msg, 1);
17930a9eefcaSMatthew Dillon 		pthread_mutex_unlock(&iocom->mtx);
17940a9eefcaSMatthew Dillon 		iocom->rcvmsg_callback(msg);
17950a9eefcaSMatthew Dillon 		pthread_mutex_lock(&iocom->mtx);
17960a9eefcaSMatthew Dillon 		dmsg_state_cleanuprx(iocom, msg);
17970a9eefcaSMatthew Dillon #if 0
1798323c0947SMatthew Dillon 		TAILQ_INSERT_TAIL(&iocom->ioq_rx.msgq, msg, qentry);
1799323c0947SMatthew Dillon 		atomic_set_int(&iocom->flags, DMSG_IOCOMF_RWORK);
18000a9eefcaSMatthew Dillon #endif
18010a9eefcaSMatthew Dillon 	}
18020a9eefcaSMatthew Dillon }
18030a9eefcaSMatthew Dillon 
18040a9eefcaSMatthew Dillon 
18050a9eefcaSMatthew Dillon /*
18060a9eefcaSMatthew Dillon  * Recursively sets DMSG_STATE_DYING on state and all sub-states, preventing
18070a9eefcaSMatthew Dillon  * the transmission of any new messages on these states.  This is done
18080a9eefcaSMatthew Dillon  * atomically when parent state is terminating, whereas setting ABORTING is
18090a9eefcaSMatthew Dillon  * not atomic and can leak races.
18100a9eefcaSMatthew Dillon  */
18110a9eefcaSMatthew Dillon static
18120a9eefcaSMatthew Dillon void
18130a9eefcaSMatthew Dillon dmsg_state_dying(dmsg_state_t *state)
18140a9eefcaSMatthew Dillon {
18150a9eefcaSMatthew Dillon 	dmsg_state_t *scan;
18160a9eefcaSMatthew Dillon 
18170a9eefcaSMatthew Dillon 	if ((state->flags & DMSG_STATE_DYING) == 0) {
18180a9eefcaSMatthew Dillon 		state->flags |= DMSG_STATE_DYING;
18190a9eefcaSMatthew Dillon 		TAILQ_FOREACH(scan, &state->subq, entry)
18200a9eefcaSMatthew Dillon 			dmsg_state_dying(scan);
1821323c0947SMatthew Dillon 	}
1822323c0947SMatthew Dillon }
1823323c0947SMatthew Dillon 
1824323c0947SMatthew Dillon /*
18250c3a8cd0SMatthew Dillon  * This is a shortcut to formulate a reply to msg with a simple error code,
18260c3a8cd0SMatthew Dillon  * It can reply to and terminate a transaction, or it can reply to a one-way
18270c3a8cd0SMatthew Dillon  * messages.  A DMSG_LNK_ERROR command code is utilized to encode
18280c3a8cd0SMatthew Dillon  * the error code (which can be 0).  Not all transactions are terminated
18290c3a8cd0SMatthew Dillon  * with DMSG_LNK_ERROR status (the low level only cares about the
18300c3a8cd0SMatthew Dillon  * MSGF_DELETE flag), but most are.
18310c3a8cd0SMatthew Dillon  *
18320c3a8cd0SMatthew Dillon  * Replies to one-way messages are a bit of an oxymoron but the feature
18330c3a8cd0SMatthew Dillon  * is used by the debug (DBG) protocol.
18340c3a8cd0SMatthew Dillon  *
18350c3a8cd0SMatthew Dillon  * The reply contains no extended data.
18360c3a8cd0SMatthew Dillon  */
18370c3a8cd0SMatthew Dillon void
18380c3a8cd0SMatthew Dillon dmsg_msg_reply(dmsg_msg_t *msg, uint32_t error)
18390c3a8cd0SMatthew Dillon {
18400c3a8cd0SMatthew Dillon 	dmsg_state_t *state = msg->state;
18410c3a8cd0SMatthew Dillon 	dmsg_msg_t *nmsg;
18420c3a8cd0SMatthew Dillon 	uint32_t cmd;
18430c3a8cd0SMatthew Dillon 
18440c3a8cd0SMatthew Dillon 	/*
18450c3a8cd0SMatthew Dillon 	 * Reply with a simple error code and terminate the transaction.
18460c3a8cd0SMatthew Dillon 	 */
18470c3a8cd0SMatthew Dillon 	cmd = DMSG_LNK_ERROR;
18480c3a8cd0SMatthew Dillon 
18490c3a8cd0SMatthew Dillon 	/*
18500c3a8cd0SMatthew Dillon 	 * Check if our direction has even been initiated yet, set CREATE.
18510c3a8cd0SMatthew Dillon 	 *
18520c3a8cd0SMatthew Dillon 	 * Check what direction this is (command or reply direction).  Note
18530c3a8cd0SMatthew Dillon 	 * that txcmd might not have been initiated yet.
18540c3a8cd0SMatthew Dillon 	 *
18550c3a8cd0SMatthew Dillon 	 * If our direction has already been closed we just return without
18560c3a8cd0SMatthew Dillon 	 * doing anything.
18570c3a8cd0SMatthew Dillon 	 */
1858d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
18590c3a8cd0SMatthew Dillon 		if (state->txcmd & DMSGF_DELETE)
18600c3a8cd0SMatthew Dillon 			return;
18610c3a8cd0SMatthew Dillon 		if (state->txcmd & DMSGF_REPLY)
18620c3a8cd0SMatthew Dillon 			cmd |= DMSGF_REPLY;
18630c3a8cd0SMatthew Dillon 		cmd |= DMSGF_DELETE;
18640c3a8cd0SMatthew Dillon 	} else {
18650c3a8cd0SMatthew Dillon 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
18660c3a8cd0SMatthew Dillon 			cmd |= DMSGF_REPLY;
18670c3a8cd0SMatthew Dillon 	}
18680c3a8cd0SMatthew Dillon 
18690c3a8cd0SMatthew Dillon 	/*
18700c3a8cd0SMatthew Dillon 	 * Allocate the message and associate it with the existing state.
18710d20ec8aSMatthew Dillon 	 * We cannot pass DMSGF_CREATE to msg_alloc() because that may
18720c3a8cd0SMatthew Dillon 	 * allocate new state.  We have our state already.
18730c3a8cd0SMatthew Dillon 	 */
18741b8eded1SMatthew Dillon 	nmsg = dmsg_msg_alloc(state, 0, cmd, NULL, NULL);
1875d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
18760c3a8cd0SMatthew Dillon 		if ((state->txcmd & DMSGF_CREATE) == 0)
18770c3a8cd0SMatthew Dillon 			nmsg->any.head.cmd |= DMSGF_CREATE;
18780c3a8cd0SMatthew Dillon 	}
18790c3a8cd0SMatthew Dillon 	nmsg->any.head.error = error;
18801b8eded1SMatthew Dillon 
18810c3a8cd0SMatthew Dillon 	dmsg_msg_write(nmsg);
18820c3a8cd0SMatthew Dillon }
18830c3a8cd0SMatthew Dillon 
18840c3a8cd0SMatthew Dillon /*
18850c3a8cd0SMatthew Dillon  * Similar to dmsg_msg_reply() but leave the transaction open.  That is,
18860c3a8cd0SMatthew Dillon  * we are generating a streaming reply or an intermediate acknowledgement
18870c3a8cd0SMatthew Dillon  * of some sort as part of the higher level protocol, with more to come
18880c3a8cd0SMatthew Dillon  * later.
18890c3a8cd0SMatthew Dillon  */
18900c3a8cd0SMatthew Dillon void
18910c3a8cd0SMatthew Dillon dmsg_msg_result(dmsg_msg_t *msg, uint32_t error)
18920c3a8cd0SMatthew Dillon {
18930c3a8cd0SMatthew Dillon 	dmsg_state_t *state = msg->state;
18940c3a8cd0SMatthew Dillon 	dmsg_msg_t *nmsg;
18950c3a8cd0SMatthew Dillon 	uint32_t cmd;
18960c3a8cd0SMatthew Dillon 
18970c3a8cd0SMatthew Dillon 
18980c3a8cd0SMatthew Dillon 	/*
18990c3a8cd0SMatthew Dillon 	 * Reply with a simple error code and terminate the transaction.
19000c3a8cd0SMatthew Dillon 	 */
19010c3a8cd0SMatthew Dillon 	cmd = DMSG_LNK_ERROR;
19020c3a8cd0SMatthew Dillon 
19030c3a8cd0SMatthew Dillon 	/*
19040c3a8cd0SMatthew Dillon 	 * Check if our direction has even been initiated yet, set CREATE.
19050c3a8cd0SMatthew Dillon 	 *
19060c3a8cd0SMatthew Dillon 	 * Check what direction this is (command or reply direction).  Note
19070c3a8cd0SMatthew Dillon 	 * that txcmd might not have been initiated yet.
19080c3a8cd0SMatthew Dillon 	 *
19090c3a8cd0SMatthew Dillon 	 * If our direction has already been closed we just return without
19100c3a8cd0SMatthew Dillon 	 * doing anything.
19110c3a8cd0SMatthew Dillon 	 */
1912d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
19130c3a8cd0SMatthew Dillon 		if (state->txcmd & DMSGF_DELETE)
19140c3a8cd0SMatthew Dillon 			return;
19150c3a8cd0SMatthew Dillon 		if (state->txcmd & DMSGF_REPLY)
19160c3a8cd0SMatthew Dillon 			cmd |= DMSGF_REPLY;
19170c3a8cd0SMatthew Dillon 		/* continuing transaction, do not set MSGF_DELETE */
19180c3a8cd0SMatthew Dillon 	} else {
19190c3a8cd0SMatthew Dillon 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
19200c3a8cd0SMatthew Dillon 			cmd |= DMSGF_REPLY;
19210c3a8cd0SMatthew Dillon 	}
19221b8eded1SMatthew Dillon 	nmsg = dmsg_msg_alloc(state, 0, cmd, NULL, NULL);
1923d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
19240c3a8cd0SMatthew Dillon 		if ((state->txcmd & DMSGF_CREATE) == 0)
19250c3a8cd0SMatthew Dillon 			nmsg->any.head.cmd |= DMSGF_CREATE;
19260c3a8cd0SMatthew Dillon 	}
19270c3a8cd0SMatthew Dillon 	nmsg->any.head.error = error;
19281b8eded1SMatthew Dillon 
19290c3a8cd0SMatthew Dillon 	dmsg_msg_write(nmsg);
19300c3a8cd0SMatthew Dillon }
19310c3a8cd0SMatthew Dillon 
19320c3a8cd0SMatthew Dillon /*
19330c3a8cd0SMatthew Dillon  * Terminate a transaction given a state structure by issuing a DELETE.
19341b8eded1SMatthew Dillon  * (the state structure must not be &iocom->state0)
19350c3a8cd0SMatthew Dillon  */
19360c3a8cd0SMatthew Dillon void
19370c3a8cd0SMatthew Dillon dmsg_state_reply(dmsg_state_t *state, uint32_t error)
19380c3a8cd0SMatthew Dillon {
19390c3a8cd0SMatthew Dillon 	dmsg_msg_t *nmsg;
19400c3a8cd0SMatthew Dillon 	uint32_t cmd = DMSG_LNK_ERROR | DMSGF_DELETE;
19410c3a8cd0SMatthew Dillon 
19420c3a8cd0SMatthew Dillon 	/*
19430c3a8cd0SMatthew Dillon 	 * Nothing to do if we already transmitted a delete
19440c3a8cd0SMatthew Dillon 	 */
19450c3a8cd0SMatthew Dillon 	if (state->txcmd & DMSGF_DELETE)
19460c3a8cd0SMatthew Dillon 		return;
19470c3a8cd0SMatthew Dillon 
19480c3a8cd0SMatthew Dillon 	/*
19490c3a8cd0SMatthew Dillon 	 * Set REPLY if the other end initiated the command.  Otherwise
19500c3a8cd0SMatthew Dillon 	 * we are the command direction.
19510c3a8cd0SMatthew Dillon 	 */
19520c3a8cd0SMatthew Dillon 	if (state->txcmd & DMSGF_REPLY)
19530c3a8cd0SMatthew Dillon 		cmd |= DMSGF_REPLY;
19540c3a8cd0SMatthew Dillon 
19551b8eded1SMatthew Dillon 	nmsg = dmsg_msg_alloc(state, 0, cmd, NULL, NULL);
1956d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
19570c3a8cd0SMatthew Dillon 		if ((state->txcmd & DMSGF_CREATE) == 0)
19580c3a8cd0SMatthew Dillon 			nmsg->any.head.cmd |= DMSGF_CREATE;
19590c3a8cd0SMatthew Dillon 	}
19600c3a8cd0SMatthew Dillon 	nmsg->any.head.error = error;
19610d20ec8aSMatthew Dillon 	dmsg_msg_write(nmsg);
19620d20ec8aSMatthew Dillon }
19630d20ec8aSMatthew Dillon 
19640d20ec8aSMatthew Dillon /*
19650d20ec8aSMatthew Dillon  * Terminate a transaction given a state structure by issuing a DELETE.
19661b8eded1SMatthew Dillon  * (the state structure must not be &iocom->state0)
19670d20ec8aSMatthew Dillon  */
19680d20ec8aSMatthew Dillon void
19690d20ec8aSMatthew Dillon dmsg_state_result(dmsg_state_t *state, uint32_t error)
19700d20ec8aSMatthew Dillon {
19710d20ec8aSMatthew Dillon 	dmsg_msg_t *nmsg;
19720d20ec8aSMatthew Dillon 	uint32_t cmd = DMSG_LNK_ERROR;
19730d20ec8aSMatthew Dillon 
19740d20ec8aSMatthew Dillon 	/*
19750d20ec8aSMatthew Dillon 	 * Nothing to do if we already transmitted a delete
19760d20ec8aSMatthew Dillon 	 */
19770d20ec8aSMatthew Dillon 	if (state->txcmd & DMSGF_DELETE)
19780d20ec8aSMatthew Dillon 		return;
19790d20ec8aSMatthew Dillon 
19800d20ec8aSMatthew Dillon 	/*
19810d20ec8aSMatthew Dillon 	 * Set REPLY if the other end initiated the command.  Otherwise
19820d20ec8aSMatthew Dillon 	 * we are the command direction.
19830d20ec8aSMatthew Dillon 	 */
19840d20ec8aSMatthew Dillon 	if (state->txcmd & DMSGF_REPLY)
19850d20ec8aSMatthew Dillon 		cmd |= DMSGF_REPLY;
19860d20ec8aSMatthew Dillon 
19871b8eded1SMatthew Dillon 	nmsg = dmsg_msg_alloc(state, 0, cmd, NULL, NULL);
1988d30cab67SMatthew Dillon 	if ((state->flags & DMSG_STATE_ROOT) == 0) {
19890d20ec8aSMatthew Dillon 		if ((state->txcmd & DMSGF_CREATE) == 0)
19900d20ec8aSMatthew Dillon 			nmsg->any.head.cmd |= DMSGF_CREATE;
19910d20ec8aSMatthew Dillon 	}
19920d20ec8aSMatthew Dillon 	nmsg->any.head.error = error;
19930c3a8cd0SMatthew Dillon 	dmsg_msg_write(nmsg);
19940c3a8cd0SMatthew Dillon }
19950c3a8cd0SMatthew Dillon 
19960c3a8cd0SMatthew Dillon /************************************************************************
19970c3a8cd0SMatthew Dillon  *			TRANSACTION STATE HANDLING			*
19980c3a8cd0SMatthew Dillon  ************************************************************************
19990c3a8cd0SMatthew Dillon  *
20000c3a8cd0SMatthew Dillon  */
20010c3a8cd0SMatthew Dillon 
20020c3a8cd0SMatthew Dillon /*
2003d30cab67SMatthew Dillon  * Process state tracking for a message after reception, prior to execution.
2004d30cab67SMatthew Dillon  * Possibly route the message (consuming it).
20050c3a8cd0SMatthew Dillon  *
20060c3a8cd0SMatthew Dillon  * Called with msglk held and the msg dequeued.
20070c3a8cd0SMatthew Dillon  *
20080c3a8cd0SMatthew Dillon  * All messages are called with dummy state and return actual state.
20090c3a8cd0SMatthew Dillon  * (One-off messages often just return the same dummy state).
20100c3a8cd0SMatthew Dillon  *
20110c3a8cd0SMatthew Dillon  * May request that caller discard the message by setting *discardp to 1.
20120c3a8cd0SMatthew Dillon  * The returned state is not used in this case and is allowed to be NULL.
20130c3a8cd0SMatthew Dillon  *
20140c3a8cd0SMatthew Dillon  * --
20150c3a8cd0SMatthew Dillon  *
20160c3a8cd0SMatthew Dillon  * These routines handle persistent and command/reply message state via the
20170c3a8cd0SMatthew Dillon  * CREATE and DELETE flags.  The first message in a command or reply sequence
20180c3a8cd0SMatthew Dillon  * sets CREATE, the last message in a command or reply sequence sets DELETE.
20190c3a8cd0SMatthew Dillon  *
20200c3a8cd0SMatthew Dillon  * There can be any number of intermediate messages belonging to the same
20210c3a8cd0SMatthew Dillon  * sequence sent inbetween the CREATE message and the DELETE message,
20220c3a8cd0SMatthew Dillon  * which set neither flag.  This represents a streaming command or reply.
20230c3a8cd0SMatthew Dillon  *
20240c3a8cd0SMatthew Dillon  * Any command message received with CREATE set expects a reply sequence to
20250c3a8cd0SMatthew Dillon  * be returned.  Reply sequences work the same as command sequences except the
20260c3a8cd0SMatthew Dillon  * REPLY bit is also sent.  Both the command side and reply side can
20270c3a8cd0SMatthew Dillon  * degenerate into a single message with both CREATE and DELETE set.  Note
20280c3a8cd0SMatthew Dillon  * that one side can be streaming and the other side not, or neither, or both.
20290c3a8cd0SMatthew Dillon  *
20300c3a8cd0SMatthew Dillon  * The msgid is unique for the initiator.  That is, two sides sending a new
20310c3a8cd0SMatthew Dillon  * message can use the same msgid without colliding.
20320c3a8cd0SMatthew Dillon  *
20330c3a8cd0SMatthew Dillon  * --
20340c3a8cd0SMatthew Dillon  *
2035a06d536bSMatthew Dillon  * The message may be running over a circuit.  If the circuit is half-deleted
2036a06d536bSMatthew Dillon  * The message is typically racing against a link failure and must be thrown
2037a06d536bSMatthew Dillon  * out.  As the circuit deletion propagates the library will automatically
2038a06d536bSMatthew Dillon  * generate terminations for sub states.
2039a06d536bSMatthew Dillon  *
2040a06d536bSMatthew Dillon  * --
2041a06d536bSMatthew Dillon  *
20420c3a8cd0SMatthew Dillon  * ABORT sequences work by setting the ABORT flag along with normal message
20430c3a8cd0SMatthew Dillon  * state.  However, ABORTs can also be sent on half-closed messages, that is
20440c3a8cd0SMatthew Dillon  * even if the command or reply side has already sent a DELETE, as long as
20450c3a8cd0SMatthew Dillon  * the message has not been fully closed it can still send an ABORT+DELETE
20460c3a8cd0SMatthew Dillon  * to terminate the half-closed message state.
20470c3a8cd0SMatthew Dillon  *
20480c3a8cd0SMatthew Dillon  * Since ABORT+DELETEs can race we silently discard ABORT's for message
20490c3a8cd0SMatthew Dillon  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
20500c3a8cd0SMatthew Dillon  * also race, and in this situation the other side might have already
20510c3a8cd0SMatthew Dillon  * initiated a new unrelated command with the same message id.  Since
20520c3a8cd0SMatthew Dillon  * the abort has not set the CREATE flag the situation can be detected
20530c3a8cd0SMatthew Dillon  * and the message will also be discarded.
20540c3a8cd0SMatthew Dillon  *
20550c3a8cd0SMatthew Dillon  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
20560c3a8cd0SMatthew Dillon  * The ABORT request is essentially integrated into the command instead
20570c3a8cd0SMatthew Dillon  * of being sent later on.  In this situation the command implementation
20580c3a8cd0SMatthew Dillon  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
20590c3a8cd0SMatthew Dillon  * special-case non-blocking operation for the command.
20600c3a8cd0SMatthew Dillon  *
20610c3a8cd0SMatthew Dillon  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
20620c3a8cd0SMatthew Dillon  *	  to be mid-stream aborts for command/reply sequences.  ABORTs on
20630c3a8cd0SMatthew Dillon  *	  one-way messages are not supported.
20640c3a8cd0SMatthew Dillon  *
20650c3a8cd0SMatthew Dillon  * NOTE!  If a command sequence does not support aborts the ABORT flag is
20660c3a8cd0SMatthew Dillon  *	  simply ignored.
20670c3a8cd0SMatthew Dillon  *
20680c3a8cd0SMatthew Dillon  * --
20690c3a8cd0SMatthew Dillon  *
2070d30cab67SMatthew Dillon  * One-off messages (no reply expected) are sent without an established
2071d30cab67SMatthew Dillon  * transaction.  CREATE and DELETE are left clear and the msgid is usually 0.
2072d30cab67SMatthew Dillon  * For one-off messages sent over circuits msgid generally MUST be 0.
2073d30cab67SMatthew Dillon  *
2074d30cab67SMatthew Dillon  * One-off messages cannot be aborted and typically aren't processed
2075d30cab67SMatthew Dillon  * by these routines.  Order is still guaranteed for messages sent over
2076d30cab67SMatthew Dillon  * the same circuit.  The REPLY bit can be used to distinguish whether
2077d30cab67SMatthew Dillon  * a one-off message is a command or reply.  For example, one-off replies
20780c3a8cd0SMatthew Dillon  * will typically just contain status updates.
20790c3a8cd0SMatthew Dillon  */
20800c3a8cd0SMatthew Dillon static int
20810a9eefcaSMatthew Dillon dmsg_state_msgrx(dmsg_msg_t *msg, int mstate)
20820c3a8cd0SMatthew Dillon {
20831b8eded1SMatthew Dillon 	dmsg_iocom_t *iocom = msg->state->iocom;
20840c3a8cd0SMatthew Dillon 	dmsg_state_t *state;
20851b8eded1SMatthew Dillon 	dmsg_state_t *pstate;
20860d20ec8aSMatthew Dillon 	dmsg_state_t sdummy;
20870c3a8cd0SMatthew Dillon 	int error;
20880c3a8cd0SMatthew Dillon 
20890d20ec8aSMatthew Dillon 	pthread_mutex_lock(&iocom->mtx);
20900d20ec8aSMatthew Dillon 
20910a9eefcaSMatthew Dillon 	if (DMsgDebugOpt) {
20925ab1caedSMatthew Dillon 		dmio_printf(iocom, 5,
20930a9eefcaSMatthew Dillon 			    "msgrx: cmd=%08x msgid=%016jx "
20940a9eefcaSMatthew Dillon 			    "circuit=%016jx error=%d\n",
20950a9eefcaSMatthew Dillon 			    msg->any.head.cmd,
20960a9eefcaSMatthew Dillon 			    msg->any.head.msgid,
20970a9eefcaSMatthew Dillon 			    msg->any.head.circuit,
20980a9eefcaSMatthew Dillon 			    msg->any.head.error);
20990a9eefcaSMatthew Dillon 	}
21000a9eefcaSMatthew Dillon 
21010c3a8cd0SMatthew Dillon 	/*
2102d30cab67SMatthew Dillon 	 * Lookup the circuit (pstate).  The circuit will be an open
2103d30cab67SMatthew Dillon 	 * transaction.  The REVCIRC bit in the message tells us which side
2104d30cab67SMatthew Dillon 	 * initiated it.
21050a9eefcaSMatthew Dillon 	 *
21060a9eefcaSMatthew Dillon 	 * If mstate is non-zero the state has already been incorporated
21070a9eefcaSMatthew Dillon 	 * into the message as part of a simulated abort.  Note that in this
21080a9eefcaSMatthew Dillon 	 * situation the parent state may have already been removed from
21090a9eefcaSMatthew Dillon 	 * the RBTREE.
21101b8eded1SMatthew Dillon 	 */
21110a9eefcaSMatthew Dillon 	if (mstate) {
21120a9eefcaSMatthew Dillon 		pstate = msg->state->parent;
21130a9eefcaSMatthew Dillon 	} else if (msg->any.head.circuit) {
21141b8eded1SMatthew Dillon 		sdummy.msgid = msg->any.head.circuit;
21151b8eded1SMatthew Dillon 
21161b8eded1SMatthew Dillon 		if (msg->any.head.cmd & DMSGF_REVCIRC) {
21171b8eded1SMatthew Dillon 			pstate = RB_FIND(dmsg_state_tree,
21181b8eded1SMatthew Dillon 					 &iocom->statewr_tree,
21191b8eded1SMatthew Dillon 					 &sdummy);
21201b8eded1SMatthew Dillon 		} else {
21211b8eded1SMatthew Dillon 			pstate = RB_FIND(dmsg_state_tree,
21221b8eded1SMatthew Dillon 					 &iocom->staterd_tree,
21231b8eded1SMatthew Dillon 					 &sdummy);
21241b8eded1SMatthew Dillon 		}
21250a9eefcaSMatthew Dillon 
21260a9eefcaSMatthew Dillon 		/*
21270a9eefcaSMatthew Dillon 		 * If we cannot find the circuit throw the message away.
21280a9eefcaSMatthew Dillon 		 * The state will have already been taken care of by
21290a9eefcaSMatthew Dillon 		 * the simulated failure code.  This case can occur due
21300a9eefcaSMatthew Dillon 		 * to a failure propagating in one direction crossing a
21310a9eefcaSMatthew Dillon 		 * request on the failed circuit propagating in the other
21320a9eefcaSMatthew Dillon 		 * direction.
21330a9eefcaSMatthew Dillon 		 */
21341b8eded1SMatthew Dillon 		if (pstate == NULL) {
21355ab1caedSMatthew Dillon 			dmio_printf(iocom, 4,
21361b8eded1SMatthew Dillon 				    "missing parent in stacked trans %s\n",
21371b8eded1SMatthew Dillon 				    dmsg_msg_str(msg));
21381b8eded1SMatthew Dillon 			pthread_mutex_unlock(&iocom->mtx);
21390a9eefcaSMatthew Dillon 			error = DMSG_IOQ_ERROR_EALREADY;
21400a9eefcaSMatthew Dillon 
21410a9eefcaSMatthew Dillon 			return error;
21421b8eded1SMatthew Dillon 		}
21431b8eded1SMatthew Dillon 	} else {
21441b8eded1SMatthew Dillon 		pstate = &iocom->state0;
21451b8eded1SMatthew Dillon 	}
21460a9eefcaSMatthew Dillon 	/* WARNING: pstate not (yet) refd */
21471b8eded1SMatthew Dillon 
21481b8eded1SMatthew Dillon 	/*
2149d30cab67SMatthew Dillon 	 * Lookup the msgid.
2150d30cab67SMatthew Dillon 	 *
21510a9eefcaSMatthew Dillon 	 * If mstate is non-zero the state has already been incorporated
21520a9eefcaSMatthew Dillon 	 * into the message as part of a simulated abort.  Note that in this
21530a9eefcaSMatthew Dillon 	 * situation the state may have already been removed from the RBTREE.
21540a9eefcaSMatthew Dillon 	 *
2155d30cab67SMatthew Dillon 	 * If received msg is a command state is on staterd_tree.
2156d30cab67SMatthew Dillon 	 * If received msg is a reply state is on statewr_tree.
2157d30cab67SMatthew Dillon 	 * Otherwise there is no state (retain &iocom->state0)
2158d30cab67SMatthew Dillon 	 */
21590a9eefcaSMatthew Dillon 	if (mstate) {
21600a9eefcaSMatthew Dillon 		state = msg->state;
21610a9eefcaSMatthew Dillon 	} else {
2162d30cab67SMatthew Dillon 		sdummy.msgid = msg->any.head.msgid;
21630a9eefcaSMatthew Dillon 		if (msg->any.head.cmd & DMSGF_REVTRANS) {
21640a9eefcaSMatthew Dillon 			state = RB_FIND(dmsg_state_tree,
21650a9eefcaSMatthew Dillon 					&iocom->statewr_tree, &sdummy);
21660a9eefcaSMatthew Dillon 		} else {
21670a9eefcaSMatthew Dillon 			state = RB_FIND(dmsg_state_tree,
21680a9eefcaSMatthew Dillon 					&iocom->staterd_tree, &sdummy);
21690a9eefcaSMatthew Dillon 		}
21700a9eefcaSMatthew Dillon 	}
2171d30cab67SMatthew Dillon 
21720a9eefcaSMatthew Dillon 	if (DMsgDebugOpt) {
21735ab1caedSMatthew Dillon 		dmio_printf(iocom, 5, "msgrx:\tstate %p(%08x)",
21740a9eefcaSMatthew Dillon 			    state, (state ? state->icmd : 0));
21750a9eefcaSMatthew Dillon 		if (pstate != &iocom->state0) {
21765ab1caedSMatthew Dillon 			dmio_printf(iocom, 5,
21770a9eefcaSMatthew Dillon 				    " pstate %p(%08x)",
21780a9eefcaSMatthew Dillon 				    pstate, pstate->icmd);
21790a9eefcaSMatthew Dillon 		}
21805ab1caedSMatthew Dillon 		dmio_printf(iocom, 5, "%s\n", "");
21810a9eefcaSMatthew Dillon 	}
21820a9eefcaSMatthew Dillon 
21830a9eefcaSMatthew Dillon 	if (mstate) {
21840a9eefcaSMatthew Dillon 		/* state already assigned to msg */
21850a9eefcaSMatthew Dillon 	} else if (state) {
2186d30cab67SMatthew Dillon 		/*
2187d30cab67SMatthew Dillon 		 * Message over an existing transaction (CREATE should not
2188d30cab67SMatthew Dillon 		 * be set).
2189d30cab67SMatthew Dillon 		 */
21900a9eefcaSMatthew Dillon 		dmsg_state_drop(msg->state);
21910a9eefcaSMatthew Dillon 		dmsg_state_hold(state);
2192d30cab67SMatthew Dillon 		msg->state = state;
2193d30cab67SMatthew Dillon 		assert(pstate == state->parent);
2194d30cab67SMatthew Dillon 	} else {
2195d30cab67SMatthew Dillon 		/*
2196d30cab67SMatthew Dillon 		 * Either a new transaction (if CREATE set) or a one-off.
2197d30cab67SMatthew Dillon 		 */
2198d30cab67SMatthew Dillon 		state = pstate;
2199d30cab67SMatthew Dillon 	}
2200d30cab67SMatthew Dillon 
2201d30cab67SMatthew Dillon 	/*
2202d30cab67SMatthew Dillon 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
2203d30cab67SMatthew Dillon 	 * inside the case statements.
2204d30cab67SMatthew Dillon 	 *
2205d30cab67SMatthew Dillon 	 * Construct new state as necessary.
2206d30cab67SMatthew Dillon 	 */
2207d30cab67SMatthew Dillon 	switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
2208d30cab67SMatthew Dillon 				    DMSGF_REPLY)) {
2209d30cab67SMatthew Dillon 	case DMSGF_CREATE:
2210d30cab67SMatthew Dillon 	case DMSGF_CREATE | DMSGF_DELETE:
2211d30cab67SMatthew Dillon 		/*
2212d30cab67SMatthew Dillon 		 * Create new sub-transaction under pstate.
2213d30cab67SMatthew Dillon 		 * (any DELETE is handled in post-processing of msg).
2214d30cab67SMatthew Dillon 		 *
2215d30cab67SMatthew Dillon 		 * (During routing the msgid was made unique for this
2216d30cab67SMatthew Dillon 		 * direction over the comlink, so our RB trees can be
2217d30cab67SMatthew Dillon 		 * iocom-based instead of state-based).
2218d30cab67SMatthew Dillon 		 */
2219d30cab67SMatthew Dillon 		if (state != pstate) {
22205ab1caedSMatthew Dillon 			dmio_printf(iocom, 2,
2221d30cab67SMatthew Dillon 				    "duplicate transaction %s\n",
2222d30cab67SMatthew Dillon 				    dmsg_msg_str(msg));
2223d30cab67SMatthew Dillon 			error = DMSG_IOQ_ERROR_TRANS;
2224d30cab67SMatthew Dillon 			assert(0);
2225d30cab67SMatthew Dillon 			break;
2226d30cab67SMatthew Dillon 		}
2227d30cab67SMatthew Dillon 
2228d30cab67SMatthew Dillon 		/*
2229d30cab67SMatthew Dillon 		 * Allocate the new state.
22301b8eded1SMatthew Dillon 		 */
22310c3a8cd0SMatthew Dillon 		state = malloc(sizeof(*state));
22320c3a8cd0SMatthew Dillon 		bzero(state, sizeof(*state));
22330a9eefcaSMatthew Dillon 		atomic_add_int(&dmsg_state_count, 1);
22340a9eefcaSMatthew Dillon 
22351b8eded1SMatthew Dillon 		TAILQ_INIT(&state->subq);
2236323c0947SMatthew Dillon 		dmsg_state_hold(pstate);
22371b8eded1SMatthew Dillon 		state->parent = pstate;
22380c3a8cd0SMatthew Dillon 		state->iocom = iocom;
22391b8eded1SMatthew Dillon 		state->flags = DMSG_STATE_DYNAMIC |
2240d30cab67SMatthew Dillon 			       DMSG_STATE_OPPOSITE;
22411b8eded1SMatthew Dillon 		state->msgid = msg->any.head.msgid;
22420c3a8cd0SMatthew Dillon 		state->txcmd = DMSGF_REPLY;
22430c3a8cd0SMatthew Dillon 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
22440d20ec8aSMatthew Dillon 		state->icmd = state->rxcmd & DMSGF_BASECMDMASK;
22450a9eefcaSMatthew Dillon 		state->flags &= ~DMSG_STATE_NEW;
22460c3a8cd0SMatthew Dillon 		msg->state = state;
22470a9eefcaSMatthew Dillon 
22481b8eded1SMatthew Dillon 		RB_INSERT(dmsg_state_tree, &iocom->staterd_tree, state);
22490a9eefcaSMatthew Dillon 		if (TAILQ_EMPTY(&pstate->subq))
22500a9eefcaSMatthew Dillon 			dmsg_state_hold(pstate);/* pstate->subq */
22511b8eded1SMatthew Dillon 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
2252a06d536bSMatthew Dillon 		state->flags |= DMSG_STATE_SUBINSERTED |
2253a06d536bSMatthew Dillon 				DMSG_STATE_RBINSERTED;
22540a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* pstate->subq */
22550a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* state on rbtree */
22560a9eefcaSMatthew Dillon 		dmsg_state_hold(state);		/* msg->state */
2257d30cab67SMatthew Dillon 
2258d30cab67SMatthew Dillon 		/*
2259d30cab67SMatthew Dillon 		 * If the parent is a relay set up the state handler to
2260d30cab67SMatthew Dillon 		 * automatically route the message.  Local processing will
2261d30cab67SMatthew Dillon 		 * not occur if set.
2262d30cab67SMatthew Dillon 		 *
2263d30cab67SMatthew Dillon 		 * (state relays are seeded by SPAN processing)
2264d30cab67SMatthew Dillon 		 */
2265d30cab67SMatthew Dillon 		if (pstate->relay)
2266d30cab67SMatthew Dillon 			state->func = dmsg_state_relay;
22670c3a8cd0SMatthew Dillon 		error = 0;
22680c3a8cd0SMatthew Dillon 		break;
22690c3a8cd0SMatthew Dillon 	case DMSGF_DELETE:
22700c3a8cd0SMatthew Dillon 		/*
22710c3a8cd0SMatthew Dillon 		 * Persistent state is expected but might not exist if an
22720c3a8cd0SMatthew Dillon 		 * ABORT+DELETE races the close.
2273d30cab67SMatthew Dillon 		 *
2274d30cab67SMatthew Dillon 		 * (any DELETE is handled in post-processing of msg).
22750c3a8cd0SMatthew Dillon 		 */
2276d30cab67SMatthew Dillon 		if (state == pstate) {
22770c3a8cd0SMatthew Dillon 			if (msg->any.head.cmd & DMSGF_ABORT) {
22780c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
22790c3a8cd0SMatthew Dillon 			} else {
22805ab1caedSMatthew Dillon 				dmio_printf(iocom, 2,
22815ab1caedSMatthew Dillon 					    "missing-state %s\n",
22820c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
22830c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_TRANS;
22840c3a8cd0SMatthew Dillon 				assert(0);
22850c3a8cd0SMatthew Dillon 			}
22860c3a8cd0SMatthew Dillon 			break;
22870c3a8cd0SMatthew Dillon 		}
22880c3a8cd0SMatthew Dillon 
22890c3a8cd0SMatthew Dillon 		/*
22900c3a8cd0SMatthew Dillon 		 * Handle another ABORT+DELETE case if the msgid has already
22910c3a8cd0SMatthew Dillon 		 * been reused.
22920c3a8cd0SMatthew Dillon 		 */
22930c3a8cd0SMatthew Dillon 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
22940c3a8cd0SMatthew Dillon 			if (msg->any.head.cmd & DMSGF_ABORT) {
22950c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
22960c3a8cd0SMatthew Dillon 			} else {
22975ab1caedSMatthew Dillon 				dmio_printf(iocom, 2,
22985ab1caedSMatthew Dillon 					    "reused-state %s\n",
22990c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
23000c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_TRANS;
23010c3a8cd0SMatthew Dillon 				assert(0);
23020c3a8cd0SMatthew Dillon 			}
23030c3a8cd0SMatthew Dillon 			break;
23040c3a8cd0SMatthew Dillon 		}
23050c3a8cd0SMatthew Dillon 		error = 0;
23060c3a8cd0SMatthew Dillon 		break;
23070c3a8cd0SMatthew Dillon 	default:
23080c3a8cd0SMatthew Dillon 		/*
23090c3a8cd0SMatthew Dillon 		 * Check for mid-stream ABORT command received, otherwise
23100c3a8cd0SMatthew Dillon 		 * allow.
23110c3a8cd0SMatthew Dillon 		 */
23120c3a8cd0SMatthew Dillon 		if (msg->any.head.cmd & DMSGF_ABORT) {
2313d30cab67SMatthew Dillon 			if ((state == pstate) ||
23140c3a8cd0SMatthew Dillon 			    (state->rxcmd & DMSGF_CREATE) == 0) {
23150c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
23160c3a8cd0SMatthew Dillon 				break;
23170c3a8cd0SMatthew Dillon 			}
23180c3a8cd0SMatthew Dillon 		}
23190c3a8cd0SMatthew Dillon 		error = 0;
23200c3a8cd0SMatthew Dillon 		break;
23210c3a8cd0SMatthew Dillon 	case DMSGF_REPLY | DMSGF_CREATE:
23220c3a8cd0SMatthew Dillon 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
23230c3a8cd0SMatthew Dillon 		/*
23240c3a8cd0SMatthew Dillon 		 * When receiving a reply with CREATE set the original
23250c3a8cd0SMatthew Dillon 		 * persistent state message should already exist.
23260c3a8cd0SMatthew Dillon 		 */
2327d30cab67SMatthew Dillon 		if (state == pstate) {
23285ab1caedSMatthew Dillon 			dmio_printf(iocom, 2, "no-state(r) %s\n",
23290c3a8cd0SMatthew Dillon 				    dmsg_msg_str(msg));
23300c3a8cd0SMatthew Dillon 			error = DMSG_IOQ_ERROR_TRANS;
23310c3a8cd0SMatthew Dillon 			assert(0);
23320c3a8cd0SMatthew Dillon 			break;
23330c3a8cd0SMatthew Dillon 		}
2334d30cab67SMatthew Dillon 		assert(((state->rxcmd ^ msg->any.head.cmd) & DMSGF_REPLY) == 0);
23350c3a8cd0SMatthew Dillon 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
23360c3a8cd0SMatthew Dillon 		error = 0;
23370c3a8cd0SMatthew Dillon 		break;
23380c3a8cd0SMatthew Dillon 	case DMSGF_REPLY | DMSGF_DELETE:
23390c3a8cd0SMatthew Dillon 		/*
23400c3a8cd0SMatthew Dillon 		 * Received REPLY+ABORT+DELETE in case where msgid has
23410c3a8cd0SMatthew Dillon 		 * already been fully closed, ignore the message.
23420c3a8cd0SMatthew Dillon 		 */
2343d30cab67SMatthew Dillon 		if (state == pstate) {
23440c3a8cd0SMatthew Dillon 			if (msg->any.head.cmd & DMSGF_ABORT) {
23450c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
23460c3a8cd0SMatthew Dillon 			} else {
23475ab1caedSMatthew Dillon 				dmio_printf(iocom, 2,
23485ab1caedSMatthew Dillon 					    "no-state(r,d) %s\n",
23490c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
23500c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_TRANS;
23510c3a8cd0SMatthew Dillon 				assert(0);
23520c3a8cd0SMatthew Dillon 			}
23530c3a8cd0SMatthew Dillon 			break;
23540c3a8cd0SMatthew Dillon 		}
23550c3a8cd0SMatthew Dillon 
23560c3a8cd0SMatthew Dillon 		/*
23570c3a8cd0SMatthew Dillon 		 * Received REPLY+ABORT+DELETE in case where msgid has
23580c3a8cd0SMatthew Dillon 		 * already been reused for an unrelated message,
23590c3a8cd0SMatthew Dillon 		 * ignore the message.
23600c3a8cd0SMatthew Dillon 		 */
23610c3a8cd0SMatthew Dillon 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
23620c3a8cd0SMatthew Dillon 			if (msg->any.head.cmd & DMSGF_ABORT) {
23630c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
23640c3a8cd0SMatthew Dillon 			} else {
23655ab1caedSMatthew Dillon 				dmio_printf(iocom, 2,
23665ab1caedSMatthew Dillon 					    "reused-state(r,d) %s\n",
23670c3a8cd0SMatthew Dillon 					    dmsg_msg_str(msg));
23680c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_TRANS;
23690c3a8cd0SMatthew Dillon 				assert(0);
23700c3a8cd0SMatthew Dillon 			}
23710c3a8cd0SMatthew Dillon 			break;
23720c3a8cd0SMatthew Dillon 		}
23730c3a8cd0SMatthew Dillon 		error = 0;
23740c3a8cd0SMatthew Dillon 		break;
23750c3a8cd0SMatthew Dillon 	case DMSGF_REPLY:
23760c3a8cd0SMatthew Dillon 		/*
23770c3a8cd0SMatthew Dillon 		 * Check for mid-stream ABORT reply received to sent command.
23780c3a8cd0SMatthew Dillon 		 */
23790c3a8cd0SMatthew Dillon 		if (msg->any.head.cmd & DMSGF_ABORT) {
2380d30cab67SMatthew Dillon 			if (state == pstate ||
23810c3a8cd0SMatthew Dillon 			    (state->rxcmd & DMSGF_CREATE) == 0) {
23820c3a8cd0SMatthew Dillon 				error = DMSG_IOQ_ERROR_EALREADY;
23830c3a8cd0SMatthew Dillon 				break;
23840c3a8cd0SMatthew Dillon 			}
23850c3a8cd0SMatthew Dillon 		}
23860c3a8cd0SMatthew Dillon 		error = 0;
23870c3a8cd0SMatthew Dillon 		break;
23880c3a8cd0SMatthew Dillon 	}
23898e226bc8SMatthew Dillon 
23908e226bc8SMatthew Dillon 	/*
23918e226bc8SMatthew Dillon 	 * Calculate the easy-switch() transactional command.  Represents
23928e226bc8SMatthew Dillon 	 * the outer-transaction command for any transaction-create or
23938e226bc8SMatthew Dillon 	 * transaction-delete, and the inner message command for any
23948e226bc8SMatthew Dillon 	 * non-transaction or inside-transaction command.  tcmd will be
23958e226bc8SMatthew Dillon 	 * set to 0 for any messaging error condition.
23968e226bc8SMatthew Dillon 	 *
23978e226bc8SMatthew Dillon 	 * The two can be told apart because outer-transaction commands
23988e226bc8SMatthew Dillon 	 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
23998e226bc8SMatthew Dillon 	 */
24008e226bc8SMatthew Dillon 	if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
24017adbba57SMatthew Dillon 		if ((msg->state->flags & DMSG_STATE_ROOT) == 0) {
24020a9eefcaSMatthew Dillon 			msg->tcmd = (state->icmd & DMSGF_BASECMDMASK) |
24038e226bc8SMatthew Dillon 				    (msg->any.head.cmd & (DMSGF_CREATE |
24048e226bc8SMatthew Dillon 							  DMSGF_DELETE |
24058e226bc8SMatthew Dillon 							  DMSGF_REPLY));
24068e226bc8SMatthew Dillon 		} else {
24078e226bc8SMatthew Dillon 			msg->tcmd = 0;
24088e226bc8SMatthew Dillon 		}
24098e226bc8SMatthew Dillon 	} else {
24108e226bc8SMatthew Dillon 		msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
24118e226bc8SMatthew Dillon 	}
24127adbba57SMatthew Dillon 
24137adbba57SMatthew Dillon #ifdef DMSG_BLOCK_DEBUG
24147adbba57SMatthew Dillon 	switch (msg->tcmd) {
24157adbba57SMatthew Dillon 	case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE:
24167adbba57SMatthew Dillon 	case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE:
24175ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
24185ab1caedSMatthew Dillon 			    "read  BIO %-3d %016jx %d@%016jx\n",
24197adbba57SMatthew Dillon 			    biocount, msg->any.head.msgid,
24207adbba57SMatthew Dillon 			    msg->any.blk_read.bytes,
24217adbba57SMatthew Dillon 			    msg->any.blk_read.offset);
24227adbba57SMatthew Dillon 		break;
24237adbba57SMatthew Dillon 	case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
24247adbba57SMatthew Dillon 	case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
24255ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
24265ab1caedSMatthew Dillon 			    "rread BIO %-3d %016jx %d@%016jx\n",
24277adbba57SMatthew Dillon 			    biocount, msg->any.head.msgid,
24287adbba57SMatthew Dillon 			    msg->any.blk_read.bytes,
24297adbba57SMatthew Dillon 			    msg->any.blk_read.offset);
24307adbba57SMatthew Dillon 		break;
24317adbba57SMatthew Dillon 	default:
24327adbba57SMatthew Dillon 		break;
24337adbba57SMatthew Dillon 	}
24347adbba57SMatthew Dillon #endif
24357adbba57SMatthew Dillon 
24360a9eefcaSMatthew Dillon 	/*
24370a9eefcaSMatthew Dillon 	 * Adjust state, mark receive side as DELETED if appropriate and
24380a9eefcaSMatthew Dillon 	 * adjust RB tree if both sides are DELETED.  cleanuprx handles
24390a9eefcaSMatthew Dillon 	 * the rest after the state callback returns.
24400a9eefcaSMatthew Dillon 	 */
24410a9eefcaSMatthew Dillon 	assert(msg->state->iocom == iocom);
24420a9eefcaSMatthew Dillon 	assert(msg->state == state);
24430a9eefcaSMatthew Dillon 
24440a9eefcaSMatthew Dillon 	if (state->flags & DMSG_STATE_ROOT) {
24450a9eefcaSMatthew Dillon 		/*
24460a9eefcaSMatthew Dillon 		 * Nothing to do for non-transactional messages.
24470a9eefcaSMatthew Dillon 		 */
24480a9eefcaSMatthew Dillon 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
24490a9eefcaSMatthew Dillon 		/*
24500a9eefcaSMatthew Dillon 		 * Message terminating transaction, remove the state from
24510a9eefcaSMatthew Dillon 		 * the RB tree if the full transaction is now complete.
24520a9eefcaSMatthew Dillon 		 * The related state, subq, and parent link is retained
24530a9eefcaSMatthew Dillon 		 * until after the state callback is complete.
24540a9eefcaSMatthew Dillon 		 */
24550a9eefcaSMatthew Dillon 		assert((state->rxcmd & DMSGF_DELETE) == 0);
24560a9eefcaSMatthew Dillon 		state->rxcmd |= DMSGF_DELETE;
24570a9eefcaSMatthew Dillon 		if (state->txcmd & DMSGF_DELETE) {
24580a9eefcaSMatthew Dillon 			assert(state->flags & DMSG_STATE_RBINSERTED);
24590a9eefcaSMatthew Dillon 			if (state->rxcmd & DMSGF_REPLY) {
24600a9eefcaSMatthew Dillon 				assert(msg->any.head.cmd & DMSGF_REPLY);
24610a9eefcaSMatthew Dillon 				RB_REMOVE(dmsg_state_tree,
24620a9eefcaSMatthew Dillon 					  &iocom->statewr_tree, state);
24630a9eefcaSMatthew Dillon 			} else {
24640a9eefcaSMatthew Dillon 				assert((msg->any.head.cmd & DMSGF_REPLY) == 0);
24650a9eefcaSMatthew Dillon 				RB_REMOVE(dmsg_state_tree,
24660a9eefcaSMatthew Dillon 					  &iocom->staterd_tree, state);
24670a9eefcaSMatthew Dillon 			}
24680a9eefcaSMatthew Dillon 			state->flags &= ~DMSG_STATE_RBINSERTED;
24690a9eefcaSMatthew Dillon 			dmsg_state_drop(state);
24700a9eefcaSMatthew Dillon 		}
24710a9eefcaSMatthew Dillon 	}
24720a9eefcaSMatthew Dillon 
24730a9eefcaSMatthew Dillon 	pthread_mutex_unlock(&iocom->mtx);
24740a9eefcaSMatthew Dillon 
24750a9eefcaSMatthew Dillon 	if (DMsgDebugOpt && error)
24765ab1caedSMatthew Dillon 		dmio_printf(iocom, 1, "msgrx: error %d\n", error);
24770a9eefcaSMatthew Dillon 
24780c3a8cd0SMatthew Dillon 	return (error);
24790c3a8cd0SMatthew Dillon }
24800c3a8cd0SMatthew Dillon 
24811b8eded1SMatthew Dillon /*
2482d30cab67SMatthew Dillon  * Route the message and handle pair-state processing.
24831b8eded1SMatthew Dillon  */
2484d30cab67SMatthew Dillon void
2485d30cab67SMatthew Dillon dmsg_state_relay(dmsg_msg_t *lmsg)
24861b8eded1SMatthew Dillon {
2487d30cab67SMatthew Dillon 	dmsg_state_t *lpstate;
2488d30cab67SMatthew Dillon 	dmsg_state_t *rpstate;
2489d30cab67SMatthew Dillon 	dmsg_state_t *lstate;
2490d30cab67SMatthew Dillon 	dmsg_state_t *rstate;
2491d30cab67SMatthew Dillon 	dmsg_msg_t *rmsg;
24921b8eded1SMatthew Dillon 
24937adbba57SMatthew Dillon #ifdef DMSG_BLOCK_DEBUG
24947adbba57SMatthew Dillon 	switch (lmsg->tcmd) {
24950a9eefcaSMatthew Dillon 	case DMSG_BLK_OPEN | DMSGF_CREATE:
24965ab1caedSMatthew Dillon 		dmio_printf(iocom, 4, "%s\n",
24975ab1caedSMatthew Dillon 			    "relay BIO_OPEN (CREATE)");
24980a9eefcaSMatthew Dillon 		break;
24990a9eefcaSMatthew Dillon 	case DMSG_BLK_OPEN | DMSGF_DELETE:
25005ab1caedSMatthew Dillon 		dmio_printf(iocom, 4, "%s\n",
25015ab1caedSMatthew Dillon 			    "relay BIO_OPEN (DELETE)");
25020a9eefcaSMatthew Dillon 		break;
25037adbba57SMatthew Dillon 	case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE:
25047adbba57SMatthew Dillon 	case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE:
25057adbba57SMatthew Dillon 		atomic_add_int(&biocount, 1);
25065ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
25075ab1caedSMatthew Dillon 			    "relay BIO %-3d %016jx %d@%016jx\n",
25087adbba57SMatthew Dillon 			    biocount, lmsg->any.head.msgid,
25097adbba57SMatthew Dillon 			    lmsg->any.blk_read.bytes,
25107adbba57SMatthew Dillon 			    lmsg->any.blk_read.offset);
25117adbba57SMatthew Dillon 		break;
25127adbba57SMatthew Dillon 	case DMSG_BLK_READ | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
25137adbba57SMatthew Dillon 	case DMSG_BLK_WRITE | DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY:
25145ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
25155ab1caedSMatthew Dillon 			    "retrn BIO %-3d %016jx %d@%016jx\n",
25167adbba57SMatthew Dillon 			    biocount, lmsg->any.head.msgid,
25177adbba57SMatthew Dillon 			    lmsg->any.blk_read.bytes,
25187adbba57SMatthew Dillon 			    lmsg->any.blk_read.offset);
25197adbba57SMatthew Dillon 		atomic_add_int(&biocount, -1);
25207adbba57SMatthew Dillon 		break;
25217adbba57SMatthew Dillon 	default:
25227adbba57SMatthew Dillon 		break;
25237adbba57SMatthew Dillon 	}
25247adbba57SMatthew Dillon #endif
25257adbba57SMatthew Dillon 
2526d30cab67SMatthew Dillon 	if ((lmsg->any.head.cmd & (DMSGF_CREATE | DMSGF_REPLY)) ==
2527d30cab67SMatthew Dillon 	    DMSGF_CREATE) {
25281b8eded1SMatthew Dillon 		/*
2529d30cab67SMatthew Dillon 		 * New sub-transaction, establish new state and relay.
25301b8eded1SMatthew Dillon 		 */
2531d30cab67SMatthew Dillon 		lstate = lmsg->state;
2532d30cab67SMatthew Dillon 		lpstate = lstate->parent;
2533d30cab67SMatthew Dillon 		rpstate = lpstate->relay;
2534d30cab67SMatthew Dillon 		assert(lstate->relay == NULL);
2535d30cab67SMatthew Dillon 		assert(rpstate != NULL);
25361b8eded1SMatthew Dillon 
2537e96cef49SMatthew Dillon 		rmsg = dmsg_msg_alloc(rpstate, 0,
2538d30cab67SMatthew Dillon 				      lmsg->any.head.cmd,
2539d30cab67SMatthew Dillon 				      dmsg_state_relay, NULL);
2540d30cab67SMatthew Dillon 		rstate = rmsg->state;
2541d30cab67SMatthew Dillon 		rstate->relay = lstate;
2542d30cab67SMatthew Dillon 		lstate->relay = rstate;
2543323c0947SMatthew Dillon 		dmsg_state_hold(lstate);
2544323c0947SMatthew Dillon 		dmsg_state_hold(rstate);
25451b8eded1SMatthew Dillon 	} else {
25461b8eded1SMatthew Dillon 		/*
2547d30cab67SMatthew Dillon 		 * State & relay already established
25481b8eded1SMatthew Dillon 		 */
2549d30cab67SMatthew Dillon 		lstate = lmsg->state;
2550d30cab67SMatthew Dillon 		rstate = lstate->relay;
2551d30cab67SMatthew Dillon 		assert(rstate != NULL);
2552d30cab67SMatthew Dillon 
25530a9eefcaSMatthew Dillon 		assert((rstate->txcmd & DMSGF_DELETE) == 0);
25540a9eefcaSMatthew Dillon 
25550a9eefcaSMatthew Dillon #if 0
25560a9eefcaSMatthew Dillon 		if (lstate->flags & DMSG_STATE_ABORTING) {
25575ab1caedSMatthew Dillon 			dmio_printf(iocom, 4,
25580a9eefcaSMatthew Dillon 				    "relay: relay lost link l=%p r=%p\n",
25590a9eefcaSMatthew Dillon 				    lstate, rstate);
25600a9eefcaSMatthew Dillon 			dmsg_simulate_failure(rstate, 0, DMSG_ERR_LOSTLINK);
25610a9eefcaSMatthew Dillon 		}
25620a9eefcaSMatthew Dillon #endif
25630a9eefcaSMatthew Dillon 
2564e96cef49SMatthew Dillon 		rmsg = dmsg_msg_alloc(rstate, 0,
2565d30cab67SMatthew Dillon 				      lmsg->any.head.cmd,
2566d30cab67SMatthew Dillon 				      dmsg_state_relay, NULL);
25671b8eded1SMatthew Dillon 	}
2568d30cab67SMatthew Dillon 	if (lmsg->hdr_size > sizeof(lmsg->any.head)) {
2569d30cab67SMatthew Dillon 		bcopy(&lmsg->any.head + 1, &rmsg->any.head + 1,
2570d30cab67SMatthew Dillon 		      lmsg->hdr_size - sizeof(lmsg->any.head));
2571d30cab67SMatthew Dillon 	}
2572d30cab67SMatthew Dillon 	rmsg->any.head.error = lmsg->any.head.error;
2573d30cab67SMatthew Dillon 	rmsg->any.head.reserved02 = lmsg->any.head.reserved02;
2574*a988b43eSMatthew Dillon 	rmsg->any.head.link_verifier = lmsg->any.head.link_verifier;
2575e96cef49SMatthew Dillon 	rmsg->aux_size = lmsg->aux_size;
2576d30cab67SMatthew Dillon 	rmsg->aux_data = lmsg->aux_data;
2577d30cab67SMatthew Dillon 	lmsg->aux_data = NULL;
25780a9eefcaSMatthew Dillon 
2579d30cab67SMatthew Dillon 	dmsg_msg_write(rmsg);
25801b8eded1SMatthew Dillon }
25811b8eded1SMatthew Dillon 
2582d30cab67SMatthew Dillon /*
25830a9eefcaSMatthew Dillon  * Cleanup and retire msg after issuing the state callback.  The state
25840a9eefcaSMatthew Dillon  * has already been removed from the RB tree.  The subq and msg must be
25850a9eefcaSMatthew Dillon  * cleaned up.
25860a9eefcaSMatthew Dillon  *
25870a9eefcaSMatthew Dillon  * Called with the iocom mutex held (to handle subq disconnection).
2588d30cab67SMatthew Dillon  */
25890c3a8cd0SMatthew Dillon void
25900c3a8cd0SMatthew Dillon dmsg_state_cleanuprx(dmsg_iocom_t *iocom, dmsg_msg_t *msg)
25910c3a8cd0SMatthew Dillon {
25920c3a8cd0SMatthew Dillon 	dmsg_state_t *state;
25930c3a8cd0SMatthew Dillon 
25941b8eded1SMatthew Dillon 	assert(msg->state->iocom == iocom);
25951b8eded1SMatthew Dillon 	state = msg->state;
2596d30cab67SMatthew Dillon 	if (state->flags & DMSG_STATE_ROOT) {
25970c3a8cd0SMatthew Dillon 		/*
25980c3a8cd0SMatthew Dillon 		 * Free a non-transactional message, there is no state
25990c3a8cd0SMatthew Dillon 		 * to worry about.
26000c3a8cd0SMatthew Dillon 		 */
26010c3a8cd0SMatthew Dillon 		dmsg_msg_free(msg);
26020a9eefcaSMatthew Dillon 	} else if ((state->flags & DMSG_STATE_SUBINSERTED) &&
26030a9eefcaSMatthew Dillon 		   (state->rxcmd & DMSGF_DELETE) &&
26040a9eefcaSMatthew Dillon 		   (state->txcmd & DMSGF_DELETE)) {
26050c3a8cd0SMatthew Dillon 		/*
26060a9eefcaSMatthew Dillon 		 * Must disconnect from parent and drop relay.
26070c3a8cd0SMatthew Dillon 		 */
26080a9eefcaSMatthew Dillon 		dmsg_subq_delete(state);
2609d30cab67SMatthew Dillon 		if (state->relay) {
2610323c0947SMatthew Dillon 			dmsg_state_drop(state->relay);
2611d30cab67SMatthew Dillon 			state->relay = NULL;
2612d30cab67SMatthew Dillon 		}
26131b8eded1SMatthew Dillon 		dmsg_msg_free(msg);
26141b8eded1SMatthew Dillon 	} else {
26150c3a8cd0SMatthew Dillon 		/*
26160c3a8cd0SMatthew Dillon 		 * Message not terminating transaction, leave state intact
26170c3a8cd0SMatthew Dillon 		 * and free message if it isn't the CREATE message.
26180c3a8cd0SMatthew Dillon 		 */
26190c3a8cd0SMatthew Dillon 		dmsg_msg_free(msg);
26200c3a8cd0SMatthew Dillon 	}
26210c3a8cd0SMatthew Dillon }
26220c3a8cd0SMatthew Dillon 
2623323c0947SMatthew Dillon /*
2624323c0947SMatthew Dillon  * Clean up the state after pulling out needed fields and queueing the
2625323c0947SMatthew Dillon  * message for transmission.   This occurs in dmsg_msg_write().
26260a9eefcaSMatthew Dillon  *
26270a9eefcaSMatthew Dillon  * Called with the mutex locked.
2628323c0947SMatthew Dillon  */
26290c3a8cd0SMatthew Dillon static void
26301b8eded1SMatthew Dillon dmsg_state_cleanuptx(dmsg_iocom_t *iocom, dmsg_msg_t *msg)
26310c3a8cd0SMatthew Dillon {
26320c3a8cd0SMatthew Dillon 	dmsg_state_t *state;
26330c3a8cd0SMatthew Dillon 
26341b8eded1SMatthew Dillon 	assert(iocom == msg->state->iocom);
26351b8eded1SMatthew Dillon 	state = msg->state;
26360a9eefcaSMatthew Dillon 
26370a9eefcaSMatthew Dillon 	dmsg_state_hold(state);
26380a9eefcaSMatthew Dillon 
2639d30cab67SMatthew Dillon 	if (state->flags & DMSG_STATE_ROOT) {
2640323c0947SMatthew Dillon 		;
26410c3a8cd0SMatthew Dillon 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
2642323c0947SMatthew Dillon 		/*
2643323c0947SMatthew Dillon 		 * Message terminating transaction, destroy the related
2644323c0947SMatthew Dillon 		 * state, the original message, and this message (if it
2645323c0947SMatthew Dillon 		 * isn't the original message due to a CREATE|DELETE).
2646323c0947SMatthew Dillon 		 *
2647323c0947SMatthew Dillon 		 * It's possible for governing state to terminate while
2648323c0947SMatthew Dillon 		 * sub-transactions still exist.  This is allowed but
2649323c0947SMatthew Dillon 		 * will cause sub-transactions to recursively fail.
2650323c0947SMatthew Dillon 		 * Further reception of sub-transaction messages will be
2651323c0947SMatthew Dillon 		 * impossible because the circuit will no longer exist.
2652323c0947SMatthew Dillon 		 * (XXX need code to make sure that happens properly).
26530a9eefcaSMatthew Dillon 		 *
26540a9eefcaSMatthew Dillon 		 * NOTE: It is possible for a fafilure to terminate the
26550a9eefcaSMatthew Dillon 		 *	 state after we have written the message but before
26560a9eefcaSMatthew Dillon 		 *	 we are able to call cleanuptx, so txcmd might already
26570a9eefcaSMatthew Dillon 		 *	 have DMSGF_DELETE set.
2658323c0947SMatthew Dillon 		 */
26590a9eefcaSMatthew Dillon 		if ((state->txcmd & DMSGF_DELETE) == 0 &&
26600a9eefcaSMatthew Dillon 		    (state->rxcmd & DMSGF_DELETE)) {
26610c3a8cd0SMatthew Dillon 			state->txcmd |= DMSGF_DELETE;
2662a06d536bSMatthew Dillon 			assert(state->flags & DMSG_STATE_RBINSERTED);
26630c3a8cd0SMatthew Dillon 			if (state->txcmd & DMSGF_REPLY) {
26640c3a8cd0SMatthew Dillon 				assert(msg->any.head.cmd & DMSGF_REPLY);
26650c3a8cd0SMatthew Dillon 				RB_REMOVE(dmsg_state_tree,
26661b8eded1SMatthew Dillon 					  &iocom->staterd_tree, state);
26670c3a8cd0SMatthew Dillon 			} else {
26680c3a8cd0SMatthew Dillon 				assert((msg->any.head.cmd & DMSGF_REPLY) == 0);
26690c3a8cd0SMatthew Dillon 				RB_REMOVE(dmsg_state_tree,
26701b8eded1SMatthew Dillon 					  &iocom->statewr_tree, state);
26711b8eded1SMatthew Dillon 			}
2672a06d536bSMatthew Dillon 			state->flags &= ~DMSG_STATE_RBINSERTED;
26730a9eefcaSMatthew Dillon 			dmsg_subq_delete(state);
2674d30cab67SMatthew Dillon 
2675d30cab67SMatthew Dillon 			if (state->relay) {
2676323c0947SMatthew Dillon 				dmsg_state_drop(state->relay);
2677d30cab67SMatthew Dillon 				state->relay = NULL;
2678d30cab67SMatthew Dillon 			}
26790a9eefcaSMatthew Dillon 			dmsg_state_drop(state);	/* state->rbtree */
26800a9eefcaSMatthew Dillon 		} else if ((state->txcmd & DMSGF_DELETE) == 0) {
26810a9eefcaSMatthew Dillon 			state->txcmd |= DMSGF_DELETE;
26820c3a8cd0SMatthew Dillon 		}
26830c3a8cd0SMatthew Dillon 	}
26840a9eefcaSMatthew Dillon 
26850a9eefcaSMatthew Dillon 	/*
26860a9eefcaSMatthew Dillon 	 * Deferred abort after transmission.
26870a9eefcaSMatthew Dillon 	 */
26880a9eefcaSMatthew Dillon 	if ((state->flags & (DMSG_STATE_ABORTING | DMSG_STATE_DYING)) &&
26890a9eefcaSMatthew Dillon 	    (state->rxcmd & DMSGF_DELETE) == 0) {
26905ab1caedSMatthew Dillon 		dmio_printf(iocom, 4,
26915ab1caedSMatthew Dillon 			    "cleanuptx: state=%p "
26920a9eefcaSMatthew Dillon 			    "executing deferred abort\n",
26930a9eefcaSMatthew Dillon 			    state);
26940a9eefcaSMatthew Dillon 		state->flags &= ~DMSG_STATE_ABORTING;
26950a9eefcaSMatthew Dillon 		dmsg_simulate_failure(state, 1, DMSG_ERR_LOSTLINK);
26960a9eefcaSMatthew Dillon 	}
26970a9eefcaSMatthew Dillon 
26980a9eefcaSMatthew Dillon 	dmsg_state_drop(state);
26990c3a8cd0SMatthew Dillon }
27000c3a8cd0SMatthew Dillon 
27010c3a8cd0SMatthew Dillon /*
2702323c0947SMatthew Dillon  * Called with or without locks
2703323c0947SMatthew Dillon  */
2704323c0947SMatthew Dillon void
2705323c0947SMatthew Dillon dmsg_state_hold(dmsg_state_t *state)
2706323c0947SMatthew Dillon {
2707323c0947SMatthew Dillon 	atomic_add_int(&state->refs, 1);
2708323c0947SMatthew Dillon }
2709323c0947SMatthew Dillon 
2710323c0947SMatthew Dillon void
2711323c0947SMatthew Dillon dmsg_state_drop(dmsg_state_t *state)
2712323c0947SMatthew Dillon {
27130a9eefcaSMatthew Dillon 	assert(state->refs > 0);
2714323c0947SMatthew Dillon 	if (atomic_fetchadd_int(&state->refs, -1) == 1)
2715323c0947SMatthew Dillon 		dmsg_state_free(state);
2716323c0947SMatthew Dillon }
2717323c0947SMatthew Dillon 
2718323c0947SMatthew Dillon /*
27190c3a8cd0SMatthew Dillon  * Called with iocom locked
27200c3a8cd0SMatthew Dillon  */
2721323c0947SMatthew Dillon static void
27220c3a8cd0SMatthew Dillon dmsg_state_free(dmsg_state_t *state)
27230c3a8cd0SMatthew Dillon {
2724323c0947SMatthew Dillon 	atomic_add_int(&dmsg_state_count, -1);
27255ab1caedSMatthew Dillon 	dmio_printf(state->iocom, 5, "terminate state %p\n", state);
2726a06d536bSMatthew Dillon 	assert((state->flags & (DMSG_STATE_ROOT |
2727a06d536bSMatthew Dillon 				DMSG_STATE_SUBINSERTED |
2728a06d536bSMatthew Dillon 				DMSG_STATE_RBINSERTED)) == 0);
2729323c0947SMatthew Dillon 	assert(TAILQ_EMPTY(&state->subq));
2730323c0947SMatthew Dillon 	assert(state->refs == 0);
2731f306de83SMatthew Dillon 	if (state->any.any != NULL)   /* XXX avoid deadlock w/exit & kernel */
2732f306de83SMatthew Dillon 		closefrom(3);
27330c3a8cd0SMatthew Dillon 	assert(state->any.any == NULL);
27340c3a8cd0SMatthew Dillon 	free(state);
27350d20ec8aSMatthew Dillon }
27360c3a8cd0SMatthew Dillon 
27370c3a8cd0SMatthew Dillon /*
27380c3a8cd0SMatthew Dillon  * This swaps endian for a hammer2_msg_hdr.  Note that the extended
27390c3a8cd0SMatthew Dillon  * header is not adjusted, just the core header.
27400c3a8cd0SMatthew Dillon  */
27410c3a8cd0SMatthew Dillon void
27420c3a8cd0SMatthew Dillon dmsg_bswap_head(dmsg_hdr_t *head)
27430c3a8cd0SMatthew Dillon {
27440c3a8cd0SMatthew Dillon 	head->magic	= bswap16(head->magic);
27450c3a8cd0SMatthew Dillon 	head->reserved02 = bswap16(head->reserved02);
27460c3a8cd0SMatthew Dillon 	head->salt	= bswap32(head->salt);
27470c3a8cd0SMatthew Dillon 
27480c3a8cd0SMatthew Dillon 	head->msgid	= bswap64(head->msgid);
27490d20ec8aSMatthew Dillon 	head->circuit	= bswap64(head->circuit);
2750*a988b43eSMatthew Dillon 	head->link_verifier= bswap64(head->link_verifier);
27510c3a8cd0SMatthew Dillon 
27520c3a8cd0SMatthew Dillon 	head->cmd	= bswap32(head->cmd);
27530c3a8cd0SMatthew Dillon 	head->aux_crc	= bswap32(head->aux_crc);
27540c3a8cd0SMatthew Dillon 	head->aux_bytes	= bswap32(head->aux_bytes);
27550c3a8cd0SMatthew Dillon 	head->error	= bswap32(head->error);
27560c3a8cd0SMatthew Dillon 	head->aux_descr = bswap64(head->aux_descr);
27570c3a8cd0SMatthew Dillon 	head->reserved38= bswap32(head->reserved38);
27580c3a8cd0SMatthew Dillon 	head->hdr_crc	= bswap32(head->hdr_crc);
27590c3a8cd0SMatthew Dillon }
2760