xref: /dragonfly/sys/kern/kern_dmsg.c (revision 38b720cd)
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36  *	 a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/priv.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
56 
57 #include <sys/dmsg.h>
58 
59 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
60 
61 SYSCTL_NODE(, OID_AUTO, kdmsg, CTLFLAG_RW, 0, "kdmsg");
62 static int kdmsg_debug = 1;
63 SYSCTL_INT(_kdmsg, OID_AUTO, debug, CTLFLAG_RW, &kdmsg_debug, 0,
64 	   "Set debug level for kernel dmsg layer");
65 
66 #define kd_printf(level, ctl, ...)      	\
67         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
68 
69 #define kdio_printf(iocom, level, ctl, ...)      \
70         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
71 
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
73 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
74 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
78 static void kdmsg_subq_delete(kdmsg_state_t *state);
79 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
80 static void kdmsg_state_abort(kdmsg_state_t *state);
81 static void kdmsg_state_dying(kdmsg_state_t *state);
82 static void kdmsg_state_free(kdmsg_state_t *state);
83 
84 #ifdef KDMSG_DEBUG
85 #define KDMSG_DEBUG_ARGS	, const char *file, int line
86 #define kdmsg_state_hold(state)	_kdmsg_state_hold(state, __FILE__, __LINE__)
87 #define kdmsg_state_drop(state)	_kdmsg_state_drop(state, __FILE__, __LINE__)
88 #else
89 #define KDMSG_DEBUG_ARGS
90 #define kdmsg_state_hold(state)	_kdmsg_state_hold(state)
91 #define kdmsg_state_drop(state)	_kdmsg_state_drop(state)
92 #endif
93 static void _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
94 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
95 
96 static void kdmsg_iocom_thread_rd(void *arg);
97 static void kdmsg_iocom_thread_wr(void *arg);
98 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
99 
100 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
101 
102 /*
103  * Initialize the roll-up communications structure for a network
104  * messaging session.  This function does not install the socket.
105  */
106 void
107 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
108 		 struct malloc_type *mmsg,
109 		 int (*rcvmsg)(kdmsg_msg_t *msg))
110 {
111 	bzero(iocom, sizeof(*iocom));
112 	iocom->handle = handle;
113 	iocom->mmsg = mmsg;
114 	iocom->rcvmsg = rcvmsg;
115 	iocom->flags = flags;
116 	lockinit(&iocom->msglk, "h2msg", 0, 0);
117 	TAILQ_INIT(&iocom->msgq);
118 	RB_INIT(&iocom->staterd_tree);
119 	RB_INIT(&iocom->statewr_tree);
120 
121 	iocom->state0.iocom = iocom;
122 	iocom->state0.parent = &iocom->state0;
123 	TAILQ_INIT(&iocom->state0.subq);
124 }
125 
126 /*
127  * [Re]connect using the passed file pointer.  The caller must ref the
128  * fp for us.  We own that ref now.
129  */
130 void
131 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
132 		      const char *subsysname)
133 {
134 	/*
135 	 * Destroy the current connection
136 	 */
137 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
138 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
139 	while (iocom->msgrd_td || iocom->msgwr_td) {
140 		wakeup(&iocom->msg_ctl);
141 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
142 	}
143 
144 	/*
145 	 * Drop communications descriptor
146 	 */
147 	if (iocom->msg_fp) {
148 		fdrop(iocom->msg_fp);
149 		iocom->msg_fp = NULL;
150 	}
151 
152 	/*
153 	 * Setup new communications descriptor
154 	 */
155 	iocom->msg_ctl = 0;
156 	iocom->msg_fp = fp;
157 	iocom->msg_seq = 0;
158 	iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
159 
160 	lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
161 		    NULL, 0, -1, "%s-msgrd", subsysname);
162 	lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
163 		    NULL, 0, -1, "%s-msgwr", subsysname);
164 	lockmgr(&iocom->msglk, LK_RELEASE);
165 }
166 
167 /*
168  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
169  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
170  */
171 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
172 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
173 
174 void
175 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
176 			 void (*auto_callback)(kdmsg_msg_t *msg))
177 {
178 	kdmsg_msg_t *msg;
179 
180 	iocom->auto_callback = auto_callback;
181 
182 	msg = kdmsg_msg_alloc(&iocom->state0,
183 			      DMSG_LNK_CONN | DMSGF_CREATE,
184 			      kdmsg_lnk_conn_reply, NULL);
185 	iocom->auto_lnk_conn.head = msg->any.head;
186 	msg->any.lnk_conn = iocom->auto_lnk_conn;
187 	iocom->conn_state = msg->state;
188 	kdmsg_state_hold(msg->state);	/* iocom->conn_state */
189 	kdmsg_msg_write(msg);
190 }
191 
192 static
193 int
194 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
195 {
196 	kdmsg_iocom_t *iocom = state->iocom;
197 	kdmsg_msg_t *rmsg;
198 
199 	/*
200 	 * Upon receipt of the LNK_CONN acknowledgement initiate an
201 	 * automatic SPAN if we were asked to.  Used by e.g. xdisk, but
202 	 * not used by HAMMER2 which must manage more than one transmitted
203 	 * SPAN.
204 	 */
205 	if ((msg->any.head.cmd & DMSGF_CREATE) &&
206 	    (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
207 		rmsg = kdmsg_msg_alloc(&iocom->state0,
208 				       DMSG_LNK_SPAN | DMSGF_CREATE,
209 				       kdmsg_lnk_span_reply, NULL);
210 		iocom->auto_lnk_span.head = rmsg->any.head;
211 		rmsg->any.lnk_span = iocom->auto_lnk_span;
212 		kdmsg_msg_write(rmsg);
213 	}
214 
215 	/*
216 	 * Process shim after the CONN is acknowledged and before the CONN
217 	 * transaction is deleted.  For deletions this gives device drivers
218 	 * the ability to interlock new operations on the circuit before
219 	 * it becomes illegal and panics.
220 	 */
221 	if (iocom->auto_callback)
222 		iocom->auto_callback(msg);
223 
224 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
225 	    (msg->any.head.cmd & DMSGF_DELETE)) {
226 		/*
227 		 * iocom->conn_state has a state ref, drop it when clearing.
228 		 */
229 		if (iocom->conn_state)
230 			kdmsg_state_drop(iocom->conn_state);
231 		iocom->conn_state = NULL;
232 		kdmsg_msg_reply(msg, 0);
233 	}
234 
235 	return (0);
236 }
237 
238 static
239 int
240 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
241 {
242 	/*
243 	 * Be sure to process shim before terminating the SPAN
244 	 * transaction.  Gives device drivers the ability to
245 	 * interlock new operations on the circuit before it
246 	 * becomes illegal and panics.
247 	 */
248 	if (state->iocom->auto_callback)
249 		state->iocom->auto_callback(msg);
250 
251 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
252 	    (msg->any.head.cmd & DMSGF_DELETE)) {
253 		kdmsg_msg_reply(msg, 0);
254 	}
255 	return (0);
256 }
257 
258 /*
259  * Disconnect and clean up
260  */
261 void
262 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
263 {
264 	kdmsg_state_t *state;
265 	kdmsg_msg_t *msg;
266 	int retries;
267 
268 	/*
269 	 * Ask the cluster controller to go away by setting
270 	 * KILLRX.  Send a PING to get a response to unstick reading
271 	 * from the pipe.
272 	 *
273 	 * After 10 seconds shitcan the pipe and do an unclean shutdown.
274 	 */
275 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
276 
277 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
278 	msg = kdmsg_msg_alloc(&iocom->state0, DMSG_LNK_PING, NULL, NULL);
279 	kdmsg_msg_write_locked(iocom, msg);
280 
281 	retries = 10;
282 	while (iocom->msgrd_td || iocom->msgwr_td) {
283 		wakeup(&iocom->msg_ctl);
284 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
285 		if (--retries == 0 && iocom->msg_fp) {
286 			kdio_printf(iocom, 0, "%s\n",
287 				    "iocom_uninit: "
288 				    "shitcanning unresponsive pipe");
289 			fp_shutdown(iocom->msg_fp, SHUT_RDWR);
290 			/* retries allowed to go negative, keep looping */
291 		}
292 	}
293 
294 	/*
295 	 * Cleanup caches
296 	 */
297 	if ((state = iocom->freerd_state) != NULL) {
298 		iocom->freerd_state = NULL;
299 		kdmsg_state_drop(state);
300 	}
301 
302 	if ((state = iocom->freewr_state) != NULL) {
303 		iocom->freewr_state = NULL;
304 		kdmsg_state_drop(state);
305 	}
306 
307 	/*
308 	 * Drop communications descriptor
309 	 */
310 	if (iocom->msg_fp) {
311 		fdrop(iocom->msg_fp);
312 		iocom->msg_fp = NULL;
313 	}
314 	lockmgr(&iocom->msglk, LK_RELEASE);
315 }
316 
317 /*
318  * Cluster controller thread.  Perform messaging functions.  We have one
319  * thread for the reader and one for the writer.  The writer handles
320  * shutdown requests (which should break the reader thread).
321  */
322 static
323 void
324 kdmsg_iocom_thread_rd(void *arg)
325 {
326 	kdmsg_iocom_t *iocom = arg;
327 	dmsg_hdr_t hdr;
328 	kdmsg_msg_t *msg = NULL;
329 	size_t hbytes;
330 	size_t abytes;
331 	int error = 0;
332 
333 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
334 		/*
335 		 * Retrieve the message from the pipe or socket.
336 		 */
337 		error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
338 				NULL, 1, UIO_SYSSPACE);
339 		if (error)
340 			break;
341 		if (hdr.magic != DMSG_HDR_MAGIC) {
342 			kdio_printf(iocom, 1, "bad magic: %04x\n", hdr.magic);
343 			error = EINVAL;
344 			break;
345 		}
346 		hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
347 		if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
348 			kdio_printf(iocom, 1, "bad header size %zd\n", hbytes);
349 			error = EINVAL;
350 			break;
351 		}
352 
353 		/* XXX messy: mask cmd to avoid allocating state */
354 		msg = kdmsg_msg_alloc(&iocom->state0,
355 				      hdr.cmd & DMSGF_BASECMDMASK,
356 				      NULL, NULL);
357 		msg->any.head = hdr;
358 		msg->hdr_size = hbytes;
359 		if (hbytes > sizeof(hdr)) {
360 			error = fp_read(iocom->msg_fp, &msg->any.head + 1,
361 					hbytes - sizeof(hdr),
362 					NULL, 1, UIO_SYSSPACE);
363 			if (error) {
364 				kdio_printf(iocom, 1, "%s\n",
365 					    "short msg received");
366 				error = EINVAL;
367 				break;
368 			}
369 		}
370 		msg->aux_size = hdr.aux_bytes;
371 		if (msg->aux_size > DMSG_AUX_MAX) {
372 			kdio_printf(iocom, 1,
373 				    "illegal msg payload size %zd\n",
374 				    msg->aux_size);
375 			error = EINVAL;
376 			break;
377 		}
378 		if (msg->aux_size) {
379 			abytes = DMSG_DOALIGN(msg->aux_size);
380 			msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
381 			msg->flags |= KDMSG_FLAG_AUXALLOC;
382 			error = fp_read(iocom->msg_fp, msg->aux_data,
383 					abytes, NULL, 1, UIO_SYSSPACE);
384 			if (error) {
385 				kdio_printf(iocom, 1, "%s\n",
386 					    "short msg payload received");
387 				break;
388 			}
389 		}
390 
391 		error = kdmsg_msg_receive_handling(msg);
392 		msg = NULL;
393 	}
394 
395 	kdio_printf(iocom, 1, "read thread terminating error=%d\n", error);
396 
397 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
398 	if (msg)
399 		kdmsg_msg_free(msg);
400 
401 	/*
402 	 * Shutdown the socket and set KILLRX for consistency in case the
403 	 * shutdown was not commanded.  Signal the transmit side to shutdown
404 	 * by setting KILLTX and waking it up.
405 	 */
406 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
407 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
408 					KDMSG_CLUSTERCTL_KILLTX);
409 	iocom->msgrd_td = NULL;
410 	lockmgr(&iocom->msglk, LK_RELEASE);
411 	wakeup(&iocom->msg_ctl);
412 
413 	/*
414 	 * iocom can be ripped out at any time once the lock is
415 	 * released with msgrd_td set to NULL.  The wakeup()s are safe but
416 	 * that is all.
417 	 */
418 	wakeup(iocom);
419 	lwkt_exit();
420 }
421 
422 static
423 void
424 kdmsg_iocom_thread_wr(void *arg)
425 {
426 	kdmsg_iocom_t *iocom = arg;
427 	kdmsg_msg_t *msg;
428 	ssize_t res;
429 	size_t abytes;
430 	int error = 0;
431 	int save_ticks;
432 	int didwarn;
433 
434 	/*
435 	 * Transmit loop
436 	 */
437 	msg = NULL;
438 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
439 
440 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
441 		/*
442 		 * Sleep if no messages pending.  Interlock with flag while
443 		 * holding msglk.
444 		 */
445 		if (TAILQ_EMPTY(&iocom->msgq)) {
446 			atomic_set_int(&iocom->msg_ctl,
447 				       KDMSG_CLUSTERCTL_SLEEPING);
448 			lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
449 			atomic_clear_int(&iocom->msg_ctl,
450 					 KDMSG_CLUSTERCTL_SLEEPING);
451 		}
452 
453 		while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
454 			/*
455 			 * Remove msg from the transmit queue and do
456 			 * persist and half-closed state handling.
457 			 */
458 			TAILQ_REMOVE(&iocom->msgq, msg, qentry);
459 
460 			error = kdmsg_state_msgtx(msg);
461 			if (error == EALREADY) {
462 				error = 0;
463 				kdmsg_msg_free(msg);
464 				continue;
465 			}
466 			if (error) {
467 				kdmsg_msg_free(msg);
468 				break;
469 			}
470 
471 			/*
472 			 * Dump the message to the pipe or socket.
473 			 *
474 			 * We have to clean up the message as if the transmit
475 			 * succeeded even if it failed.
476 			 */
477 			lockmgr(&iocom->msglk, LK_RELEASE);
478 			error = fp_write(iocom->msg_fp, &msg->any,
479 					 msg->hdr_size, &res, UIO_SYSSPACE);
480 			if (error || res != msg->hdr_size) {
481 				if (error == 0)
482 					error = EINVAL;
483 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
484 				kdmsg_state_cleanuptx(msg);
485 				break;
486 			}
487 			if (msg->aux_size) {
488 				abytes = DMSG_DOALIGN(msg->aux_size);
489 				error = fp_write(iocom->msg_fp,
490 						 msg->aux_data, abytes,
491 						 &res, UIO_SYSSPACE);
492 				if (error || res != abytes) {
493 					if (error == 0)
494 						error = EINVAL;
495 					lockmgr(&iocom->msglk, LK_EXCLUSIVE);
496 					kdmsg_state_cleanuptx(msg);
497 					break;
498 				}
499 			}
500 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
501 			kdmsg_state_cleanuptx(msg);
502 		}
503 	}
504 
505 	kdio_printf(iocom, 1, "write thread terminating error=%d\n", error);
506 
507 	/*
508 	 * Shutdown the socket and set KILLTX for consistency in case the
509 	 * shutdown was not commanded.  Signal the receive side to shutdown
510 	 * by setting KILLRX and waking it up.
511 	 */
512 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
513 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
514 					KDMSG_CLUSTERCTL_KILLTX);
515 	wakeup(&iocom->msg_ctl);
516 
517 	/*
518 	 * The transmit thread is responsible for final cleanups, wait
519 	 * for the receive side to terminate to prevent new received
520 	 * states from interfering with our cleanup.
521 	 *
522 	 * Do not set msgwr_td to NULL until we actually exit.
523 	 */
524 	while (iocom->msgrd_td) {
525 		wakeup(&iocom->msg_ctl);
526 		lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
527 	}
528 
529 	/*
530 	 * We can no longer receive new messages.  We must drain the transmit
531 	 * message queue and simulate received messages to close anay remaining
532 	 * states.
533 	 *
534 	 * Loop until all the states are gone and there are no messages
535 	 * pending transmit.
536 	 */
537 	save_ticks = ticks;
538 	didwarn = 0;
539 
540 	while (TAILQ_FIRST(&iocom->msgq) ||
541 	       RB_ROOT(&iocom->staterd_tree) ||
542 	       RB_ROOT(&iocom->statewr_tree)) {
543 		/*
544 		 * Simulate failure for all sub-states of state0.
545 		 */
546 		kdmsg_drain_msgq(iocom);
547 		kdio_printf(iocom, 2, "%s\n",
548 			    "simulate failure for all substates of state0");
549 		kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
550 
551 		lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
552 
553 		if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
554 			didwarn = 1;
555 			kdio_printf(iocom, 0,
556 				    "Warning, write thread on %p "
557 				    "still terminating\n",
558 				    iocom);
559 		}
560 		if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
561 			didwarn = 2;
562 			kdio_printf(iocom, 0,
563 				    "Warning, write thread on %p "
564 				    "still terminating\n",
565 				    iocom);
566 		}
567 		if ((int)(ticks - save_ticks) > hz*60) {
568 			kdio_printf(iocom, 0,
569 				    "Can't terminate: msgq %p "
570 				    "rd_tree %p wr_tree %p\n",
571 				    TAILQ_FIRST(&iocom->msgq),
572 				    RB_ROOT(&iocom->staterd_tree),
573 				    RB_ROOT(&iocom->statewr_tree));
574 			lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz * 10);
575 		}
576 	}
577 
578 	/*
579 	 * Exit handling is done by the write thread.
580 	 */
581 	iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
582 	lockmgr(&iocom->msglk, LK_RELEASE);
583 
584 	/*
585 	 * The state trees had better be empty now
586 	 */
587 	KKASSERT(RB_EMPTY(&iocom->staterd_tree));
588 	KKASSERT(RB_EMPTY(&iocom->statewr_tree));
589 	KKASSERT(iocom->conn_state == NULL);
590 
591 	if (iocom->exit_func) {
592 		/*
593 		 * iocom is invalid after we call the exit function.
594 		 */
595 		iocom->msgwr_td = NULL;
596 		iocom->exit_func(iocom);
597 	} else {
598 		/*
599 		 * iocom can be ripped out from under us once msgwr_td is
600 		 * set to NULL.  The wakeup is safe.
601 		 */
602 		iocom->msgwr_td = NULL;
603 		wakeup(iocom);
604 	}
605 	lwkt_exit();
606 }
607 
608 /*
609  * This cleans out the pending transmit message queue, adjusting any
610  * persistent states properly in the process.
611  *
612  * Called with iocom locked.
613  */
614 void
615 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
616 {
617 	kdmsg_msg_t *msg;
618 
619 	/*
620 	 * Clean out our pending transmit queue, executing the
621 	 * appropriate state adjustments.  If this tries to open
622 	 * any new outgoing transactions we have to loop up and
623 	 * clean them out.
624 	 */
625 	while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
626 		TAILQ_REMOVE(&iocom->msgq, msg, qentry);
627 		if (kdmsg_state_msgtx(msg))
628 			kdmsg_msg_free(msg);
629 		else
630 			kdmsg_state_cleanuptx(msg);
631 	}
632 }
633 
634 /*
635  * Do all processing required to handle a freshly received message
636  * after its low level header has been validated.
637  *
638  * iocom is not locked.
639  */
640 static
641 int
642 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
643 {
644 	kdmsg_iocom_t *iocom = msg->state->iocom;
645 	int error;
646 
647 	/*
648 	 * State machine tracking, state assignment for msg,
649 	 * returns error and discard status.  Errors are fatal
650 	 * to the connection except for EALREADY which forces
651 	 * a discard without execution.
652 	 */
653 	error = kdmsg_state_msgrx(msg);
654 	if (msg->state->flags & KDMSG_STATE_ABORTING) {
655 		kdio_printf(iocom, 5,
656 			    "kdmsg_state_abort(b): state %p rxcmd=%08x "
657 			    "txcmd=%08x msgrx error %d\n",
658 			    msg->state, msg->state->rxcmd,
659 			    msg->state->txcmd, error);
660 	}
661 	if (error) {
662 		/*
663 		 * Raw protocol or connection error
664 		 */
665 		if (msg->state->flags & KDMSG_STATE_ABORTING)
666 			kdio_printf(iocom, 5,
667 				    "X1 state %p error %d\n",
668 				    msg->state, error);
669 		kdmsg_msg_free(msg);
670 		if (error == EALREADY)
671 			error = 0;
672 	} else if (msg->state && msg->state->func) {
673 		/*
674 		 * Message related to state which already has a
675 		 * handling function installed for it.
676 		 */
677 		if (msg->state->flags & KDMSG_STATE_ABORTING)
678 			kdio_printf(iocom, 5,
679 				    "X2 state %p func %p\n",
680 				    msg->state, msg->state->func);
681 		error = msg->state->func(msg->state, msg);
682 		kdmsg_state_cleanuprx(msg);
683 	} else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
684 		if (msg->state->flags & KDMSG_STATE_ABORTING)
685 			kdio_printf(iocom, 5,
686 				    "X3 state %p\n", msg->state);
687 		error = kdmsg_autorxmsg(msg);
688 		kdmsg_state_cleanuprx(msg);
689 	} else {
690 		if (msg->state->flags & KDMSG_STATE_ABORTING)
691 			kdio_printf(iocom, 5,
692 				    "X4 state %p\n", msg->state);
693 		error = iocom->rcvmsg(msg);
694 		kdmsg_state_cleanuprx(msg);
695 	}
696 	return error;
697 }
698 
699 /*
700  * Process state tracking for a message after reception and dequeueing,
701  * prior to execution of the state callback.  The state is updated and
702  * will be removed from the RBTREE if completely closed, but the state->parent
703  * and subq linkage is not cleaned up until after the callback (see
704  * cleanuprx()).
705  *
706  * msglk is not held.
707  *
708  * NOTE: A message transaction can consist of several messages in either
709  *	 direction.
710  *
711  * NOTE: The msgid is unique to the initiator, not necessarily unique for
712  *	 us or for any relay or for the return direction for that matter.
713  *	 That is, two sides sending a new message can use the same msgid
714  *	 without colliding.
715  *
716  * --
717  *
718  * ABORT sequences work by setting the ABORT flag along with normal message
719  * state.  However, ABORTs can also be sent on half-closed messages, that is
720  * even if the command or reply side has already sent a DELETE, as long as
721  * the message has not been fully closed it can still send an ABORT+DELETE
722  * to terminate the half-closed message state.
723  *
724  * Since ABORT+DELETEs can race we silently discard ABORT's for message
725  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
726  * also race, and in this situation the other side might have already
727  * initiated a new unrelated command with the same message id.  Since
728  * the abort has not set the CREATE flag the situation can be detected
729   * and the message will also be discarded.
730  *
731  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
732  * The ABORT request is essentially integrated into the command instead
733  * of being sent later on.  In this situation the command implementation
734  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
735  * special-case non-blocking operation for the command.
736  *
737  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
738  *	  to be mid-stream aborts for command/reply sequences.  ABORTs on
739  *	  one-way messages are not supported.
740  *
741  * NOTE!  If a command sequence does not support aborts the ABORT flag is
742  *	  simply ignored.
743  *
744  * --
745  *
746  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
747  * set.  One-off messages cannot be aborted and typically aren't processed
748  * by these routines.  The REPLY bit can be used to distinguish whether a
749  * one-off message is a command or reply.  For example, one-off replies
750  * will typically just contain status updates.
751  */
752 static
753 int
754 kdmsg_state_msgrx(kdmsg_msg_t *msg)
755 {
756 	kdmsg_iocom_t *iocom = msg->state->iocom;
757 	kdmsg_state_t *state;
758 	kdmsg_state_t *pstate;
759 	kdmsg_state_t sdummy;
760 	int error;
761 
762 	bzero(&sdummy, sizeof(sdummy));	/* avoid gcc warnings */
763 
764 	/*
765 	 * Make sure a state structure is ready to go in case we need a new
766 	 * one.  This is the only routine which uses freerd_state so no
767 	 * races are possible.
768 	 */
769 	if ((state = iocom->freerd_state) == NULL) {
770 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
771 		state->flags = KDMSG_STATE_DYNAMIC;
772 		state->iocom = iocom;
773 		state->refs = 1;
774 		TAILQ_INIT(&state->subq);
775 		iocom->freerd_state = state;
776 	}
777 	state = NULL;	/* safety */
778 
779 	/*
780 	 * Lock RB tree and locate existing persistent state, if any.
781 	 *
782 	 * If received msg is a command state is on staterd_tree.
783 	 * If received msg is a reply state is on statewr_tree.
784 	 */
785 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
786 
787 again:
788 	if (msg->state == &iocom->state0) {
789 		sdummy.msgid = msg->any.head.msgid;
790 		sdummy.iocom = iocom;
791 		if (msg->any.head.cmd & DMSGF_REVTRANS) {
792 			state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
793 					&sdummy);
794 		} else {
795 			state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
796 					&sdummy);
797 		}
798 
799 		/*
800 		 * Set message state unconditionally.  If this is a CREATE
801 		 * message this state will become the parent state and new
802 		 * state will be allocated for the message state.
803 		 */
804 		if (state == NULL)
805 			state = &iocom->state0;
806 		if (state->flags & KDMSG_STATE_INTERLOCK) {
807 			state->flags |= KDMSG_STATE_SIGNAL;
808 			lksleep(state, &iocom->msglk, 0, "dmrace", hz);
809 			goto again;
810 		}
811 		kdmsg_state_hold(state);
812 		kdmsg_state_drop(msg->state);	/* iocom->state0 */
813 		msg->state = state;
814 	} else {
815 		state = msg->state;
816 	}
817 
818 	/*
819 	 * Short-cut one-off or mid-stream messages.
820 	 */
821 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
822 				  DMSGF_ABORT)) == 0) {
823 		error = 0;
824 		goto done;
825 	}
826 
827 	/*
828 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
829 	 * inside the case statements.
830 	 */
831 	switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
832 	case DMSGF_CREATE:
833 	case DMSGF_CREATE | DMSGF_DELETE:
834 		/*
835 		 * New persistant command received.
836 		 */
837 		if (state != &iocom->state0) {
838 			kdio_printf(iocom, 1, "%s\n",
839 				    "duplicate transaction");
840 			error = EINVAL;
841 			break;
842 		}
843 
844 		/*
845 		 * Lookup the circuit.  The circuit is an open transaction.
846 		 * the REVCIRC bit in the message tells us which side
847 		 * initiated the transaction representing the circuit.
848 		 */
849 		if (msg->any.head.circuit) {
850 			sdummy.msgid = msg->any.head.circuit;
851 
852 			if (msg->any.head.cmd & DMSGF_REVCIRC) {
853 				pstate = RB_FIND(kdmsg_state_tree,
854 						 &iocom->statewr_tree,
855 						 &sdummy);
856 			} else {
857 				pstate = RB_FIND(kdmsg_state_tree,
858 						 &iocom->staterd_tree,
859 						 &sdummy);
860 			}
861 			if (pstate == NULL) {
862 				kdio_printf(iocom, 1, "%s\n",
863 					    "missing parent in "
864 					    "stacked trans");
865 				error = EINVAL;
866 				break;
867 			}
868 		} else {
869 			pstate = &iocom->state0;
870 		}
871 
872 		/*
873 		 * Allocate new state.
874 		 *
875 		 * msg->state becomes the owner of the ref we inherit from
876 		 * freerd_stae.
877 		 */
878 		kdmsg_state_drop(state);
879 		state = iocom->freerd_state;
880 		iocom->freerd_state = NULL;
881 
882 		msg->state = state;		/* inherits freerd ref */
883 		state->parent = pstate;
884 		KKASSERT(state->iocom == iocom);
885 		state->flags |= KDMSG_STATE_RBINSERTED |
886 				KDMSG_STATE_SUBINSERTED |
887 			        KDMSG_STATE_OPPOSITE;
888 		if (TAILQ_EMPTY(&pstate->subq))
889 			kdmsg_state_hold(pstate);/* states on pstate->subq */
890 		kdmsg_state_hold(state);	/* state on pstate->subq */
891 		kdmsg_state_hold(state);	/* state on rbtree */
892 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
893 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
894 		state->txcmd = DMSGF_REPLY;
895 		state->msgid = msg->any.head.msgid;
896 		state->flags &= ~KDMSG_STATE_NEW;
897 		RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
898 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
899 		error = 0;
900 		break;
901 	case DMSGF_DELETE:
902 		/*
903 		 * Persistent state is expected but might not exist if an
904 		 * ABORT+DELETE races the close.
905 		 */
906 		if (state == &iocom->state0) {
907 			if (msg->any.head.cmd & DMSGF_ABORT) {
908 				kdio_printf(iocom, 1, "%s\n",
909 					    "msgrx: "
910 					    "state already A");
911 				error = EALREADY;
912 			} else {
913 				kdio_printf(iocom, 1, "%s\n",
914 					    "msgrx: no state for DELETE");
915 				error = EINVAL;
916 			}
917 			break;
918 		}
919 
920 		/*
921 		 * Handle another ABORT+DELETE case if the msgid has already
922 		 * been reused.
923 		 */
924 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
925 			if (msg->any.head.cmd & DMSGF_ABORT) {
926 				kdio_printf(iocom, 1, "%s\n",
927 					    "msgrx: state already B");
928 				error = EALREADY;
929 			} else {
930 				kdio_printf(iocom, 1, "%s\n",
931 					    "msgrx: state reused for DELETE");
932 				error = EINVAL;
933 			}
934 			break;
935 		}
936 		error = 0;
937 		break;
938 	default:
939 		/*
940 		 * Check for mid-stream ABORT command received, otherwise
941 		 * allow.
942 		 */
943 		if (msg->any.head.cmd & DMSGF_ABORT) {
944 			if (state == &iocom->state0 ||
945 			    (state->rxcmd & DMSGF_CREATE) == 0) {
946 				error = EALREADY;
947 				break;
948 			}
949 		}
950 		error = 0;
951 		break;
952 	case DMSGF_REPLY | DMSGF_CREATE:
953 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
954 		/*
955 		 * When receiving a reply with CREATE set the original
956 		 * persistent state message should already exist.
957 		 */
958 		if (state == &iocom->state0) {
959 			kdio_printf(iocom, 1,
960 				    "msgrx: no state match for "
961 				    "REPLY cmd=%08x msgid=%016jx\n",
962 				    msg->any.head.cmd,
963 				    (intmax_t)msg->any.head.msgid);
964 			error = EINVAL;
965 			break;
966 		}
967 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
968 		error = 0;
969 		break;
970 	case DMSGF_REPLY | DMSGF_DELETE:
971 		/*
972 		 * Received REPLY+ABORT+DELETE in case where msgid has
973 		 * already been fully closed, ignore the message.
974 		 */
975 		if (state == &iocom->state0) {
976 			if (msg->any.head.cmd & DMSGF_ABORT) {
977 				error = EALREADY;
978 			} else {
979 				kdio_printf(iocom, 1, "%s\n",
980 					    "msgrx: no state match "
981 					    "for REPLY|DELETE");
982 				error = EINVAL;
983 			}
984 			break;
985 		}
986 
987 		/*
988 		 * Received REPLY+ABORT+DELETE in case where msgid has
989 		 * already been reused for an unrelated message,
990 		 * ignore the message.
991 		 */
992 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
993 			if (msg->any.head.cmd & DMSGF_ABORT) {
994 				error = EALREADY;
995 			} else {
996 				kdio_printf(iocom, 1, "%s\n",
997 					    "msgrx: state reused "
998 					    "for REPLY|DELETE");
999 				error = EINVAL;
1000 			}
1001 			break;
1002 		}
1003 		error = 0;
1004 		break;
1005 	case DMSGF_REPLY:
1006 		/*
1007 		 * Check for mid-stream ABORT reply received to sent command.
1008 		 */
1009 		if (msg->any.head.cmd & DMSGF_ABORT) {
1010 			if (state == &iocom->state0 ||
1011 			    (state->rxcmd & DMSGF_CREATE) == 0) {
1012 				error = EALREADY;
1013 				break;
1014 			}
1015 		}
1016 		error = 0;
1017 		break;
1018 	}
1019 
1020 	/*
1021 	 * Calculate the easy-switch() transactional command.  Represents
1022 	 * the outer-transaction command for any transaction-create or
1023 	 * transaction-delete, and the inner message command for any
1024 	 * non-transaction or inside-transaction command.  tcmd will be
1025 	 * set to 0 if the message state is illegal.
1026 	 *
1027 	 * The two can be told apart because outer-transaction commands
1028 	 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1029 	 */
1030 done:
1031 	if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
1032 		if (state != &iocom->state0) {
1033 			msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1034 				    (msg->any.head.cmd & (DMSGF_CREATE |
1035 							  DMSGF_DELETE |
1036 							  DMSGF_REPLY));
1037 		} else {
1038 			msg->tcmd = 0;
1039 		}
1040 	} else {
1041 		msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1042 	}
1043 
1044 	/*
1045 	 * Adjust the state for DELETE handling now, before making the
1046 	 * callback so we are atomic with other state updates.
1047 	 *
1048 	 * Subq/parent linkages are cleaned up after the callback.
1049 	 * If an error occurred the message is ignored and state is not
1050 	 * updated.
1051 	 */
1052 	if ((state = msg->state) == NULL || error != 0) {
1053 		kdio_printf(iocom, 1,
1054 			    "msgrx: state=%p error %d\n",
1055 			    state, error);
1056 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
1057 		KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1058 		state->rxcmd |= DMSGF_DELETE;
1059 		if (state->txcmd & DMSGF_DELETE) {
1060 			KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1061 			if (state->rxcmd & DMSGF_REPLY) {
1062 				KKASSERT(msg->any.head.cmd &
1063 					 DMSGF_REPLY);
1064 				RB_REMOVE(kdmsg_state_tree,
1065 					  &iocom->statewr_tree, state);
1066 			} else {
1067 				KKASSERT((msg->any.head.cmd &
1068 					  DMSGF_REPLY) == 0);
1069 				RB_REMOVE(kdmsg_state_tree,
1070 					  &iocom->staterd_tree, state);
1071 			}
1072 			state->flags &= ~KDMSG_STATE_RBINSERTED;
1073 			kdmsg_state_drop(state);	/* state on rbtree */
1074 		}
1075 	}
1076 	lockmgr(&iocom->msglk, LK_RELEASE);
1077 
1078 	return (error);
1079 }
1080 
1081 /*
1082  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1083  * This routine must call iocom->rcvmsg() for anything not automatically
1084  * handled.
1085  */
1086 static int
1087 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1088 {
1089 	kdmsg_iocom_t *iocom = msg->state->iocom;
1090 	kdmsg_msg_t *rep;
1091 	int error = 0;
1092 	uint32_t cmd;
1093 
1094 	/*
1095 	 * Main switch processes transaction create/delete sequences only.
1096 	 * Use icmd (DELETEs use DMSG_LNK_ERROR
1097 	 *
1098 	 * NOTE: If processing in-transaction messages you generally want
1099 	 *	 an inner switch on msg->any.head.cmd.
1100 	 */
1101 	if (msg->state) {
1102 		cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1103 		      (msg->any.head.cmd & (DMSGF_CREATE |
1104 					    DMSGF_DELETE |
1105 					    DMSGF_REPLY));
1106 	} else {
1107 		cmd = 0;
1108 	}
1109 
1110 	switch(cmd) {
1111 	case DMSG_LNK_PING:
1112 		/*
1113 		 * Received ping, send reply
1114 		 */
1115 		rep = kdmsg_msg_alloc(msg->state, DMSG_LNK_PING | DMSGF_REPLY,
1116 				      NULL, NULL);
1117 		kdmsg_msg_write(rep);
1118 		break;
1119 	case DMSG_LNK_PING | DMSGF_REPLY:
1120 		/* ignore replies */
1121 		break;
1122 	case DMSG_LNK_CONN | DMSGF_CREATE:
1123 	case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1124 		/*
1125 		 * Received LNK_CONN transaction.  Transmit response and
1126 		 * leave transaction open, which allows the other end to
1127 		 * start to the SPAN protocol.
1128 		 *
1129 		 * Handle shim after acknowledging the CONN.
1130 		 */
1131 		if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1132 			if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1133 				kdmsg_msg_result(msg, 0);
1134 				if (iocom->auto_callback)
1135 					iocom->auto_callback(msg);
1136 			} else {
1137 				error = iocom->rcvmsg(msg);
1138 			}
1139 			break;
1140 		}
1141 		/* fall through */
1142 	case DMSG_LNK_CONN | DMSGF_DELETE:
1143 		/*
1144 		 * This message is usually simulated after a link is lost
1145 		 * to clean up the transaction.
1146 		 */
1147 		if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1148 			if (iocom->auto_callback)
1149 				iocom->auto_callback(msg);
1150 			kdmsg_msg_reply(msg, 0);
1151 		} else {
1152 			error = iocom->rcvmsg(msg);
1153 		}
1154 		break;
1155 	case DMSG_LNK_SPAN | DMSGF_CREATE:
1156 	case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1157 		/*
1158 		 * Received LNK_SPAN transaction.  We do not have to respond
1159 		 * (except on termination), but we must leave the transaction
1160 		 * open.
1161 		 *
1162 		 * Handle shim after acknowledging the SPAN.
1163 		 */
1164 		if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1165 			if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1166 				if (iocom->auto_callback)
1167 					iocom->auto_callback(msg);
1168 				break;
1169 			}
1170 			/* fall through */
1171 		} else {
1172 			error = iocom->rcvmsg(msg);
1173 			break;
1174 		}
1175 		/* fall through */
1176 	case DMSG_LNK_SPAN | DMSGF_DELETE:
1177 		/*
1178 		 * Process shims (auto_callback) before cleaning up the
1179 		 * circuit structure and closing the transactions.  Device
1180 		 * driver should ensure that the circuit is not used after
1181 		 * the auto_callback() returns.
1182 		 *
1183 		 * Handle shim before closing the SPAN transaction.
1184 		 */
1185 		if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1186 			if (iocom->auto_callback)
1187 				iocom->auto_callback(msg);
1188 			kdmsg_msg_reply(msg, 0);
1189 		} else {
1190 			error = iocom->rcvmsg(msg);
1191 		}
1192 		break;
1193 	default:
1194 		/*
1195 		 * Anything unhandled goes into rcvmsg.
1196 		 *
1197 		 * NOTE: Replies to link-level messages initiated by our side
1198 		 *	 are handled by the state callback, they are NOT
1199 		 *	 handled here.
1200 		 */
1201 		error = iocom->rcvmsg(msg);
1202 		break;
1203 	}
1204 	return (error);
1205 }
1206 
1207 /*
1208  * Post-receive-handling message and state cleanup.  This routine is called
1209  * after the state function handling/callback to properly dispose of the
1210  * message and unlink the state's parent/subq linkage if the state is
1211  * completely closed.
1212  *
1213  * msglk is not held.
1214  */
1215 static
1216 void
1217 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1218 {
1219 	kdmsg_state_t *state = msg->state;
1220 	kdmsg_iocom_t *iocom = state->iocom;
1221 
1222 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1223 	if (state != &iocom->state0) {
1224 		/*
1225 		 * When terminating a transaction (in either direction), all
1226 		 * sub-states are aborted.
1227 		 */
1228 		if ((msg->any.head.cmd & DMSGF_DELETE) &&
1229 		    TAILQ_FIRST(&msg->state->subq)) {
1230 			kdio_printf(iocom, 2,
1231 				    "simulate failure for substates of "
1232 				    "state %p cmd %08x/%08x\n",
1233 				    msg->state,
1234 				    msg->state->rxcmd,
1235 				    msg->state->txcmd);
1236 			kdmsg_simulate_failure(msg->state,
1237 					       0, DMSG_ERR_LOSTLINK);
1238 		}
1239 
1240 		/*
1241 		 * Once the state is fully closed we can (try to) remove it
1242 		 * from the subq topology.
1243 		 */
1244 		if ((state->flags & KDMSG_STATE_SUBINSERTED) &&
1245 		    (state->rxcmd & DMSGF_DELETE) &&
1246 		    (state->txcmd & DMSGF_DELETE)) {
1247 			/*
1248 			 * Remove parent linkage if state is completely closed.
1249 			 */
1250 			kdmsg_subq_delete(state);
1251 		}
1252 	}
1253 	kdmsg_msg_free(msg);
1254 
1255 	lockmgr(&iocom->msglk, LK_RELEASE);
1256 }
1257 
1258 /*
1259  * Remove state from its parent's subq.  This can wind up recursively
1260  * dropping the parent upward.
1261  *
1262  * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1263  */
1264 static
1265 void
1266 kdmsg_subq_delete(kdmsg_state_t *state)
1267 {
1268 	kdmsg_state_t *pstate;
1269 
1270 	if (state->flags & KDMSG_STATE_SUBINSERTED) {
1271 		pstate = state->parent;
1272 		KKASSERT(pstate);
1273 		if (pstate->scan == state)
1274 			pstate->scan = NULL;
1275 		TAILQ_REMOVE(&pstate->subq, state, entry);
1276 		state->flags &= ~KDMSG_STATE_SUBINSERTED;
1277 		state->parent = NULL;
1278 		if (TAILQ_EMPTY(&pstate->subq)) {
1279 			kdmsg_state_drop(pstate);/* pstate->subq */
1280 		}
1281 		pstate = NULL;			 /* safety */
1282 		kdmsg_state_drop(state);  	 /* pstate->subq */
1283 	} else {
1284 		KKASSERT(state->parent == NULL);
1285 	}
1286 }
1287 
1288 /*
1289  * Simulate receiving a message which terminates an active transaction
1290  * state.  Our simulated received message must set DELETE and may also
1291  * have to set CREATE.  It must also ensure that all fields are set such
1292  * that the receive handling code can find the state (kdmsg_state_msgrx())
1293  * or an endless loop will ensue.
1294  *
1295  * This is used when the other end of the link is dead so the device driver
1296  * gets a completed transaction for all pending states.
1297  *
1298  * Called with iocom locked.
1299  */
1300 static
1301 void
1302 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1303 {
1304 	kdmsg_state_t *substate;
1305 
1306 	kdmsg_state_hold(state);		/* aborting */
1307 
1308 	/*
1309 	 * Abort parent state first. Parent will not actually disappear
1310 	 * until children are gone.  Device drivers must handle the situation.
1311 	 * The advantage of this is that device drivers can flag the situation
1312 	 * as an interlock against new operations on dying states.  And since
1313 	 * device operations are often asynchronous anyway, this sequence of
1314 	 * events works out better.
1315 	 */
1316 	if (meto)
1317 		kdmsg_state_abort(state);
1318 
1319 	/*
1320 	 * Recurse through any children.
1321 	 */
1322 again:
1323 	TAILQ_FOREACH(substate, &state->subq, entry) {
1324 		if (substate->flags & KDMSG_STATE_ABORTING)
1325 			continue;
1326 		state->scan = substate;
1327 		kdmsg_simulate_failure(substate, 1, error);
1328 		if (state->scan != substate)
1329 			goto again;
1330 	}
1331 	kdmsg_state_drop(state);		/* aborting */
1332 }
1333 
1334 static
1335 void
1336 kdmsg_state_abort(kdmsg_state_t *state)
1337 {
1338 	kdmsg_msg_t *msg;
1339 
1340 	/*
1341 	 * Set ABORTING and DYING, return if already set.  If the state was
1342 	 * just allocated we defer the abort operation until the related
1343 	 * message is processed.
1344 	 */
1345 	KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1346 	if (state->flags & KDMSG_STATE_ABORTING)
1347 		return;
1348 	state->flags |= KDMSG_STATE_ABORTING;
1349 	kdmsg_state_dying(state);
1350 	if (state->flags & KDMSG_STATE_NEW) {
1351 		kdio_printf(iocom, 5,
1352 			    "kdmsg_state_abort(0): state %p rxcmd %08x "
1353 			    "txcmd %08x flags %08x - in NEW state\n",
1354 			    state, state->rxcmd,
1355 			    state->txcmd, state->flags);
1356 		return;
1357 	}
1358 
1359 	/*
1360 	 * NOTE: The DELETE flag might already be set due to an early
1361 	 *	 termination.
1362 	 *
1363 	 * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1364 	 *
1365 	 * NOTE: We are simulating a received message using our state
1366 	 *	 (vs a message generated by the other side using its state),
1367 	 *	 so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1368 	 */
1369 	kdio_printf(iocom, 5,
1370 		    "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1371 		    state, state->rxcmd, state->txcmd);
1372 	if ((state->rxcmd & DMSGF_DELETE) == 0) {
1373 		msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1374 		if ((state->rxcmd & DMSGF_CREATE) == 0)
1375 			msg->any.head.cmd |= DMSGF_CREATE;
1376 		msg->any.head.cmd |= DMSGF_DELETE |
1377 				     (state->rxcmd & DMSGF_REPLY);
1378 		msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1379 		msg->any.head.error = DMSG_ERR_LOSTLINK;
1380 		kdio_printf(iocom, 5,
1381 			    "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1382 			    state, msg->any.head.cmd);
1383 		/* circuit not initialized */
1384 		lockmgr(&state->iocom->msglk, LK_RELEASE);
1385 		kdmsg_msg_receive_handling(msg);
1386 		lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1387 		msg = NULL;
1388 	}
1389 	kdio_printf(iocom, 5,
1390 		    "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1391 		    state, state->rxcmd, state->txcmd);
1392 }
1393 
1394 /*
1395  * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1396  * the transmission of any new messages on these states.  This is done
1397  * atomically when parent state is terminating, whereas setting ABORTING is
1398  * not atomic and can leak races.
1399  */
1400 static
1401 void
1402 kdmsg_state_dying(kdmsg_state_t *state)
1403 {
1404 	kdmsg_state_t *scan;
1405 
1406 	if ((state->flags & KDMSG_STATE_DYING) == 0) {
1407 		state->flags |= KDMSG_STATE_DYING;
1408 		TAILQ_FOREACH(scan, &state->subq, entry)
1409 			kdmsg_state_dying(scan);
1410 	}
1411 }
1412 
1413 /*
1414  * Process state tracking for a message prior to transmission.
1415  *
1416  * Called with msglk held and the msg dequeued.  Returns non-zero if
1417  * the message is bad and should be deleted by the caller.
1418  *
1419  * One-off messages are usually with dummy state and msg->state may be NULL
1420  * in this situation.
1421  *
1422  * New transactions (when CREATE is set) will insert the state.
1423  *
1424  * May request that caller discard the message by setting *discardp to 1.
1425  * A NULL state may be returned in this case.
1426  */
1427 static
1428 int
1429 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1430 {
1431 	kdmsg_iocom_t *iocom = msg->state->iocom;
1432 	kdmsg_state_t *state;
1433 	int error;
1434 
1435 	/*
1436 	 * Make sure a state structure is ready to go in case we need a new
1437 	 * one.  This is the only routine which uses freewr_state so no
1438 	 * races are possible.
1439 	 */
1440 	if ((state = iocom->freewr_state) == NULL) {
1441 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1442 		state->flags = KDMSG_STATE_DYNAMIC;
1443 		state->iocom = iocom;
1444 		state->refs = 1;
1445 		TAILQ_INIT(&state->subq);
1446 		iocom->freewr_state = state;
1447 	}
1448 
1449 	/*
1450 	 * Lock RB tree.  If persistent state is present it will have already
1451 	 * been assigned to msg.
1452 	 */
1453 	state = msg->state;
1454 
1455 	/*
1456 	 * Short-cut one-off or mid-stream messages (state may be NULL).
1457 	 */
1458 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1459 				  DMSGF_ABORT)) == 0) {
1460 		return(0);
1461 	}
1462 
1463 
1464 	/*
1465 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1466 	 * inside the case statements.
1467 	 */
1468 	switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1469 				    DMSGF_REPLY)) {
1470 	case DMSGF_CREATE:
1471 	case DMSGF_CREATE | DMSGF_DELETE:
1472 		/*
1473 		 * Insert the new persistent message state and mark
1474 		 * half-closed if DELETE is set.  Since this is a new
1475 		 * message it isn't possible to transition into the fully
1476 		 * closed state here.
1477 		 *
1478 		 * XXX state must be assigned and inserted by
1479 		 *     kdmsg_msg_write().  txcmd is assigned by us
1480 		 *     on-transmit.
1481 		 */
1482 		KKASSERT(state != NULL);
1483 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1484 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1485 		state->rxcmd = DMSGF_REPLY;
1486 		state->flags &= ~KDMSG_STATE_NEW;
1487 		error = 0;
1488 		break;
1489 	case DMSGF_DELETE:
1490 		/*
1491 		 * Sent ABORT+DELETE in case where msgid has already
1492 		 * been fully closed, ignore the message.
1493 		 */
1494 		if (state == &iocom->state0) {
1495 			if (msg->any.head.cmd & DMSGF_ABORT) {
1496 				error = EALREADY;
1497 			} else {
1498 				kdio_printf(iocom, 1,
1499 					"msgtx: no state match "
1500 					"for DELETE cmd=%08x msgid=%016jx\n",
1501 					msg->any.head.cmd,
1502 					(intmax_t)msg->any.head.msgid);
1503 				error = EINVAL;
1504 			}
1505 			break;
1506 		}
1507 
1508 		/*
1509 		 * Sent ABORT+DELETE in case where msgid has
1510 		 * already been reused for an unrelated message,
1511 		 * ignore the message.
1512 		 */
1513 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1514 			if (msg->any.head.cmd & DMSGF_ABORT) {
1515 				error = EALREADY;
1516 			} else {
1517 				kdio_printf(iocom, 1, "%s\n",
1518 					    "msgtx: state reused "
1519 					    "for DELETE");
1520 				error = EINVAL;
1521 			}
1522 			break;
1523 		}
1524 		error = 0;
1525 		break;
1526 	default:
1527 		/*
1528 		 * Check for mid-stream ABORT command sent
1529 		 */
1530 		if (msg->any.head.cmd & DMSGF_ABORT) {
1531 			if (state == &state->iocom->state0 ||
1532 			    (state->txcmd & DMSGF_CREATE) == 0) {
1533 				error = EALREADY;
1534 				break;
1535 			}
1536 		}
1537 		error = 0;
1538 		break;
1539 	case DMSGF_REPLY | DMSGF_CREATE:
1540 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1541 		/*
1542 		 * When transmitting a reply with CREATE set the original
1543 		 * persistent state message should already exist.
1544 		 */
1545 		if (state == &state->iocom->state0) {
1546 			kdio_printf(iocom, 1, "%s\n",
1547 				    "msgtx: no state match "
1548 				    "for REPLY | CREATE");
1549 			error = EINVAL;
1550 			break;
1551 		}
1552 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1553 		error = 0;
1554 		break;
1555 	case DMSGF_REPLY | DMSGF_DELETE:
1556 		/*
1557 		 * When transmitting a reply with DELETE set the original
1558 		 * persistent state message should already exist.
1559 		 *
1560 		 * This is very similar to the REPLY|CREATE|* case except
1561 		 * txcmd is already stored, so we just add the DELETE flag.
1562 		 *
1563 		 * Sent REPLY+ABORT+DELETE in case where msgid has
1564 		 * already been fully closed, ignore the message.
1565 		 */
1566 		if (state == &state->iocom->state0) {
1567 			if (msg->any.head.cmd & DMSGF_ABORT) {
1568 				error = EALREADY;
1569 			} else {
1570 				kdio_printf(iocom, 1, "%s\n",
1571 					    "msgtx: no state match "
1572 					    "for REPLY | DELETE");
1573 				error = EINVAL;
1574 			}
1575 			break;
1576 		}
1577 
1578 		/*
1579 		 * Sent REPLY+ABORT+DELETE in case where msgid has already
1580 		 * been reused for an unrelated message, ignore the message.
1581 		 */
1582 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1583 			if (msg->any.head.cmd & DMSGF_ABORT) {
1584 				error = EALREADY;
1585 			} else {
1586 				kdio_printf(iocom, 1, "%s\n",
1587 					    "msgtx: state reused "
1588 					    "for REPLY | DELETE");
1589 				error = EINVAL;
1590 			}
1591 			break;
1592 		}
1593 		error = 0;
1594 		break;
1595 	case DMSGF_REPLY:
1596 		/*
1597 		 * Check for mid-stream ABORT reply sent.
1598 		 *
1599 		 * One-off REPLY messages are allowed for e.g. status updates.
1600 		 */
1601 		if (msg->any.head.cmd & DMSGF_ABORT) {
1602 			if (state == &state->iocom->state0 ||
1603 			    (state->txcmd & DMSGF_CREATE) == 0) {
1604 				error = EALREADY;
1605 				break;
1606 			}
1607 		}
1608 		error = 0;
1609 		break;
1610 	}
1611 
1612 	/*
1613 	 * Set interlock (XXX hack) in case the send side blocks and a
1614 	 * response is returned before kdmsg_state_cleanuptx() can be
1615 	 * run.
1616 	 */
1617 	if (state && error == 0)
1618 		state->flags |= KDMSG_STATE_INTERLOCK;
1619 
1620 	return (error);
1621 }
1622 
1623 /*
1624  * Called with iocom locked.
1625  */
1626 static
1627 void
1628 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1629 {
1630 	kdmsg_iocom_t *iocom = msg->state->iocom;
1631 	kdmsg_state_t *state;
1632 
1633 	if ((state = msg->state) == NULL) {
1634 		kdmsg_msg_free(msg);
1635 		return;
1636 	}
1637 
1638 	/*
1639 	 * Clear interlock (XXX hack) in case the send side blocks and a
1640 	 * response is returned in the other thread before
1641 	 * kdmsg_state_cleanuptx() can be run.  We maintain our hold on
1642 	 * iocom->msglk so we can do this before completing our task.
1643 	 */
1644 	if (state->flags & KDMSG_STATE_SIGNAL) {
1645 		kdio_printf(iocom, 1, "state %p interlock!\n", state);
1646 		wakeup(state);
1647 	}
1648 	state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1649 	kdmsg_state_hold(state);
1650 
1651 	if (msg->any.head.cmd & DMSGF_DELETE) {
1652 		KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1653 		state->txcmd |= DMSGF_DELETE;
1654 		if (state->rxcmd & DMSGF_DELETE) {
1655 			KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1656 			if (state->txcmd & DMSGF_REPLY) {
1657 				KKASSERT(msg->any.head.cmd &
1658 					 DMSGF_REPLY);
1659 				RB_REMOVE(kdmsg_state_tree,
1660 					  &iocom->staterd_tree, state);
1661 			} else {
1662 				KKASSERT((msg->any.head.cmd &
1663 					  DMSGF_REPLY) == 0);
1664 				RB_REMOVE(kdmsg_state_tree,
1665 					  &iocom->statewr_tree, state);
1666 			}
1667 			state->flags &= ~KDMSG_STATE_RBINSERTED;
1668 
1669 			/*
1670 			 * The subq recursion is used for parent linking and
1671 			 * scanning the topology for aborts, we can only
1672 			 * remove leafs.  The circuit is effectively dead now,
1673 			 * but topology won't be torn down until all of its
1674 			 * children have finished/aborted.
1675 			 *
1676 			 * This is particularly important for end-point
1677 			 * devices which might need to access private data
1678 			 * in parent states.  Out of order disconnects can
1679 			 * occur if an end-point device is processing a
1680 			 * message transaction asynchronously because abort
1681 			 * requests are basically synchronous and it probably
1682 			 * isn't convenient (or possible) for the end-point
1683 			 * to abort an asynchronous operation.
1684 			 */
1685 			if (TAILQ_EMPTY(&state->subq))
1686 				kdmsg_subq_delete(state);
1687 			kdmsg_msg_free(msg);
1688 			kdmsg_state_drop(state);   /* state on rbtree */
1689 		} else {
1690 			kdmsg_msg_free(msg);
1691 		}
1692 	} else {
1693 		kdmsg_msg_free(msg);
1694 	}
1695 
1696 	/*
1697 	 * Deferred abort after transmission.
1698 	 */
1699 	if ((state->flags & (KDMSG_STATE_ABORTING | KDMSG_STATE_DYING)) &&
1700 	    (state->rxcmd & DMSGF_DELETE) == 0) {
1701 		kdio_printf(iocom, 5,
1702 			    "kdmsg_state_cleanuptx: state=%p "
1703 			    "executing deferred abort\n",
1704 			    state);
1705 		state->flags &= ~KDMSG_STATE_ABORTING;
1706 		kdmsg_state_abort(state);
1707 	}
1708 	kdmsg_state_drop(state);
1709 }
1710 
1711 static
1712 void
1713 _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1714 {
1715 	atomic_add_int(&state->refs, 1);
1716 #if KDMSG_DEBUG
1717 	kd_printf(4, "state %p +%d\t%s:%d\n", state, state->refs, file, line);
1718 #endif
1719 }
1720 
1721 static
1722 void
1723 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1724 {
1725 	KKASSERT(state->refs > 0);
1726 #if KDMSG_DEBUG
1727 	kd_printf(4, "state %p -%d\t%s:%d\n", state, state->refs, file, line);
1728 #endif
1729 	if (atomic_fetchadd_int(&state->refs, -1) == 1)
1730 		kdmsg_state_free(state);
1731 }
1732 
1733 static
1734 void
1735 kdmsg_state_free(kdmsg_state_t *state)
1736 {
1737 	kdmsg_iocom_t *iocom = state->iocom;
1738 
1739 	KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1740 	KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1741 	KKASSERT(TAILQ_EMPTY(&state->subq));
1742 
1743 	if (state != &state->iocom->state0)
1744 		kfree(state, iocom->mmsg);
1745 }
1746 
1747 kdmsg_msg_t *
1748 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1749 		int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1750 {
1751 	kdmsg_iocom_t *iocom = state->iocom;
1752 	kdmsg_state_t *pstate;
1753 	kdmsg_msg_t *msg;
1754 	size_t hbytes;
1755 
1756 	KKASSERT(iocom != NULL);
1757 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1758 	msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1759 		      iocom->mmsg, M_WAITOK | M_ZERO);
1760 	msg->hdr_size = hbytes;
1761 
1762 	if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1763 		/*
1764 		 * New transaction, requires tracking state and a unique
1765 		 * msgid to be allocated.
1766 		 *
1767 		 * It is possible to race a circuit failure, inherit the
1768 		 * parent's STATE_DYING flag to trigger an abort sequence
1769 		 * in the transmit path.  By not inheriting ABORTING the
1770 		 * abort sequence can recurse.
1771 		 *
1772 		 * NOTE: The transactions has not yet been initiated so we
1773 		 *	 cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1774 		 *	 We have to properly setup DMSGF_REPLY, however.
1775 		 */
1776 		pstate = state;
1777 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1778 		TAILQ_INIT(&state->subq);
1779 		state->iocom = iocom;
1780 		state->parent = pstate;
1781 		state->flags = KDMSG_STATE_DYNAMIC |
1782 			       KDMSG_STATE_NEW;
1783 		state->func = func;
1784 		state->any.any = data;
1785 		state->msgid = (uint64_t)(uintptr_t)state;
1786 		/*msg->any.head.msgid = state->msgid;XXX*/
1787 
1788 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1789 		if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1790 			panic("duplicate msgid allocated");
1791 		if (TAILQ_EMPTY(&pstate->subq))
1792 			kdmsg_state_hold(pstate);/* pstate->subq */
1793 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1794 		state->flags |= KDMSG_STATE_RBINSERTED |
1795 				KDMSG_STATE_SUBINSERTED;
1796 		state->flags |= pstate->flags & KDMSG_STATE_DYING;
1797 		kdmsg_state_hold(state);	/* pstate->subq */
1798 		kdmsg_state_hold(state);	/* state on rbtree */
1799 		kdmsg_state_hold(state);	/* msg->state */
1800 		lockmgr(&iocom->msglk, LK_RELEASE);
1801 	} else {
1802 		pstate = state->parent;
1803 		KKASSERT(pstate != NULL);
1804 		kdmsg_state_hold(state);	/* msg->state */
1805 	}
1806 
1807 	if (state->flags & KDMSG_STATE_OPPOSITE)
1808 		cmd |= DMSGF_REVTRANS;
1809 	if (pstate->flags & KDMSG_STATE_OPPOSITE)
1810 		cmd |= DMSGF_REVCIRC;
1811 
1812 	msg->any.head.magic = DMSG_HDR_MAGIC;
1813 	msg->any.head.cmd = cmd;
1814 	msg->any.head.msgid = state->msgid;
1815 	msg->any.head.circuit = pstate->msgid;
1816 	msg->state = state;
1817 
1818 	return (msg);
1819 }
1820 
1821 void
1822 kdmsg_msg_free(kdmsg_msg_t *msg)
1823 {
1824 	kdmsg_iocom_t *iocom = msg->state->iocom;
1825 	kdmsg_state_t *state;
1826 
1827 	if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1828 	    msg->aux_data && msg->aux_size) {
1829 		kfree(msg->aux_data, iocom->mmsg);
1830 		msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1831 	}
1832 	if ((state = msg->state) != NULL) {
1833 		msg->state = NULL;
1834 		kdmsg_state_drop(state);	/* msg->state */
1835 	}
1836 	msg->aux_data = NULL;
1837 	msg->aux_size = 0;
1838 
1839 	kfree(msg, iocom->mmsg);
1840 }
1841 
1842 void
1843 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1844 {
1845 	if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1846 		data->aux_data = msg->aux_data;
1847 		data->aux_size = msg->aux_size;
1848 		data->iocom = msg->state->iocom;
1849 		msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1850 	} else {
1851 		data->aux_data = NULL;
1852 		data->aux_size = 0;
1853 		data->iocom = msg->state->iocom;
1854 	}
1855 }
1856 
1857 void
1858 kdmsg_free_aux_data(kdmsg_data_t *data)
1859 {
1860 	if (data->aux_data)
1861 		kfree(data->aux_data, data->iocom->mmsg);
1862 }
1863 
1864 /*
1865  * Indexed messages are stored in a red-black tree indexed by their
1866  * msgid.  Only persistent messages are indexed.
1867  */
1868 int
1869 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1870 {
1871 	if (state1->iocom < state2->iocom)
1872 		return(-1);
1873 	if (state1->iocom > state2->iocom)
1874 		return(1);
1875 	if (state1->msgid < state2->msgid)
1876 		return(-1);
1877 	if (state1->msgid > state2->msgid)
1878 		return(1);
1879 	return(0);
1880 }
1881 
1882 /*
1883  * Write a message.  All requisit command flags have been set.
1884  *
1885  * If msg->state is non-NULL the message is written to the existing
1886  * transaction.  msgid will be set accordingly.
1887  *
1888  * If msg->state is NULL and CREATE is set new state is allocated and
1889  * (func, data) is installed.  A msgid is assigned.
1890  *
1891  * If msg->state is NULL and CREATE is not set the message is assumed
1892  * to be a one-way message.  The originator must assign the msgid
1893  * (or leave it 0, which is typical.
1894  *
1895  * This function merely queues the message to the management thread, it
1896  * does not write to the message socket/pipe.
1897  */
1898 void
1899 kdmsg_msg_write(kdmsg_msg_t *msg)
1900 {
1901 	kdmsg_iocom_t *iocom = msg->state->iocom;
1902 
1903 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1904 	kdmsg_msg_write_locked(iocom, msg);
1905 	lockmgr(&iocom->msglk, LK_RELEASE);
1906 }
1907 
1908 static void
1909 kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg)
1910 {
1911 	kdmsg_state_t *state;
1912 
1913 	if (msg->state) {
1914 		/*
1915 		 * Continuance or termination of existing transaction.
1916 		 * The transaction could have been initiated by either end.
1917 		 *
1918 		 * (Function callback and aux data for the receive side can
1919 		 * be replaced or left alone).
1920 		 */
1921 		state = msg->state;
1922 		msg->any.head.msgid = state->msgid;
1923 	} else {
1924 		/*
1925 		 * One-off message (always uses msgid 0 to distinguish
1926 		 * between a possibly lost in-transaction message due to
1927 		 * competing aborts and a real one-off message?)
1928 		 */
1929 		state = NULL;
1930 		msg->any.head.msgid = 0;
1931 	}
1932 
1933 #if 0
1934 	/*
1935 	 * XXX removed - don't make this a panic, allow the state checks
1936 	 *     below to catch the situation.
1937 	 *
1938 	 * This flag is not set until after the tx thread has drained
1939 	 * the tx msgq and simulated responses.  After that point the
1940 	 * txthread is dead and can no longer simulate responses.
1941 	 *
1942 	 * Device drivers should never try to send a message once this
1943 	 * flag is set.  They should have detected (through the state
1944 	 * closures) that the link is in trouble.
1945 	 */
1946 	if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1947 		lockmgr(&iocom->msglk, LK_RELEASE);
1948 		panic("kdmsg_msg_write: Attempt to write message to "
1949 		      "terminated iocom\n");
1950 	}
1951 #endif
1952 
1953 	/*
1954 	 * For stateful messages, if the circuit is dead or dying we have
1955 	 * to abort the potentially newly-created state and discard the
1956 	 * message.
1957 	 *
1958 	 * - We must discard the message because the other end will not
1959 	 *   be expecting any more messages over the dead or dying circuit
1960 	 *   and might not be able to receive them.
1961 	 *
1962 	 * - We abort the state by simulating a failure to generate a fake
1963 	 *   incoming DELETE.  This will trigger the state callback and allow
1964 	 *   the device to clean things up and reply, closing the outgoing
1965 	 *   direction and allowing the state to be freed.
1966 	 *
1967 	 * This situation occurs quite often, particularly as SPANs stabilize.
1968 	 * End-points must do the right thing.
1969 	 */
1970 	if (state) {
1971 		KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1972 		if (state->flags & KDMSG_STATE_DYING) {
1973 #if 0
1974 		if ((state->flags & KDMSG_STATE_DYING) ||
1975 		    (state->parent->txcmd & DMSGF_DELETE) ||
1976 		    (state->parent->flags & KDMSG_STATE_DYING)) {
1977 #endif
1978 			kdio_printf(iocom, 4,
1979 				    "kdmsg_msg_write: Write to dying circuit "
1980 				    "state=%p "
1981 				    "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1982 				    state,
1983 				    state->parent->rxcmd,
1984 				    state->parent->txcmd,
1985 				    state->parent->flags);
1986 			kdmsg_state_hold(state);
1987 			kdmsg_state_msgtx(msg);
1988 			kdmsg_state_cleanuptx(msg);
1989 			kdmsg_state_drop(state);
1990 			return;
1991 		}
1992 	}
1993 
1994 	/*
1995 	 * Finish up the msg fields.  Note that msg->aux_size and the
1996 	 * aux_bytes stored in the message header represent the unaligned
1997 	 * (actual) bytes of data, but the buffer is sized to an aligned
1998 	 * size and the CRC is generated over the aligned length.
1999 	 */
2000 	msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
2001 	++iocom->msg_seq;
2002 
2003 	if (msg->aux_data && msg->aux_size) {
2004 		uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
2005 
2006 		msg->any.head.aux_bytes = msg->aux_size;
2007 		msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
2008 	}
2009 	msg->any.head.hdr_crc = 0;
2010 	msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
2011 
2012 	TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
2013 
2014 	if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
2015 		atomic_clear_int(&iocom->msg_ctl,
2016 				 KDMSG_CLUSTERCTL_SLEEPING);
2017 		wakeup(&iocom->msg_ctl);
2018 	}
2019 }
2020 
2021 /*
2022  * Reply to a message and terminate our side of the transaction.
2023  *
2024  * If msg->state is non-NULL we are replying to a one-way message.
2025  */
2026 void
2027 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
2028 {
2029 	kdmsg_state_t *state = msg->state;
2030 	kdmsg_msg_t *nmsg;
2031 	uint32_t cmd;
2032 
2033 	/*
2034 	 * Reply with a simple error code and terminate the transaction.
2035 	 */
2036 	cmd = DMSG_LNK_ERROR;
2037 
2038 	/*
2039 	 * Check if our direction has even been initiated yet, set CREATE.
2040 	 *
2041 	 * Check what direction this is (command or reply direction).  Note
2042 	 * that txcmd might not have been initiated yet.
2043 	 *
2044 	 * If our direction has already been closed we just return without
2045 	 * doing anything.
2046 	 */
2047 	if (state != &state->iocom->state0) {
2048 		if (state->txcmd & DMSGF_DELETE)
2049 			return;
2050 		if ((state->txcmd & DMSGF_CREATE) == 0)
2051 			cmd |= DMSGF_CREATE;
2052 		if (state->txcmd & DMSGF_REPLY)
2053 			cmd |= DMSGF_REPLY;
2054 		cmd |= DMSGF_DELETE;
2055 	} else {
2056 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2057 			cmd |= DMSGF_REPLY;
2058 	}
2059 
2060 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2061 	nmsg->any.head.error = error;
2062 	kdmsg_msg_write(nmsg);
2063 }
2064 
2065 /*
2066  * Reply to a message and continue our side of the transaction.
2067  *
2068  * If msg->state is non-NULL we are replying to a one-way message and this
2069  * function degenerates into the same as kdmsg_msg_reply().
2070  */
2071 void
2072 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
2073 {
2074 	kdmsg_state_t *state = msg->state;
2075 	kdmsg_msg_t *nmsg;
2076 	uint32_t cmd;
2077 
2078 	/*
2079 	 * Return a simple result code, do NOT terminate the transaction.
2080 	 */
2081 	cmd = DMSG_LNK_ERROR;
2082 
2083 	/*
2084 	 * Check if our direction has even been initiated yet, set CREATE.
2085 	 *
2086 	 * Check what direction this is (command or reply direction).  Note
2087 	 * that txcmd might not have been initiated yet.
2088 	 *
2089 	 * If our direction has already been closed we just return without
2090 	 * doing anything.
2091 	 */
2092 	if (state != &state->iocom->state0) {
2093 		if (state->txcmd & DMSGF_DELETE)
2094 			return;
2095 		if ((state->txcmd & DMSGF_CREATE) == 0)
2096 			cmd |= DMSGF_CREATE;
2097 		if (state->txcmd & DMSGF_REPLY)
2098 			cmd |= DMSGF_REPLY;
2099 		/* continuing transaction, do not set MSGF_DELETE */
2100 	} else {
2101 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2102 			cmd |= DMSGF_REPLY;
2103 	}
2104 
2105 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2106 	nmsg->any.head.error = error;
2107 	kdmsg_msg_write(nmsg);
2108 }
2109 
2110 /*
2111  * Reply to a message and terminate our side of the transaction.
2112  *
2113  * If msg->state is non-NULL we are replying to a one-way message.
2114  */
2115 void
2116 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
2117 {
2118 	kdmsg_msg_t *nmsg;
2119 	uint32_t cmd;
2120 
2121 	/*
2122 	 * Reply with a simple error code and terminate the transaction.
2123 	 */
2124 	cmd = DMSG_LNK_ERROR;
2125 
2126 	/*
2127 	 * Check if our direction has even been initiated yet, set CREATE.
2128 	 *
2129 	 * Check what direction this is (command or reply direction).  Note
2130 	 * that txcmd might not have been initiated yet.
2131 	 *
2132 	 * If our direction has already been closed we just return without
2133 	 * doing anything.
2134 	 */
2135 	KKASSERT(state);
2136 	if (state->txcmd & DMSGF_DELETE)
2137 		return;
2138 	if ((state->txcmd & DMSGF_CREATE) == 0)
2139 		cmd |= DMSGF_CREATE;
2140 	if (state->txcmd & DMSGF_REPLY)
2141 		cmd |= DMSGF_REPLY;
2142 	cmd |= DMSGF_DELETE;
2143 
2144 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2145 	nmsg->any.head.error = error;
2146 	kdmsg_msg_write(nmsg);
2147 }
2148 
2149 /*
2150  * Reply to a message and continue our side of the transaction.
2151  *
2152  * If msg->state is non-NULL we are replying to a one-way message and this
2153  * function degenerates into the same as kdmsg_msg_reply().
2154  */
2155 void
2156 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2157 {
2158 	kdmsg_msg_t *nmsg;
2159 	uint32_t cmd;
2160 
2161 	/*
2162 	 * Return a simple result code, do NOT terminate the transaction.
2163 	 */
2164 	cmd = DMSG_LNK_ERROR;
2165 
2166 	/*
2167 	 * Check if our direction has even been initiated yet, set CREATE.
2168 	 *
2169 	 * Check what direction this is (command or reply direction).  Note
2170 	 * that txcmd might not have been initiated yet.
2171 	 *
2172 	 * If our direction has already been closed we just return without
2173 	 * doing anything.
2174 	 */
2175 	KKASSERT(state);
2176 	if (state->txcmd & DMSGF_DELETE)
2177 		return;
2178 	if ((state->txcmd & DMSGF_CREATE) == 0)
2179 		cmd |= DMSGF_CREATE;
2180 	if (state->txcmd & DMSGF_REPLY)
2181 		cmd |= DMSGF_REPLY;
2182 	/* continuing transaction, do not set MSGF_DELETE */
2183 
2184 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2185 	nmsg->any.head.error = error;
2186 	kdmsg_msg_write(nmsg);
2187 }
2188