xref: /dragonfly/sys/kern/kern_dmsg.c (revision a4c31683)
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36  *	 a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/priv.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
56 
57 #include <sys/dmsg.h>
58 
59 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
60 
61 SYSCTL_NODE(, OID_AUTO, kdmsg, CTLFLAG_RW, 0, "kdmsg");
62 static int kdmsg_debug = 1;
63 SYSCTL_INT(_kdmsg, OID_AUTO, debug, CTLFLAG_RW, &kdmsg_debug, 0,
64 	   "Set debug level for kernel dmsg layer");
65 
66 #define kd_printf(level, ctl, ...)      	\
67         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
68 
69 #define kdio_printf(iocom, level, ctl, ...)      \
70         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
71 
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
73 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
74 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
78 static void kdmsg_subq_delete(kdmsg_state_t *state);
79 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
80 static void kdmsg_state_abort(kdmsg_state_t *state);
81 static void kdmsg_state_dying(kdmsg_state_t *state);
82 static void kdmsg_state_free(kdmsg_state_t *state);
83 
84 #ifdef KDMSG_DEBUG
85 #define KDMSG_DEBUG_ARGS	, const char *file, int line
86 #define kdmsg_state_hold(state)	_kdmsg_state_hold(state, __FILE__, __LINE__)
87 #define kdmsg_state_drop(state)	_kdmsg_state_drop(state, __FILE__, __LINE__)
88 #else
89 #define KDMSG_DEBUG 0
90 #define KDMSG_DEBUG_ARGS
91 #define kdmsg_state_hold(state)	_kdmsg_state_hold(state)
92 #define kdmsg_state_drop(state)	_kdmsg_state_drop(state)
93 #endif
94 static void _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
95 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
96 
97 static void kdmsg_iocom_thread_rd(void *arg);
98 static void kdmsg_iocom_thread_wr(void *arg);
99 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
100 
101 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
102 
103 /*
104  * Initialize the roll-up communications structure for a network
105  * messaging session.  This function does not install the socket.
106  */
107 void
108 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
109 		 struct malloc_type *mmsg,
110 		 int (*rcvmsg)(kdmsg_msg_t *msg))
111 {
112 	bzero(iocom, sizeof(*iocom));
113 	iocom->handle = handle;
114 	iocom->mmsg = mmsg;
115 	iocom->rcvmsg = rcvmsg;
116 	iocom->flags = flags;
117 	lockinit(&iocom->msglk, "h2msg", 0, 0);
118 	TAILQ_INIT(&iocom->msgq);
119 	RB_INIT(&iocom->staterd_tree);
120 	RB_INIT(&iocom->statewr_tree);
121 
122 	iocom->state0.iocom = iocom;
123 	iocom->state0.parent = &iocom->state0;
124 	TAILQ_INIT(&iocom->state0.subq);
125 }
126 
127 /*
128  * [Re]connect using the passed file pointer.  The caller must ref the
129  * fp for us.  We own that ref now.
130  */
131 void
132 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
133 		      const char *subsysname)
134 {
135 	/*
136 	 * Destroy the current connection
137 	 */
138 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
139 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
140 	while (iocom->msgrd_td || iocom->msgwr_td) {
141 		wakeup(&iocom->msg_ctl);
142 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
143 	}
144 
145 	/*
146 	 * Drop communications descriptor
147 	 */
148 	if (iocom->msg_fp) {
149 		fdrop(iocom->msg_fp);
150 		iocom->msg_fp = NULL;
151 	}
152 
153 	/*
154 	 * Setup new communications descriptor
155 	 */
156 	iocom->msg_ctl = 0;
157 	iocom->msg_fp = fp;
158 	iocom->msg_seq = 0;
159 	iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
160 
161 	lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
162 		    NULL, 0, -1, "%s-msgrd", subsysname);
163 	lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
164 		    NULL, 0, -1, "%s-msgwr", subsysname);
165 	lockmgr(&iocom->msglk, LK_RELEASE);
166 }
167 
168 /*
169  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
170  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
171  */
172 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
173 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
174 
175 void
176 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
177 			 void (*auto_callback)(kdmsg_msg_t *msg))
178 {
179 	kdmsg_msg_t *msg;
180 
181 	iocom->auto_callback = auto_callback;
182 
183 	msg = kdmsg_msg_alloc(&iocom->state0,
184 			      DMSG_LNK_CONN | DMSGF_CREATE,
185 			      kdmsg_lnk_conn_reply, NULL);
186 	iocom->auto_lnk_conn.head = msg->any.head;
187 	msg->any.lnk_conn = iocom->auto_lnk_conn;
188 	iocom->conn_state = msg->state;
189 	kdmsg_state_hold(msg->state);	/* iocom->conn_state */
190 	kdmsg_msg_write(msg);
191 }
192 
193 static
194 int
195 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
196 {
197 	kdmsg_iocom_t *iocom = state->iocom;
198 	kdmsg_msg_t *rmsg;
199 
200 	/*
201 	 * Upon receipt of the LNK_CONN acknowledgement initiate an
202 	 * automatic SPAN if we were asked to.  Used by e.g. xdisk, but
203 	 * not used by HAMMER2 which must manage more than one transmitted
204 	 * SPAN.
205 	 */
206 	if ((msg->any.head.cmd & DMSGF_CREATE) &&
207 	    (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
208 		rmsg = kdmsg_msg_alloc(&iocom->state0,
209 				       DMSG_LNK_SPAN | DMSGF_CREATE,
210 				       kdmsg_lnk_span_reply, NULL);
211 		iocom->auto_lnk_span.head = rmsg->any.head;
212 		rmsg->any.lnk_span = iocom->auto_lnk_span;
213 		kdmsg_msg_write(rmsg);
214 	}
215 
216 	/*
217 	 * Process shim after the CONN is acknowledged and before the CONN
218 	 * transaction is deleted.  For deletions this gives device drivers
219 	 * the ability to interlock new operations on the circuit before
220 	 * it becomes illegal and panics.
221 	 */
222 	if (iocom->auto_callback)
223 		iocom->auto_callback(msg);
224 
225 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
226 	    (msg->any.head.cmd & DMSGF_DELETE)) {
227 		/*
228 		 * iocom->conn_state has a state ref, drop it when clearing.
229 		 */
230 		if (iocom->conn_state)
231 			kdmsg_state_drop(iocom->conn_state);
232 		iocom->conn_state = NULL;
233 		kdmsg_msg_reply(msg, 0);
234 	}
235 
236 	return (0);
237 }
238 
239 static
240 int
241 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
242 {
243 	/*
244 	 * Be sure to process shim before terminating the SPAN
245 	 * transaction.  Gives device drivers the ability to
246 	 * interlock new operations on the circuit before it
247 	 * becomes illegal and panics.
248 	 */
249 	if (state->iocom->auto_callback)
250 		state->iocom->auto_callback(msg);
251 
252 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
253 	    (msg->any.head.cmd & DMSGF_DELETE)) {
254 		kdmsg_msg_reply(msg, 0);
255 	}
256 	return (0);
257 }
258 
259 /*
260  * Disconnect and clean up
261  */
262 void
263 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
264 {
265 	kdmsg_state_t *state;
266 	kdmsg_msg_t *msg;
267 	int retries;
268 
269 	/*
270 	 * Ask the cluster controller to go away by setting
271 	 * KILLRX.  Send a PING to get a response to unstick reading
272 	 * from the pipe.
273 	 *
274 	 * After 10 seconds shitcan the pipe and do an unclean shutdown.
275 	 */
276 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
277 
278 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
279 	msg = kdmsg_msg_alloc(&iocom->state0, DMSG_LNK_PING, NULL, NULL);
280 	kdmsg_msg_write_locked(iocom, msg);
281 
282 	retries = 10;
283 	while (iocom->msgrd_td || iocom->msgwr_td) {
284 		wakeup(&iocom->msg_ctl);
285 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
286 		if (--retries == 0 && iocom->msg_fp) {
287 			kdio_printf(iocom, 0, "%s\n",
288 				    "iocom_uninit: "
289 				    "shitcanning unresponsive pipe");
290 			fp_shutdown(iocom->msg_fp, SHUT_RDWR);
291 			/* retries allowed to go negative, keep looping */
292 		}
293 	}
294 
295 	/*
296 	 * Cleanup caches
297 	 */
298 	if ((state = iocom->freerd_state) != NULL) {
299 		iocom->freerd_state = NULL;
300 		kdmsg_state_drop(state);
301 	}
302 
303 	if ((state = iocom->freewr_state) != NULL) {
304 		iocom->freewr_state = NULL;
305 		kdmsg_state_drop(state);
306 	}
307 
308 	/*
309 	 * Drop communications descriptor
310 	 */
311 	if (iocom->msg_fp) {
312 		fdrop(iocom->msg_fp);
313 		iocom->msg_fp = NULL;
314 	}
315 	lockmgr(&iocom->msglk, LK_RELEASE);
316 }
317 
318 /*
319  * Cluster controller thread.  Perform messaging functions.  We have one
320  * thread for the reader and one for the writer.  The writer handles
321  * shutdown requests (which should break the reader thread).
322  */
323 static
324 void
325 kdmsg_iocom_thread_rd(void *arg)
326 {
327 	kdmsg_iocom_t *iocom = arg;
328 	dmsg_hdr_t hdr;
329 	kdmsg_msg_t *msg = NULL;
330 	size_t hbytes;
331 	size_t abytes;
332 	int error = 0;
333 
334 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
335 		/*
336 		 * Retrieve the message from the pipe or socket.
337 		 */
338 		error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
339 				NULL, 1, UIO_SYSSPACE);
340 		if (error)
341 			break;
342 		if (hdr.magic != DMSG_HDR_MAGIC) {
343 			kdio_printf(iocom, 1, "bad magic: %04x\n", hdr.magic);
344 			error = EINVAL;
345 			break;
346 		}
347 		hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
348 		if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
349 			kdio_printf(iocom, 1, "bad header size %zd\n", hbytes);
350 			error = EINVAL;
351 			break;
352 		}
353 
354 		/* XXX messy: mask cmd to avoid allocating state */
355 		msg = kdmsg_msg_alloc(&iocom->state0,
356 				      hdr.cmd & DMSGF_BASECMDMASK,
357 				      NULL, NULL);
358 		msg->any.head = hdr;
359 		msg->hdr_size = hbytes;
360 		if (hbytes > sizeof(hdr)) {
361 			error = fp_read(iocom->msg_fp, &msg->any.head + 1,
362 					hbytes - sizeof(hdr),
363 					NULL, 1, UIO_SYSSPACE);
364 			if (error) {
365 				kdio_printf(iocom, 1, "%s\n",
366 					    "short msg received");
367 				error = EINVAL;
368 				break;
369 			}
370 		}
371 		msg->aux_size = hdr.aux_bytes;
372 		if (msg->aux_size > DMSG_AUX_MAX) {
373 			kdio_printf(iocom, 1,
374 				    "illegal msg payload size %zd\n",
375 				    msg->aux_size);
376 			error = EINVAL;
377 			break;
378 		}
379 		if (msg->aux_size) {
380 			abytes = DMSG_DOALIGN(msg->aux_size);
381 			msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
382 			msg->flags |= KDMSG_FLAG_AUXALLOC;
383 			error = fp_read(iocom->msg_fp, msg->aux_data,
384 					abytes, NULL, 1, UIO_SYSSPACE);
385 			if (error) {
386 				kdio_printf(iocom, 1, "%s\n",
387 					    "short msg payload received");
388 				break;
389 			}
390 		}
391 
392 		error = kdmsg_msg_receive_handling(msg);
393 		msg = NULL;
394 	}
395 
396 	kdio_printf(iocom, 1, "read thread terminating error=%d\n", error);
397 
398 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
399 	if (msg)
400 		kdmsg_msg_free(msg);
401 
402 	/*
403 	 * Shutdown the socket and set KILLRX for consistency in case the
404 	 * shutdown was not commanded.  Signal the transmit side to shutdown
405 	 * by setting KILLTX and waking it up.
406 	 */
407 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
408 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
409 					KDMSG_CLUSTERCTL_KILLTX);
410 	iocom->msgrd_td = NULL;
411 	lockmgr(&iocom->msglk, LK_RELEASE);
412 	wakeup(&iocom->msg_ctl);
413 
414 	/*
415 	 * iocom can be ripped out at any time once the lock is
416 	 * released with msgrd_td set to NULL.  The wakeup()s are safe but
417 	 * that is all.
418 	 */
419 	wakeup(iocom);
420 	lwkt_exit();
421 }
422 
423 static
424 void
425 kdmsg_iocom_thread_wr(void *arg)
426 {
427 	kdmsg_iocom_t *iocom = arg;
428 	kdmsg_msg_t *msg;
429 	ssize_t res;
430 	size_t abytes;
431 	int error = 0;
432 	int save_ticks;
433 	int didwarn;
434 
435 	/*
436 	 * Transmit loop
437 	 */
438 	msg = NULL;
439 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
440 
441 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
442 		/*
443 		 * Sleep if no messages pending.  Interlock with flag while
444 		 * holding msglk.
445 		 */
446 		if (TAILQ_EMPTY(&iocom->msgq)) {
447 			atomic_set_int(&iocom->msg_ctl,
448 				       KDMSG_CLUSTERCTL_SLEEPING);
449 			lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
450 			atomic_clear_int(&iocom->msg_ctl,
451 					 KDMSG_CLUSTERCTL_SLEEPING);
452 		}
453 
454 		while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
455 			/*
456 			 * Remove msg from the transmit queue and do
457 			 * persist and half-closed state handling.
458 			 */
459 			TAILQ_REMOVE(&iocom->msgq, msg, qentry);
460 
461 			error = kdmsg_state_msgtx(msg);
462 			if (error == EALREADY) {
463 				error = 0;
464 				kdmsg_msg_free(msg);
465 				continue;
466 			}
467 			if (error) {
468 				kdmsg_msg_free(msg);
469 				break;
470 			}
471 
472 			/*
473 			 * Dump the message to the pipe or socket.
474 			 *
475 			 * We have to clean up the message as if the transmit
476 			 * succeeded even if it failed.
477 			 */
478 			lockmgr(&iocom->msglk, LK_RELEASE);
479 			error = fp_write(iocom->msg_fp, &msg->any,
480 					 msg->hdr_size, &res, UIO_SYSSPACE);
481 			if (error || res != msg->hdr_size) {
482 				if (error == 0)
483 					error = EINVAL;
484 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
485 				kdmsg_state_cleanuptx(msg);
486 				break;
487 			}
488 			if (msg->aux_size) {
489 				abytes = DMSG_DOALIGN(msg->aux_size);
490 				error = fp_write(iocom->msg_fp,
491 						 msg->aux_data, abytes,
492 						 &res, UIO_SYSSPACE);
493 				if (error || res != abytes) {
494 					if (error == 0)
495 						error = EINVAL;
496 					lockmgr(&iocom->msglk, LK_EXCLUSIVE);
497 					kdmsg_state_cleanuptx(msg);
498 					break;
499 				}
500 			}
501 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
502 			kdmsg_state_cleanuptx(msg);
503 		}
504 	}
505 
506 	kdio_printf(iocom, 1, "write thread terminating error=%d\n", error);
507 
508 	/*
509 	 * Shutdown the socket and set KILLTX for consistency in case the
510 	 * shutdown was not commanded.  Signal the receive side to shutdown
511 	 * by setting KILLRX and waking it up.
512 	 */
513 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
514 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
515 					KDMSG_CLUSTERCTL_KILLTX);
516 	wakeup(&iocom->msg_ctl);
517 
518 	/*
519 	 * The transmit thread is responsible for final cleanups, wait
520 	 * for the receive side to terminate to prevent new received
521 	 * states from interfering with our cleanup.
522 	 *
523 	 * Do not set msgwr_td to NULL until we actually exit.
524 	 */
525 	while (iocom->msgrd_td) {
526 		wakeup(&iocom->msg_ctl);
527 		lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
528 	}
529 
530 	/*
531 	 * We can no longer receive new messages.  We must drain the transmit
532 	 * message queue and simulate received messages to close anay remaining
533 	 * states.
534 	 *
535 	 * Loop until all the states are gone and there are no messages
536 	 * pending transmit.
537 	 */
538 	save_ticks = ticks;
539 	didwarn = 0;
540 
541 	while (TAILQ_FIRST(&iocom->msgq) ||
542 	       RB_ROOT(&iocom->staterd_tree) ||
543 	       RB_ROOT(&iocom->statewr_tree)) {
544 		/*
545 		 * Simulate failure for all sub-states of state0.
546 		 */
547 		kdmsg_drain_msgq(iocom);
548 		kdio_printf(iocom, 2, "%s\n",
549 			    "simulate failure for all substates of state0");
550 		kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
551 
552 		lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
553 
554 		if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
555 			didwarn = 1;
556 			kdio_printf(iocom, 0,
557 				    "Warning, write thread on %p "
558 				    "still terminating\n",
559 				    iocom);
560 		}
561 		if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
562 			didwarn = 2;
563 			kdio_printf(iocom, 0,
564 				    "Warning, write thread on %p "
565 				    "still terminating\n",
566 				    iocom);
567 		}
568 		if ((int)(ticks - save_ticks) > hz*60) {
569 			kdio_printf(iocom, 0,
570 				    "Can't terminate: msgq %p "
571 				    "rd_tree %p wr_tree %p\n",
572 				    TAILQ_FIRST(&iocom->msgq),
573 				    RB_ROOT(&iocom->staterd_tree),
574 				    RB_ROOT(&iocom->statewr_tree));
575 			lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz * 10);
576 		}
577 	}
578 
579 	/*
580 	 * Exit handling is done by the write thread.
581 	 */
582 	iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
583 	lockmgr(&iocom->msglk, LK_RELEASE);
584 
585 	/*
586 	 * The state trees had better be empty now
587 	 */
588 	KKASSERT(RB_EMPTY(&iocom->staterd_tree));
589 	KKASSERT(RB_EMPTY(&iocom->statewr_tree));
590 	KKASSERT(iocom->conn_state == NULL);
591 
592 	if (iocom->exit_func) {
593 		/*
594 		 * iocom is invalid after we call the exit function.
595 		 */
596 		iocom->msgwr_td = NULL;
597 		iocom->exit_func(iocom);
598 	} else {
599 		/*
600 		 * iocom can be ripped out from under us once msgwr_td is
601 		 * set to NULL.  The wakeup is safe.
602 		 */
603 		iocom->msgwr_td = NULL;
604 		wakeup(iocom);
605 	}
606 	lwkt_exit();
607 }
608 
609 /*
610  * This cleans out the pending transmit message queue, adjusting any
611  * persistent states properly in the process.
612  *
613  * Called with iocom locked.
614  */
615 void
616 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
617 {
618 	kdmsg_msg_t *msg;
619 
620 	/*
621 	 * Clean out our pending transmit queue, executing the
622 	 * appropriate state adjustments.  If this tries to open
623 	 * any new outgoing transactions we have to loop up and
624 	 * clean them out.
625 	 */
626 	while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
627 		TAILQ_REMOVE(&iocom->msgq, msg, qentry);
628 		if (kdmsg_state_msgtx(msg))
629 			kdmsg_msg_free(msg);
630 		else
631 			kdmsg_state_cleanuptx(msg);
632 	}
633 }
634 
635 /*
636  * Do all processing required to handle a freshly received message
637  * after its low level header has been validated.
638  *
639  * iocom is not locked.
640  */
641 static
642 int
643 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
644 {
645 	kdmsg_iocom_t *iocom = msg->state->iocom;
646 	int error;
647 
648 	/*
649 	 * State machine tracking, state assignment for msg,
650 	 * returns error and discard status.  Errors are fatal
651 	 * to the connection except for EALREADY which forces
652 	 * a discard without execution.
653 	 */
654 	error = kdmsg_state_msgrx(msg);
655 	if (msg->state->flags & KDMSG_STATE_ABORTING) {
656 		kdio_printf(iocom, 5,
657 			    "kdmsg_state_abort(b): state %p rxcmd=%08x "
658 			    "txcmd=%08x msgrx error %d\n",
659 			    msg->state, msg->state->rxcmd,
660 			    msg->state->txcmd, error);
661 	}
662 	if (error) {
663 		/*
664 		 * Raw protocol or connection error
665 		 */
666 		if (msg->state->flags & KDMSG_STATE_ABORTING)
667 			kdio_printf(iocom, 5,
668 				    "X1 state %p error %d\n",
669 				    msg->state, error);
670 		kdmsg_msg_free(msg);
671 		if (error == EALREADY)
672 			error = 0;
673 	} else if (msg->state && msg->state->func) {
674 		/*
675 		 * Message related to state which already has a
676 		 * handling function installed for it.
677 		 */
678 		if (msg->state->flags & KDMSG_STATE_ABORTING)
679 			kdio_printf(iocom, 5,
680 				    "X2 state %p func %p\n",
681 				    msg->state, msg->state->func);
682 		error = msg->state->func(msg->state, msg);
683 		kdmsg_state_cleanuprx(msg);
684 	} else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
685 		if (msg->state->flags & KDMSG_STATE_ABORTING)
686 			kdio_printf(iocom, 5,
687 				    "X3 state %p\n", msg->state);
688 		error = kdmsg_autorxmsg(msg);
689 		kdmsg_state_cleanuprx(msg);
690 	} else {
691 		if (msg->state->flags & KDMSG_STATE_ABORTING)
692 			kdio_printf(iocom, 5,
693 				    "X4 state %p\n", msg->state);
694 		error = iocom->rcvmsg(msg);
695 		kdmsg_state_cleanuprx(msg);
696 	}
697 	return error;
698 }
699 
700 /*
701  * Process state tracking for a message after reception and dequeueing,
702  * prior to execution of the state callback.  The state is updated and
703  * will be removed from the RBTREE if completely closed, but the state->parent
704  * and subq linkage is not cleaned up until after the callback (see
705  * cleanuprx()).
706  *
707  * msglk is not held.
708  *
709  * NOTE: A message transaction can consist of several messages in either
710  *	 direction.
711  *
712  * NOTE: The msgid is unique to the initiator, not necessarily unique for
713  *	 us or for any relay or for the return direction for that matter.
714  *	 That is, two sides sending a new message can use the same msgid
715  *	 without colliding.
716  *
717  * --
718  *
719  * ABORT sequences work by setting the ABORT flag along with normal message
720  * state.  However, ABORTs can also be sent on half-closed messages, that is
721  * even if the command or reply side has already sent a DELETE, as long as
722  * the message has not been fully closed it can still send an ABORT+DELETE
723  * to terminate the half-closed message state.
724  *
725  * Since ABORT+DELETEs can race we silently discard ABORT's for message
726  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
727  * also race, and in this situation the other side might have already
728  * initiated a new unrelated command with the same message id.  Since
729  * the abort has not set the CREATE flag the situation can be detected
730   * and the message will also be discarded.
731  *
732  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
733  * The ABORT request is essentially integrated into the command instead
734  * of being sent later on.  In this situation the command implementation
735  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
736  * special-case non-blocking operation for the command.
737  *
738  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
739  *	  to be mid-stream aborts for command/reply sequences.  ABORTs on
740  *	  one-way messages are not supported.
741  *
742  * NOTE!  If a command sequence does not support aborts the ABORT flag is
743  *	  simply ignored.
744  *
745  * --
746  *
747  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
748  * set.  One-off messages cannot be aborted and typically aren't processed
749  * by these routines.  The REPLY bit can be used to distinguish whether a
750  * one-off message is a command or reply.  For example, one-off replies
751  * will typically just contain status updates.
752  */
753 static
754 int
755 kdmsg_state_msgrx(kdmsg_msg_t *msg)
756 {
757 	kdmsg_iocom_t *iocom = msg->state->iocom;
758 	kdmsg_state_t *state;
759 	kdmsg_state_t *pstate;
760 	kdmsg_state_t sdummy;
761 	int error;
762 
763 	bzero(&sdummy, sizeof(sdummy));	/* avoid gcc warnings */
764 
765 	/*
766 	 * Make sure a state structure is ready to go in case we need a new
767 	 * one.  This is the only routine which uses freerd_state so no
768 	 * races are possible.
769 	 */
770 	if ((state = iocom->freerd_state) == NULL) {
771 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
772 		state->flags = KDMSG_STATE_DYNAMIC;
773 		state->iocom = iocom;
774 		state->refs = 1;
775 		TAILQ_INIT(&state->subq);
776 		iocom->freerd_state = state;
777 	}
778 	state = NULL;	/* safety */
779 
780 	/*
781 	 * Lock RB tree and locate existing persistent state, if any.
782 	 *
783 	 * If received msg is a command state is on staterd_tree.
784 	 * If received msg is a reply state is on statewr_tree.
785 	 */
786 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
787 
788 again:
789 	if (msg->state == &iocom->state0) {
790 		sdummy.msgid = msg->any.head.msgid;
791 		sdummy.iocom = iocom;
792 		if (msg->any.head.cmd & DMSGF_REVTRANS) {
793 			state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
794 					&sdummy);
795 		} else {
796 			state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
797 					&sdummy);
798 		}
799 
800 		/*
801 		 * Set message state unconditionally.  If this is a CREATE
802 		 * message this state will become the parent state and new
803 		 * state will be allocated for the message state.
804 		 */
805 		if (state == NULL)
806 			state = &iocom->state0;
807 		if (state->flags & KDMSG_STATE_INTERLOCK) {
808 			state->flags |= KDMSG_STATE_SIGNAL;
809 			lksleep(state, &iocom->msglk, 0, "dmrace", hz);
810 			goto again;
811 		}
812 		kdmsg_state_hold(state);
813 		kdmsg_state_drop(msg->state);	/* iocom->state0 */
814 		msg->state = state;
815 	} else {
816 		state = msg->state;
817 	}
818 
819 	/*
820 	 * Short-cut one-off or mid-stream messages.
821 	 */
822 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
823 				  DMSGF_ABORT)) == 0) {
824 		error = 0;
825 		goto done;
826 	}
827 
828 	/*
829 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
830 	 * inside the case statements.
831 	 */
832 	switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
833 	case DMSGF_CREATE:
834 	case DMSGF_CREATE | DMSGF_DELETE:
835 		/*
836 		 * New persistant command received.
837 		 */
838 		if (state != &iocom->state0) {
839 			kdio_printf(iocom, 1, "%s\n",
840 				    "duplicate transaction");
841 			error = EINVAL;
842 			break;
843 		}
844 
845 		/*
846 		 * Lookup the circuit.  The circuit is an open transaction.
847 		 * the REVCIRC bit in the message tells us which side
848 		 * initiated the transaction representing the circuit.
849 		 */
850 		if (msg->any.head.circuit) {
851 			sdummy.msgid = msg->any.head.circuit;
852 
853 			if (msg->any.head.cmd & DMSGF_REVCIRC) {
854 				pstate = RB_FIND(kdmsg_state_tree,
855 						 &iocom->statewr_tree,
856 						 &sdummy);
857 			} else {
858 				pstate = RB_FIND(kdmsg_state_tree,
859 						 &iocom->staterd_tree,
860 						 &sdummy);
861 			}
862 			if (pstate == NULL) {
863 				kdio_printf(iocom, 1, "%s\n",
864 					    "missing parent in "
865 					    "stacked trans");
866 				error = EINVAL;
867 				break;
868 			}
869 		} else {
870 			pstate = &iocom->state0;
871 		}
872 
873 		/*
874 		 * Allocate new state.
875 		 *
876 		 * msg->state becomes the owner of the ref we inherit from
877 		 * freerd_stae.
878 		 */
879 		kdmsg_state_drop(state);
880 		state = iocom->freerd_state;
881 		iocom->freerd_state = NULL;
882 
883 		msg->state = state;		/* inherits freerd ref */
884 		state->parent = pstate;
885 		KKASSERT(state->iocom == iocom);
886 		state->flags |= KDMSG_STATE_RBINSERTED |
887 				KDMSG_STATE_SUBINSERTED |
888 			        KDMSG_STATE_OPPOSITE;
889 		if (TAILQ_EMPTY(&pstate->subq))
890 			kdmsg_state_hold(pstate);/* states on pstate->subq */
891 		kdmsg_state_hold(state);	/* state on pstate->subq */
892 		kdmsg_state_hold(state);	/* state on rbtree */
893 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
894 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
895 		state->txcmd = DMSGF_REPLY;
896 		state->msgid = msg->any.head.msgid;
897 		state->flags &= ~KDMSG_STATE_NEW;
898 		RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
899 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
900 		error = 0;
901 		break;
902 	case DMSGF_DELETE:
903 		/*
904 		 * Persistent state is expected but might not exist if an
905 		 * ABORT+DELETE races the close.
906 		 */
907 		if (state == &iocom->state0) {
908 			if (msg->any.head.cmd & DMSGF_ABORT) {
909 				kdio_printf(iocom, 1, "%s\n",
910 					    "msgrx: "
911 					    "state already A");
912 				error = EALREADY;
913 			} else {
914 				kdio_printf(iocom, 1, "%s\n",
915 					    "msgrx: no state for DELETE");
916 				error = EINVAL;
917 			}
918 			break;
919 		}
920 
921 		/*
922 		 * Handle another ABORT+DELETE case if the msgid has already
923 		 * been reused.
924 		 */
925 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
926 			if (msg->any.head.cmd & DMSGF_ABORT) {
927 				kdio_printf(iocom, 1, "%s\n",
928 					    "msgrx: state already B");
929 				error = EALREADY;
930 			} else {
931 				kdio_printf(iocom, 1, "%s\n",
932 					    "msgrx: state reused for DELETE");
933 				error = EINVAL;
934 			}
935 			break;
936 		}
937 		error = 0;
938 		break;
939 	default:
940 		/*
941 		 * Check for mid-stream ABORT command received, otherwise
942 		 * allow.
943 		 */
944 		if (msg->any.head.cmd & DMSGF_ABORT) {
945 			if (state == &iocom->state0 ||
946 			    (state->rxcmd & DMSGF_CREATE) == 0) {
947 				error = EALREADY;
948 				break;
949 			}
950 		}
951 		error = 0;
952 		break;
953 	case DMSGF_REPLY | DMSGF_CREATE:
954 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
955 		/*
956 		 * When receiving a reply with CREATE set the original
957 		 * persistent state message should already exist.
958 		 */
959 		if (state == &iocom->state0) {
960 			kdio_printf(iocom, 1,
961 				    "msgrx: no state match for "
962 				    "REPLY cmd=%08x msgid=%016jx\n",
963 				    msg->any.head.cmd,
964 				    (intmax_t)msg->any.head.msgid);
965 			error = EINVAL;
966 			break;
967 		}
968 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
969 		error = 0;
970 		break;
971 	case DMSGF_REPLY | DMSGF_DELETE:
972 		/*
973 		 * Received REPLY+ABORT+DELETE in case where msgid has
974 		 * already been fully closed, ignore the message.
975 		 */
976 		if (state == &iocom->state0) {
977 			if (msg->any.head.cmd & DMSGF_ABORT) {
978 				error = EALREADY;
979 			} else {
980 				kdio_printf(iocom, 1, "%s\n",
981 					    "msgrx: no state match "
982 					    "for REPLY|DELETE");
983 				error = EINVAL;
984 			}
985 			break;
986 		}
987 
988 		/*
989 		 * Received REPLY+ABORT+DELETE in case where msgid has
990 		 * already been reused for an unrelated message,
991 		 * ignore the message.
992 		 */
993 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
994 			if (msg->any.head.cmd & DMSGF_ABORT) {
995 				error = EALREADY;
996 			} else {
997 				kdio_printf(iocom, 1, "%s\n",
998 					    "msgrx: state reused "
999 					    "for REPLY|DELETE");
1000 				error = EINVAL;
1001 			}
1002 			break;
1003 		}
1004 		error = 0;
1005 		break;
1006 	case DMSGF_REPLY:
1007 		/*
1008 		 * Check for mid-stream ABORT reply received to sent command.
1009 		 */
1010 		if (msg->any.head.cmd & DMSGF_ABORT) {
1011 			if (state == &iocom->state0 ||
1012 			    (state->rxcmd & DMSGF_CREATE) == 0) {
1013 				error = EALREADY;
1014 				break;
1015 			}
1016 		}
1017 		error = 0;
1018 		break;
1019 	}
1020 
1021 	/*
1022 	 * Calculate the easy-switch() transactional command.  Represents
1023 	 * the outer-transaction command for any transaction-create or
1024 	 * transaction-delete, and the inner message command for any
1025 	 * non-transaction or inside-transaction command.  tcmd will be
1026 	 * set to 0 if the message state is illegal.
1027 	 *
1028 	 * The two can be told apart because outer-transaction commands
1029 	 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1030 	 */
1031 done:
1032 	if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
1033 		if (state != &iocom->state0) {
1034 			msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1035 				    (msg->any.head.cmd & (DMSGF_CREATE |
1036 							  DMSGF_DELETE |
1037 							  DMSGF_REPLY));
1038 		} else {
1039 			msg->tcmd = 0;
1040 		}
1041 	} else {
1042 		msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1043 	}
1044 
1045 	/*
1046 	 * Adjust the state for DELETE handling now, before making the
1047 	 * callback so we are atomic with other state updates.
1048 	 *
1049 	 * Subq/parent linkages are cleaned up after the callback.
1050 	 * If an error occurred the message is ignored and state is not
1051 	 * updated.
1052 	 */
1053 	if ((state = msg->state) == NULL || error != 0) {
1054 		kdio_printf(iocom, 1,
1055 			    "msgrx: state=%p error %d\n",
1056 			    state, error);
1057 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
1058 		KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1059 		state->rxcmd |= DMSGF_DELETE;
1060 		if (state->txcmd & DMSGF_DELETE) {
1061 			KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1062 			if (state->rxcmd & DMSGF_REPLY) {
1063 				KKASSERT(msg->any.head.cmd &
1064 					 DMSGF_REPLY);
1065 				RB_REMOVE(kdmsg_state_tree,
1066 					  &iocom->statewr_tree, state);
1067 			} else {
1068 				KKASSERT((msg->any.head.cmd &
1069 					  DMSGF_REPLY) == 0);
1070 				RB_REMOVE(kdmsg_state_tree,
1071 					  &iocom->staterd_tree, state);
1072 			}
1073 			state->flags &= ~KDMSG_STATE_RBINSERTED;
1074 			kdmsg_state_drop(state);	/* state on rbtree */
1075 		}
1076 	}
1077 	lockmgr(&iocom->msglk, LK_RELEASE);
1078 
1079 	return (error);
1080 }
1081 
1082 /*
1083  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1084  * This routine must call iocom->rcvmsg() for anything not automatically
1085  * handled.
1086  */
1087 static int
1088 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1089 {
1090 	kdmsg_iocom_t *iocom = msg->state->iocom;
1091 	kdmsg_msg_t *rep;
1092 	int error = 0;
1093 	uint32_t cmd;
1094 
1095 	/*
1096 	 * Main switch processes transaction create/delete sequences only.
1097 	 * Use icmd (DELETEs use DMSG_LNK_ERROR
1098 	 *
1099 	 * NOTE: If processing in-transaction messages you generally want
1100 	 *	 an inner switch on msg->any.head.cmd.
1101 	 */
1102 	if (msg->state) {
1103 		cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1104 		      (msg->any.head.cmd & (DMSGF_CREATE |
1105 					    DMSGF_DELETE |
1106 					    DMSGF_REPLY));
1107 	} else {
1108 		cmd = 0;
1109 	}
1110 
1111 	switch(cmd) {
1112 	case DMSG_LNK_PING:
1113 		/*
1114 		 * Received ping, send reply
1115 		 */
1116 		rep = kdmsg_msg_alloc(msg->state, DMSG_LNK_PING | DMSGF_REPLY,
1117 				      NULL, NULL);
1118 		kdmsg_msg_write(rep);
1119 		break;
1120 	case DMSG_LNK_PING | DMSGF_REPLY:
1121 		/* ignore replies */
1122 		break;
1123 	case DMSG_LNK_CONN | DMSGF_CREATE:
1124 	case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1125 		/*
1126 		 * Received LNK_CONN transaction.  Transmit response and
1127 		 * leave transaction open, which allows the other end to
1128 		 * start to the SPAN protocol.
1129 		 *
1130 		 * Handle shim after acknowledging the CONN.
1131 		 */
1132 		if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1133 			if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1134 				kdmsg_msg_result(msg, 0);
1135 				if (iocom->auto_callback)
1136 					iocom->auto_callback(msg);
1137 			} else {
1138 				error = iocom->rcvmsg(msg);
1139 			}
1140 			break;
1141 		}
1142 		/* fall through */
1143 	case DMSG_LNK_CONN | DMSGF_DELETE:
1144 		/*
1145 		 * This message is usually simulated after a link is lost
1146 		 * to clean up the transaction.
1147 		 */
1148 		if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1149 			if (iocom->auto_callback)
1150 				iocom->auto_callback(msg);
1151 			kdmsg_msg_reply(msg, 0);
1152 		} else {
1153 			error = iocom->rcvmsg(msg);
1154 		}
1155 		break;
1156 	case DMSG_LNK_SPAN | DMSGF_CREATE:
1157 	case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1158 		/*
1159 		 * Received LNK_SPAN transaction.  We do not have to respond
1160 		 * (except on termination), but we must leave the transaction
1161 		 * open.
1162 		 *
1163 		 * Handle shim after acknowledging the SPAN.
1164 		 */
1165 		if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1166 			if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1167 				if (iocom->auto_callback)
1168 					iocom->auto_callback(msg);
1169 				break;
1170 			}
1171 			/* fall through */
1172 		} else {
1173 			error = iocom->rcvmsg(msg);
1174 			break;
1175 		}
1176 		/* fall through */
1177 	case DMSG_LNK_SPAN | DMSGF_DELETE:
1178 		/*
1179 		 * Process shims (auto_callback) before cleaning up the
1180 		 * circuit structure and closing the transactions.  Device
1181 		 * driver should ensure that the circuit is not used after
1182 		 * the auto_callback() returns.
1183 		 *
1184 		 * Handle shim before closing the SPAN transaction.
1185 		 */
1186 		if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1187 			if (iocom->auto_callback)
1188 				iocom->auto_callback(msg);
1189 			kdmsg_msg_reply(msg, 0);
1190 		} else {
1191 			error = iocom->rcvmsg(msg);
1192 		}
1193 		break;
1194 	default:
1195 		/*
1196 		 * Anything unhandled goes into rcvmsg.
1197 		 *
1198 		 * NOTE: Replies to link-level messages initiated by our side
1199 		 *	 are handled by the state callback, they are NOT
1200 		 *	 handled here.
1201 		 */
1202 		error = iocom->rcvmsg(msg);
1203 		break;
1204 	}
1205 	return (error);
1206 }
1207 
1208 /*
1209  * Post-receive-handling message and state cleanup.  This routine is called
1210  * after the state function handling/callback to properly dispose of the
1211  * message and unlink the state's parent/subq linkage if the state is
1212  * completely closed.
1213  *
1214  * msglk is not held.
1215  */
1216 static
1217 void
1218 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1219 {
1220 	kdmsg_state_t *state = msg->state;
1221 	kdmsg_iocom_t *iocom = state->iocom;
1222 
1223 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1224 	if (state != &iocom->state0) {
1225 		/*
1226 		 * When terminating a transaction (in either direction), all
1227 		 * sub-states are aborted.
1228 		 */
1229 		if ((msg->any.head.cmd & DMSGF_DELETE) &&
1230 		    TAILQ_FIRST(&msg->state->subq)) {
1231 			kdio_printf(iocom, 2,
1232 				    "simulate failure for substates of "
1233 				    "state %p cmd %08x/%08x\n",
1234 				    msg->state,
1235 				    msg->state->rxcmd,
1236 				    msg->state->txcmd);
1237 			kdmsg_simulate_failure(msg->state,
1238 					       0, DMSG_ERR_LOSTLINK);
1239 		}
1240 
1241 		/*
1242 		 * Once the state is fully closed we can (try to) remove it
1243 		 * from the subq topology.
1244 		 */
1245 		if ((state->flags & KDMSG_STATE_SUBINSERTED) &&
1246 		    (state->rxcmd & DMSGF_DELETE) &&
1247 		    (state->txcmd & DMSGF_DELETE)) {
1248 			/*
1249 			 * Remove parent linkage if state is completely closed.
1250 			 */
1251 			kdmsg_subq_delete(state);
1252 		}
1253 	}
1254 	kdmsg_msg_free(msg);
1255 
1256 	lockmgr(&iocom->msglk, LK_RELEASE);
1257 }
1258 
1259 /*
1260  * Remove state from its parent's subq.  This can wind up recursively
1261  * dropping the parent upward.
1262  *
1263  * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1264  */
1265 static
1266 void
1267 kdmsg_subq_delete(kdmsg_state_t *state)
1268 {
1269 	kdmsg_state_t *pstate;
1270 
1271 	if (state->flags & KDMSG_STATE_SUBINSERTED) {
1272 		pstate = state->parent;
1273 		KKASSERT(pstate);
1274 		if (pstate->scan == state)
1275 			pstate->scan = NULL;
1276 		TAILQ_REMOVE(&pstate->subq, state, entry);
1277 		state->flags &= ~KDMSG_STATE_SUBINSERTED;
1278 		state->parent = NULL;
1279 		if (TAILQ_EMPTY(&pstate->subq)) {
1280 			kdmsg_state_drop(pstate);/* pstate->subq */
1281 		}
1282 		pstate = NULL;			 /* safety */
1283 		kdmsg_state_drop(state);  	 /* pstate->subq */
1284 	} else {
1285 		KKASSERT(state->parent == NULL);
1286 	}
1287 }
1288 
1289 /*
1290  * Simulate receiving a message which terminates an active transaction
1291  * state.  Our simulated received message must set DELETE and may also
1292  * have to set CREATE.  It must also ensure that all fields are set such
1293  * that the receive handling code can find the state (kdmsg_state_msgrx())
1294  * or an endless loop will ensue.
1295  *
1296  * This is used when the other end of the link is dead so the device driver
1297  * gets a completed transaction for all pending states.
1298  *
1299  * Called with iocom locked.
1300  */
1301 static
1302 void
1303 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1304 {
1305 	kdmsg_state_t *substate;
1306 
1307 	kdmsg_state_hold(state);		/* aborting */
1308 
1309 	/*
1310 	 * Abort parent state first. Parent will not actually disappear
1311 	 * until children are gone.  Device drivers must handle the situation.
1312 	 * The advantage of this is that device drivers can flag the situation
1313 	 * as an interlock against new operations on dying states.  And since
1314 	 * device operations are often asynchronous anyway, this sequence of
1315 	 * events works out better.
1316 	 */
1317 	if (meto)
1318 		kdmsg_state_abort(state);
1319 
1320 	/*
1321 	 * Recurse through any children.
1322 	 */
1323 again:
1324 	TAILQ_FOREACH(substate, &state->subq, entry) {
1325 		if (substate->flags & KDMSG_STATE_ABORTING)
1326 			continue;
1327 		state->scan = substate;
1328 		kdmsg_simulate_failure(substate, 1, error);
1329 		if (state->scan != substate)
1330 			goto again;
1331 	}
1332 	kdmsg_state_drop(state);		/* aborting */
1333 }
1334 
1335 static
1336 void
1337 kdmsg_state_abort(kdmsg_state_t *state)
1338 {
1339 	kdmsg_msg_t *msg;
1340 
1341 	/*
1342 	 * Set ABORTING and DYING, return if already set.  If the state was
1343 	 * just allocated we defer the abort operation until the related
1344 	 * message is processed.
1345 	 */
1346 	KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1347 	if (state->flags & KDMSG_STATE_ABORTING)
1348 		return;
1349 	state->flags |= KDMSG_STATE_ABORTING;
1350 	kdmsg_state_dying(state);
1351 	if (state->flags & KDMSG_STATE_NEW) {
1352 		kdio_printf(iocom, 5,
1353 			    "kdmsg_state_abort(0): state %p rxcmd %08x "
1354 			    "txcmd %08x flags %08x - in NEW state\n",
1355 			    state, state->rxcmd,
1356 			    state->txcmd, state->flags);
1357 		return;
1358 	}
1359 
1360 	/*
1361 	 * NOTE: The DELETE flag might already be set due to an early
1362 	 *	 termination.
1363 	 *
1364 	 * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1365 	 *
1366 	 * NOTE: We are simulating a received message using our state
1367 	 *	 (vs a message generated by the other side using its state),
1368 	 *	 so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1369 	 */
1370 	kdio_printf(iocom, 5,
1371 		    "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1372 		    state, state->rxcmd, state->txcmd);
1373 	if ((state->rxcmd & DMSGF_DELETE) == 0) {
1374 		msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1375 		if ((state->rxcmd & DMSGF_CREATE) == 0)
1376 			msg->any.head.cmd |= DMSGF_CREATE;
1377 		msg->any.head.cmd |= DMSGF_DELETE |
1378 				     (state->rxcmd & DMSGF_REPLY);
1379 		msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1380 		msg->any.head.error = DMSG_ERR_LOSTLINK;
1381 		kdio_printf(iocom, 5,
1382 			    "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1383 			    state, msg->any.head.cmd);
1384 		/* circuit not initialized */
1385 		lockmgr(&state->iocom->msglk, LK_RELEASE);
1386 		kdmsg_msg_receive_handling(msg);
1387 		lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1388 		msg = NULL;
1389 	}
1390 	kdio_printf(iocom, 5,
1391 		    "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1392 		    state, state->rxcmd, state->txcmd);
1393 }
1394 
1395 /*
1396  * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1397  * the transmission of any new messages on these states.  This is done
1398  * atomically when parent state is terminating, whereas setting ABORTING is
1399  * not atomic and can leak races.
1400  */
1401 static
1402 void
1403 kdmsg_state_dying(kdmsg_state_t *state)
1404 {
1405 	kdmsg_state_t *scan;
1406 
1407 	if ((state->flags & KDMSG_STATE_DYING) == 0) {
1408 		state->flags |= KDMSG_STATE_DYING;
1409 		TAILQ_FOREACH(scan, &state->subq, entry)
1410 			kdmsg_state_dying(scan);
1411 	}
1412 }
1413 
1414 /*
1415  * Process state tracking for a message prior to transmission.
1416  *
1417  * Called with msglk held and the msg dequeued.  Returns non-zero if
1418  * the message is bad and should be deleted by the caller.
1419  *
1420  * One-off messages are usually with dummy state and msg->state may be NULL
1421  * in this situation.
1422  *
1423  * New transactions (when CREATE is set) will insert the state.
1424  *
1425  * May request that caller discard the message by setting *discardp to 1.
1426  * A NULL state may be returned in this case.
1427  */
1428 static
1429 int
1430 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1431 {
1432 	kdmsg_iocom_t *iocom = msg->state->iocom;
1433 	kdmsg_state_t *state;
1434 	int error;
1435 
1436 	/*
1437 	 * Make sure a state structure is ready to go in case we need a new
1438 	 * one.  This is the only routine which uses freewr_state so no
1439 	 * races are possible.
1440 	 */
1441 	if ((state = iocom->freewr_state) == NULL) {
1442 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1443 		state->flags = KDMSG_STATE_DYNAMIC;
1444 		state->iocom = iocom;
1445 		state->refs = 1;
1446 		TAILQ_INIT(&state->subq);
1447 		iocom->freewr_state = state;
1448 	}
1449 
1450 	/*
1451 	 * Lock RB tree.  If persistent state is present it will have already
1452 	 * been assigned to msg.
1453 	 */
1454 	state = msg->state;
1455 
1456 	/*
1457 	 * Short-cut one-off or mid-stream messages (state may be NULL).
1458 	 */
1459 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1460 				  DMSGF_ABORT)) == 0) {
1461 		return(0);
1462 	}
1463 
1464 
1465 	/*
1466 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1467 	 * inside the case statements.
1468 	 */
1469 	switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1470 				    DMSGF_REPLY)) {
1471 	case DMSGF_CREATE:
1472 	case DMSGF_CREATE | DMSGF_DELETE:
1473 		/*
1474 		 * Insert the new persistent message state and mark
1475 		 * half-closed if DELETE is set.  Since this is a new
1476 		 * message it isn't possible to transition into the fully
1477 		 * closed state here.
1478 		 *
1479 		 * XXX state must be assigned and inserted by
1480 		 *     kdmsg_msg_write().  txcmd is assigned by us
1481 		 *     on-transmit.
1482 		 */
1483 		KKASSERT(state != NULL);
1484 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1485 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1486 		state->rxcmd = DMSGF_REPLY;
1487 		state->flags &= ~KDMSG_STATE_NEW;
1488 		error = 0;
1489 		break;
1490 	case DMSGF_DELETE:
1491 		/*
1492 		 * Sent ABORT+DELETE in case where msgid has already
1493 		 * been fully closed, ignore the message.
1494 		 */
1495 		if (state == &iocom->state0) {
1496 			if (msg->any.head.cmd & DMSGF_ABORT) {
1497 				error = EALREADY;
1498 			} else {
1499 				kdio_printf(iocom, 1,
1500 					"msgtx: no state match "
1501 					"for DELETE cmd=%08x msgid=%016jx\n",
1502 					msg->any.head.cmd,
1503 					(intmax_t)msg->any.head.msgid);
1504 				error = EINVAL;
1505 			}
1506 			break;
1507 		}
1508 
1509 		/*
1510 		 * Sent ABORT+DELETE in case where msgid has
1511 		 * already been reused for an unrelated message,
1512 		 * ignore the message.
1513 		 */
1514 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1515 			if (msg->any.head.cmd & DMSGF_ABORT) {
1516 				error = EALREADY;
1517 			} else {
1518 				kdio_printf(iocom, 1, "%s\n",
1519 					    "msgtx: state reused "
1520 					    "for DELETE");
1521 				error = EINVAL;
1522 			}
1523 			break;
1524 		}
1525 		error = 0;
1526 		break;
1527 	default:
1528 		/*
1529 		 * Check for mid-stream ABORT command sent
1530 		 */
1531 		if (msg->any.head.cmd & DMSGF_ABORT) {
1532 			if (state == &state->iocom->state0 ||
1533 			    (state->txcmd & DMSGF_CREATE) == 0) {
1534 				error = EALREADY;
1535 				break;
1536 			}
1537 		}
1538 		error = 0;
1539 		break;
1540 	case DMSGF_REPLY | DMSGF_CREATE:
1541 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1542 		/*
1543 		 * When transmitting a reply with CREATE set the original
1544 		 * persistent state message should already exist.
1545 		 */
1546 		if (state == &state->iocom->state0) {
1547 			kdio_printf(iocom, 1, "%s\n",
1548 				    "msgtx: no state match "
1549 				    "for REPLY | CREATE");
1550 			error = EINVAL;
1551 			break;
1552 		}
1553 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1554 		error = 0;
1555 		break;
1556 	case DMSGF_REPLY | DMSGF_DELETE:
1557 		/*
1558 		 * When transmitting a reply with DELETE set the original
1559 		 * persistent state message should already exist.
1560 		 *
1561 		 * This is very similar to the REPLY|CREATE|* case except
1562 		 * txcmd is already stored, so we just add the DELETE flag.
1563 		 *
1564 		 * Sent REPLY+ABORT+DELETE in case where msgid has
1565 		 * already been fully closed, ignore the message.
1566 		 */
1567 		if (state == &state->iocom->state0) {
1568 			if (msg->any.head.cmd & DMSGF_ABORT) {
1569 				error = EALREADY;
1570 			} else {
1571 				kdio_printf(iocom, 1, "%s\n",
1572 					    "msgtx: no state match "
1573 					    "for REPLY | DELETE");
1574 				error = EINVAL;
1575 			}
1576 			break;
1577 		}
1578 
1579 		/*
1580 		 * Sent REPLY+ABORT+DELETE in case where msgid has already
1581 		 * been reused for an unrelated message, ignore the message.
1582 		 */
1583 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1584 			if (msg->any.head.cmd & DMSGF_ABORT) {
1585 				error = EALREADY;
1586 			} else {
1587 				kdio_printf(iocom, 1, "%s\n",
1588 					    "msgtx: state reused "
1589 					    "for REPLY | DELETE");
1590 				error = EINVAL;
1591 			}
1592 			break;
1593 		}
1594 		error = 0;
1595 		break;
1596 	case DMSGF_REPLY:
1597 		/*
1598 		 * Check for mid-stream ABORT reply sent.
1599 		 *
1600 		 * One-off REPLY messages are allowed for e.g. status updates.
1601 		 */
1602 		if (msg->any.head.cmd & DMSGF_ABORT) {
1603 			if (state == &state->iocom->state0 ||
1604 			    (state->txcmd & DMSGF_CREATE) == 0) {
1605 				error = EALREADY;
1606 				break;
1607 			}
1608 		}
1609 		error = 0;
1610 		break;
1611 	}
1612 
1613 	/*
1614 	 * Set interlock (XXX hack) in case the send side blocks and a
1615 	 * response is returned before kdmsg_state_cleanuptx() can be
1616 	 * run.
1617 	 */
1618 	if (state && error == 0)
1619 		state->flags |= KDMSG_STATE_INTERLOCK;
1620 
1621 	return (error);
1622 }
1623 
1624 /*
1625  * Called with iocom locked.
1626  */
1627 static
1628 void
1629 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1630 {
1631 	kdmsg_iocom_t *iocom = msg->state->iocom;
1632 	kdmsg_state_t *state;
1633 
1634 	if ((state = msg->state) == NULL) {
1635 		kdmsg_msg_free(msg);
1636 		return;
1637 	}
1638 
1639 	/*
1640 	 * Clear interlock (XXX hack) in case the send side blocks and a
1641 	 * response is returned in the other thread before
1642 	 * kdmsg_state_cleanuptx() can be run.  We maintain our hold on
1643 	 * iocom->msglk so we can do this before completing our task.
1644 	 */
1645 	if (state->flags & KDMSG_STATE_SIGNAL) {
1646 		kdio_printf(iocom, 1, "state %p interlock!\n", state);
1647 		wakeup(state);
1648 	}
1649 	state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1650 	kdmsg_state_hold(state);
1651 
1652 	if (msg->any.head.cmd & DMSGF_DELETE) {
1653 		KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1654 		state->txcmd |= DMSGF_DELETE;
1655 		if (state->rxcmd & DMSGF_DELETE) {
1656 			KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1657 			if (state->txcmd & DMSGF_REPLY) {
1658 				KKASSERT(msg->any.head.cmd &
1659 					 DMSGF_REPLY);
1660 				RB_REMOVE(kdmsg_state_tree,
1661 					  &iocom->staterd_tree, state);
1662 			} else {
1663 				KKASSERT((msg->any.head.cmd &
1664 					  DMSGF_REPLY) == 0);
1665 				RB_REMOVE(kdmsg_state_tree,
1666 					  &iocom->statewr_tree, state);
1667 			}
1668 			state->flags &= ~KDMSG_STATE_RBINSERTED;
1669 
1670 			/*
1671 			 * The subq recursion is used for parent linking and
1672 			 * scanning the topology for aborts, we can only
1673 			 * remove leafs.  The circuit is effectively dead now,
1674 			 * but topology won't be torn down until all of its
1675 			 * children have finished/aborted.
1676 			 *
1677 			 * This is particularly important for end-point
1678 			 * devices which might need to access private data
1679 			 * in parent states.  Out of order disconnects can
1680 			 * occur if an end-point device is processing a
1681 			 * message transaction asynchronously because abort
1682 			 * requests are basically synchronous and it probably
1683 			 * isn't convenient (or possible) for the end-point
1684 			 * to abort an asynchronous operation.
1685 			 */
1686 			if (TAILQ_EMPTY(&state->subq))
1687 				kdmsg_subq_delete(state);
1688 			kdmsg_msg_free(msg);
1689 			kdmsg_state_drop(state);   /* state on rbtree */
1690 		} else {
1691 			kdmsg_msg_free(msg);
1692 		}
1693 	} else {
1694 		kdmsg_msg_free(msg);
1695 	}
1696 
1697 	/*
1698 	 * Deferred abort after transmission.
1699 	 */
1700 	if ((state->flags & (KDMSG_STATE_ABORTING | KDMSG_STATE_DYING)) &&
1701 	    (state->rxcmd & DMSGF_DELETE) == 0) {
1702 		kdio_printf(iocom, 5,
1703 			    "kdmsg_state_cleanuptx: state=%p "
1704 			    "executing deferred abort\n",
1705 			    state);
1706 		state->flags &= ~KDMSG_STATE_ABORTING;
1707 		kdmsg_state_abort(state);
1708 	}
1709 	kdmsg_state_drop(state);
1710 }
1711 
1712 static
1713 void
1714 _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1715 {
1716 	atomic_add_int(&state->refs, 1);
1717 #if KDMSG_DEBUG
1718 	kd_printf(4, "state %p +%d\t%s:%d\n", state, state->refs, file, line);
1719 #endif
1720 }
1721 
1722 static
1723 void
1724 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1725 {
1726 	KKASSERT(state->refs > 0);
1727 #if KDMSG_DEBUG
1728 	kd_printf(4, "state %p -%d\t%s:%d\n", state, state->refs, file, line);
1729 #endif
1730 	if (atomic_fetchadd_int(&state->refs, -1) == 1)
1731 		kdmsg_state_free(state);
1732 }
1733 
1734 static
1735 void
1736 kdmsg_state_free(kdmsg_state_t *state)
1737 {
1738 	kdmsg_iocom_t *iocom = state->iocom;
1739 
1740 	KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1741 	KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1742 	KKASSERT(TAILQ_EMPTY(&state->subq));
1743 
1744 	if (state != &state->iocom->state0)
1745 		kfree(state, iocom->mmsg);
1746 }
1747 
1748 kdmsg_msg_t *
1749 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1750 		int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1751 {
1752 	kdmsg_iocom_t *iocom = state->iocom;
1753 	kdmsg_state_t *pstate;
1754 	kdmsg_msg_t *msg;
1755 	size_t hbytes;
1756 
1757 	KKASSERT(iocom != NULL);
1758 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1759 	msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1760 		      iocom->mmsg, M_WAITOK | M_ZERO);
1761 	msg->hdr_size = hbytes;
1762 
1763 	if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1764 		/*
1765 		 * New transaction, requires tracking state and a unique
1766 		 * msgid to be allocated.
1767 		 *
1768 		 * It is possible to race a circuit failure, inherit the
1769 		 * parent's STATE_DYING flag to trigger an abort sequence
1770 		 * in the transmit path.  By not inheriting ABORTING the
1771 		 * abort sequence can recurse.
1772 		 *
1773 		 * NOTE: The transactions has not yet been initiated so we
1774 		 *	 cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1775 		 *	 We have to properly setup DMSGF_REPLY, however.
1776 		 */
1777 		pstate = state;
1778 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1779 		TAILQ_INIT(&state->subq);
1780 		state->iocom = iocom;
1781 		state->parent = pstate;
1782 		state->flags = KDMSG_STATE_DYNAMIC |
1783 			       KDMSG_STATE_NEW;
1784 		state->func = func;
1785 		state->any.any = data;
1786 		state->msgid = (uint64_t)(uintptr_t)state;
1787 		/*msg->any.head.msgid = state->msgid;XXX*/
1788 
1789 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1790 		if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1791 			panic("duplicate msgid allocated");
1792 		if (TAILQ_EMPTY(&pstate->subq))
1793 			kdmsg_state_hold(pstate);/* pstate->subq */
1794 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1795 		state->flags |= KDMSG_STATE_RBINSERTED |
1796 				KDMSG_STATE_SUBINSERTED;
1797 		state->flags |= pstate->flags & KDMSG_STATE_DYING;
1798 		kdmsg_state_hold(state);	/* pstate->subq */
1799 		kdmsg_state_hold(state);	/* state on rbtree */
1800 		kdmsg_state_hold(state);	/* msg->state */
1801 		lockmgr(&iocom->msglk, LK_RELEASE);
1802 	} else {
1803 		pstate = state->parent;
1804 		KKASSERT(pstate != NULL);
1805 		kdmsg_state_hold(state);	/* msg->state */
1806 	}
1807 
1808 	if (state->flags & KDMSG_STATE_OPPOSITE)
1809 		cmd |= DMSGF_REVTRANS;
1810 	if (pstate->flags & KDMSG_STATE_OPPOSITE)
1811 		cmd |= DMSGF_REVCIRC;
1812 
1813 	msg->any.head.magic = DMSG_HDR_MAGIC;
1814 	msg->any.head.cmd = cmd;
1815 	msg->any.head.msgid = state->msgid;
1816 	msg->any.head.circuit = pstate->msgid;
1817 	msg->state = state;
1818 
1819 	return (msg);
1820 }
1821 
1822 void
1823 kdmsg_msg_free(kdmsg_msg_t *msg)
1824 {
1825 	kdmsg_iocom_t *iocom = msg->state->iocom;
1826 	kdmsg_state_t *state;
1827 
1828 	if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1829 	    msg->aux_data && msg->aux_size) {
1830 		kfree(msg->aux_data, iocom->mmsg);
1831 		msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1832 	}
1833 	if ((state = msg->state) != NULL) {
1834 		msg->state = NULL;
1835 		kdmsg_state_drop(state);	/* msg->state */
1836 	}
1837 	msg->aux_data = NULL;
1838 	msg->aux_size = 0;
1839 
1840 	kfree(msg, iocom->mmsg);
1841 }
1842 
1843 void
1844 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1845 {
1846 	if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1847 		data->aux_data = msg->aux_data;
1848 		data->aux_size = msg->aux_size;
1849 		data->iocom = msg->state->iocom;
1850 		msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1851 	} else {
1852 		data->aux_data = NULL;
1853 		data->aux_size = 0;
1854 		data->iocom = msg->state->iocom;
1855 	}
1856 }
1857 
1858 void
1859 kdmsg_free_aux_data(kdmsg_data_t *data)
1860 {
1861 	if (data->aux_data)
1862 		kfree(data->aux_data, data->iocom->mmsg);
1863 }
1864 
1865 /*
1866  * Indexed messages are stored in a red-black tree indexed by their
1867  * msgid.  Only persistent messages are indexed.
1868  */
1869 int
1870 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1871 {
1872 	if (state1->iocom < state2->iocom)
1873 		return(-1);
1874 	if (state1->iocom > state2->iocom)
1875 		return(1);
1876 	if (state1->msgid < state2->msgid)
1877 		return(-1);
1878 	if (state1->msgid > state2->msgid)
1879 		return(1);
1880 	return(0);
1881 }
1882 
1883 /*
1884  * Write a message.  All requisit command flags have been set.
1885  *
1886  * If msg->state is non-NULL the message is written to the existing
1887  * transaction.  msgid will be set accordingly.
1888  *
1889  * If msg->state is NULL and CREATE is set new state is allocated and
1890  * (func, data) is installed.  A msgid is assigned.
1891  *
1892  * If msg->state is NULL and CREATE is not set the message is assumed
1893  * to be a one-way message.  The originator must assign the msgid
1894  * (or leave it 0, which is typical.
1895  *
1896  * This function merely queues the message to the management thread, it
1897  * does not write to the message socket/pipe.
1898  */
1899 void
1900 kdmsg_msg_write(kdmsg_msg_t *msg)
1901 {
1902 	kdmsg_iocom_t *iocom = msg->state->iocom;
1903 
1904 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1905 	kdmsg_msg_write_locked(iocom, msg);
1906 	lockmgr(&iocom->msglk, LK_RELEASE);
1907 }
1908 
1909 static void
1910 kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg)
1911 {
1912 	kdmsg_state_t *state;
1913 
1914 	if (msg->state) {
1915 		/*
1916 		 * Continuance or termination of existing transaction.
1917 		 * The transaction could have been initiated by either end.
1918 		 *
1919 		 * (Function callback and aux data for the receive side can
1920 		 * be replaced or left alone).
1921 		 */
1922 		state = msg->state;
1923 		msg->any.head.msgid = state->msgid;
1924 	} else {
1925 		/*
1926 		 * One-off message (always uses msgid 0 to distinguish
1927 		 * between a possibly lost in-transaction message due to
1928 		 * competing aborts and a real one-off message?)
1929 		 */
1930 		state = NULL;
1931 		msg->any.head.msgid = 0;
1932 	}
1933 
1934 #if 0
1935 	/*
1936 	 * XXX removed - don't make this a panic, allow the state checks
1937 	 *     below to catch the situation.
1938 	 *
1939 	 * This flag is not set until after the tx thread has drained
1940 	 * the tx msgq and simulated responses.  After that point the
1941 	 * txthread is dead and can no longer simulate responses.
1942 	 *
1943 	 * Device drivers should never try to send a message once this
1944 	 * flag is set.  They should have detected (through the state
1945 	 * closures) that the link is in trouble.
1946 	 */
1947 	if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1948 		lockmgr(&iocom->msglk, LK_RELEASE);
1949 		panic("kdmsg_msg_write: Attempt to write message to "
1950 		      "terminated iocom\n");
1951 	}
1952 #endif
1953 
1954 	/*
1955 	 * For stateful messages, if the circuit is dead or dying we have
1956 	 * to abort the potentially newly-created state and discard the
1957 	 * message.
1958 	 *
1959 	 * - We must discard the message because the other end will not
1960 	 *   be expecting any more messages over the dead or dying circuit
1961 	 *   and might not be able to receive them.
1962 	 *
1963 	 * - We abort the state by simulating a failure to generate a fake
1964 	 *   incoming DELETE.  This will trigger the state callback and allow
1965 	 *   the device to clean things up and reply, closing the outgoing
1966 	 *   direction and allowing the state to be freed.
1967 	 *
1968 	 * This situation occurs quite often, particularly as SPANs stabilize.
1969 	 * End-points must do the right thing.
1970 	 */
1971 	if (state) {
1972 		KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1973 		if (state->flags & KDMSG_STATE_DYING) {
1974 #if 0
1975 		if ((state->flags & KDMSG_STATE_DYING) ||
1976 		    (state->parent->txcmd & DMSGF_DELETE) ||
1977 		    (state->parent->flags & KDMSG_STATE_DYING)) {
1978 #endif
1979 			kdio_printf(iocom, 4,
1980 				    "kdmsg_msg_write: Write to dying circuit "
1981 				    "state=%p "
1982 				    "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1983 				    state,
1984 				    state->parent->rxcmd,
1985 				    state->parent->txcmd,
1986 				    state->parent->flags);
1987 			kdmsg_state_hold(state);
1988 			kdmsg_state_msgtx(msg);
1989 			kdmsg_state_cleanuptx(msg);
1990 			kdmsg_state_drop(state);
1991 			return;
1992 		}
1993 	}
1994 
1995 	/*
1996 	 * Finish up the msg fields.  Note that msg->aux_size and the
1997 	 * aux_bytes stored in the message header represent the unaligned
1998 	 * (actual) bytes of data, but the buffer is sized to an aligned
1999 	 * size and the CRC is generated over the aligned length.
2000 	 */
2001 	msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
2002 	++iocom->msg_seq;
2003 
2004 	if (msg->aux_data && msg->aux_size) {
2005 		uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
2006 
2007 		msg->any.head.aux_bytes = msg->aux_size;
2008 		msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
2009 	}
2010 	msg->any.head.hdr_crc = 0;
2011 	msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
2012 
2013 	TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
2014 
2015 	if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
2016 		atomic_clear_int(&iocom->msg_ctl,
2017 				 KDMSG_CLUSTERCTL_SLEEPING);
2018 		wakeup(&iocom->msg_ctl);
2019 	}
2020 }
2021 
2022 /*
2023  * Reply to a message and terminate our side of the transaction.
2024  *
2025  * If msg->state is non-NULL we are replying to a one-way message.
2026  */
2027 void
2028 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
2029 {
2030 	kdmsg_state_t *state = msg->state;
2031 	kdmsg_msg_t *nmsg;
2032 	uint32_t cmd;
2033 
2034 	/*
2035 	 * Reply with a simple error code and terminate the transaction.
2036 	 */
2037 	cmd = DMSG_LNK_ERROR;
2038 
2039 	/*
2040 	 * Check if our direction has even been initiated yet, set CREATE.
2041 	 *
2042 	 * Check what direction this is (command or reply direction).  Note
2043 	 * that txcmd might not have been initiated yet.
2044 	 *
2045 	 * If our direction has already been closed we just return without
2046 	 * doing anything.
2047 	 */
2048 	if (state != &state->iocom->state0) {
2049 		if (state->txcmd & DMSGF_DELETE)
2050 			return;
2051 		if ((state->txcmd & DMSGF_CREATE) == 0)
2052 			cmd |= DMSGF_CREATE;
2053 		if (state->txcmd & DMSGF_REPLY)
2054 			cmd |= DMSGF_REPLY;
2055 		cmd |= DMSGF_DELETE;
2056 	} else {
2057 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2058 			cmd |= DMSGF_REPLY;
2059 	}
2060 
2061 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2062 	nmsg->any.head.error = error;
2063 	kdmsg_msg_write(nmsg);
2064 }
2065 
2066 /*
2067  * Reply to a message and continue our side of the transaction.
2068  *
2069  * If msg->state is non-NULL we are replying to a one-way message and this
2070  * function degenerates into the same as kdmsg_msg_reply().
2071  */
2072 void
2073 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
2074 {
2075 	kdmsg_state_t *state = msg->state;
2076 	kdmsg_msg_t *nmsg;
2077 	uint32_t cmd;
2078 
2079 	/*
2080 	 * Return a simple result code, do NOT terminate the transaction.
2081 	 */
2082 	cmd = DMSG_LNK_ERROR;
2083 
2084 	/*
2085 	 * Check if our direction has even been initiated yet, set CREATE.
2086 	 *
2087 	 * Check what direction this is (command or reply direction).  Note
2088 	 * that txcmd might not have been initiated yet.
2089 	 *
2090 	 * If our direction has already been closed we just return without
2091 	 * doing anything.
2092 	 */
2093 	if (state != &state->iocom->state0) {
2094 		if (state->txcmd & DMSGF_DELETE)
2095 			return;
2096 		if ((state->txcmd & DMSGF_CREATE) == 0)
2097 			cmd |= DMSGF_CREATE;
2098 		if (state->txcmd & DMSGF_REPLY)
2099 			cmd |= DMSGF_REPLY;
2100 		/* continuing transaction, do not set MSGF_DELETE */
2101 	} else {
2102 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2103 			cmd |= DMSGF_REPLY;
2104 	}
2105 
2106 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2107 	nmsg->any.head.error = error;
2108 	kdmsg_msg_write(nmsg);
2109 }
2110 
2111 /*
2112  * Reply to a message and terminate our side of the transaction.
2113  *
2114  * If msg->state is non-NULL we are replying to a one-way message.
2115  */
2116 void
2117 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
2118 {
2119 	kdmsg_msg_t *nmsg;
2120 	uint32_t cmd;
2121 
2122 	/*
2123 	 * Reply with a simple error code and terminate the transaction.
2124 	 */
2125 	cmd = DMSG_LNK_ERROR;
2126 
2127 	/*
2128 	 * Check if our direction has even been initiated yet, set CREATE.
2129 	 *
2130 	 * Check what direction this is (command or reply direction).  Note
2131 	 * that txcmd might not have been initiated yet.
2132 	 *
2133 	 * If our direction has already been closed we just return without
2134 	 * doing anything.
2135 	 */
2136 	KKASSERT(state);
2137 	if (state->txcmd & DMSGF_DELETE)
2138 		return;
2139 	if ((state->txcmd & DMSGF_CREATE) == 0)
2140 		cmd |= DMSGF_CREATE;
2141 	if (state->txcmd & DMSGF_REPLY)
2142 		cmd |= DMSGF_REPLY;
2143 	cmd |= DMSGF_DELETE;
2144 
2145 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2146 	nmsg->any.head.error = error;
2147 	kdmsg_msg_write(nmsg);
2148 }
2149 
2150 /*
2151  * Reply to a message and continue our side of the transaction.
2152  *
2153  * If msg->state is non-NULL we are replying to a one-way message and this
2154  * function degenerates into the same as kdmsg_msg_reply().
2155  */
2156 void
2157 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2158 {
2159 	kdmsg_msg_t *nmsg;
2160 	uint32_t cmd;
2161 
2162 	/*
2163 	 * Return a simple result code, do NOT terminate the transaction.
2164 	 */
2165 	cmd = DMSG_LNK_ERROR;
2166 
2167 	/*
2168 	 * Check if our direction has even been initiated yet, set CREATE.
2169 	 *
2170 	 * Check what direction this is (command or reply direction).  Note
2171 	 * that txcmd might not have been initiated yet.
2172 	 *
2173 	 * If our direction has already been closed we just return without
2174 	 * doing anything.
2175 	 */
2176 	KKASSERT(state);
2177 	if (state->txcmd & DMSGF_DELETE)
2178 		return;
2179 	if ((state->txcmd & DMSGF_CREATE) == 0)
2180 		cmd |= DMSGF_CREATE;
2181 	if (state->txcmd & DMSGF_REPLY)
2182 		cmd |= DMSGF_REPLY;
2183 	/* continuing transaction, do not set MSGF_DELETE */
2184 
2185 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2186 	nmsg->any.head.error = error;
2187 	kdmsg_msg_write(nmsg);
2188 }
2189