xref: /dragonfly/sys/kern/kern_dmsg.c (revision ef3ac1d1)
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by txmsgq, need to calculate
36  *	 a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/priv.h>
52 #include <sys/thread.h>
53 #include <sys/globaldata.h>
54 #include <sys/limits.h>
55 
56 #include <sys/dmsg.h>
57 
58 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
59 
60 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
61 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
62 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
63 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
64 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
65 static void kdmsg_state_abort(kdmsg_state_t *state);
66 static void kdmsg_state_free(kdmsg_state_t *state);
67 
68 static void kdmsg_iocom_thread_rd(void *arg);
69 static void kdmsg_iocom_thread_wr(void *arg);
70 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
71 
72 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
73 
74 /*
75  * Initialize the roll-up communications structure for a network
76  * messaging session.  This function does not install the socket.
77  */
78 void
79 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
80 		 struct malloc_type *mmsg,
81 		 int (*rcvmsg)(kdmsg_msg_t *msg))
82 {
83 	bzero(iocom, sizeof(*iocom));
84 	iocom->handle = handle;
85 	iocom->mmsg = mmsg;
86 	iocom->rcvmsg = rcvmsg;
87 	iocom->flags = flags;
88 	lockinit(&iocom->msglk, "h2msg", 0, 0);
89 	TAILQ_INIT(&iocom->msgq);
90 	RB_INIT(&iocom->staterd_tree);
91 	RB_INIT(&iocom->statewr_tree);
92 
93 	iocom->state0.iocom = iocom;
94 	iocom->state0.parent = &iocom->state0;
95 	TAILQ_INIT(&iocom->state0.subq);
96 }
97 
98 /*
99  * [Re]connect using the passed file pointer.  The caller must ref the
100  * fp for us.  We own that ref now.
101  */
102 void
103 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
104 		      const char *subsysname)
105 {
106 	/*
107 	 * Destroy the current connection
108 	 */
109 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
110 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
111 	while (iocom->msgrd_td || iocom->msgwr_td) {
112 		wakeup(&iocom->msg_ctl);
113 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
114 	}
115 
116 	/*
117 	 * Drop communications descriptor
118 	 */
119 	if (iocom->msg_fp) {
120 		fdrop(iocom->msg_fp);
121 		iocom->msg_fp = NULL;
122 	}
123 
124 	/*
125 	 * Setup new communications descriptor
126 	 */
127 	iocom->msg_ctl = 0;
128 	iocom->msg_fp = fp;
129 	iocom->msg_seq = 0;
130 	iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
131 
132 	lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
133 		    NULL, 0, -1, "%s-msgrd", subsysname);
134 	lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
135 		    NULL, 0, -1, "%s-msgwr", subsysname);
136 	lockmgr(&iocom->msglk, LK_RELEASE);
137 }
138 
139 /*
140  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
141  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
142  */
143 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
144 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
145 
146 void
147 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
148 			 void (*auto_callback)(kdmsg_msg_t *msg))
149 {
150 	kdmsg_msg_t *msg;
151 
152 	iocom->auto_callback = auto_callback;
153 
154 	msg = kdmsg_msg_alloc(&iocom->state0,
155 			      DMSG_LNK_CONN | DMSGF_CREATE,
156 			      kdmsg_lnk_conn_reply, NULL);
157 	iocom->auto_lnk_conn.head = msg->any.head;
158 	msg->any.lnk_conn = iocom->auto_lnk_conn;
159 	iocom->conn_state = msg->state;
160 	kdmsg_msg_write(msg);
161 }
162 
163 static
164 int
165 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
166 {
167 	kdmsg_iocom_t *iocom = state->iocom;
168 	kdmsg_msg_t *rmsg;
169 
170 	/*
171 	 * Upon receipt of the LNK_CONN acknowledgement initiate an
172 	 * automatic SPAN if we were asked to.  Used by e.g. xdisk, but
173 	 * not used by HAMMER2 which must manage more than one transmitted
174 	 * SPAN.
175 	 */
176 	if ((msg->any.head.cmd & DMSGF_CREATE) &&
177 	    (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
178 		rmsg = kdmsg_msg_alloc(&iocom->state0,
179 				       DMSG_LNK_SPAN | DMSGF_CREATE,
180 				       kdmsg_lnk_span_reply, NULL);
181 		iocom->auto_lnk_span.head = rmsg->any.head;
182 		rmsg->any.lnk_span = iocom->auto_lnk_span;
183 		kdmsg_msg_write(rmsg);
184 	}
185 
186 	/*
187 	 * Process shim after the CONN is acknowledged and before the CONN
188 	 * transaction is deleted.  For deletions this gives device drivers
189 	 * the ability to interlock new operations on the circuit before
190 	 * it becomes illegal and panics.
191 	 */
192 	if (iocom->auto_callback)
193 		iocom->auto_callback(msg);
194 
195 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
196 	    (msg->any.head.cmd & DMSGF_DELETE)) {
197 		iocom->conn_state = NULL;
198 		kdmsg_msg_reply(msg, 0);
199 	}
200 
201 	return (0);
202 }
203 
204 static
205 int
206 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
207 {
208 	/*
209 	 * Be sure to process shim before terminating the SPAN
210 	 * transaction.  Gives device drivers the ability to
211 	 * interlock new operations on the circuit before it
212 	 * becomes illegal and panics.
213 	 */
214 	if (state->iocom->auto_callback)
215 		state->iocom->auto_callback(msg);
216 
217 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
218 	    (msg->any.head.cmd & DMSGF_DELETE)) {
219 		kdmsg_msg_reply(msg, 0);
220 	}
221 	return (0);
222 }
223 
224 /*
225  * Disconnect and clean up
226  */
227 void
228 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
229 {
230 	kdmsg_state_t *state;
231 
232 	/*
233 	 * Ask the cluster controller to go away
234 	 */
235 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
236 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
237 
238 	while (iocom->msgrd_td || iocom->msgwr_td) {
239 		wakeup(&iocom->msg_ctl);
240 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
241 	}
242 
243 	/*
244 	 * Cleanup caches
245 	 */
246 	if ((state = iocom->freerd_state) != NULL) {
247 		iocom->freerd_state = NULL;
248 		kdmsg_state_free(state);
249 	}
250 
251 	if ((state = iocom->freewr_state) != NULL) {
252 		iocom->freewr_state = NULL;
253 		kdmsg_state_free(state);
254 	}
255 
256 	/*
257 	 * Drop communications descriptor
258 	 */
259 	if (iocom->msg_fp) {
260 		fdrop(iocom->msg_fp);
261 		iocom->msg_fp = NULL;
262 	}
263 	lockmgr(&iocom->msglk, LK_RELEASE);
264 }
265 
266 /*
267  * Cluster controller thread.  Perform messaging functions.  We have one
268  * thread for the reader and one for the writer.  The writer handles
269  * shutdown requests (which should break the reader thread).
270  */
271 static
272 void
273 kdmsg_iocom_thread_rd(void *arg)
274 {
275 	kdmsg_iocom_t *iocom = arg;
276 	dmsg_hdr_t hdr;
277 	kdmsg_msg_t *msg = NULL;
278 	size_t hbytes;
279 	size_t abytes;
280 	int error = 0;
281 
282 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0) {
283 		/*
284 		 * Retrieve the message from the pipe or socket.
285 		 */
286 		error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
287 				NULL, 1, UIO_SYSSPACE);
288 		if (error)
289 			break;
290 		if (hdr.magic != DMSG_HDR_MAGIC) {
291 			kprintf("kdmsg: bad magic: %04x\n", hdr.magic);
292 			error = EINVAL;
293 			break;
294 		}
295 		hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
296 		if (hbytes < sizeof(hdr) || hbytes > DMSG_AUX_MAX) {
297 			kprintf("kdmsg: bad header size %zd\n", hbytes);
298 			error = EINVAL;
299 			break;
300 		}
301 
302 		/* XXX messy: mask cmd to avoid allocating state */
303 		msg = kdmsg_msg_alloc(&iocom->state0,
304 				      hdr.cmd & DMSGF_BASECMDMASK,
305 				      NULL, NULL);
306 		msg->any.head = hdr;
307 		msg->hdr_size = hbytes;
308 		if (hbytes > sizeof(hdr)) {
309 			error = fp_read(iocom->msg_fp, &msg->any.head + 1,
310 					hbytes - sizeof(hdr),
311 					NULL, 1, UIO_SYSSPACE);
312 			if (error) {
313 				kprintf("kdmsg: short msg received\n");
314 				error = EINVAL;
315 				break;
316 			}
317 		}
318 		msg->aux_size = hdr.aux_bytes;
319 		if (msg->aux_size > DMSG_AUX_MAX) {
320 			kprintf("kdmsg: illegal msg payload size %zd\n",
321 				msg->aux_size);
322 			error = EINVAL;
323 			break;
324 		}
325 		if (msg->aux_size) {
326 			abytes = DMSG_DOALIGN(msg->aux_size);
327 			msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
328 			msg->flags |= KDMSG_FLAG_AUXALLOC;
329 			error = fp_read(iocom->msg_fp, msg->aux_data,
330 					abytes, NULL, 1, UIO_SYSSPACE);
331 			if (error) {
332 				kprintf("kdmsg: short msg payload received\n");
333 				break;
334 			}
335 		}
336 
337 		error = kdmsg_msg_receive_handling(msg);
338 		msg = NULL;
339 	}
340 
341 	if (error)
342 		kprintf("kdmsg: read failed error %d\n", error);
343 
344 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
345 	if (msg)
346 		kdmsg_msg_free(msg);
347 
348 	/*
349 	 * Shutdown the socket before waiting for the transmit side.
350 	 *
351 	 * If we are dying due to e.g. a socket disconnect verses being
352 	 * killed explicity we have to set KILL in order to kick the tx
353 	 * side when it might not have any other work to do.  KILL might
354 	 * already be set if we are in an unmount or reconnect.
355 	 */
356 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
357 
358 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
359 	wakeup(&iocom->msg_ctl);
360 
361 	/*
362 	 * Wait for the transmit side to drain remaining messages
363 	 * before cleaning up the rx state.  The transmit side will
364 	 * set KILLTX and wait for the rx side to completely finish
365 	 * (set msgrd_td to NULL) before cleaning up any remaining
366 	 * tx states.
367 	 */
368 	lockmgr(&iocom->msglk, LK_RELEASE);
369 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
370 	wakeup(&iocom->msg_ctl);
371 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0) {
372 		wakeup(&iocom->msg_ctl);
373 		tsleep(iocom, 0, "clstrkw", hz);
374 	}
375 
376 	iocom->msgrd_td = NULL;
377 
378 	/*
379 	 * iocom can be ripped out from under us at this point but
380 	 * wakeup() is safe.
381 	 */
382 	wakeup(iocom);
383 	lwkt_exit();
384 }
385 
386 static
387 void
388 kdmsg_iocom_thread_wr(void *arg)
389 {
390 	kdmsg_iocom_t *iocom = arg;
391 	kdmsg_msg_t *msg;
392 	kdmsg_state_t *state;
393 	ssize_t res;
394 	size_t abytes;
395 	int error = 0;
396 	int retries = 20;
397 
398 	/*
399 	 * Transmit loop
400 	 */
401 	msg = NULL;
402 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
403 
404 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0 && error == 0) {
405 		/*
406 		 * Sleep if no messages pending.  Interlock with flag while
407 		 * holding msglk.
408 		 */
409 		if (TAILQ_EMPTY(&iocom->msgq)) {
410 			atomic_set_int(&iocom->msg_ctl,
411 				       KDMSG_CLUSTERCTL_SLEEPING);
412 			lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
413 			atomic_clear_int(&iocom->msg_ctl,
414 					 KDMSG_CLUSTERCTL_SLEEPING);
415 		}
416 
417 		while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
418 			/*
419 			 * Remove msg from the transmit queue and do
420 			 * persist and half-closed state handling.
421 			 */
422 			TAILQ_REMOVE(&iocom->msgq, msg, qentry);
423 			lockmgr(&iocom->msglk, LK_RELEASE);
424 
425 			error = kdmsg_state_msgtx(msg);
426 			if (error == EALREADY) {
427 				error = 0;
428 				kdmsg_msg_free(msg);
429 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
430 				continue;
431 			}
432 			if (error) {
433 				kdmsg_msg_free(msg);
434 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
435 				break;
436 			}
437 
438 			/*
439 			 * Dump the message to the pipe or socket.
440 			 *
441 			 * We have to clean up the message as if the transmit
442 			 * succeeded even if it failed.
443 			 */
444 			error = fp_write(iocom->msg_fp, &msg->any,
445 					 msg->hdr_size, &res, UIO_SYSSPACE);
446 			if (error || res != msg->hdr_size) {
447 				if (error == 0)
448 					error = EINVAL;
449 				kdmsg_state_cleanuptx(msg);
450 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
451 				break;
452 			}
453 			if (msg->aux_size) {
454 				abytes = DMSG_DOALIGN(msg->aux_size);
455 				error = fp_write(iocom->msg_fp,
456 						 msg->aux_data, abytes,
457 						 &res, UIO_SYSSPACE);
458 				if (error || res != abytes) {
459 					if (error == 0)
460 						error = EINVAL;
461 					kdmsg_state_cleanuptx(msg);
462 					lockmgr(&iocom->msglk, LK_EXCLUSIVE);
463 					break;
464 				}
465 			}
466 			kdmsg_state_cleanuptx(msg);
467 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
468 		}
469 	}
470 
471 	/*
472 	 * Cleanup messages pending transmission and release msgq lock.
473 	 */
474 	if (error)
475 		kprintf("kdmsg: write failed error %d\n", error);
476 	kprintf("thread_wr: Terminating iocom\n");
477 
478 	/*
479 	 * Shutdown the socket.  This will cause the rx thread to get an
480 	 * EOF and ensure that both threads get to a termination state.
481 	 */
482 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
483 
484 	/*
485 	 * Set KILLTX (which the rx side waits for), then wait for the RX
486 	 * side to completely finish before we clean out any remaining
487 	 * command states.
488 	 */
489 	lockmgr(&iocom->msglk, LK_RELEASE);
490 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLTX);
491 	wakeup(&iocom->msg_ctl);
492 	while (iocom->msgrd_td) {
493 		wakeup(&iocom->msg_ctl);
494 		tsleep(iocom, 0, "clstrkw", hz);
495 	}
496 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
497 
498 	/*
499 	 * Simulate received MSGF_DELETE's for any remaining states.
500 	 * (For remote masters).
501 	 *
502 	 * Drain the message queue to handle any device initiated writes
503 	 * due to state callbacks.
504 	 */
505 cleanuprd:
506 	RB_FOREACH(state, kdmsg_state_tree, &iocom->staterd_tree)
507 		atomic_set_int(&state->flags, KDMSG_STATE_DYING);
508 	RB_FOREACH(state, kdmsg_state_tree, &iocom->statewr_tree)
509 		atomic_set_int(&state->flags, KDMSG_STATE_DYING);
510 	kdmsg_drain_msgq(iocom);
511 	RB_FOREACH(state, kdmsg_state_tree, &iocom->staterd_tree) {
512 		if ((state->rxcmd & DMSGF_DELETE) == 0) {
513 			lockmgr(&iocom->msglk, LK_RELEASE);
514 			kdmsg_state_abort(state);
515 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
516 			goto cleanuprd;
517 		}
518 	}
519 
520 	/*
521 	 * Simulate received MSGF_DELETE's for any remaining states.
522 	 * (For local masters).
523 	 */
524 	kdmsg_drain_msgq(iocom);
525 	RB_FOREACH(state, kdmsg_state_tree, &iocom->statewr_tree) {
526 		if ((state->rxcmd & DMSGF_DELETE) == 0) {
527 			lockmgr(&iocom->msglk, LK_RELEASE);
528 			kdmsg_state_abort(state);
529 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
530 			goto cleanuprd;
531 		}
532 	}
533 
534 	/*
535 	 * Retry until all work is done
536 	 */
537 	if (--retries == 0)
538 		panic("kdmsg: comm thread shutdown couldn't drain");
539 	if (TAILQ_FIRST(&iocom->msgq) ||
540 	    RB_ROOT(&iocom->staterd_tree) ||
541 	    RB_ROOT(&iocom->statewr_tree)) {
542 		goto cleanuprd;
543 	}
544 	iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
545 
546 	lockmgr(&iocom->msglk, LK_RELEASE);
547 
548 	/*
549 	 * The state trees had better be empty now
550 	 */
551 	KKASSERT(RB_EMPTY(&iocom->staterd_tree));
552 	KKASSERT(RB_EMPTY(&iocom->statewr_tree));
553 	KKASSERT(iocom->conn_state == NULL);
554 
555 	if (iocom->exit_func) {
556 		/*
557 		 * iocom is invalid after we call the exit function.
558 		 */
559 		iocom->msgwr_td = NULL;
560 		iocom->exit_func(iocom);
561 	} else {
562 		/*
563 		 * iocom can be ripped out from under us once msgwr_td is
564 		 * set to NULL.  The wakeup is safe.
565 		 */
566 		iocom->msgwr_td = NULL;
567 		wakeup(iocom);
568 	}
569 	lwkt_exit();
570 }
571 
572 /*
573  * This cleans out the pending transmit message queue, adjusting any
574  * persistent states properly in the process.
575  *
576  * Caller must hold pmp->iocom.msglk
577  */
578 void
579 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
580 {
581 	kdmsg_msg_t *msg;
582 
583 	/*
584 	 * Clean out our pending transmit queue, executing the
585 	 * appropriate state adjustments.  If this tries to open
586 	 * any new outgoing transactions we have to loop up and
587 	 * clean them out.
588 	 */
589 	while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
590 		TAILQ_REMOVE(&iocom->msgq, msg, qentry);
591 		lockmgr(&iocom->msglk, LK_RELEASE);
592 		if (kdmsg_state_msgtx(msg))
593 			kdmsg_msg_free(msg);
594 		else
595 			kdmsg_state_cleanuptx(msg);
596 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
597 	}
598 }
599 
600 /*
601  * Do all processing required to handle a freshly received message
602  * after its low level header has been validated.
603  */
604 static
605 int
606 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
607 {
608 	kdmsg_iocom_t *iocom = msg->state->iocom;
609 	int error;
610 
611 	/*
612 	 * State machine tracking, state assignment for msg,
613 	 * returns error and discard status.  Errors are fatal
614 	 * to the connection except for EALREADY which forces
615 	 * a discard without execution.
616 	 */
617 	error = kdmsg_state_msgrx(msg);
618 	if (error) {
619 		/*
620 		 * Raw protocol or connection error
621 		 */
622 		kdmsg_msg_free(msg);
623 		if (error == EALREADY)
624 			error = 0;
625 	} else if (msg->state && msg->state->func) {
626 		/*
627 		 * Message related to state which already has a
628 		 * handling function installed for it.
629 		 */
630 		error = msg->state->func(msg->state, msg);
631 		kdmsg_state_cleanuprx(msg);
632 	} else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
633 		error = kdmsg_autorxmsg(msg);
634 		kdmsg_state_cleanuprx(msg);
635 	} else {
636 		error = iocom->rcvmsg(msg);
637 		kdmsg_state_cleanuprx(msg);
638 	}
639 	return error;
640 }
641 
642 /*
643  * Process state tracking for a message after reception, prior to
644  * execution.
645  *
646  * Called with msglk held and the msg dequeued.
647  *
648  * All messages are called with dummy state and return actual state.
649  * (One-off messages often just return the same dummy state).
650  *
651  * May request that caller discard the message by setting *discardp to 1.
652  * The returned state is not used in this case and is allowed to be NULL.
653  *
654  * --
655  *
656  * These routines handle persistent and command/reply message state via the
657  * CREATE and DELETE flags.  The first message in a command or reply sequence
658  * sets CREATE, the last message in a command or reply sequence sets DELETE.
659  *
660  * There can be any number of intermediate messages belonging to the same
661  * sequence sent inbetween the CREATE message and the DELETE message,
662  * which set neither flag.  This represents a streaming command or reply.
663  *
664  * Any command message received with CREATE set expects a reply sequence to
665  * be returned.  Reply sequences work the same as command sequences except the
666  * REPLY bit is also sent.  Both the command side and reply side can
667  * degenerate into a single message with both CREATE and DELETE set.  Note
668  * that one side can be streaming and the other side not, or neither, or both.
669  *
670  * The msgid is unique for the initiator.  That is, two sides sending a new
671  * message can use the same msgid without colliding.
672  *
673  * --
674  *
675  * ABORT sequences work by setting the ABORT flag along with normal message
676  * state.  However, ABORTs can also be sent on half-closed messages, that is
677  * even if the command or reply side has already sent a DELETE, as long as
678  * the message has not been fully closed it can still send an ABORT+DELETE
679  * to terminate the half-closed message state.
680  *
681  * Since ABORT+DELETEs can race we silently discard ABORT's for message
682  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
683  * also race, and in this situation the other side might have already
684  * initiated a new unrelated command with the same message id.  Since
685  * the abort has not set the CREATE flag the situation can be detected
686  * and the message will also be discarded.
687  *
688  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
689  * The ABORT request is essentially integrated into the command instead
690  * of being sent later on.  In this situation the command implementation
691  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
692  * special-case non-blocking operation for the command.
693  *
694  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
695  *	  to be mid-stream aborts for command/reply sequences.  ABORTs on
696  *	  one-way messages are not supported.
697  *
698  * NOTE!  If a command sequence does not support aborts the ABORT flag is
699  *	  simply ignored.
700  *
701  * --
702  *
703  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
704  * set.  One-off messages cannot be aborted and typically aren't processed
705  * by these routines.  The REPLY bit can be used to distinguish whether a
706  * one-off message is a command or reply.  For example, one-off replies
707  * will typically just contain status updates.
708  */
709 static
710 int
711 kdmsg_state_msgrx(kdmsg_msg_t *msg)
712 {
713 	kdmsg_iocom_t *iocom = msg->state->iocom;
714 	kdmsg_state_t *state;
715 	kdmsg_state_t *pstate;
716 	kdmsg_state_t sdummy;
717 	int error;
718 
719 	/*
720 	 * Make sure a state structure is ready to go in case we need a new
721 	 * one.  This is the only routine which uses freerd_state so no
722 	 * races are possible.
723 	 */
724 	if ((state = iocom->freerd_state) == NULL) {
725 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
726 		state->flags = KDMSG_STATE_DYNAMIC;
727 		state->iocom = iocom;
728 		TAILQ_INIT(&state->subq);
729 		iocom->freerd_state = state;
730 	}
731 
732 	/*
733 	 * Lock RB tree and locate existing persistent state, if any.
734 	 *
735 	 * If received msg is a command state is on staterd_tree.
736 	 * If received msg is a reply state is on statewr_tree.
737 	 */
738 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
739 
740 	sdummy.msgid = msg->any.head.msgid;
741 	sdummy.iocom = iocom;
742 	if (msg->any.head.cmd & DMSGF_REVTRANS) {
743 		state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
744 				&sdummy);
745 	} else {
746 		state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
747 				&sdummy);
748 	}
749 	if (state == NULL)
750 		state = &iocom->state0;
751 	msg->state = state;
752 
753 	/*
754 	 * Short-cut one-off or mid-stream messages.
755 	 */
756 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
757 				  DMSGF_ABORT)) == 0) {
758 		error = 0;
759 		goto done;
760 	}
761 
762 	/*
763 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
764 	 * inside the case statements.
765 	 */
766 	switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
767 	case DMSGF_CREATE:
768 	case DMSGF_CREATE | DMSGF_DELETE:
769 		/*
770 		 * New persistant command received.
771 		 */
772 		if (state != &iocom->state0) {
773 			kprintf("kdmsg_state_msgrx: duplicate transaction\n");
774 			error = EINVAL;
775 			break;
776 		}
777 
778 		/*
779 		 * Lookup the circuit.  The circuit is an open transaction.
780 		 * the REVCIRC bit in the message tells us which side
781 		 * initiated the transaction representing the circuit.
782 		 */
783 		if (msg->any.head.circuit) {
784 			sdummy.msgid = msg->any.head.circuit;
785 
786 			if (msg->any.head.cmd & DMSGF_REVCIRC) {
787 				pstate = RB_FIND(kdmsg_state_tree,
788 						 &iocom->statewr_tree,
789 						 &sdummy);
790 			} else {
791 				pstate = RB_FIND(kdmsg_state_tree,
792 						 &iocom->staterd_tree,
793 						 &sdummy);
794 			}
795 			if (pstate == NULL) {
796 				kprintf("kdmsg_state_msgrx: "
797 					"missing parent in stacked trans\n");
798 				error = EINVAL;
799 				break;
800 			}
801 		} else {
802 			pstate = &iocom->state0;
803 		}
804 
805 		/*
806 		 * Allocate new state
807 		 */
808 		state = iocom->freerd_state;
809 		iocom->freerd_state = NULL;
810 
811 		msg->state = state;
812 		state->parent = pstate;
813 		KKASSERT(state->iocom == iocom);
814 		state->flags |= KDMSG_STATE_INSERTED |
815 			        KDMSG_STATE_OPPOSITE;
816 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
817 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
818 		state->txcmd = DMSGF_REPLY;
819 		state->msgid = msg->any.head.msgid;
820 		RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
821 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
822 		error = 0;
823 		break;
824 	case DMSGF_DELETE:
825 		/*
826 		 * Persistent state is expected but might not exist if an
827 		 * ABORT+DELETE races the close.
828 		 */
829 		if (state == &iocom->state0) {
830 			if (msg->any.head.cmd & DMSGF_ABORT) {
831 				error = EALREADY;
832 			} else {
833 				kprintf("kdmsg_state_msgrx: "
834 					"no state for DELETE\n");
835 				error = EINVAL;
836 			}
837 			break;
838 		}
839 
840 		/*
841 		 * Handle another ABORT+DELETE case if the msgid has already
842 		 * been reused.
843 		 */
844 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
845 			if (msg->any.head.cmd & DMSGF_ABORT) {
846 				error = EALREADY;
847 			} else {
848 				kprintf("kdmsg_state_msgrx: "
849 					"state reused for DELETE\n");
850 				error = EINVAL;
851 			}
852 			break;
853 		}
854 		error = 0;
855 		break;
856 	default:
857 		/*
858 		 * Check for mid-stream ABORT command received, otherwise
859 		 * allow.
860 		 */
861 		if (msg->any.head.cmd & DMSGF_ABORT) {
862 			if (state == &iocom->state0 ||
863 			    (state->rxcmd & DMSGF_CREATE) == 0) {
864 				error = EALREADY;
865 				break;
866 			}
867 		}
868 		error = 0;
869 		break;
870 	case DMSGF_REPLY | DMSGF_CREATE:
871 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
872 		/*
873 		 * When receiving a reply with CREATE set the original
874 		 * persistent state message should already exist.
875 		 */
876 		if (state == &iocom->state0) {
877 			kprintf("kdmsg_state_msgrx: no state match for "
878 				"REPLY cmd=%08x msgid=%016jx\n",
879 				msg->any.head.cmd,
880 				(intmax_t)msg->any.head.msgid);
881 			error = EINVAL;
882 			break;
883 		}
884 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
885 		error = 0;
886 		break;
887 	case DMSGF_REPLY | DMSGF_DELETE:
888 		/*
889 		 * Received REPLY+ABORT+DELETE in case where msgid has
890 		 * already been fully closed, ignore the message.
891 		 */
892 		if (state == &iocom->state0) {
893 			if (msg->any.head.cmd & DMSGF_ABORT) {
894 				error = EALREADY;
895 			} else {
896 				kprintf("kdmsg_state_msgrx: no state match "
897 					"for REPLY|DELETE\n");
898 				error = EINVAL;
899 			}
900 			break;
901 		}
902 
903 		/*
904 		 * Received REPLY+ABORT+DELETE in case where msgid has
905 		 * already been reused for an unrelated message,
906 		 * ignore the message.
907 		 */
908 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
909 			if (msg->any.head.cmd & DMSGF_ABORT) {
910 				error = EALREADY;
911 			} else {
912 				kprintf("kdmsg_state_msgrx: state reused "
913 					"for REPLY|DELETE\n");
914 				error = EINVAL;
915 			}
916 			break;
917 		}
918 		error = 0;
919 		break;
920 	case DMSGF_REPLY:
921 		/*
922 		 * Check for mid-stream ABORT reply received to sent command.
923 		 */
924 		if (msg->any.head.cmd & DMSGF_ABORT) {
925 			if (state == &iocom->state0 ||
926 			    (state->rxcmd & DMSGF_CREATE) == 0) {
927 				error = EALREADY;
928 				break;
929 			}
930 		}
931 		error = 0;
932 		break;
933 	}
934 
935 	/*
936 	 * Calculate the easy-switch() transactional command.  Represents
937 	 * the outer-transaction command for any transaction-create or
938 	 * transaction-delete, and the inner message command for any
939 	 * non-transaction or inside-transaction command.  tcmd will be
940 	 * set to 0 if the message state is illegal.
941 	 *
942 	 * The two can be told apart because outer-transaction commands
943 	 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
944 	 */
945 done:
946 	lockmgr(&iocom->msglk, LK_RELEASE);
947 
948 	if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
949 		if (state != &iocom->state0) {
950 			msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
951 				    (msg->any.head.cmd & (DMSGF_CREATE |
952 							  DMSGF_DELETE |
953 							  DMSGF_REPLY));
954 		} else {
955 			msg->tcmd = 0;
956 		}
957 	} else {
958 		msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
959 	}
960 	return (error);
961 }
962 
963 /*
964  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
965  * This routine must call iocom->rcvmsg() for anything not automatically
966  * handled.
967  */
968 static int
969 kdmsg_autorxmsg(kdmsg_msg_t *msg)
970 {
971 	kdmsg_iocom_t *iocom = msg->state->iocom;
972 	int error = 0;
973 	uint32_t cmd;
974 
975 	/*
976 	 * Main switch processes transaction create/delete sequences only.
977 	 * Use icmd (DELETEs use DMSG_LNK_ERROR
978 	 *
979 	 * NOTE: If processing in-transaction messages you generally want
980 	 *	 an inner switch on msg->any.head.cmd.
981 	 */
982 	if (msg->state) {
983 		cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
984 		      (msg->any.head.cmd & (DMSGF_CREATE |
985 					    DMSGF_DELETE |
986 					    DMSGF_REPLY));
987 	} else {
988 		cmd = 0;
989 	}
990 
991 	switch(cmd) {
992 	case DMSG_LNK_CONN | DMSGF_CREATE:
993 	case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
994 		/*
995 		 * Received LNK_CONN transaction.  Transmit response and
996 		 * leave transaction open, which allows the other end to
997 		 * start to the SPAN protocol.
998 		 *
999 		 * Handle shim after acknowledging the CONN.
1000 		 */
1001 		if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1002 			if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1003 				kdmsg_msg_result(msg, 0);
1004 				if (iocom->auto_callback)
1005 					iocom->auto_callback(msg);
1006 			} else {
1007 				error = iocom->rcvmsg(msg);
1008 			}
1009 			break;
1010 		}
1011 		/* fall through */
1012 	case DMSG_LNK_CONN | DMSGF_DELETE:
1013 		/*
1014 		 * This message is usually simulated after a link is lost
1015 		 * to clean up the transaction.
1016 		 */
1017 		if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1018 			if (iocom->auto_callback)
1019 				iocom->auto_callback(msg);
1020 			kdmsg_msg_reply(msg, 0);
1021 		} else {
1022 			error = iocom->rcvmsg(msg);
1023 		}
1024 		break;
1025 	case DMSG_LNK_SPAN | DMSGF_CREATE:
1026 	case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1027 		/*
1028 		 * Received LNK_SPAN transaction.  We do not have to respond
1029 		 * (except on termination), but we must leave the transaction
1030 		 * open.
1031 		 *
1032 		 * Handle shim after acknowledging the SPAN.
1033 		 */
1034 		if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1035 			if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1036 				if (iocom->auto_callback)
1037 					iocom->auto_callback(msg);
1038 				break;
1039 			}
1040 			/* fall through */
1041 		} else {
1042 			error = iocom->rcvmsg(msg);
1043 			break;
1044 		}
1045 		/* fall through */
1046 	case DMSG_LNK_SPAN | DMSGF_DELETE:
1047 		/*
1048 		 * Process shims (auto_callback) before cleaning up the
1049 		 * circuit structure and closing the transactions.  Device
1050 		 * driver should ensure that the circuit is not used after
1051 		 * the auto_callback() returns.
1052 		 *
1053 		 * Handle shim before closing the SPAN transaction.
1054 		 */
1055 		if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1056 			if (iocom->auto_callback)
1057 				iocom->auto_callback(msg);
1058 			kdmsg_msg_reply(msg, 0);
1059 		} else {
1060 			error = iocom->rcvmsg(msg);
1061 		}
1062 		break;
1063 	default:
1064 		/*
1065 		 * Anything unhandled goes into rcvmsg.
1066 		 *
1067 		 * NOTE: Replies to link-level messages initiated by our side
1068 		 *	 are handled by the state callback, they are NOT
1069 		 *	 handled here.
1070 		 */
1071 		error = iocom->rcvmsg(msg);
1072 		break;
1073 	}
1074 	return (error);
1075 }
1076 
1077 /*
1078  * Post-receive-handling message and state cleanup.  This routine is called
1079  * after the state function handling/callback to properly dispose of the
1080  * message and update or dispose of the state.
1081  */
1082 static
1083 void
1084 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1085 {
1086 	kdmsg_iocom_t *iocom = msg->state->iocom;
1087 	kdmsg_state_t *state;
1088 	kdmsg_state_t *pstate;
1089 
1090 	if ((state = msg->state) == NULL) {
1091 		kdmsg_msg_free(msg);
1092 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
1093 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1094 		KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1095 		state->rxcmd |= DMSGF_DELETE;
1096 		if (state->txcmd & DMSGF_DELETE) {
1097 			KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1098 			if (state->rxcmd & DMSGF_REPLY) {
1099 				KKASSERT(msg->any.head.cmd &
1100 					 DMSGF_REPLY);
1101 				RB_REMOVE(kdmsg_state_tree,
1102 					  &iocom->statewr_tree, state);
1103 			} else {
1104 				KKASSERT((msg->any.head.cmd &
1105 					  DMSGF_REPLY) == 0);
1106 				RB_REMOVE(kdmsg_state_tree,
1107 					  &iocom->staterd_tree, state);
1108 			}
1109 			pstate = state->parent;
1110 			TAILQ_REMOVE(&pstate->subq, state, entry);
1111 			if (pstate != &pstate->iocom->state0 &&
1112 			    TAILQ_EMPTY(&pstate->subq) &&
1113 			    (pstate->flags & KDMSG_STATE_INSERTED) == 0) {
1114 				kdmsg_state_free(pstate);
1115 			}
1116 			state->flags &= ~KDMSG_STATE_INSERTED;
1117 			state->parent = NULL;
1118 			kdmsg_msg_free(msg);
1119 			if (TAILQ_EMPTY(&state->subq))
1120 				kdmsg_state_free(state);
1121 			lockmgr(&iocom->msglk, LK_RELEASE);
1122 		} else {
1123 			kdmsg_msg_free(msg);
1124 			lockmgr(&iocom->msglk, LK_RELEASE);
1125 		}
1126 	} else {
1127 		kdmsg_msg_free(msg);
1128 	}
1129 }
1130 
1131 /*
1132  * Simulate receiving a message which terminates an active transaction
1133  * state.  Our simulated received message must set DELETE and may also
1134  * have to set CREATE.  It must also ensure that all fields are set such
1135  * that the receive handling code can find the state (kdmsg_state_msgrx())
1136  * or an endless loop will ensue.
1137  *
1138  * This is used when the other end of the link is dead so the device driver
1139  * gets a completed transaction for all pending states.
1140  */
1141 static
1142 void
1143 kdmsg_state_abort(kdmsg_state_t *state)
1144 {
1145 	kdmsg_msg_t *msg;
1146 
1147 	/*
1148 	 * Prevent recursive aborts which could otherwise occur if the
1149 	 * simulated message reception runs state->func which then turns
1150 	 * around and tries to reply to a broken circuit when then calls
1151 	 * the state abort code again.
1152 	 */
1153 	if (state->flags & KDMSG_STATE_ABORTING)
1154 		return;
1155 	state->flags |= KDMSG_STATE_ABORTING;
1156 
1157 	/*
1158 	 * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1159 	 *
1160 	 * NOTE: We are simulating a received message using our state
1161 	 *	 (vs a message generated by the other side using its state),
1162 	 *	 so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1163 	 */
1164 	msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1165 	if ((state->rxcmd & DMSGF_CREATE) == 0)
1166 		msg->any.head.cmd |= DMSGF_CREATE;
1167 	msg->any.head.cmd |= DMSGF_DELETE | (state->rxcmd & DMSGF_REPLY);
1168 	msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1169 	msg->any.head.error = DMSG_ERR_LOSTLINK;
1170 	kdmsg_msg_receive_handling(msg);
1171 }
1172 
1173 /*
1174  * Process state tracking for a message prior to transmission.
1175  *
1176  * Called with msglk held and the msg dequeued.  Returns non-zero if
1177  * the message is bad and should be deleted by the caller.
1178  *
1179  * One-off messages are usually with dummy state and msg->state may be NULL
1180  * in this situation.
1181  *
1182  * New transactions (when CREATE is set) will insert the state.
1183  *
1184  * May request that caller discard the message by setting *discardp to 1.
1185  * A NULL state may be returned in this case.
1186  */
1187 static
1188 int
1189 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1190 {
1191 	kdmsg_iocom_t *iocom = msg->state->iocom;
1192 	kdmsg_state_t *state;
1193 	int error;
1194 
1195 	/*
1196 	 * Make sure a state structure is ready to go in case we need a new
1197 	 * one.  This is the only routine which uses freewr_state so no
1198 	 * races are possible.
1199 	 */
1200 	if ((state = iocom->freewr_state) == NULL) {
1201 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1202 		state->flags = KDMSG_STATE_DYNAMIC;
1203 		state->iocom = iocom;
1204 		iocom->freewr_state = state;
1205 	}
1206 
1207 	/*
1208 	 * Lock RB tree.  If persistent state is present it will have already
1209 	 * been assigned to msg.
1210 	 */
1211 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1212 	state = msg->state;
1213 
1214 	/*
1215 	 * Short-cut one-off or mid-stream messages (state may be NULL).
1216 	 */
1217 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1218 				  DMSGF_ABORT)) == 0) {
1219 		lockmgr(&iocom->msglk, LK_RELEASE);
1220 		return(0);
1221 	}
1222 
1223 
1224 	/*
1225 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1226 	 * inside the case statements.
1227 	 */
1228 	switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1229 				    DMSGF_REPLY)) {
1230 	case DMSGF_CREATE:
1231 	case DMSGF_CREATE | DMSGF_DELETE:
1232 		/*
1233 		 * Insert the new persistent message state and mark
1234 		 * half-closed if DELETE is set.  Since this is a new
1235 		 * message it isn't possible to transition into the fully
1236 		 * closed state here.
1237 		 *
1238 		 * XXX state must be assigned and inserted by
1239 		 *     kdmsg_msg_write().  txcmd is assigned by us
1240 		 *     on-transmit.
1241 		 */
1242 		KKASSERT(state != NULL);
1243 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1244 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1245 		state->rxcmd = DMSGF_REPLY;
1246 		error = 0;
1247 		break;
1248 	case DMSGF_DELETE:
1249 		/*
1250 		 * Sent ABORT+DELETE in case where msgid has already
1251 		 * been fully closed, ignore the message.
1252 		 */
1253 		if (state == &iocom->state0) {
1254 			if (msg->any.head.cmd & DMSGF_ABORT) {
1255 				error = EALREADY;
1256 			} else {
1257 				kprintf("kdmsg_state_msgtx: no state match "
1258 					"for DELETE cmd=%08x msgid=%016jx\n",
1259 					msg->any.head.cmd,
1260 					(intmax_t)msg->any.head.msgid);
1261 				error = EINVAL;
1262 			}
1263 			break;
1264 		}
1265 
1266 		/*
1267 		 * Sent ABORT+DELETE in case where msgid has
1268 		 * already been reused for an unrelated message,
1269 		 * ignore the message.
1270 		 */
1271 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1272 			if (msg->any.head.cmd & DMSGF_ABORT) {
1273 				error = EALREADY;
1274 			} else {
1275 				kprintf("kdmsg_state_msgtx: state reused "
1276 					"for DELETE\n");
1277 				error = EINVAL;
1278 			}
1279 			break;
1280 		}
1281 		error = 0;
1282 		break;
1283 	default:
1284 		/*
1285 		 * Check for mid-stream ABORT command sent
1286 		 */
1287 		if (msg->any.head.cmd & DMSGF_ABORT) {
1288 			if (state == &state->iocom->state0 ||
1289 			    (state->txcmd & DMSGF_CREATE) == 0) {
1290 				error = EALREADY;
1291 				break;
1292 			}
1293 		}
1294 		error = 0;
1295 		break;
1296 	case DMSGF_REPLY | DMSGF_CREATE:
1297 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1298 		/*
1299 		 * When transmitting a reply with CREATE set the original
1300 		 * persistent state message should already exist.
1301 		 */
1302 		if (state == &state->iocom->state0) {
1303 			kprintf("kdmsg_state_msgtx: no state match "
1304 				"for REPLY | CREATE\n");
1305 			error = EINVAL;
1306 			break;
1307 		}
1308 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1309 		error = 0;
1310 		break;
1311 	case DMSGF_REPLY | DMSGF_DELETE:
1312 		/*
1313 		 * When transmitting a reply with DELETE set the original
1314 		 * persistent state message should already exist.
1315 		 *
1316 		 * This is very similar to the REPLY|CREATE|* case except
1317 		 * txcmd is already stored, so we just add the DELETE flag.
1318 		 *
1319 		 * Sent REPLY+ABORT+DELETE in case where msgid has
1320 		 * already been fully closed, ignore the message.
1321 		 */
1322 		if (state == &state->iocom->state0) {
1323 			if (msg->any.head.cmd & DMSGF_ABORT) {
1324 				error = EALREADY;
1325 			} else {
1326 				kprintf("kdmsg_state_msgtx: no state match "
1327 					"for REPLY | DELETE\n");
1328 				error = EINVAL;
1329 			}
1330 			break;
1331 		}
1332 
1333 		/*
1334 		 * Sent REPLY+ABORT+DELETE in case where msgid has already
1335 		 * been reused for an unrelated message, ignore the message.
1336 		 */
1337 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1338 			if (msg->any.head.cmd & DMSGF_ABORT) {
1339 				error = EALREADY;
1340 			} else {
1341 				kprintf("kdmsg_state_msgtx: state reused "
1342 					"for REPLY | DELETE\n");
1343 				error = EINVAL;
1344 			}
1345 			break;
1346 		}
1347 		error = 0;
1348 		break;
1349 	case DMSGF_REPLY:
1350 		/*
1351 		 * Check for mid-stream ABORT reply sent.
1352 		 *
1353 		 * One-off REPLY messages are allowed for e.g. status updates.
1354 		 */
1355 		if (msg->any.head.cmd & DMSGF_ABORT) {
1356 			if (state == &state->iocom->state0 ||
1357 			    (state->txcmd & DMSGF_CREATE) == 0) {
1358 				error = EALREADY;
1359 				break;
1360 			}
1361 		}
1362 		error = 0;
1363 		break;
1364 	}
1365 	lockmgr(&iocom->msglk, LK_RELEASE);
1366 	return (error);
1367 }
1368 
1369 static
1370 void
1371 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1372 {
1373 	kdmsg_iocom_t *iocom = msg->state->iocom;
1374 	kdmsg_state_t *state;
1375 	kdmsg_state_t *pstate;
1376 
1377 	if ((state = msg->state) == NULL) {
1378 		kdmsg_msg_free(msg);
1379 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
1380 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1381 		KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1382 		state->txcmd |= DMSGF_DELETE;
1383 		if (state->rxcmd & DMSGF_DELETE) {
1384 			KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1385 			if (state->txcmd & DMSGF_REPLY) {
1386 				KKASSERT(msg->any.head.cmd &
1387 					 DMSGF_REPLY);
1388 				RB_REMOVE(kdmsg_state_tree,
1389 					  &iocom->staterd_tree, state);
1390 			} else {
1391 				KKASSERT((msg->any.head.cmd &
1392 					  DMSGF_REPLY) == 0);
1393 				RB_REMOVE(kdmsg_state_tree,
1394 					  &iocom->statewr_tree, state);
1395 			}
1396 			pstate = state->parent;
1397 			TAILQ_REMOVE(&pstate->subq, state, entry);
1398 			if (pstate != &pstate->iocom->state0 &&
1399 			    TAILQ_EMPTY(&pstate->subq) &&
1400 			    (pstate->flags & KDMSG_STATE_INSERTED) == 0) {
1401 				kdmsg_state_free(pstate);
1402 			}
1403 			state->flags &= ~KDMSG_STATE_INSERTED;
1404 			state->parent = NULL;
1405 			kdmsg_msg_free(msg);
1406 			if (TAILQ_EMPTY(&state->subq))
1407 				kdmsg_state_free(state);
1408 			lockmgr(&iocom->msglk, LK_RELEASE);
1409 		} else {
1410 			kdmsg_msg_free(msg);
1411 			lockmgr(&iocom->msglk, LK_RELEASE);
1412 		}
1413 	} else {
1414 		kdmsg_msg_free(msg);
1415 	}
1416 }
1417 
1418 static
1419 void
1420 kdmsg_state_free(kdmsg_state_t *state)
1421 {
1422 	kdmsg_iocom_t *iocom = state->iocom;
1423 
1424 	KKASSERT((state->flags & KDMSG_STATE_INSERTED) == 0);
1425 	kfree(state, iocom->mmsg);
1426 }
1427 
1428 kdmsg_msg_t *
1429 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1430 		int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1431 {
1432 	kdmsg_iocom_t *iocom = state->iocom;
1433 	kdmsg_state_t *pstate;
1434 	kdmsg_msg_t *msg;
1435 	size_t hbytes;
1436 
1437 	KKASSERT(iocom != NULL);
1438 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1439 	msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1440 		      iocom->mmsg, M_WAITOK | M_ZERO);
1441 	msg->hdr_size = hbytes;
1442 
1443 	if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1444 		/*
1445 		 * New transaction, requires tracking state and a unique
1446 		 * msgid to be allocated.
1447 		 */
1448 		pstate = state;
1449 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1450 		TAILQ_INIT(&state->subq);
1451 		state->iocom = iocom;
1452 		state->parent = pstate;
1453 		state->flags = KDMSG_STATE_DYNAMIC;
1454 		state->func = func;
1455 		state->any.any = data;
1456 		state->msgid = (uint64_t)(uintptr_t)state;
1457 		/*msg->any.head.msgid = state->msgid;XXX*/
1458 
1459 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1460 		if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1461 			panic("duplicate msgid allocated");
1462 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1463 		state->flags |= KDMSG_STATE_INSERTED;
1464 		lockmgr(&iocom->msglk, LK_RELEASE);
1465 	} else {
1466 		pstate = state->parent;
1467 	}
1468 
1469 	if (state->flags & KDMSG_STATE_OPPOSITE)
1470 		cmd |= DMSGF_REVTRANS;
1471 	if (pstate->flags & KDMSG_STATE_OPPOSITE)
1472 		cmd |= DMSGF_REVCIRC;
1473 
1474 	msg->any.head.magic = DMSG_HDR_MAGIC;
1475 	msg->any.head.cmd = cmd;
1476 	msg->any.head.msgid = state->msgid;
1477 	msg->any.head.circuit = pstate->msgid;
1478 	msg->state = state;
1479 
1480 	return (msg);
1481 }
1482 
1483 void
1484 kdmsg_msg_free(kdmsg_msg_t *msg)
1485 {
1486 	kdmsg_iocom_t *iocom = msg->state->iocom;
1487 
1488 	if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1489 	    msg->aux_data && msg->aux_size) {
1490 		kfree(msg->aux_data, iocom->mmsg);
1491 		msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1492 	}
1493 	msg->state = NULL;
1494 	msg->aux_data = NULL;
1495 	msg->aux_size = 0;
1496 
1497 	kfree(msg, iocom->mmsg);
1498 }
1499 
1500 /*
1501  * Indexed messages are stored in a red-black tree indexed by their
1502  * msgid.  Only persistent messages are indexed.
1503  */
1504 int
1505 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1506 {
1507 	if (state1->iocom < state2->iocom)
1508 		return(-1);
1509 	if (state1->iocom > state2->iocom)
1510 		return(1);
1511 	if (state1->msgid < state2->msgid)
1512 		return(-1);
1513 	if (state1->msgid > state2->msgid)
1514 		return(1);
1515 	return(0);
1516 }
1517 
1518 /*
1519  * Write a message.  All requisit command flags have been set.
1520  *
1521  * If msg->state is non-NULL the message is written to the existing
1522  * transaction.  msgid will be set accordingly.
1523  *
1524  * If msg->state is NULL and CREATE is set new state is allocated and
1525  * (func, data) is installed.  A msgid is assigned.
1526  *
1527  * If msg->state is NULL and CREATE is not set the message is assumed
1528  * to be a one-way message.  The originator must assign the msgid
1529  * (or leave it 0, which is typical.
1530  *
1531  * This function merely queues the message to the management thread, it
1532  * does not write to the message socket/pipe.
1533  */
1534 void
1535 kdmsg_msg_write(kdmsg_msg_t *msg)
1536 {
1537 	kdmsg_iocom_t *iocom = msg->state->iocom;
1538 	kdmsg_state_t *state;
1539 
1540 	if (msg->state) {
1541 		/*
1542 		 * Continuance or termination of existing transaction.
1543 		 * The transaction could have been initiated by either end.
1544 		 *
1545 		 * (Function callback and aux data for the receive side can
1546 		 * be replaced or left alone).
1547 		 */
1548 		state = msg->state;
1549 		msg->any.head.msgid = state->msgid;
1550 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1551 	} else {
1552 		/*
1553 		 * One-off message (always uses msgid 0 to distinguish
1554 		 * between a possibly lost in-transaction message due to
1555 		 * competing aborts and a real one-off message?)
1556 		 */
1557 		state = NULL;
1558 		msg->any.head.msgid = 0;
1559 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1560 	}
1561 
1562 	/*
1563 	 * This flag is not set until after the tx thread has drained
1564 	 * the txmsgq and simulated responses.  After that point the
1565 	 * txthread is dead and can no longer simulate responses.
1566 	 *
1567 	 * Device drivers should never try to send a message once this
1568 	 * flag is set.  They should have detected (through the state
1569 	 * closures) that the link is in trouble.
1570 	 */
1571 	if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1572 		lockmgr(&iocom->msglk, LK_RELEASE);
1573 		panic("kdmsg_msg_write: Attempt to write message to "
1574 		      "terminated iocom\n");
1575 	}
1576 
1577 	/*
1578 	 * Finish up the msg fields.  Note that msg->aux_size and the
1579 	 * aux_bytes stored in the message header represent the unaligned
1580 	 * (actual) bytes of data, but the buffer is sized to an aligned
1581 	 * size and the CRC is generated over the aligned length.
1582 	 */
1583 	msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
1584 	++iocom->msg_seq;
1585 
1586 	if (msg->aux_data && msg->aux_size) {
1587 		uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
1588 
1589 		msg->any.head.aux_bytes = msg->aux_size;
1590 		msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
1591 	}
1592 	msg->any.head.hdr_crc = 0;
1593 	msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
1594 
1595 	TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
1596 
1597 	if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
1598 		atomic_clear_int(&iocom->msg_ctl,
1599 				 KDMSG_CLUSTERCTL_SLEEPING);
1600 		wakeup(&iocom->msg_ctl);
1601 	}
1602 
1603 	lockmgr(&iocom->msglk, LK_RELEASE);
1604 }
1605 
1606 /*
1607  * Reply to a message and terminate our side of the transaction.
1608  *
1609  * If msg->state is non-NULL we are replying to a one-way message.
1610  */
1611 void
1612 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
1613 {
1614 	kdmsg_state_t *state = msg->state;
1615 	kdmsg_msg_t *nmsg;
1616 	uint32_t cmd;
1617 
1618 	/*
1619 	 * Reply with a simple error code and terminate the transaction.
1620 	 */
1621 	cmd = DMSG_LNK_ERROR;
1622 
1623 	/*
1624 	 * Check if our direction has even been initiated yet, set CREATE.
1625 	 *
1626 	 * Check what direction this is (command or reply direction).  Note
1627 	 * that txcmd might not have been initiated yet.
1628 	 *
1629 	 * If our direction has already been closed we just return without
1630 	 * doing anything.
1631 	 */
1632 	if (state != &state->iocom->state0) {
1633 		if (state->txcmd & DMSGF_DELETE)
1634 			return;
1635 		if ((state->txcmd & DMSGF_CREATE) == 0)
1636 			cmd |= DMSGF_CREATE;
1637 		if (state->txcmd & DMSGF_REPLY)
1638 			cmd |= DMSGF_REPLY;
1639 		cmd |= DMSGF_DELETE;
1640 	} else {
1641 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1642 			cmd |= DMSGF_REPLY;
1643 	}
1644 
1645 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1646 	nmsg->any.head.error = error;
1647 	kdmsg_msg_write(nmsg);
1648 }
1649 
1650 /*
1651  * Reply to a message and continue our side of the transaction.
1652  *
1653  * If msg->state is non-NULL we are replying to a one-way message and this
1654  * function degenerates into the same as kdmsg_msg_reply().
1655  */
1656 void
1657 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
1658 {
1659 	kdmsg_state_t *state = msg->state;
1660 	kdmsg_msg_t *nmsg;
1661 	uint32_t cmd;
1662 
1663 	/*
1664 	 * Return a simple result code, do NOT terminate the transaction.
1665 	 */
1666 	cmd = DMSG_LNK_ERROR;
1667 
1668 	/*
1669 	 * Check if our direction has even been initiated yet, set CREATE.
1670 	 *
1671 	 * Check what direction this is (command or reply direction).  Note
1672 	 * that txcmd might not have been initiated yet.
1673 	 *
1674 	 * If our direction has already been closed we just return without
1675 	 * doing anything.
1676 	 */
1677 	if (state != &state->iocom->state0) {
1678 		if (state->txcmd & DMSGF_DELETE)
1679 			return;
1680 		if ((state->txcmd & DMSGF_CREATE) == 0)
1681 			cmd |= DMSGF_CREATE;
1682 		if (state->txcmd & DMSGF_REPLY)
1683 			cmd |= DMSGF_REPLY;
1684 		/* continuing transaction, do not set MSGF_DELETE */
1685 	} else {
1686 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1687 			cmd |= DMSGF_REPLY;
1688 	}
1689 
1690 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1691 	nmsg->any.head.error = error;
1692 	kdmsg_msg_write(nmsg);
1693 }
1694 
1695 /*
1696  * Reply to a message and terminate our side of the transaction.
1697  *
1698  * If msg->state is non-NULL we are replying to a one-way message.
1699  */
1700 void
1701 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
1702 {
1703 	kdmsg_msg_t *nmsg;
1704 	uint32_t cmd;
1705 
1706 	/*
1707 	 * Reply with a simple error code and terminate the transaction.
1708 	 */
1709 	cmd = DMSG_LNK_ERROR;
1710 
1711 	/*
1712 	 * Check if our direction has even been initiated yet, set CREATE.
1713 	 *
1714 	 * Check what direction this is (command or reply direction).  Note
1715 	 * that txcmd might not have been initiated yet.
1716 	 *
1717 	 * If our direction has already been closed we just return without
1718 	 * doing anything.
1719 	 */
1720 	KKASSERT(state);
1721 	if (state->txcmd & DMSGF_DELETE)
1722 		return;
1723 	if ((state->txcmd & DMSGF_CREATE) == 0)
1724 		cmd |= DMSGF_CREATE;
1725 	if (state->txcmd & DMSGF_REPLY)
1726 		cmd |= DMSGF_REPLY;
1727 	cmd |= DMSGF_DELETE;
1728 
1729 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1730 	nmsg->any.head.error = error;
1731 	kdmsg_msg_write(nmsg);
1732 }
1733 
1734 /*
1735  * Reply to a message and continue our side of the transaction.
1736  *
1737  * If msg->state is non-NULL we are replying to a one-way message and this
1738  * function degenerates into the same as kdmsg_msg_reply().
1739  */
1740 void
1741 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
1742 {
1743 	kdmsg_msg_t *nmsg;
1744 	uint32_t cmd;
1745 
1746 	/*
1747 	 * Return a simple result code, do NOT terminate the transaction.
1748 	 */
1749 	cmd = DMSG_LNK_ERROR;
1750 
1751 	/*
1752 	 * Check if our direction has even been initiated yet, set CREATE.
1753 	 *
1754 	 * Check what direction this is (command or reply direction).  Note
1755 	 * that txcmd might not have been initiated yet.
1756 	 *
1757 	 * If our direction has already been closed we just return without
1758 	 * doing anything.
1759 	 */
1760 	KKASSERT(state);
1761 	if (state->txcmd & DMSGF_DELETE)
1762 		return;
1763 	if ((state->txcmd & DMSGF_CREATE) == 0)
1764 		cmd |= DMSGF_CREATE;
1765 	if (state->txcmd & DMSGF_REPLY)
1766 		cmd |= DMSGF_REPLY;
1767 	/* continuing transaction, do not set MSGF_DELETE */
1768 
1769 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1770 	nmsg->any.head.error = error;
1771 	kdmsg_msg_write(nmsg);
1772 }
1773