xref: /dragonfly/sys/kern/kern_dmsg.c (revision 20c2db9a)
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by txmsgq, need to calculate
36  *	 a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/priv.h>
52 #include <sys/thread.h>
53 #include <sys/globaldata.h>
54 #include <sys/limits.h>
55 
56 #include <sys/dmsg.h>
57 
58 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
59 RB_GENERATE(kdmsg_circuit_tree, kdmsg_circuit, rbnode, kdmsg_circuit_cmp);
60 
61 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
62 static int kdmsg_circ_msgrx(kdmsg_msg_t *msg);
63 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
64 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
65 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
66 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
67 static void kdmsg_state_abort(kdmsg_state_t *state);
68 static void kdmsg_state_free(kdmsg_state_t *state);
69 
70 static void kdmsg_iocom_thread_rd(void *arg);
71 static void kdmsg_iocom_thread_wr(void *arg);
72 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
73 static void kdmsg_autocirc(kdmsg_msg_t *msg);
74 static int kdmsg_autocirc_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
75 
76 static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);
77 
78 void
79 kdmsg_circ_hold(kdmsg_circuit_t *circ)
80 {
81 	atomic_add_int(&circ->refs, 1);
82 }
83 
84 void
85 kdmsg_circ_drop(kdmsg_circuit_t *circ)
86 {
87 	kdmsg_iocom_t *iocom;
88 
89 	if (atomic_fetchadd_int(&circ->refs, -1) == 1) {
90 		KKASSERT(circ->span_state == NULL &&
91 			 circ->circ_state == NULL &&
92 			 circ->rcirc_state == NULL &&
93 			 circ->recorded == 0);
94 		iocom = circ->iocom;
95 		circ->iocom = NULL;
96 		kfree(circ, iocom->mmsg);
97 	}
98 }
99 
100 
101 /*
102  * Initialize the roll-up communications structure for a network
103  * messaging session.  This function does not install the socket.
104  */
105 void
106 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
107 		 struct malloc_type *mmsg,
108 		 int (*rcvmsg)(kdmsg_msg_t *msg))
109 {
110 	bzero(iocom, sizeof(*iocom));
111 	iocom->handle = handle;
112 	iocom->mmsg = mmsg;
113 	iocom->rcvmsg = rcvmsg;
114 	iocom->flags = flags;
115 	lockinit(&iocom->msglk, "h2msg", 0, 0);
116 	TAILQ_INIT(&iocom->msgq);
117 	RB_INIT(&iocom->circ_tree);
118 	RB_INIT(&iocom->staterd_tree);
119 	RB_INIT(&iocom->statewr_tree);
120 }
121 
122 /*
123  * [Re]connect using the passed file pointer.  The caller must ref the
124  * fp for us.  We own that ref now.
125  */
126 void
127 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
128 		      const char *subsysname)
129 {
130 	/*
131 	 * Destroy the current connection
132 	 */
133 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
134 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
135 	while (iocom->msgrd_td || iocom->msgwr_td) {
136 		wakeup(&iocom->msg_ctl);
137 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
138 	}
139 
140 	/*
141 	 * Drop communications descriptor
142 	 */
143 	if (iocom->msg_fp) {
144 		fdrop(iocom->msg_fp);
145 		iocom->msg_fp = NULL;
146 	}
147 
148 	/*
149 	 * Setup new communications descriptor
150 	 */
151 	iocom->msg_ctl = 0;
152 	iocom->msg_fp = fp;
153 	iocom->msg_seq = 0;
154 	iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
155 
156 	lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
157 		    NULL, 0, -1, "%s-msgrd", subsysname);
158 	lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
159 		    NULL, 0, -1, "%s-msgwr", subsysname);
160 	lockmgr(&iocom->msglk, LK_RELEASE);
161 }
162 
163 /*
164  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
165  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
166  *
167  * NOTE: Caller typically also sets the IOCOMF_AUTOCONN, IOCOMF_AUTOSPAN,
168  *	 and IOCOMF_AUTOCIRC in the kdmsg_iocom_init() call.  Clients
169  *	 typically set IOCOMF_AUTOFORGE to automatically forged circuits
170  *	 for received SPANs.
171  */
172 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
173 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
174 
175 void
176 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
177 			 void (*auto_callback)(kdmsg_msg_t *msg))
178 {
179 	kdmsg_msg_t *msg;
180 
181 	iocom->auto_callback = auto_callback;
182 
183 	msg = kdmsg_msg_alloc(iocom, NULL,
184 			      DMSG_LNK_CONN | DMSGF_CREATE,
185 			      kdmsg_lnk_conn_reply, NULL);
186 	iocom->auto_lnk_conn.head = msg->any.head;
187 	msg->any.lnk_conn = iocom->auto_lnk_conn;
188 	iocom->conn_state = msg->state;
189 	kdmsg_msg_write(msg);
190 }
191 
192 static
193 int
194 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
195 {
196 	kdmsg_iocom_t *iocom = state->iocom;
197 	kdmsg_msg_t *rmsg;
198 
199 	if (msg->any.head.cmd & DMSGF_CREATE) {
200 		rmsg = kdmsg_msg_alloc(iocom, NULL,
201 				       DMSG_LNK_SPAN | DMSGF_CREATE,
202 				       kdmsg_lnk_span_reply, NULL);
203 		iocom->auto_lnk_span.head = rmsg->any.head;
204 		rmsg->any.lnk_span = iocom->auto_lnk_span;
205 		kdmsg_msg_write(rmsg);
206 	}
207 
208 	/*
209 	 * Process shim after the CONN is acknowledged and before the CONN
210 	 * transaction is deleted.  For deletions this gives device drivers
211 	 * the ability to interlock new operations on the circuit before
212 	 * it becomes illegal and panics.
213 	 */
214 	if (iocom->auto_callback)
215 		iocom->auto_callback(msg);
216 
217 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
218 	    (msg->any.head.cmd & DMSGF_DELETE)) {
219 		iocom->conn_state = NULL;
220 		kdmsg_msg_reply(msg, 0);
221 	}
222 
223 	return (0);
224 }
225 
226 static
227 int
228 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
229 {
230 	/*
231 	 * Be sure to process shim before terminating the SPAN
232 	 * transaction.  Gives device drivers the ability to
233 	 * interlock new operations on the circuit before it
234 	 * becomes illegal and panics.
235 	 */
236 	if (state->iocom->auto_callback)
237 		state->iocom->auto_callback(msg);
238 
239 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
240 	    (msg->any.head.cmd & DMSGF_DELETE)) {
241 		kdmsg_msg_reply(msg, 0);
242 	}
243 	return (0);
244 }
245 
246 /*
247  * Disconnect and clean up
248  */
249 void
250 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
251 {
252 	kdmsg_state_t *state;
253 
254 	/*
255 	 * Ask the cluster controller to go away
256 	 */
257 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
258 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
259 
260 	while (iocom->msgrd_td || iocom->msgwr_td) {
261 		wakeup(&iocom->msg_ctl);
262 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
263 	}
264 
265 	/*
266 	 * Cleanup caches
267 	 */
268 	if ((state = iocom->freerd_state) != NULL) {
269 		iocom->freerd_state = NULL;
270 		kdmsg_state_free(state);
271 	}
272 
273 	if ((state = iocom->freewr_state) != NULL) {
274 		iocom->freewr_state = NULL;
275 		kdmsg_state_free(state);
276 	}
277 
278 	/*
279 	 * Drop communications descriptor
280 	 */
281 	if (iocom->msg_fp) {
282 		fdrop(iocom->msg_fp);
283 		iocom->msg_fp = NULL;
284 	}
285 	lockmgr(&iocom->msglk, LK_RELEASE);
286 }
287 
288 /*
289  * Cluster controller thread.  Perform messaging functions.  We have one
290  * thread for the reader and one for the writer.  The writer handles
291  * shutdown requests (which should break the reader thread).
292  */
293 static
294 void
295 kdmsg_iocom_thread_rd(void *arg)
296 {
297 	kdmsg_iocom_t *iocom = arg;
298 	dmsg_hdr_t hdr;
299 	kdmsg_msg_t *msg = NULL;
300 	size_t hbytes;
301 	size_t abytes;
302 	int error = 0;
303 
304 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0) {
305 		/*
306 		 * Retrieve the message from the pipe or socket.
307 		 */
308 		error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
309 				NULL, 1, UIO_SYSSPACE);
310 		if (error)
311 			break;
312 		if (hdr.magic != DMSG_HDR_MAGIC) {
313 			kprintf("kdmsg: bad magic: %04x\n", hdr.magic);
314 			error = EINVAL;
315 			break;
316 		}
317 		hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
318 		if (hbytes < sizeof(hdr) || hbytes > DMSG_AUX_MAX) {
319 			kprintf("kdmsg: bad header size %zd\n", hbytes);
320 			error = EINVAL;
321 			break;
322 		}
323 		/* XXX messy: mask cmd to avoid allocating state */
324 		msg = kdmsg_msg_alloc(iocom, NULL,
325 				      hdr.cmd & DMSGF_BASECMDMASK,
326 				      NULL, NULL);
327 		msg->any.head = hdr;
328 		msg->hdr_size = hbytes;
329 		if (hbytes > sizeof(hdr)) {
330 			error = fp_read(iocom->msg_fp, &msg->any.head + 1,
331 					hbytes - sizeof(hdr),
332 					NULL, 1, UIO_SYSSPACE);
333 			if (error) {
334 				kprintf("kdmsg: short msg received\n");
335 				error = EINVAL;
336 				break;
337 			}
338 		}
339 		msg->aux_size = hdr.aux_bytes;
340 		if (msg->aux_size > DMSG_AUX_MAX) {
341 			kprintf("kdmsg: illegal msg payload size %zd\n",
342 				msg->aux_size);
343 			error = EINVAL;
344 			break;
345 		}
346 		if (msg->aux_size) {
347 			abytes = DMSG_DOALIGN(msg->aux_size);
348 			msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
349 			msg->flags |= KDMSG_FLAG_AUXALLOC;
350 			error = fp_read(iocom->msg_fp, msg->aux_data,
351 					abytes, NULL, 1, UIO_SYSSPACE);
352 			if (error) {
353 				kprintf("kdmsg: short msg payload received\n");
354 				break;
355 			}
356 		}
357 
358 		(void)kdmsg_circ_msgrx(msg);
359 		error = kdmsg_msg_receive_handling(msg);
360 		msg = NULL;
361 	}
362 
363 	if (error)
364 		kprintf("kdmsg: read failed error %d\n", error);
365 
366 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
367 	if (msg)
368 		kdmsg_msg_free(msg);
369 
370 	/*
371 	 * Shutdown the socket before waiting for the transmit side.
372 	 *
373 	 * If we are dying due to e.g. a socket disconnect verses being
374 	 * killed explicity we have to set KILL in order to kick the tx
375 	 * side when it might not have any other work to do.  KILL might
376 	 * already be set if we are in an unmount or reconnect.
377 	 */
378 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
379 
380 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
381 	wakeup(&iocom->msg_ctl);
382 
383 	/*
384 	 * Wait for the transmit side to drain remaining messages
385 	 * before cleaning up the rx state.  The transmit side will
386 	 * set KILLTX and wait for the rx side to completely finish
387 	 * (set msgrd_td to NULL) before cleaning up any remaining
388 	 * tx states.
389 	 */
390 	lockmgr(&iocom->msglk, LK_RELEASE);
391 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
392 	wakeup(&iocom->msg_ctl);
393 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0) {
394 		wakeup(&iocom->msg_ctl);
395 		tsleep(iocom, 0, "clstrkw", hz);
396 	}
397 
398 	iocom->msgrd_td = NULL;
399 
400 	/*
401 	 * iocom can be ripped out from under us at this point but
402 	 * wakeup() is safe.
403 	 */
404 	wakeup(iocom);
405 	lwkt_exit();
406 }
407 
408 static
409 void
410 kdmsg_iocom_thread_wr(void *arg)
411 {
412 	kdmsg_iocom_t *iocom = arg;
413 	kdmsg_msg_t *msg;
414 	kdmsg_state_t *state;
415 	ssize_t res;
416 	size_t abytes;
417 	int error = 0;
418 	int retries = 20;
419 
420 	/*
421 	 * Transmit loop
422 	 */
423 	msg = NULL;
424 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
425 
426 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0 && error == 0) {
427 		/*
428 		 * Sleep if no messages pending.  Interlock with flag while
429 		 * holding msglk.
430 		 */
431 		if (TAILQ_EMPTY(&iocom->msgq)) {
432 			atomic_set_int(&iocom->msg_ctl,
433 				       KDMSG_CLUSTERCTL_SLEEPING);
434 			lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
435 			atomic_clear_int(&iocom->msg_ctl,
436 					 KDMSG_CLUSTERCTL_SLEEPING);
437 		}
438 
439 		while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
440 			/*
441 			 * Remove msg from the transmit queue and do
442 			 * persist and half-closed state handling.
443 			 */
444 			TAILQ_REMOVE(&iocom->msgq, msg, qentry);
445 			lockmgr(&iocom->msglk, LK_RELEASE);
446 
447 			error = kdmsg_state_msgtx(msg);
448 			if (error == EALREADY) {
449 				error = 0;
450 				kdmsg_msg_free(msg);
451 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
452 				continue;
453 			}
454 			if (error) {
455 				kdmsg_msg_free(msg);
456 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
457 				break;
458 			}
459 
460 			/*
461 			 * Dump the message to the pipe or socket.
462 			 *
463 			 * We have to clean up the message as if the transmit
464 			 * succeeded even if it failed.
465 			 */
466 			error = fp_write(iocom->msg_fp, &msg->any,
467 					 msg->hdr_size, &res, UIO_SYSSPACE);
468 			if (error || res != msg->hdr_size) {
469 				if (error == 0)
470 					error = EINVAL;
471 				kdmsg_state_cleanuptx(msg);
472 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
473 				break;
474 			}
475 			if (msg->aux_size) {
476 				abytes = DMSG_DOALIGN(msg->aux_size);
477 				error = fp_write(iocom->msg_fp,
478 						 msg->aux_data, abytes,
479 						 &res, UIO_SYSSPACE);
480 				if (error || res != abytes) {
481 					if (error == 0)
482 						error = EINVAL;
483 					kdmsg_state_cleanuptx(msg);
484 					lockmgr(&iocom->msglk, LK_EXCLUSIVE);
485 					break;
486 				}
487 			}
488 			kdmsg_state_cleanuptx(msg);
489 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
490 		}
491 	}
492 
493 	/*
494 	 * Cleanup messages pending transmission and release msgq lock.
495 	 */
496 	if (error)
497 		kprintf("kdmsg: write failed error %d\n", error);
498 	kprintf("thread_wr: Terminating iocom\n");
499 
500 	/*
501 	 * Shutdown the socket.  This will cause the rx thread to get an
502 	 * EOF and ensure that both threads get to a termination state.
503 	 */
504 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
505 
506 	/*
507 	 * Set KILLTX (which the rx side waits for), then wait for the RX
508 	 * side to completely finish before we clean out any remaining
509 	 * command states.
510 	 */
511 	lockmgr(&iocom->msglk, LK_RELEASE);
512 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLTX);
513 	wakeup(&iocom->msg_ctl);
514 	while (iocom->msgrd_td) {
515 		wakeup(&iocom->msg_ctl);
516 		tsleep(iocom, 0, "clstrkw", hz);
517 	}
518 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
519 
520 	/*
521 	 * Simulate received MSGF_DELETE's for any remaining states.
522 	 * (For remote masters).
523 	 *
524 	 * Drain the message queue to handle any device initiated writes
525 	 * due to state callbacks.
526 	 */
527 cleanuprd:
528 	kdmsg_drain_msgq(iocom);
529 	RB_FOREACH(state, kdmsg_state_tree, &iocom->staterd_tree) {
530 		if ((state->rxcmd & DMSGF_DELETE) == 0) {
531 			lockmgr(&iocom->msglk, LK_RELEASE);
532 			kdmsg_state_abort(state);
533 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
534 			goto cleanuprd;
535 		}
536 	}
537 
538 	/*
539 	 * Simulate received MSGF_DELETE's for any remaining states.
540 	 * (For local masters).
541 	 */
542 cleanupwr:
543 	kdmsg_drain_msgq(iocom);
544 	RB_FOREACH(state, kdmsg_state_tree, &iocom->statewr_tree) {
545 		if ((state->rxcmd & DMSGF_DELETE) == 0) {
546 			lockmgr(&iocom->msglk, LK_RELEASE);
547 			kdmsg_state_abort(state);
548 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
549 			goto cleanupwr;
550 		}
551 	}
552 
553 	/*
554 	 * Retry until all work is done
555 	 */
556 	if (--retries == 0)
557 		panic("kdmsg: comm thread shutdown couldn't drain");
558 	if (TAILQ_FIRST(&iocom->msgq) ||
559 	    RB_ROOT(&iocom->staterd_tree) ||
560 	    RB_ROOT(&iocom->statewr_tree)) {
561 		goto cleanuprd;
562 	}
563 	iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
564 
565 	lockmgr(&iocom->msglk, LK_RELEASE);
566 
567 	/*
568 	 * The state trees had better be empty now
569 	 */
570 	KKASSERT(RB_EMPTY(&iocom->staterd_tree));
571 	KKASSERT(RB_EMPTY(&iocom->statewr_tree));
572 	KKASSERT(iocom->conn_state == NULL);
573 
574 	if (iocom->exit_func) {
575 		/*
576 		 * iocom is invalid after we call the exit function.
577 		 */
578 		iocom->msgwr_td = NULL;
579 		iocom->exit_func(iocom);
580 	} else {
581 		/*
582 		 * iocom can be ripped out from under us once msgwr_td is
583 		 * set to NULL.  The wakeup is safe.
584 		 */
585 		iocom->msgwr_td = NULL;
586 		wakeup(iocom);
587 	}
588 	lwkt_exit();
589 }
590 
591 /*
592  * This cleans out the pending transmit message queue, adjusting any
593  * persistent states properly in the process.
594  *
595  * Caller must hold pmp->iocom.msglk
596  */
597 void
598 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
599 {
600 	kdmsg_msg_t *msg;
601 
602 	/*
603 	 * Clean out our pending transmit queue, executing the
604 	 * appropriate state adjustments.  If this tries to open
605 	 * any new outgoing transactions we have to loop up and
606 	 * clean them out.
607 	 */
608 	while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
609 		TAILQ_REMOVE(&iocom->msgq, msg, qentry);
610 		lockmgr(&iocom->msglk, LK_RELEASE);
611 		if (kdmsg_state_msgtx(msg))
612 			kdmsg_msg_free(msg);
613 		else
614 			kdmsg_state_cleanuptx(msg);
615 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
616 	}
617 }
618 
619 /*
620  * Do all processing required to handle a freshly received message
621  * after its low level header has been validated.
622  */
623 static
624 int
625 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
626 {
627 	kdmsg_iocom_t *iocom = msg->iocom;
628 	int error;
629 
630 	/*
631 	 * State machine tracking, state assignment for msg,
632 	 * returns error and discard status.  Errors are fatal
633 	 * to the connection except for EALREADY which forces
634 	 * a discard without execution.
635 	 */
636 	error = kdmsg_state_msgrx(msg);
637 	if (error) {
638 		/*
639 		 * Raw protocol or connection error
640 		 */
641 		kdmsg_msg_free(msg);
642 		if (error == EALREADY)
643 			error = 0;
644 	} else if (msg->state && msg->state->func) {
645 		/*
646 		 * Message related to state which already has a
647 		 * handling function installed for it.
648 		 */
649 		error = msg->state->func(msg->state, msg);
650 		kdmsg_state_cleanuprx(msg);
651 	} else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
652 		error = kdmsg_autorxmsg(msg);
653 		kdmsg_state_cleanuprx(msg);
654 	} else {
655 		error = iocom->rcvmsg(msg);
656 		kdmsg_state_cleanuprx(msg);
657 	}
658 	return error;
659 }
660 
661 /*
662  * Process circuit tracking (NEEDS WORK)
663  */
664 static
665 int
666 kdmsg_circ_msgrx(kdmsg_msg_t *msg)
667 {
668 	kdmsg_circuit_t dummy;
669 	kdmsg_circuit_t *circ;
670 	int error = 0;
671 
672 	if (msg->any.head.circuit) {
673 		dummy.msgid = msg->any.head.circuit;
674 		lwkt_gettoken(&kdmsg_token);
675 		circ = RB_FIND(kdmsg_circuit_tree, &msg->iocom->circ_tree,
676 			       &dummy);
677 		if (circ) {
678 			msg->circ = circ;
679 			kdmsg_circ_hold(circ);
680 		}
681 		if (circ == NULL) {
682 			kprintf("KDMSG_CIRC_MSGRX CMD %08x: IOCOM %p "
683 				"Bad circuit %016jx\n",
684 				msg->any.head.cmd,
685 				msg->iocom,
686 				(intmax_t)msg->any.head.circuit);
687 			kprintf("KDMSG_CIRC_MSGRX: Avail circuits: ");
688 			RB_FOREACH(circ, kdmsg_circuit_tree,
689 				   &msg->iocom->circ_tree) {
690 				kprintf(" %016jx", (intmax_t)circ->msgid);
691 			}
692 			kprintf("\n");
693 			error = EINVAL;
694 		}
695 		lwkt_reltoken(&kdmsg_token);
696 	}
697 	return (error);
698 }
699 
700 /*
701  * Process state tracking for a message after reception, prior to
702  * execution.
703  *
704  * Called with msglk held and the msg dequeued.
705  *
706  * All messages are called with dummy state and return actual state.
707  * (One-off messages often just return the same dummy state).
708  *
709  * May request that caller discard the message by setting *discardp to 1.
710  * The returned state is not used in this case and is allowed to be NULL.
711  *
712  * --
713  *
714  * These routines handle persistent and command/reply message state via the
715  * CREATE and DELETE flags.  The first message in a command or reply sequence
716  * sets CREATE, the last message in a command or reply sequence sets DELETE.
717  *
718  * There can be any number of intermediate messages belonging to the same
719  * sequence sent inbetween the CREATE message and the DELETE message,
720  * which set neither flag.  This represents a streaming command or reply.
721  *
722  * Any command message received with CREATE set expects a reply sequence to
723  * be returned.  Reply sequences work the same as command sequences except the
724  * REPLY bit is also sent.  Both the command side and reply side can
725  * degenerate into a single message with both CREATE and DELETE set.  Note
726  * that one side can be streaming and the other side not, or neither, or both.
727  *
728  * The msgid is unique for the initiator.  That is, two sides sending a new
729  * message can use the same msgid without colliding.
730  *
731  * --
732  *
733  * ABORT sequences work by setting the ABORT flag along with normal message
734  * state.  However, ABORTs can also be sent on half-closed messages, that is
735  * even if the command or reply side has already sent a DELETE, as long as
736  * the message has not been fully closed it can still send an ABORT+DELETE
737  * to terminate the half-closed message state.
738  *
739  * Since ABORT+DELETEs can race we silently discard ABORT's for message
740  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
741  * also race, and in this situation the other side might have already
742  * initiated a new unrelated command with the same message id.  Since
743  * the abort has not set the CREATE flag the situation can be detected
744  * and the message will also be discarded.
745  *
746  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
747  * The ABORT request is essentially integrated into the command instead
748  * of being sent later on.  In this situation the command implementation
749  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
750  * special-case non-blocking operation for the command.
751  *
752  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
753  *	  to be mid-stream aborts for command/reply sequences.  ABORTs on
754  *	  one-way messages are not supported.
755  *
756  * NOTE!  If a command sequence does not support aborts the ABORT flag is
757  *	  simply ignored.
758  *
759  * --
760  *
761  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
762  * set.  One-off messages cannot be aborted and typically aren't processed
763  * by these routines.  The REPLY bit can be used to distinguish whether a
764  * one-off message is a command or reply.  For example, one-off replies
765  * will typically just contain status updates.
766  */
767 static
768 int
769 kdmsg_state_msgrx(kdmsg_msg_t *msg)
770 {
771 	kdmsg_iocom_t *iocom = msg->iocom;
772 	kdmsg_state_t *state;
773 	int error;
774 
775 	/*
776 	 * Make sure a state structure is ready to go in case we need a new
777 	 * one.  This is the only routine which uses freerd_state so no
778 	 * races are possible.
779 	 */
780 	if ((state = iocom->freerd_state) == NULL) {
781 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
782 		state->flags = KDMSG_STATE_DYNAMIC;
783 		iocom->freerd_state = state;
784 	}
785 
786 	/*
787 	 * Lock RB tree and locate existing persistent state, if any.
788 	 *
789 	 * If received msg is a command state is on staterd_tree.
790 	 * If received msg is a reply state is on statewr_tree.
791 	 */
792 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
793 
794 	state->msgid = msg->any.head.msgid;
795 	state->circ = msg->circ;
796 	state->iocom = iocom;
797 	if (msg->any.head.cmd & DMSGF_REPLY)
798 		state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree, state);
799 	else
800 		state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree, state);
801 	msg->state = state;
802 
803 	/*
804 	 * Short-cut one-off or mid-stream messages (state may be NULL).
805 	 */
806 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
807 				  DMSGF_ABORT)) == 0) {
808 		lockmgr(&iocom->msglk, LK_RELEASE);
809 		return(0);
810 	}
811 
812 	/*
813 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
814 	 * inside the case statements.
815 	 */
816 	switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
817 	case DMSGF_CREATE:
818 	case DMSGF_CREATE | DMSGF_DELETE:
819 		/*
820 		 * New persistant command received.
821 		 */
822 		if (state) {
823 			kprintf("kdmsg_state_msgrx: duplicate transaction\n");
824 			error = EINVAL;
825 			break;
826 		}
827 		state = iocom->freerd_state;
828 		iocom->freerd_state = NULL;
829 		msg->state = state;
830 		state->msg = msg;
831 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
832 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
833 		state->txcmd = DMSGF_REPLY;
834 		state->msgid = msg->any.head.msgid;
835 		if ((state->circ = msg->circ) != NULL)
836 			kdmsg_circ_hold(state->circ);
837 		RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
838 		state->flags |= KDMSG_STATE_INSERTED;
839 		error = 0;
840 		break;
841 	case DMSGF_DELETE:
842 		/*
843 		 * Persistent state is expected but might not exist if an
844 		 * ABORT+DELETE races the close.
845 		 */
846 		if (state == NULL) {
847 			if (msg->any.head.cmd & DMSGF_ABORT) {
848 				error = EALREADY;
849 			} else {
850 				kprintf("kdmsg_state_msgrx: "
851 					"no state for DELETE\n");
852 				error = EINVAL;
853 			}
854 			break;
855 		}
856 
857 		/*
858 		 * Handle another ABORT+DELETE case if the msgid has already
859 		 * been reused.
860 		 */
861 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
862 			if (msg->any.head.cmd & DMSGF_ABORT) {
863 				error = EALREADY;
864 			} else {
865 				kprintf("kdmsg_state_msgrx: "
866 					"state reused for DELETE\n");
867 				error = EINVAL;
868 			}
869 			break;
870 		}
871 		error = 0;
872 		break;
873 	default:
874 		/*
875 		 * Check for mid-stream ABORT command received, otherwise
876 		 * allow.
877 		 */
878 		if (msg->any.head.cmd & DMSGF_ABORT) {
879 			if (state == NULL ||
880 			    (state->rxcmd & DMSGF_CREATE) == 0) {
881 				error = EALREADY;
882 				break;
883 			}
884 		}
885 		error = 0;
886 		break;
887 	case DMSGF_REPLY | DMSGF_CREATE:
888 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
889 		/*
890 		 * When receiving a reply with CREATE set the original
891 		 * persistent state message should already exist.
892 		 */
893 		if (state == NULL) {
894 			kprintf("kdmsg_state_msgrx: no state match for "
895 				"REPLY cmd=%08x msgid=%016jx\n",
896 				msg->any.head.cmd,
897 				(intmax_t)msg->any.head.msgid);
898 			error = EINVAL;
899 			break;
900 		}
901 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
902 		error = 0;
903 		break;
904 	case DMSGF_REPLY | DMSGF_DELETE:
905 		/*
906 		 * Received REPLY+ABORT+DELETE in case where msgid has
907 		 * already been fully closed, ignore the message.
908 		 */
909 		if (state == NULL) {
910 			if (msg->any.head.cmd & DMSGF_ABORT) {
911 				error = EALREADY;
912 			} else {
913 				kprintf("kdmsg_state_msgrx: no state match "
914 					"for REPLY|DELETE\n");
915 				error = EINVAL;
916 			}
917 			break;
918 		}
919 
920 		/*
921 		 * Received REPLY+ABORT+DELETE in case where msgid has
922 		 * already been reused for an unrelated message,
923 		 * ignore the message.
924 		 */
925 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
926 			if (msg->any.head.cmd & DMSGF_ABORT) {
927 				error = EALREADY;
928 			} else {
929 				kprintf("kdmsg_state_msgrx: state reused "
930 					"for REPLY|DELETE\n");
931 				error = EINVAL;
932 			}
933 			break;
934 		}
935 		error = 0;
936 		break;
937 	case DMSGF_REPLY:
938 		/*
939 		 * Check for mid-stream ABORT reply received to sent command.
940 		 */
941 		if (msg->any.head.cmd & DMSGF_ABORT) {
942 			if (state == NULL ||
943 			    (state->rxcmd & DMSGF_CREATE) == 0) {
944 				error = EALREADY;
945 				break;
946 			}
947 		}
948 		error = 0;
949 		break;
950 	}
951 	lockmgr(&iocom->msglk, LK_RELEASE);
952 	return (error);
953 }
954 
955 /*
956  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
957  * This routine must call iocom->rcvmsg() for anything not automatically
958  * handled.
959  */
960 static int
961 kdmsg_autorxmsg(kdmsg_msg_t *msg)
962 {
963 	kdmsg_iocom_t *iocom = msg->iocom;
964 	kdmsg_circuit_t *circ;
965 	int error = 0;
966 	uint32_t cmd;
967 
968 	/*
969 	 * Process a combination of the transaction command and the message
970 	 * flags.  For the purposes of this routine, the message command is
971 	 * only relevant when it initiates a transaction (where it is
972 	 * recorded in icmd).
973 	 */
974 	cmd = (msg->state ? msg->state->icmd : msg->any.head.cmd) &
975 	      DMSGF_BASECMDMASK;
976 	cmd |= msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY);
977 
978 	switch(cmd) {
979 	case DMSG_LNK_CONN | DMSGF_CREATE:
980 	case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
981 		/*
982 		 * Received LNK_CONN transaction.  Transmit response and
983 		 * leave transaction open, which allows the other end to
984 		 * start to the SPAN protocol.
985 		 *
986 		 * Handle shim after acknowledging the CONN.
987 		 */
988 		if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
989 			if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
990 				kdmsg_msg_result(msg, 0);
991 				if (iocom->auto_callback)
992 					iocom->auto_callback(msg);
993 			} else {
994 				error = iocom->rcvmsg(msg);
995 			}
996 			break;
997 		}
998 		/* fall through */
999 	case DMSG_LNK_CONN | DMSGF_DELETE:
1000 		/*
1001 		 * This message is usually simulated after a link is lost
1002 		 * to clean up the transaction.
1003 		 */
1004 		if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1005 			if (iocom->auto_callback)
1006 				iocom->auto_callback(msg);
1007 			kdmsg_msg_reply(msg, 0);
1008 		} else {
1009 			error = iocom->rcvmsg(msg);
1010 		}
1011 		break;
1012 	case DMSG_LNK_SPAN | DMSGF_CREATE:
1013 	case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1014 		/*
1015 		 * Received LNK_SPAN transaction.  We do not have to respond
1016 		 * but we must leave the transaction open.
1017 		 *
1018 		 * If AUTOCIRC is set automatically initiate a virtual circuit
1019 		 * to the received span.  This will attach a kdmsg_circuit
1020 		 * to the SPAN state.  The circuit is lost when the span is
1021 		 * lost.
1022 		 *
1023 		 * Handle shim after acknowledging the SPAN.
1024 		 */
1025 		if (iocom->flags & KDMSG_IOCOMF_AUTOSPAN) {
1026 			if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1027 				if (iocom->flags & KDMSG_IOCOMF_AUTOFORGE)
1028 					kdmsg_autocirc(msg);
1029 				if (iocom->auto_callback)
1030 					iocom->auto_callback(msg);
1031 				break;
1032 			}
1033 			/* fall through */
1034 		} else {
1035 			error = iocom->rcvmsg(msg);
1036 			break;
1037 		}
1038 		/* fall through */
1039 	case DMSG_LNK_SPAN | DMSGF_DELETE:
1040 		/*
1041 		 * Process shims (auto_callback) before cleaning up the
1042 		 * circuit structure and closing the transactions.  Device
1043 		 * driver should ensure that the circuit is not used after
1044 		 * the auto_callback() returns.
1045 		 *
1046 		 * Handle shim before closing the SPAN transaction.
1047 		 */
1048 		if (iocom->flags & KDMSG_IOCOMF_AUTOSPAN) {
1049 			if (iocom->auto_callback)
1050 				iocom->auto_callback(msg);
1051 			if (iocom->flags & KDMSG_IOCOMF_AUTOFORGE)
1052 				kdmsg_autocirc(msg);
1053 			kdmsg_msg_reply(msg, 0);
1054 		} else {
1055 			error = iocom->rcvmsg(msg);
1056 		}
1057 		break;
1058 	case DMSG_LNK_CIRC | DMSGF_CREATE:
1059 	case DMSG_LNK_CIRC | DMSGF_CREATE | DMSGF_DELETE:
1060 		/*
1061 		 * Received LNK_CIRC transaction.  We must respond and should
1062 		 * leave the transaction open, allowing the circuit.  The
1063 		 * remote can start issuing commands to us over the circuit
1064 		 * even before we respond.
1065 		 */
1066 		if (iocom->flags & KDMSG_IOCOMF_AUTOCIRC) {
1067 			if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1068 				circ = kmalloc(sizeof(*circ), iocom->mmsg,
1069 					       M_WAITOK | M_ZERO);
1070 				lwkt_gettoken(&kdmsg_token);
1071 				msg->state->any.circ = circ;
1072 				circ->iocom = iocom;
1073 				circ->rcirc_state = msg->state;
1074 				kdmsg_circ_hold(circ);	/* for rcirc_state */
1075 				circ->weight = 0;
1076 				circ->msgid = circ->rcirc_state->msgid;
1077 				/* XXX no span link for received circuits */
1078 				kdmsg_circ_hold(circ);	/* for circ_state */
1079 
1080 				if (RB_INSERT(kdmsg_circuit_tree,
1081 					      &iocom->circ_tree, circ)) {
1082 					panic("duplicate circuitid allocated");
1083 				}
1084 				lwkt_reltoken(&kdmsg_token);
1085 				kdmsg_msg_result(msg, 0);
1086 
1087 				/*
1088 				 * Handle shim after adding the circuit and
1089 				 * after acknowledging the CIRC.
1090 				 */
1091 				if (iocom->auto_callback)
1092 					iocom->auto_callback(msg);
1093 				break;
1094 			}
1095 			/* fall through */
1096 		} else {
1097 			error = iocom->rcvmsg(msg);
1098 			break;
1099 		}
1100 		/* fall through */
1101 	case DMSG_LNK_CIRC | DMSGF_DELETE:
1102 		if (iocom->flags & KDMSG_IOCOMF_AUTOCIRC) {
1103 			circ = msg->state->any.circ;
1104 			if (circ == NULL)
1105 				break;
1106 
1107 			/*
1108 			 * Handle shim before terminating the circuit.
1109 			 */
1110 #if 0
1111 			kprintf("KDMSG VC: RECEIVE CIRC DELETE "
1112 				"IOCOM %p MSGID %016jx\n",
1113 				msg->iocom, circ->msgid);
1114 #endif
1115 			if (iocom->auto_callback)
1116 				iocom->auto_callback(msg);
1117 
1118 			KKASSERT(circ->rcirc_state == msg->state);
1119 			lwkt_gettoken(&kdmsg_token);
1120 			circ->rcirc_state = NULL;
1121 			msg->state->any.circ = NULL;
1122 			RB_REMOVE(kdmsg_circuit_tree, &iocom->circ_tree, circ);
1123 			lwkt_reltoken(&kdmsg_token);
1124 			kdmsg_circ_drop(circ);	/* for rcirc_state */
1125 			kdmsg_msg_reply(msg, 0);
1126 		} else {
1127 			error = iocom->rcvmsg(msg);
1128 		}
1129 		break;
1130 	default:
1131 		/*
1132 		 * Anything unhandled goes into rcvmsg.
1133 		 *
1134 		 * NOTE: Replies to link-level messages initiated by our side
1135 		 *	 are handled by the state callback, they are NOT
1136 		 *	 handled here.
1137 		 */
1138 		error = iocom->rcvmsg(msg);
1139 		break;
1140 	}
1141 	return (error);
1142 }
1143 
1144 /*
1145  * Handle automatic forging of virtual circuits based on received SPANs.
1146  * (AUTOFORGE).  Note that other code handles tracking received circuit
1147  * transactions (AUTOCIRC).
1148  *
1149  * We can ignore non-transactions here.  Use trans->icmd to test the
1150  * transactional command (once past the CREATE the individual message
1151  * commands are not usually the icmd).
1152  *
1153  * XXX locks
1154  */
1155 static
1156 void
1157 kdmsg_autocirc(kdmsg_msg_t *msg)
1158 {
1159 	kdmsg_iocom_t *iocom = msg->iocom;
1160 	kdmsg_circuit_t *circ;
1161 	kdmsg_msg_t *xmsg;	/* CIRC */
1162 
1163 	if (msg->state == NULL)
1164 		return;
1165 
1166 	/*
1167 	 * Gaining the SPAN, automatically forge a circuit to the target.
1168 	 *
1169 	 * NOTE!! The shim is not executed until we receive an acknowlegement
1170 	 *	  to our forged LNK_CIRC (see kdmsg_autocirc_reply()).
1171 	 */
1172 	if (msg->state->icmd == DMSG_LNK_SPAN &&
1173 	    (msg->any.head.cmd & DMSGF_CREATE)) {
1174 		circ = kmalloc(sizeof(*circ), iocom->mmsg, M_WAITOK | M_ZERO);
1175 		lwkt_gettoken(&kdmsg_token);
1176 		msg->state->any.circ = circ;
1177 		circ->iocom = iocom;
1178 		circ->span_state = msg->state;
1179 		kdmsg_circ_hold(circ);	/* for span_state */
1180 		xmsg = kdmsg_msg_alloc(iocom, NULL,
1181 				       DMSG_LNK_CIRC | DMSGF_CREATE,
1182 				       kdmsg_autocirc_reply, circ);
1183 		circ->circ_state = xmsg->state;
1184 		circ->weight = msg->any.lnk_span.dist;
1185 		circ->msgid = circ->circ_state->msgid;
1186 		kdmsg_circ_hold(circ);	/* for circ_state */
1187 #if 0
1188 		kprintf("KDMSG VC: CREATE SPAN->CIRC IOCOM %p MSGID %016jx\n",
1189 			msg->iocom, circ->msgid);
1190 #endif
1191 
1192 		if (RB_INSERT(kdmsg_circuit_tree, &iocom->circ_tree, circ))
1193 			panic("duplicate circuitid allocated");
1194 		lwkt_reltoken(&kdmsg_token);
1195 
1196 		xmsg->any.lnk_circ.target = msg->any.head.msgid;
1197 		kdmsg_msg_write(xmsg);
1198 	}
1199 
1200 	/*
1201 	 * Losing the SPAN
1202 	 *
1203 	 * NOTE: When losing a SPAN, any circuits using the span should be
1204 	 *	 deleted by the remote end first.  XXX might not be ordered
1205 	 *	 on actual loss of connection.
1206 	 */
1207 	if (msg->state->icmd == DMSG_LNK_SPAN &&
1208 	    (msg->any.head.cmd & DMSGF_DELETE) &&
1209 	    msg->state->any.circ) {
1210 		circ = msg->state->any.circ;
1211 		lwkt_gettoken(&kdmsg_token);
1212 		circ->span_state = NULL;
1213 		msg->state->any.circ = NULL;
1214 		RB_REMOVE(kdmsg_circuit_tree, &iocom->circ_tree, circ);
1215 #if 0
1216 		kprintf("KDMSG VC: DELETE SPAN->CIRC IOCOM %p MSGID %016jx\n",
1217 			msg->iocom, (intmax_t)circ->msgid);
1218 #endif
1219 		kdmsg_circ_drop(circ);	/* for span_state */
1220 		lwkt_reltoken(&kdmsg_token);
1221 	}
1222 }
1223 
1224 static
1225 int
1226 kdmsg_autocirc_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
1227 {
1228 	kdmsg_iocom_t *iocom = state->iocom;
1229 	kdmsg_circuit_t *circ = state->any.circ;
1230 
1231 	/*
1232 	 * Call shim after receiving an acknowlegement to our forged
1233 	 * circuit and before processing a received termination.
1234 	 */
1235 	if (iocom->auto_callback)
1236 		iocom->auto_callback(msg);
1237 
1238 	/*
1239 	 * If the remote is terminating the VC we terminate our side
1240 	 */
1241 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
1242 	    (msg->any.head.cmd & DMSGF_DELETE)) {
1243 #if 0
1244 		kprintf("KDMSG VC: DELETE CIRC FROM REMOTE\n");
1245 #endif
1246 		lwkt_gettoken(&kdmsg_token);
1247 		circ->circ_state = NULL;
1248 		state->any.circ = NULL;
1249 		kdmsg_circ_drop(circ);		/* for circ_state */
1250 		lwkt_reltoken(&kdmsg_token);
1251 		kdmsg_msg_reply(msg, 0);
1252 	}
1253 	return (0);
1254 }
1255 
1256 /*
1257  * Post-receive-handling message and state cleanup.  This routine is called
1258  * after the state function handling/callback to properly dispose of the
1259  * message and update or dispose of the state.
1260  */
1261 static
1262 void
1263 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1264 {
1265 	kdmsg_iocom_t *iocom = msg->iocom;
1266 	kdmsg_state_t *state;
1267 
1268 	if ((state = msg->state) == NULL) {
1269 		kdmsg_msg_free(msg);
1270 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
1271 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1272 		KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1273 		state->rxcmd |= DMSGF_DELETE;
1274 		if (state->txcmd & DMSGF_DELETE) {
1275 			KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1276 			if (state->rxcmd & DMSGF_REPLY) {
1277 				KKASSERT(msg->any.head.cmd &
1278 					 DMSGF_REPLY);
1279 				RB_REMOVE(kdmsg_state_tree,
1280 					  &iocom->statewr_tree, state);
1281 			} else {
1282 				KKASSERT((msg->any.head.cmd &
1283 					  DMSGF_REPLY) == 0);
1284 				RB_REMOVE(kdmsg_state_tree,
1285 					  &iocom->staterd_tree, state);
1286 			}
1287 			state->flags &= ~KDMSG_STATE_INSERTED;
1288 			if (msg != state->msg)
1289 				kdmsg_msg_free(msg);
1290 			lockmgr(&iocom->msglk, LK_RELEASE);
1291 			kdmsg_state_free(state);
1292 		} else {
1293 			if (msg != state->msg)
1294 				kdmsg_msg_free(msg);
1295 			lockmgr(&iocom->msglk, LK_RELEASE);
1296 		}
1297 	} else if (msg != state->msg) {
1298 		kdmsg_msg_free(msg);
1299 	}
1300 }
1301 
1302 /*
1303  * Simulate receiving a message which terminates an active transaction
1304  * state.  Our simulated received message must set DELETE and may also
1305  * have to set CREATE.  It must also ensure that all fields are set such
1306  * that the receive handling code can find the state (kdmsg_state_msgrx())
1307  * or an endless loop will ensue.
1308  *
1309  * This is used when the other end of the link or virtual circuit is dead
1310  * so the device driver gets a completed transaction for all pending states.
1311  */
1312 static
1313 void
1314 kdmsg_state_abort(kdmsg_state_t *state)
1315 {
1316 	kdmsg_iocom_t *iocom = state->iocom;
1317 	kdmsg_msg_t *msg;
1318 
1319 	/*
1320 	 * Prevent recursive aborts which could otherwise occur if the
1321 	 * simulated message reception runs state->func which then turns
1322 	 * around and tries to reply to a broken circuit when then calls
1323 	 * the state abort code again.
1324 	 */
1325 	if (state->flags & KDMSG_STATE_ABORTING)
1326 		return;
1327 	state->flags |= KDMSG_STATE_ABORTING;
1328 
1329 	/*
1330 	 * Simulatem essage reception
1331 	 */
1332 	msg = kdmsg_msg_alloc(iocom, state->circ,
1333 			      DMSG_LNK_ERROR,
1334 			      NULL, NULL);
1335 	if ((state->rxcmd & DMSGF_CREATE) == 0)
1336 		msg->any.head.cmd |= DMSGF_CREATE;
1337 	msg->any.head.cmd |= DMSGF_DELETE | (state->rxcmd & DMSGF_REPLY);
1338 	msg->any.head.error = DMSG_ERR_LOSTLINK;
1339 	msg->any.head.msgid = state->msgid;
1340 	msg->state = state;
1341 	kdmsg_msg_receive_handling(msg);
1342 }
1343 
1344 /*
1345  * Process state tracking for a message prior to transmission.
1346  *
1347  * Called with msglk held and the msg dequeued.  Returns non-zero if
1348  * the message is bad and should be deleted by the caller.
1349  *
1350  * One-off messages are usually with dummy state and msg->state may be NULL
1351  * in this situation.
1352  *
1353  * New transactions (when CREATE is set) will insert the state.
1354  *
1355  * May request that caller discard the message by setting *discardp to 1.
1356  * A NULL state may be returned in this case.
1357  */
1358 static
1359 int
1360 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1361 {
1362 	kdmsg_iocom_t *iocom = msg->iocom;
1363 	kdmsg_state_t *state;
1364 	int error;
1365 
1366 	/*
1367 	 * Make sure a state structure is ready to go in case we need a new
1368 	 * one.  This is the only routine which uses freewr_state so no
1369 	 * races are possible.
1370 	 */
1371 	if ((state = iocom->freewr_state) == NULL) {
1372 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1373 		state->flags = KDMSG_STATE_DYNAMIC;
1374 		state->iocom = iocom;
1375 		iocom->freewr_state = state;
1376 	}
1377 
1378 	/*
1379 	 * Lock RB tree.  If persistent state is present it will have already
1380 	 * been assigned to msg.
1381 	 */
1382 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1383 	state = msg->state;
1384 
1385 	/*
1386 	 * Short-cut one-off or mid-stream messages (state may be NULL).
1387 	 */
1388 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1389 				  DMSGF_ABORT)) == 0) {
1390 		lockmgr(&iocom->msglk, LK_RELEASE);
1391 		return(0);
1392 	}
1393 
1394 
1395 	/*
1396 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1397 	 * inside the case statements.
1398 	 */
1399 	switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1400 				    DMSGF_REPLY)) {
1401 	case DMSGF_CREATE:
1402 	case DMSGF_CREATE | DMSGF_DELETE:
1403 		/*
1404 		 * Insert the new persistent message state and mark
1405 		 * half-closed if DELETE is set.  Since this is a new
1406 		 * message it isn't possible to transition into the fully
1407 		 * closed state here.
1408 		 *
1409 		 * XXX state must be assigned and inserted by
1410 		 *     kdmsg_msg_write().  txcmd is assigned by us
1411 		 *     on-transmit.
1412 		 */
1413 		KKASSERT(state != NULL);
1414 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1415 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1416 		state->rxcmd = DMSGF_REPLY;
1417 		error = 0;
1418 		break;
1419 	case DMSGF_DELETE:
1420 		/*
1421 		 * Sent ABORT+DELETE in case where msgid has already
1422 		 * been fully closed, ignore the message.
1423 		 */
1424 		if (state == NULL) {
1425 			if (msg->any.head.cmd & DMSGF_ABORT) {
1426 				error = EALREADY;
1427 			} else {
1428 				kprintf("kdmsg_state_msgtx: no state match "
1429 					"for DELETE cmd=%08x msgid=%016jx\n",
1430 					msg->any.head.cmd,
1431 					(intmax_t)msg->any.head.msgid);
1432 				error = EINVAL;
1433 			}
1434 			break;
1435 		}
1436 
1437 		/*
1438 		 * Sent ABORT+DELETE in case where msgid has
1439 		 * already been reused for an unrelated message,
1440 		 * ignore the message.
1441 		 */
1442 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1443 			if (msg->any.head.cmd & DMSGF_ABORT) {
1444 				error = EALREADY;
1445 			} else {
1446 				kprintf("kdmsg_state_msgtx: state reused "
1447 					"for DELETE\n");
1448 				error = EINVAL;
1449 			}
1450 			break;
1451 		}
1452 		error = 0;
1453 		break;
1454 	default:
1455 		/*
1456 		 * Check for mid-stream ABORT command sent
1457 		 */
1458 		if (msg->any.head.cmd & DMSGF_ABORT) {
1459 			if (state == NULL ||
1460 			    (state->txcmd & DMSGF_CREATE) == 0) {
1461 				error = EALREADY;
1462 				break;
1463 			}
1464 		}
1465 		error = 0;
1466 		break;
1467 	case DMSGF_REPLY | DMSGF_CREATE:
1468 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1469 		/*
1470 		 * When transmitting a reply with CREATE set the original
1471 		 * persistent state message should already exist.
1472 		 */
1473 		if (state == NULL) {
1474 			kprintf("kdmsg_state_msgtx: no state match "
1475 				"for REPLY | CREATE\n");
1476 			error = EINVAL;
1477 			break;
1478 		}
1479 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1480 		error = 0;
1481 		break;
1482 	case DMSGF_REPLY | DMSGF_DELETE:
1483 		/*
1484 		 * When transmitting a reply with DELETE set the original
1485 		 * persistent state message should already exist.
1486 		 *
1487 		 * This is very similar to the REPLY|CREATE|* case except
1488 		 * txcmd is already stored, so we just add the DELETE flag.
1489 		 *
1490 		 * Sent REPLY+ABORT+DELETE in case where msgid has
1491 		 * already been fully closed, ignore the message.
1492 		 */
1493 		if (state == NULL) {
1494 			if (msg->any.head.cmd & DMSGF_ABORT) {
1495 				error = EALREADY;
1496 			} else {
1497 				kprintf("kdmsg_state_msgtx: no state match "
1498 					"for REPLY | DELETE\n");
1499 				error = EINVAL;
1500 			}
1501 			break;
1502 		}
1503 
1504 		/*
1505 		 * Sent REPLY+ABORT+DELETE in case where msgid has already
1506 		 * been reused for an unrelated message, ignore the message.
1507 		 */
1508 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1509 			if (msg->any.head.cmd & DMSGF_ABORT) {
1510 				error = EALREADY;
1511 			} else {
1512 				kprintf("kdmsg_state_msgtx: state reused "
1513 					"for REPLY | DELETE\n");
1514 				error = EINVAL;
1515 			}
1516 			break;
1517 		}
1518 		error = 0;
1519 		break;
1520 	case DMSGF_REPLY:
1521 		/*
1522 		 * Check for mid-stream ABORT reply sent.
1523 		 *
1524 		 * One-off REPLY messages are allowed for e.g. status updates.
1525 		 */
1526 		if (msg->any.head.cmd & DMSGF_ABORT) {
1527 			if (state == NULL ||
1528 			    (state->txcmd & DMSGF_CREATE) == 0) {
1529 				error = EALREADY;
1530 				break;
1531 			}
1532 		}
1533 		error = 0;
1534 		break;
1535 	}
1536 	lockmgr(&iocom->msglk, LK_RELEASE);
1537 	return (error);
1538 }
1539 
1540 static
1541 void
1542 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1543 {
1544 	kdmsg_iocom_t *iocom = msg->iocom;
1545 	kdmsg_state_t *state;
1546 
1547 	if ((state = msg->state) == NULL) {
1548 		kdmsg_msg_free(msg);
1549 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
1550 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1551 		KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1552 		state->txcmd |= DMSGF_DELETE;
1553 		if (state->rxcmd & DMSGF_DELETE) {
1554 			KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1555 			if (state->txcmd & DMSGF_REPLY) {
1556 				KKASSERT(msg->any.head.cmd &
1557 					 DMSGF_REPLY);
1558 				RB_REMOVE(kdmsg_state_tree,
1559 					  &iocom->staterd_tree, state);
1560 			} else {
1561 				KKASSERT((msg->any.head.cmd &
1562 					  DMSGF_REPLY) == 0);
1563 				RB_REMOVE(kdmsg_state_tree,
1564 					  &iocom->statewr_tree, state);
1565 			}
1566 			state->flags &= ~KDMSG_STATE_INSERTED;
1567 			if (msg != state->msg)
1568 				kdmsg_msg_free(msg);
1569 			lockmgr(&iocom->msglk, LK_RELEASE);
1570 			kdmsg_state_free(state);
1571 		} else {
1572 			if (msg != state->msg)
1573 				kdmsg_msg_free(msg);
1574 			lockmgr(&iocom->msglk, LK_RELEASE);
1575 		}
1576 	} else if (msg != state->msg) {
1577 		kdmsg_msg_free(msg);
1578 	}
1579 }
1580 
1581 static
1582 void
1583 kdmsg_state_free(kdmsg_state_t *state)
1584 {
1585 	kdmsg_iocom_t *iocom = state->iocom;
1586 	kdmsg_msg_t *msg;
1587 
1588 	KKASSERT((state->flags & KDMSG_STATE_INSERTED) == 0);
1589 	msg = state->msg;
1590 	state->msg = NULL;
1591 	kfree(state, iocom->mmsg);
1592 	if (msg) {
1593 		msg->state = NULL;
1594 		kdmsg_msg_free(msg);
1595 	}
1596 }
1597 
1598 kdmsg_msg_t *
1599 kdmsg_msg_alloc(kdmsg_iocom_t *iocom, kdmsg_circuit_t *circ, uint32_t cmd,
1600 		int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1601 {
1602 	kdmsg_msg_t *msg;
1603 	kdmsg_state_t *state;
1604 	size_t hbytes;
1605 
1606 	KKASSERT(iocom != NULL);
1607 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1608 	msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1609 		      iocom->mmsg, M_WAITOK | M_ZERO);
1610 	msg->hdr_size = hbytes;
1611 	msg->iocom = iocom;
1612 	msg->any.head.magic = DMSG_HDR_MAGIC;
1613 	msg->any.head.cmd = cmd;
1614 	if (circ) {
1615 		kdmsg_circ_hold(circ);
1616 		msg->circ = circ;
1617 		msg->any.head.circuit = circ->msgid;
1618 	}
1619 
1620 	if (cmd & DMSGF_CREATE) {
1621 		/*
1622 		 * New transaction, requires tracking state and a unique
1623 		 * msgid to be allocated.
1624 		 */
1625 		KKASSERT(msg->state == NULL);
1626 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1627 		state->flags = KDMSG_STATE_DYNAMIC;
1628 		state->func = func;
1629 		state->any.any = data;
1630 		state->msg = msg;
1631 		state->msgid = (uint64_t)(uintptr_t)state;
1632 		state->circ = circ;
1633 		state->iocom = iocom;
1634 		msg->state = state;
1635 		if (circ)
1636 			kdmsg_circ_hold(circ);
1637 		/*msg->any.head.msgid = state->msgid;XXX*/
1638 
1639 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1640 		if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1641 			panic("duplicate msgid allocated");
1642 		state->flags |= KDMSG_STATE_INSERTED;
1643 		msg->any.head.msgid = state->msgid;
1644 		lockmgr(&iocom->msglk, LK_RELEASE);
1645 	}
1646 	return (msg);
1647 }
1648 
1649 kdmsg_msg_t *
1650 kdmsg_msg_alloc_state(kdmsg_state_t *state, uint32_t cmd,
1651 		      int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1652 {
1653 	kdmsg_iocom_t *iocom = state->iocom;
1654 	kdmsg_msg_t *msg;
1655 	size_t hbytes;
1656 
1657 	KKASSERT(iocom != NULL);
1658 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1659 	msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1660 		      iocom->mmsg, M_WAITOK | M_ZERO);
1661 	msg->hdr_size = hbytes;
1662 	msg->iocom = iocom;
1663 	msg->any.head.magic = DMSG_HDR_MAGIC;
1664 	msg->any.head.cmd = cmd;
1665 	msg->state = state;
1666 	if (state->circ) {
1667 		kdmsg_circ_hold(state->circ);
1668 		msg->circ = state->circ;
1669 		msg->any.head.circuit = state->circ->msgid;
1670 	}
1671 	return(msg);
1672 }
1673 
1674 void
1675 kdmsg_msg_free(kdmsg_msg_t *msg)
1676 {
1677 	kdmsg_iocom_t *iocom = msg->iocom;
1678 
1679 	if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1680 	    msg->aux_data && msg->aux_size) {
1681 		kfree(msg->aux_data, iocom->mmsg);
1682 		msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1683 	}
1684 	if (msg->circ) {
1685 		kdmsg_circ_drop(msg->circ);
1686 		msg->circ = NULL;
1687 	}
1688 	if (msg->state) {
1689 		if (msg->state->msg == msg)
1690 			msg->state->msg = NULL;
1691 		msg->state = NULL;
1692 	}
1693 	msg->aux_data = NULL;
1694 	msg->aux_size = 0;
1695 	msg->iocom = NULL;
1696 	kfree(msg, iocom->mmsg);
1697 }
1698 
1699 /*
1700  * Circuits are tracked in a red-black tree by their circuit id (msgid).
1701  */
1702 int
1703 kdmsg_circuit_cmp(kdmsg_circuit_t *circ1, kdmsg_circuit_t *circ2)
1704 {
1705 	if (circ1->msgid < circ2->msgid)
1706 		return(-1);
1707 	if (circ1->msgid > circ2->msgid)
1708 		return(1);
1709 	return (0);
1710 }
1711 
1712 /*
1713  * Indexed messages are stored in a red-black tree indexed by their
1714  * msgid.  Only persistent messages are indexed.
1715  */
1716 int
1717 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1718 {
1719 	if (state1->iocom < state2->iocom)
1720 		return(-1);
1721 	if (state1->iocom > state2->iocom)
1722 		return(1);
1723 	if (state1->circ < state2->circ)
1724 		return(-1);
1725 	if (state1->circ > state2->circ)
1726 		return(1);
1727 	if (state1->msgid < state2->msgid)
1728 		return(-1);
1729 	if (state1->msgid > state2->msgid)
1730 		return(1);
1731 	return(0);
1732 }
1733 
1734 /*
1735  * Write a message.  All requisit command flags have been set.
1736  *
1737  * If msg->state is non-NULL the message is written to the existing
1738  * transaction.  msgid will be set accordingly.
1739  *
1740  * If msg->state is NULL and CREATE is set new state is allocated and
1741  * (func, data) is installed.  A msgid is assigned.
1742  *
1743  * If msg->state is NULL and CREATE is not set the message is assumed
1744  * to be a one-way message.  The originator must assign the msgid
1745  * (or leave it 0, which is typical.
1746  *
1747  * This function merely queues the message to the management thread, it
1748  * does not write to the message socket/pipe.
1749  */
1750 void
1751 kdmsg_msg_write(kdmsg_msg_t *msg)
1752 {
1753 	kdmsg_iocom_t *iocom = msg->iocom;
1754 	kdmsg_state_t *state;
1755 
1756 	if (msg->state) {
1757 		/*
1758 		 * Continuance or termination of existing transaction.
1759 		 * The transaction could have been initiated by either end.
1760 		 *
1761 		 * (Function callback and aux data for the receive side can
1762 		 * be replaced or left alone).
1763 		 */
1764 		state = msg->state;
1765 		msg->any.head.msgid = state->msgid;
1766 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1767 	} else {
1768 		/*
1769 		 * One-off message (always uses msgid 0 to distinguish
1770 		 * between a possibly lost in-transaction message due to
1771 		 * competing aborts and a real one-off message?)
1772 		 */
1773 		state = NULL;
1774 		msg->any.head.msgid = 0;
1775 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1776 	}
1777 
1778 	/*
1779 	 * With AUTOCIRC and AUTOFORGE it is possible for the circuit to
1780 	 * get ripped out in the rxthread while some other thread is
1781 	 * holding a ref on it inbetween allocating and sending a dmsg.
1782 	 */
1783 	if (msg->circ && msg->circ->rcirc_state == NULL &&
1784 	    (msg->circ->span_state == NULL || msg->circ->circ_state == NULL)) {
1785 		kprintf("kdmsg_msg_write: Attempt to write message to "
1786 		        "terminated circuit: msg %08x\n", msg->any.head.cmd);
1787 		lockmgr(&iocom->msglk, LK_RELEASE);
1788 		if (kdmsg_state_msgtx(msg)) {
1789 			if (state == NULL || msg != state->msg)
1790 				kdmsg_msg_free(msg);
1791 		} else if ((msg->state->rxcmd & DMSGF_DELETE) == 0) {
1792 			/* XXX SMP races simulating a response here */
1793 			kdmsg_state_t *state = msg->state;
1794 			kdmsg_state_cleanuptx(msg);
1795 			kdmsg_state_abort(state);
1796 		} else {
1797 			kdmsg_state_cleanuptx(msg);
1798 		}
1799 		return;
1800 	}
1801 
1802 	/*
1803 	 * This flag is not set until after the tx thread has drained
1804 	 * the txmsgq and simulated responses.  After that point the
1805 	 * txthread is dead and can no longer simulate responses.
1806 	 *
1807 	 * Device drivers should never try to send a message once this
1808 	 * flag is set.  They should have detected (through the state
1809 	 * closures) that the link is in trouble.
1810 	 */
1811 	if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1812 		lockmgr(&iocom->msglk, LK_RELEASE);
1813 		panic("kdmsg_msg_write: Attempt to write message to "
1814 		      "terminated iocom\n");
1815 	}
1816 
1817 	/*
1818 	 * Finish up the msg fields.  Note that msg->aux_size and the
1819 	 * aux_bytes stored in the message header represent the unaligned
1820 	 * (actual) bytes of data, but the buffer is sized to an aligned
1821 	 * size and the CRC is generated over the aligned length.
1822 	 */
1823 	msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
1824 	++iocom->msg_seq;
1825 
1826 	if (msg->aux_data && msg->aux_size) {
1827 		uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
1828 
1829 		msg->any.head.aux_bytes = msg->aux_size;
1830 		msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
1831 	}
1832 	msg->any.head.hdr_crc = 0;
1833 	msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
1834 
1835 	TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
1836 
1837 	if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
1838 		atomic_clear_int(&iocom->msg_ctl,
1839 				 KDMSG_CLUSTERCTL_SLEEPING);
1840 		wakeup(&iocom->msg_ctl);
1841 	}
1842 
1843 	lockmgr(&iocom->msglk, LK_RELEASE);
1844 }
1845 
1846 /*
1847  * Reply to a message and terminate our side of the transaction.
1848  *
1849  * If msg->state is non-NULL we are replying to a one-way message.
1850  */
1851 void
1852 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
1853 {
1854 	kdmsg_state_t *state = msg->state;
1855 	kdmsg_msg_t *nmsg;
1856 	uint32_t cmd;
1857 
1858 	/*
1859 	 * Reply with a simple error code and terminate the transaction.
1860 	 */
1861 	cmd = DMSG_LNK_ERROR;
1862 
1863 	/*
1864 	 * Check if our direction has even been initiated yet, set CREATE.
1865 	 *
1866 	 * Check what direction this is (command or reply direction).  Note
1867 	 * that txcmd might not have been initiated yet.
1868 	 *
1869 	 * If our direction has already been closed we just return without
1870 	 * doing anything.
1871 	 */
1872 	if (state) {
1873 		if (state->txcmd & DMSGF_DELETE)
1874 			return;
1875 		if ((state->txcmd & DMSGF_CREATE) == 0)
1876 			cmd |= DMSGF_CREATE;
1877 		if (state->txcmd & DMSGF_REPLY)
1878 			cmd |= DMSGF_REPLY;
1879 		cmd |= DMSGF_DELETE;
1880 	} else {
1881 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1882 			cmd |= DMSGF_REPLY;
1883 	}
1884 
1885 	/* XXX messy mask cmd to avoid allocating state */
1886 	nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
1887 	nmsg->any.head.error = error;
1888 	kdmsg_msg_write(nmsg);
1889 }
1890 
1891 /*
1892  * Reply to a message and continue our side of the transaction.
1893  *
1894  * If msg->state is non-NULL we are replying to a one-way message and this
1895  * function degenerates into the same as kdmsg_msg_reply().
1896  */
1897 void
1898 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
1899 {
1900 	kdmsg_state_t *state = msg->state;
1901 	kdmsg_msg_t *nmsg;
1902 	uint32_t cmd;
1903 
1904 	/*
1905 	 * Return a simple result code, do NOT terminate the transaction.
1906 	 */
1907 	cmd = DMSG_LNK_ERROR;
1908 
1909 	/*
1910 	 * Check if our direction has even been initiated yet, set CREATE.
1911 	 *
1912 	 * Check what direction this is (command or reply direction).  Note
1913 	 * that txcmd might not have been initiated yet.
1914 	 *
1915 	 * If our direction has already been closed we just return without
1916 	 * doing anything.
1917 	 */
1918 	if (state) {
1919 		if (state->txcmd & DMSGF_DELETE)
1920 			return;
1921 		if ((state->txcmd & DMSGF_CREATE) == 0)
1922 			cmd |= DMSGF_CREATE;
1923 		if (state->txcmd & DMSGF_REPLY)
1924 			cmd |= DMSGF_REPLY;
1925 		/* continuing transaction, do not set MSGF_DELETE */
1926 	} else {
1927 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1928 			cmd |= DMSGF_REPLY;
1929 	}
1930 
1931 	/* XXX messy mask cmd to avoid allocating state */
1932 	nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
1933 	nmsg->any.head.error = error;
1934 	kdmsg_msg_write(nmsg);
1935 }
1936 
1937 /*
1938  * Reply to a message and terminate our side of the transaction.
1939  *
1940  * If msg->state is non-NULL we are replying to a one-way message.
1941  */
1942 void
1943 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
1944 {
1945 	kdmsg_msg_t *nmsg;
1946 	uint32_t cmd;
1947 
1948 	/*
1949 	 * Reply with a simple error code and terminate the transaction.
1950 	 */
1951 	cmd = DMSG_LNK_ERROR;
1952 
1953 	/*
1954 	 * Check if our direction has even been initiated yet, set CREATE.
1955 	 *
1956 	 * Check what direction this is (command or reply direction).  Note
1957 	 * that txcmd might not have been initiated yet.
1958 	 *
1959 	 * If our direction has already been closed we just return without
1960 	 * doing anything.
1961 	 */
1962 	if (state) {
1963 		if (state->txcmd & DMSGF_DELETE)
1964 			return;
1965 		if ((state->txcmd & DMSGF_CREATE) == 0)
1966 			cmd |= DMSGF_CREATE;
1967 		if (state->txcmd & DMSGF_REPLY)
1968 			cmd |= DMSGF_REPLY;
1969 		cmd |= DMSGF_DELETE;
1970 	} else {
1971 		if ((state->txcmd & DMSGF_REPLY) == 0)
1972 			cmd |= DMSGF_REPLY;
1973 	}
1974 
1975 	/* XXX messy mask cmd to avoid allocating state */
1976 	nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
1977 	nmsg->any.head.error = error;
1978 	kdmsg_msg_write(nmsg);
1979 }
1980 
1981 /*
1982  * Reply to a message and continue our side of the transaction.
1983  *
1984  * If msg->state is non-NULL we are replying to a one-way message and this
1985  * function degenerates into the same as kdmsg_msg_reply().
1986  */
1987 void
1988 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
1989 {
1990 	kdmsg_msg_t *nmsg;
1991 	uint32_t cmd;
1992 
1993 	/*
1994 	 * Return a simple result code, do NOT terminate the transaction.
1995 	 */
1996 	cmd = DMSG_LNK_ERROR;
1997 
1998 	/*
1999 	 * Check if our direction has even been initiated yet, set CREATE.
2000 	 *
2001 	 * Check what direction this is (command or reply direction).  Note
2002 	 * that txcmd might not have been initiated yet.
2003 	 *
2004 	 * If our direction has already been closed we just return without
2005 	 * doing anything.
2006 	 */
2007 	if (state) {
2008 		if (state->txcmd & DMSGF_DELETE)
2009 			return;
2010 		if ((state->txcmd & DMSGF_CREATE) == 0)
2011 			cmd |= DMSGF_CREATE;
2012 		if (state->txcmd & DMSGF_REPLY)
2013 			cmd |= DMSGF_REPLY;
2014 		/* continuing transaction, do not set MSGF_DELETE */
2015 	} else {
2016 		if ((state->txcmd & DMSGF_REPLY) == 0)
2017 			cmd |= DMSGF_REPLY;
2018 	}
2019 
2020 	/* XXX messy mask cmd to avoid allocating state */
2021 	nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
2022 	nmsg->any.head.error = error;
2023 	kdmsg_msg_write(nmsg);
2024 }
2025