xref: /dragonfly/sys/kern/kern_dmsg.c (revision 2b3f93ea)
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36  *	 a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/caps.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
56 
57 #include <sys/dmsg.h>
58 
59 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
60 
61 SYSCTL_NODE(, OID_AUTO, kdmsg, CTLFLAG_RW, 0, "kdmsg");
62 static int kdmsg_debug = 1;
63 SYSCTL_INT(_kdmsg, OID_AUTO, debug, CTLFLAG_RW, &kdmsg_debug, 0,
64 	   "Set debug level for kernel dmsg layer");
65 
66 #define kd_printf(level, ctl, ...)      	\
67         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
68 
69 #define kdio_printf(iocom, level, ctl, ...)      \
70         if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
71 
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
73 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
74 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
78 static void kdmsg_subq_delete(kdmsg_state_t *state);
79 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
80 static void kdmsg_state_abort(kdmsg_state_t *state);
81 static void kdmsg_state_dying(kdmsg_state_t *state);
82 static void kdmsg_state_free(kdmsg_state_t *state);
83 static void kdmsg_drain_msg(kdmsg_msg_t *msg);
84 
85 #ifdef KDMSG_DEBUG
86 #define KDMSG_DEBUG_ARGS	, const char *file, int line
87 #define kdmsg_state_hold(state)	_kdmsg_state_hold(state, __FILE__, __LINE__)
88 #define kdmsg_state_drop(state)	_kdmsg_state_drop(state, __FILE__, __LINE__)
89 #else
90 #define KDMSG_DEBUG 0
91 #define KDMSG_DEBUG_ARGS
92 #define kdmsg_state_hold(state)	_kdmsg_state_hold(state)
93 #define kdmsg_state_drop(state)	_kdmsg_state_drop(state)
94 #endif
95 static void _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
96 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
97 
98 static void kdmsg_iocom_thread_rd(void *arg);
99 static void kdmsg_iocom_thread_wr(void *arg);
100 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
101 
102 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
103 
104 /*
105  * Initialize the roll-up communications structure for a network
106  * messaging session.  This function does not install the socket.
107  */
108 void
kdmsg_iocom_init(kdmsg_iocom_t * iocom,void * handle,uint32_t flags,struct malloc_type * mmsg,int (* rcvmsg)(kdmsg_msg_t * msg))109 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
110 		 struct malloc_type *mmsg,
111 		 int (*rcvmsg)(kdmsg_msg_t *msg))
112 {
113 	bzero(iocom, sizeof(*iocom));
114 	iocom->handle = handle;
115 	iocom->mmsg = mmsg;
116 	iocom->rcvmsg = rcvmsg;
117 	iocom->flags = flags;
118 	lockinit(&iocom->msglk, "h2msg", 0, 0);
119 	TAILQ_INIT(&iocom->msgq);
120 	RB_INIT(&iocom->staterd_tree);
121 	RB_INIT(&iocom->statewr_tree);
122 
123 	iocom->state0.iocom = iocom;
124 	iocom->state0.parent = &iocom->state0;
125 	TAILQ_INIT(&iocom->state0.subq);
126 }
127 
128 /*
129  * [Re]connect using the passed file pointer.  The caller must ref the
130  * fp for us.  We own that ref now.
131  */
132 void
kdmsg_iocom_reconnect(kdmsg_iocom_t * iocom,struct file * fp,const char * subsysname)133 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
134 		      const char *subsysname)
135 {
136 	/*
137 	 * Destroy the current connection
138 	 */
139 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
140 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
141 	while (iocom->msgrd_td || iocom->msgwr_td) {
142 		wakeup(&iocom->msg_ctl);
143 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
144 	}
145 
146 	/*
147 	 * Drop communications descriptor
148 	 */
149 	if (iocom->msg_fp) {
150 		fdrop(iocom->msg_fp);
151 		iocom->msg_fp = NULL;
152 	}
153 
154 	/*
155 	 * Setup new communications descriptor
156 	 */
157 	iocom->msg_ctl = 0;
158 	iocom->msg_fp = fp;
159 	iocom->msg_seq = 0;
160 	iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
161 
162 	lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
163 		    NULL, 0, -1, "%s-msgrd", subsysname);
164 	lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
165 		    NULL, 0, -1, "%s-msgwr", subsysname);
166 	lockmgr(&iocom->msglk, LK_RELEASE);
167 }
168 
169 /*
170  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
171  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
172  */
173 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
174 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
175 
176 void
kdmsg_iocom_autoinitiate(kdmsg_iocom_t * iocom,void (* auto_callback)(kdmsg_msg_t * msg))177 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
178 			 void (*auto_callback)(kdmsg_msg_t *msg))
179 {
180 	kdmsg_msg_t *msg;
181 
182 	iocom->auto_callback = auto_callback;
183 
184 	msg = kdmsg_msg_alloc(&iocom->state0,
185 			      DMSG_LNK_CONN | DMSGF_CREATE,
186 			      kdmsg_lnk_conn_reply, NULL);
187 	iocom->auto_lnk_conn.head = msg->any.head;
188 	msg->any.lnk_conn = iocom->auto_lnk_conn;
189 	iocom->conn_state = msg->state;
190 	kdmsg_state_hold(msg->state);	/* iocom->conn_state */
191 	kdmsg_msg_write(msg);
192 }
193 
194 static
195 int
kdmsg_lnk_conn_reply(kdmsg_state_t * state,kdmsg_msg_t * msg)196 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
197 {
198 	kdmsg_iocom_t *iocom = state->iocom;
199 	kdmsg_msg_t *rmsg;
200 
201 	/*
202 	 * Upon receipt of the LNK_CONN acknowledgement initiate an
203 	 * automatic SPAN if we were asked to.  Used by e.g. xdisk, but
204 	 * not used by HAMMER2 which must manage more than one transmitted
205 	 * SPAN.
206 	 */
207 	if ((msg->any.head.cmd & DMSGF_CREATE) &&
208 	    (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
209 		rmsg = kdmsg_msg_alloc(&iocom->state0,
210 				       DMSG_LNK_SPAN | DMSGF_CREATE,
211 				       kdmsg_lnk_span_reply, NULL);
212 		iocom->auto_lnk_span.head = rmsg->any.head;
213 		rmsg->any.lnk_span = iocom->auto_lnk_span;
214 		kdmsg_msg_write(rmsg);
215 	}
216 
217 	/*
218 	 * Process shim after the CONN is acknowledged and before the CONN
219 	 * transaction is deleted.  For deletions this gives device drivers
220 	 * the ability to interlock new operations on the circuit before
221 	 * it becomes illegal and panics.
222 	 */
223 	if (iocom->auto_callback)
224 		iocom->auto_callback(msg);
225 
226 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
227 	    (msg->any.head.cmd & DMSGF_DELETE)) {
228 		/*
229 		 * iocom->conn_state has a state ref, drop it when clearing.
230 		 */
231 		if (iocom->conn_state)
232 			kdmsg_state_drop(iocom->conn_state);
233 		iocom->conn_state = NULL;
234 		kdmsg_msg_reply(msg, 0);
235 	}
236 
237 	return (0);
238 }
239 
240 static
241 int
kdmsg_lnk_span_reply(kdmsg_state_t * state,kdmsg_msg_t * msg)242 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
243 {
244 	/*
245 	 * Be sure to process shim before terminating the SPAN
246 	 * transaction.  Gives device drivers the ability to
247 	 * interlock new operations on the circuit before it
248 	 * becomes illegal and panics.
249 	 */
250 	if (state->iocom->auto_callback)
251 		state->iocom->auto_callback(msg);
252 
253 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
254 	    (msg->any.head.cmd & DMSGF_DELETE)) {
255 		kdmsg_msg_reply(msg, 0);
256 	}
257 	return (0);
258 }
259 
260 /*
261  * Disconnect and clean up
262  */
263 void
kdmsg_iocom_uninit(kdmsg_iocom_t * iocom)264 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
265 {
266 	kdmsg_state_t *state;
267 	kdmsg_msg_t *msg;
268 	int retries;
269 
270 	/*
271 	 * Ask the cluster controller to go away by setting
272 	 * KILLRX.  Send a PING to get a response to unstick reading
273 	 * from the pipe.
274 	 *
275 	 * After 10 seconds shitcan the pipe and do an unclean shutdown.
276 	 */
277 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
278 
279 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
280 	msg = kdmsg_msg_alloc(&iocom->state0, DMSG_LNK_PING, NULL, NULL);
281 	kdmsg_msg_write_locked(iocom, msg);
282 
283 	retries = 10;
284 	while (iocom->msgrd_td || iocom->msgwr_td) {
285 		wakeup(&iocom->msg_ctl);
286 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
287 		if (--retries == 0 && iocom->msg_fp) {
288 			kdio_printf(iocom, 0, "%s\n",
289 				    "iocom_uninit: "
290 				    "shitcanning unresponsive pipe");
291 			fp_shutdown(iocom->msg_fp, SHUT_RDWR);
292 			/* retries allowed to go negative, keep looping */
293 		}
294 	}
295 
296 	/*
297 	 * Cleanup caches
298 	 */
299 	if ((state = iocom->freerd_state) != NULL) {
300 		iocom->freerd_state = NULL;
301 		kdmsg_state_drop(state);
302 	}
303 
304 	if ((state = iocom->freewr_state) != NULL) {
305 		iocom->freewr_state = NULL;
306 		kdmsg_state_drop(state);
307 	}
308 
309 	/*
310 	 * Drop communications descriptor
311 	 */
312 	if (iocom->msg_fp) {
313 		fdrop(iocom->msg_fp);
314 		iocom->msg_fp = NULL;
315 	}
316 	lockmgr(&iocom->msglk, LK_RELEASE);
317 }
318 
319 /*
320  * Cluster controller thread.  Perform messaging functions.  We have one
321  * thread for the reader and one for the writer.  The writer handles
322  * shutdown requests (which should break the reader thread).
323  */
324 static
325 void
kdmsg_iocom_thread_rd(void * arg)326 kdmsg_iocom_thread_rd(void *arg)
327 {
328 	kdmsg_iocom_t *iocom = arg;
329 	dmsg_hdr_t hdr;
330 	kdmsg_msg_t *msg = NULL;
331 	size_t hbytes;
332 	size_t abytes;
333 	int error = 0;
334 
335 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
336 		/*
337 		 * Retrieve the message from the pipe or socket.
338 		 */
339 		error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
340 				NULL, 1, UIO_SYSSPACE);
341 		if (error)
342 			break;
343 		if (hdr.magic != DMSG_HDR_MAGIC) {
344 			kdio_printf(iocom, 1, "bad magic: %04x\n", hdr.magic);
345 			error = EINVAL;
346 			break;
347 		}
348 		hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
349 		if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
350 			kdio_printf(iocom, 1, "bad header size %zd\n", hbytes);
351 			error = EINVAL;
352 			break;
353 		}
354 
355 		/* XXX messy: mask cmd to avoid allocating state */
356 		msg = kdmsg_msg_alloc(&iocom->state0,
357 				      hdr.cmd & DMSGF_BASECMDMASK,
358 				      NULL, NULL);
359 		msg->any.head = hdr;
360 		msg->hdr_size = hbytes;
361 		if (hbytes > sizeof(hdr)) {
362 			error = fp_read(iocom->msg_fp, &msg->any.head + 1,
363 					hbytes - sizeof(hdr),
364 					NULL, 1, UIO_SYSSPACE);
365 			if (error) {
366 				kdio_printf(iocom, 1, "%s\n",
367 					    "short msg received");
368 				error = EINVAL;
369 				break;
370 			}
371 		}
372 		msg->aux_size = hdr.aux_bytes;
373 		if (msg->aux_size > DMSG_AUX_MAX) {
374 			kdio_printf(iocom, 1,
375 				    "illegal msg payload size %zd\n",
376 				    msg->aux_size);
377 			error = EINVAL;
378 			break;
379 		}
380 		if (msg->aux_size) {
381 			abytes = DMSG_DOALIGN(msg->aux_size);
382 			msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
383 			msg->flags |= KDMSG_FLAG_AUXALLOC;
384 			error = fp_read(iocom->msg_fp, msg->aux_data,
385 					abytes, NULL, 1, UIO_SYSSPACE);
386 			if (error) {
387 				kdio_printf(iocom, 1, "%s\n",
388 					    "short msg payload received");
389 				break;
390 			}
391 		}
392 
393 		error = kdmsg_msg_receive_handling(msg);
394 		msg = NULL;
395 	}
396 
397 #if 0
398 	kdio_printf(iocom, 1, "read thread terminating error=%d\n", error);
399 #endif
400 
401 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
402 	if (msg)
403 		kdmsg_msg_free(msg);
404 
405 	/*
406 	 * Shutdown the socket and set KILLRX for consistency in case the
407 	 * shutdown was not commanded.  Signal the transmit side to shutdown
408 	 * by setting KILLTX and waking it up.
409 	 */
410 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
411 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
412 					KDMSG_CLUSTERCTL_KILLTX);
413 	iocom->msgrd_td = NULL;
414 	lockmgr(&iocom->msglk, LK_RELEASE);
415 	wakeup(&iocom->msg_ctl);
416 
417 	/*
418 	 * iocom can be ripped out at any time once the lock is
419 	 * released with msgrd_td set to NULL.  The wakeup()s are safe but
420 	 * that is all.
421 	 */
422 	wakeup(iocom);
423 	lwkt_exit();
424 }
425 
426 static
427 void
kdmsg_iocom_thread_wr(void * arg)428 kdmsg_iocom_thread_wr(void *arg)
429 {
430 	kdmsg_iocom_t *iocom = arg;
431 	kdmsg_msg_t *msg;
432 	ssize_t res;
433 	size_t abytes;
434 	int error = 0;
435 	int save_ticks;
436 	int didwarn;
437 
438 	/*
439 	 * Transmit loop
440 	 */
441 	msg = NULL;
442 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
443 
444 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
445 		/*
446 		 * Sleep if no messages pending.  Interlock with flag while
447 		 * holding msglk.
448 		 */
449 		if (TAILQ_EMPTY(&iocom->msgq)) {
450 			atomic_set_int(&iocom->msg_ctl,
451 				       KDMSG_CLUSTERCTL_SLEEPING);
452 			lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
453 			atomic_clear_int(&iocom->msg_ctl,
454 					 KDMSG_CLUSTERCTL_SLEEPING);
455 		}
456 
457 		while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
458 			/*
459 			 * Remove msg from the transmit queue and do
460 			 * persist and half-closed state handling.
461 			 */
462 			TAILQ_REMOVE(&iocom->msgq, msg, qentry);
463 
464 			error = kdmsg_state_msgtx(msg);
465 			if (error == EALREADY) {
466 				error = 0;
467 				kdmsg_msg_free(msg);
468 				continue;
469 			}
470 			if (error) {
471 				kdmsg_msg_free(msg);
472 				break;
473 			}
474 
475 			/*
476 			 * Dump the message to the pipe or socket.
477 			 *
478 			 * We have to clean up the message as if the transmit
479 			 * succeeded even if it failed.
480 			 */
481 			lockmgr(&iocom->msglk, LK_RELEASE);
482 			error = fp_write(iocom->msg_fp, &msg->any,
483 					 msg->hdr_size, &res, UIO_SYSSPACE);
484 			if (error || res != msg->hdr_size) {
485 				if (error == 0)
486 					error = EINVAL;
487 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
488 				kdmsg_state_cleanuptx(msg);
489 				break;
490 			}
491 			if (msg->aux_size) {
492 				abytes = DMSG_DOALIGN(msg->aux_size);
493 				error = fp_write(iocom->msg_fp,
494 						 msg->aux_data, abytes,
495 						 &res, UIO_SYSSPACE);
496 				if (error || res != abytes) {
497 					if (error == 0)
498 						error = EINVAL;
499 					lockmgr(&iocom->msglk, LK_EXCLUSIVE);
500 					kdmsg_state_cleanuptx(msg);
501 					break;
502 				}
503 			}
504 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
505 			kdmsg_state_cleanuptx(msg);
506 		}
507 	}
508 
509 #if 0
510 	kdio_printf(iocom, 1, "write thread terminating error=%d\n", error);
511 #endif
512 
513 	/*
514 	 * Shutdown the socket and set KILLTX for consistency in case the
515 	 * shutdown was not commanded.  Signal the receive side to shutdown
516 	 * by setting KILLRX and waking it up.
517 	 */
518 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
519 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
520 					KDMSG_CLUSTERCTL_KILLTX);
521 	wakeup(&iocom->msg_ctl);
522 
523 	/*
524 	 * The transmit thread is responsible for final cleanups, wait
525 	 * for the receive side to terminate to prevent new received
526 	 * states from interfering with our cleanup.
527 	 *
528 	 * Do not set msgwr_td to NULL until we actually exit.
529 	 */
530 	while (iocom->msgrd_td) {
531 		wakeup(&iocom->msg_ctl);
532 		lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
533 	}
534 
535 	/*
536 	 * We can no longer receive new messages.  We must drain the transmit
537 	 * message queue and simulate received messages to close anay remaining
538 	 * states.
539 	 *
540 	 * Loop until all the states are gone and there are no messages
541 	 * pending transmit.
542 	 */
543 	save_ticks = ticks;
544 	didwarn = 0;
545 	iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
546 
547 	while (TAILQ_FIRST(&iocom->msgq) ||
548 	       RB_ROOT(&iocom->staterd_tree) ||
549 	       RB_ROOT(&iocom->statewr_tree) ||
550 	       iocom->conn_state) {
551 		/*
552 		 * Simulate failure for all sub-states of state0.
553 		 */
554 		kdmsg_drain_msgq(iocom);
555 		kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
556 
557 		lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
558 
559 		if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
560 			didwarn = 1;
561 			kdio_printf(iocom, 0,
562 				    "Warning, write thread on %p "
563 				    "still terminating\n",
564 				    iocom);
565 		}
566 		if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
567 			didwarn = 2;
568 			kdio_printf(iocom, 0,
569 				    "Warning, write thread on %p "
570 				    "still terminating\n",
571 				    iocom);
572 		}
573 		if ((int)(ticks - save_ticks) > hz*60) {
574 			kdio_printf(iocom, 0,
575 				    "Can't terminate: msgq %p "
576 				    "rd_tree %p wr_tree %p\n",
577 				    TAILQ_FIRST(&iocom->msgq),
578 				    RB_ROOT(&iocom->staterd_tree),
579 				    RB_ROOT(&iocom->statewr_tree));
580 			lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz * 10);
581 		}
582 	}
583 
584 	/*
585 	 * Exit handling is done by the write thread.
586 	 */
587 	lockmgr(&iocom->msglk, LK_RELEASE);
588 
589 	/*
590 	 * The state trees had better be empty now
591 	 */
592 	KKASSERT(RB_EMPTY(&iocom->staterd_tree));
593 	KKASSERT(RB_EMPTY(&iocom->statewr_tree));
594 	KKASSERT(iocom->conn_state == NULL);
595 
596 	if (iocom->exit_func) {
597 		/*
598 		 * iocom is invalid after we call the exit function.
599 		 */
600 		iocom->msgwr_td = NULL;
601 		iocom->exit_func(iocom);
602 	} else {
603 		/*
604 		 * iocom can be ripped out from under us once msgwr_td is
605 		 * set to NULL.  The wakeup is safe.
606 		 */
607 		iocom->msgwr_td = NULL;
608 		wakeup(iocom);
609 	}
610 	lwkt_exit();
611 }
612 
613 /*
614  * This cleans out the pending transmit message queue, adjusting any
615  * persistent states properly in the process.
616  *
617  * Called with iocom locked.
618  */
619 void
kdmsg_drain_msgq(kdmsg_iocom_t * iocom)620 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
621 {
622 	kdmsg_msg_t *msg;
623 
624 	/*
625 	 * Clean out our pending transmit queue, executing the
626 	 * appropriate state adjustments as if the messages were
627 	 * sent.
628 	 */
629 	while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
630 		TAILQ_REMOVE(&iocom->msgq, msg, qentry);
631 		kdmsg_drain_msg(msg);
632 	}
633 }
634 
635 /*
636  * Drain one message by simulating transmission and also simulating a
637  * receive failure.
638  */
639 static void
kdmsg_drain_msg(kdmsg_msg_t * msg)640 kdmsg_drain_msg(kdmsg_msg_t *msg)
641 {
642 	if (kdmsg_state_msgtx(msg)) {
643 		kdmsg_msg_free(msg);
644 	} else {
645 		if (msg->state) {
646 			kdmsg_simulate_failure(msg->state,
647 					       0, DMSG_ERR_LOSTLINK);
648 		}
649 		kdmsg_state_cleanuptx(msg);
650 	}
651 }
652 
653 /*
654  * Do all processing required to handle a freshly received message
655  * after its low level header has been validated.
656  *
657  * iocom is not locked.
658  */
659 static
660 int
kdmsg_msg_receive_handling(kdmsg_msg_t * msg)661 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
662 {
663 	kdmsg_iocom_t *iocom = msg->state->iocom;
664 	int error;
665 
666 	/*
667 	 * State machine tracking, state assignment for msg,
668 	 * returns error and discard status.  Errors are fatal
669 	 * to the connection except for EALREADY which forces
670 	 * a discard without execution.
671 	 */
672 	error = kdmsg_state_msgrx(msg);
673 	if (msg->state->flags & KDMSG_STATE_ABORTING) {
674 		kdio_printf(iocom, 5,
675 			    "kdmsg_state_abort(b): state %p rxcmd=%08x "
676 			    "txcmd=%08x msgrx error %d\n",
677 			    msg->state, msg->state->rxcmd,
678 			    msg->state->txcmd, error);
679 	}
680 	if (error) {
681 		/*
682 		 * Raw protocol or connection error
683 		 */
684 		if (msg->state->flags & KDMSG_STATE_ABORTING)
685 			kdio_printf(iocom, 5,
686 				    "X1 state %p error %d\n",
687 				    msg->state, error);
688 		kdmsg_msg_free(msg);
689 		if (error == EALREADY)
690 			error = 0;
691 	} else if (msg->state && msg->state->func) {
692 		/*
693 		 * Message related to state which already has a
694 		 * handling function installed for it.
695 		 */
696 		if (msg->state->flags & KDMSG_STATE_ABORTING)
697 			kdio_printf(iocom, 5,
698 				    "X2 state %p func %p\n",
699 				    msg->state, msg->state->func);
700 		error = msg->state->func(msg->state, msg);
701 		kdmsg_state_cleanuprx(msg);
702 	} else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
703 		if (msg->state->flags & KDMSG_STATE_ABORTING)
704 			kdio_printf(iocom, 5,
705 				    "X3 state %p\n", msg->state);
706 		error = kdmsg_autorxmsg(msg);
707 		kdmsg_state_cleanuprx(msg);
708 	} else {
709 		if (msg->state->flags & KDMSG_STATE_ABORTING)
710 			kdio_printf(iocom, 5,
711 				    "X4 state %p\n", msg->state);
712 		error = iocom->rcvmsg(msg);
713 		kdmsg_state_cleanuprx(msg);
714 	}
715 	return error;
716 }
717 
718 /*
719  * Process state tracking for a message after reception and dequeueing,
720  * prior to execution of the state callback.  The state is updated and
721  * will be removed from the RBTREE if completely closed, but the state->parent
722  * and subq linkage is not cleaned up until after the callback (see
723  * cleanuprx()).
724  *
725  * msglk is not held.
726  *
727  * NOTE: A message transaction can consist of several messages in either
728  *	 direction.
729  *
730  * NOTE: The msgid is unique to the initiator, not necessarily unique for
731  *	 us or for any relay or for the return direction for that matter.
732  *	 That is, two sides sending a new message can use the same msgid
733  *	 without colliding.
734  *
735  * --
736  *
737  * ABORT sequences work by setting the ABORT flag along with normal message
738  * state.  However, ABORTs can also be sent on half-closed messages, that is
739  * even if the command or reply side has already sent a DELETE, as long as
740  * the message has not been fully closed it can still send an ABORT+DELETE
741  * to terminate the half-closed message state.
742  *
743  * Since ABORT+DELETEs can race we silently discard ABORT's for message
744  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
745  * also race, and in this situation the other side might have already
746  * initiated a new unrelated command with the same message id.  Since
747  * the abort has not set the CREATE flag the situation can be detected
748   * and the message will also be discarded.
749  *
750  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
751  * The ABORT request is essentially integrated into the command instead
752  * of being sent later on.  In this situation the command implementation
753  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
754  * special-case non-blocking operation for the command.
755  *
756  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
757  *	  to be mid-stream aborts for command/reply sequences.  ABORTs on
758  *	  one-way messages are not supported.
759  *
760  * NOTE!  If a command sequence does not support aborts the ABORT flag is
761  *	  simply ignored.
762  *
763  * --
764  *
765  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
766  * set.  One-off messages cannot be aborted and typically aren't processed
767  * by these routines.  The REPLY bit can be used to distinguish whether a
768  * one-off message is a command or reply.  For example, one-off replies
769  * will typically just contain status updates.
770  */
771 static
772 int
kdmsg_state_msgrx(kdmsg_msg_t * msg)773 kdmsg_state_msgrx(kdmsg_msg_t *msg)
774 {
775 	kdmsg_iocom_t *iocom = msg->state->iocom;
776 	kdmsg_state_t *state;
777 	kdmsg_state_t *pstate;
778 	kdmsg_state_t sdummy;
779 	int error;
780 
781 	bzero(&sdummy, sizeof(sdummy));	/* avoid gcc warnings */
782 
783 	/*
784 	 * Make sure a state structure is ready to go in case we need a new
785 	 * one.  This is the only routine which uses freerd_state so no
786 	 * races are possible.
787 	 */
788 	if ((state = iocom->freerd_state) == NULL) {
789 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
790 		state->flags = KDMSG_STATE_DYNAMIC;
791 		state->iocom = iocom;
792 		state->refs = 1;
793 		TAILQ_INIT(&state->subq);
794 		iocom->freerd_state = state;
795 	}
796 	state = NULL;	/* safety */
797 
798 	/*
799 	 * Lock RB tree and locate existing persistent state, if any.
800 	 *
801 	 * If received msg is a command state is on staterd_tree.
802 	 * If received msg is a reply state is on statewr_tree.
803 	 */
804 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
805 
806 again:
807 	if (msg->state == &iocom->state0) {
808 		sdummy.msgid = msg->any.head.msgid;
809 		sdummy.iocom = iocom;
810 		if (msg->any.head.cmd & DMSGF_REVTRANS) {
811 			state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
812 					&sdummy);
813 		} else {
814 			state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
815 					&sdummy);
816 		}
817 
818 		/*
819 		 * Set message state unconditionally.  If this is a CREATE
820 		 * message this state will become the parent state and new
821 		 * state will be allocated for the message state.
822 		 */
823 		if (state == NULL)
824 			state = &iocom->state0;
825 		if (state->flags & KDMSG_STATE_INTERLOCK) {
826 			state->flags |= KDMSG_STATE_SIGNAL;
827 			lksleep(state, &iocom->msglk, 0, "dmrace", hz);
828 			goto again;
829 		}
830 		kdmsg_state_hold(state);
831 		kdmsg_state_drop(msg->state);	/* iocom->state0 */
832 		msg->state = state;
833 	} else {
834 		state = msg->state;
835 	}
836 
837 	/*
838 	 * Short-cut one-off or mid-stream messages.
839 	 */
840 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
841 				  DMSGF_ABORT)) == 0) {
842 		error = 0;
843 		goto done;
844 	}
845 
846 	/*
847 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
848 	 * inside the case statements.
849 	 */
850 	switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
851 	case DMSGF_CREATE:
852 	case DMSGF_CREATE | DMSGF_DELETE:
853 		/*
854 		 * New persistant command received.
855 		 */
856 		if (state != &iocom->state0) {
857 			kdio_printf(iocom, 1, "%s\n",
858 				    "duplicate transaction");
859 			error = EINVAL;
860 			break;
861 		}
862 
863 		/*
864 		 * Lookup the circuit.  The circuit is an open transaction.
865 		 * the REVCIRC bit in the message tells us which side
866 		 * initiated the transaction representing the circuit.
867 		 */
868 		if (msg->any.head.circuit) {
869 			sdummy.msgid = msg->any.head.circuit;
870 
871 			if (msg->any.head.cmd & DMSGF_REVCIRC) {
872 				pstate = RB_FIND(kdmsg_state_tree,
873 						 &iocom->statewr_tree,
874 						 &sdummy);
875 			} else {
876 				pstate = RB_FIND(kdmsg_state_tree,
877 						 &iocom->staterd_tree,
878 						 &sdummy);
879 			}
880 			if (pstate == NULL) {
881 				kdio_printf(iocom, 1, "%s\n",
882 					    "missing parent in "
883 					    "stacked trans");
884 				error = EINVAL;
885 				break;
886 			}
887 		} else {
888 			pstate = &iocom->state0;
889 		}
890 
891 		/*
892 		 * Allocate new state.
893 		 *
894 		 * msg->state becomes the owner of the ref we inherit from
895 		 * freerd_stae.
896 		 */
897 		kdmsg_state_drop(state);
898 		state = iocom->freerd_state;
899 		iocom->freerd_state = NULL;
900 
901 		msg->state = state;		/* inherits freerd ref */
902 		state->parent = pstate;
903 		KKASSERT(state->iocom == iocom);
904 		state->flags |= KDMSG_STATE_RBINSERTED |
905 				KDMSG_STATE_SUBINSERTED |
906 			        KDMSG_STATE_OPPOSITE;
907 		if (TAILQ_EMPTY(&pstate->subq))
908 			kdmsg_state_hold(pstate);/* states on pstate->subq */
909 		kdmsg_state_hold(state);	/* state on pstate->subq */
910 		kdmsg_state_hold(state);	/* state on rbtree */
911 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
912 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
913 		state->txcmd = DMSGF_REPLY;
914 		state->msgid = msg->any.head.msgid;
915 		state->flags &= ~KDMSG_STATE_NEW;
916 		RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
917 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
918 		error = 0;
919 		break;
920 	case DMSGF_DELETE:
921 		/*
922 		 * Persistent state is expected but might not exist if an
923 		 * ABORT+DELETE races the close.
924 		 */
925 		if (state == &iocom->state0) {
926 			if (msg->any.head.cmd & DMSGF_ABORT) {
927 				kdio_printf(iocom, 1, "%s\n",
928 					    "msgrx: "
929 					    "state already A");
930 				error = EALREADY;
931 			} else {
932 				kdio_printf(iocom, 1, "%s\n",
933 					    "msgrx: no state for DELETE");
934 				error = EINVAL;
935 			}
936 			break;
937 		}
938 
939 		/*
940 		 * Handle another ABORT+DELETE case if the msgid has already
941 		 * been reused.
942 		 */
943 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
944 			if (msg->any.head.cmd & DMSGF_ABORT) {
945 				kdio_printf(iocom, 1, "%s\n",
946 					    "msgrx: state already B");
947 				error = EALREADY;
948 			} else {
949 				kdio_printf(iocom, 1, "%s\n",
950 					    "msgrx: state reused for DELETE");
951 				error = EINVAL;
952 			}
953 			break;
954 		}
955 		error = 0;
956 		break;
957 	default:
958 		/*
959 		 * Check for mid-stream ABORT command received, otherwise
960 		 * allow.
961 		 */
962 		if (msg->any.head.cmd & DMSGF_ABORT) {
963 			if (state == &iocom->state0 ||
964 			    (state->rxcmd & DMSGF_CREATE) == 0) {
965 				error = EALREADY;
966 				break;
967 			}
968 		}
969 		error = 0;
970 		break;
971 	case DMSGF_REPLY | DMSGF_CREATE:
972 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
973 		/*
974 		 * When receiving a reply with CREATE set the original
975 		 * persistent state message should already exist.
976 		 */
977 		if (state == &iocom->state0) {
978 			kdio_printf(iocom, 1,
979 				    "msgrx: no state match for "
980 				    "REPLY cmd=%08x msgid=%016jx\n",
981 				    msg->any.head.cmd,
982 				    (intmax_t)msg->any.head.msgid);
983 			error = EINVAL;
984 			break;
985 		}
986 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
987 		error = 0;
988 		break;
989 	case DMSGF_REPLY | DMSGF_DELETE:
990 		/*
991 		 * Received REPLY+ABORT+DELETE in case where msgid has
992 		 * already been fully closed, ignore the message.
993 		 */
994 		if (state == &iocom->state0) {
995 			if (msg->any.head.cmd & DMSGF_ABORT) {
996 				error = EALREADY;
997 			} else {
998 				kdio_printf(iocom, 1, "%s\n",
999 					    "msgrx: no state match "
1000 					    "for REPLY|DELETE");
1001 				error = EINVAL;
1002 			}
1003 			break;
1004 		}
1005 
1006 		/*
1007 		 * Received REPLY+ABORT+DELETE in case where msgid has
1008 		 * already been reused for an unrelated message,
1009 		 * ignore the message.
1010 		 */
1011 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
1012 			if (msg->any.head.cmd & DMSGF_ABORT) {
1013 				error = EALREADY;
1014 			} else {
1015 				kdio_printf(iocom, 1, "%s\n",
1016 					    "msgrx: state reused "
1017 					    "for REPLY|DELETE");
1018 				error = EINVAL;
1019 			}
1020 			break;
1021 		}
1022 		error = 0;
1023 		break;
1024 	case DMSGF_REPLY:
1025 		/*
1026 		 * Check for mid-stream ABORT reply received to sent command.
1027 		 */
1028 		if (msg->any.head.cmd & DMSGF_ABORT) {
1029 			if (state == &iocom->state0 ||
1030 			    (state->rxcmd & DMSGF_CREATE) == 0) {
1031 				error = EALREADY;
1032 				break;
1033 			}
1034 		}
1035 		error = 0;
1036 		break;
1037 	}
1038 
1039 	/*
1040 	 * Calculate the easy-switch() transactional command.  Represents
1041 	 * the outer-transaction command for any transaction-create or
1042 	 * transaction-delete, and the inner message command for any
1043 	 * non-transaction or inside-transaction command.  tcmd will be
1044 	 * set to 0 if the message state is illegal.
1045 	 *
1046 	 * The two can be told apart because outer-transaction commands
1047 	 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1048 	 */
1049 done:
1050 	if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
1051 		if (state != &iocom->state0) {
1052 			msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1053 				    (msg->any.head.cmd & (DMSGF_CREATE |
1054 							  DMSGF_DELETE |
1055 							  DMSGF_REPLY));
1056 		} else {
1057 			msg->tcmd = 0;
1058 		}
1059 	} else {
1060 		msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1061 	}
1062 
1063 	/*
1064 	 * Adjust the state for DELETE handling now, before making the
1065 	 * callback so we are atomic with other state updates.
1066 	 *
1067 	 * Subq/parent linkages are cleaned up after the callback.
1068 	 * If an error occurred the message is ignored and state is not
1069 	 * updated.
1070 	 */
1071 	if ((state = msg->state) == NULL || error != 0) {
1072 		kdio_printf(iocom, 1,
1073 			    "msgrx: state=%p error %d\n",
1074 			    state, error);
1075 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
1076 		KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1077 		state->rxcmd |= DMSGF_DELETE;
1078 		if (state->txcmd & DMSGF_DELETE) {
1079 			KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1080 			if (state->rxcmd & DMSGF_REPLY) {
1081 				KKASSERT(msg->any.head.cmd &
1082 					 DMSGF_REPLY);
1083 				RB_REMOVE(kdmsg_state_tree,
1084 					  &iocom->statewr_tree, state);
1085 			} else {
1086 				KKASSERT((msg->any.head.cmd &
1087 					  DMSGF_REPLY) == 0);
1088 				RB_REMOVE(kdmsg_state_tree,
1089 					  &iocom->staterd_tree, state);
1090 			}
1091 			state->flags &= ~KDMSG_STATE_RBINSERTED;
1092 			kdmsg_state_drop(state);	/* state on rbtree */
1093 		}
1094 	}
1095 	lockmgr(&iocom->msglk, LK_RELEASE);
1096 
1097 	return (error);
1098 }
1099 
1100 /*
1101  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1102  * This routine must call iocom->rcvmsg() for anything not automatically
1103  * handled.
1104  */
1105 static int
kdmsg_autorxmsg(kdmsg_msg_t * msg)1106 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1107 {
1108 	kdmsg_iocom_t *iocom = msg->state->iocom;
1109 	kdmsg_msg_t *rep;
1110 	int error = 0;
1111 	uint32_t cmd;
1112 
1113 	/*
1114 	 * Main switch processes transaction create/delete sequences only.
1115 	 * Use icmd (DELETEs use DMSG_LNK_ERROR
1116 	 *
1117 	 * NOTE: If processing in-transaction messages you generally want
1118 	 *	 an inner switch on msg->any.head.cmd.
1119 	 */
1120 	if (msg->state) {
1121 		cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1122 		      (msg->any.head.cmd & (DMSGF_CREATE |
1123 					    DMSGF_DELETE |
1124 					    DMSGF_REPLY));
1125 	} else {
1126 		cmd = 0;
1127 	}
1128 
1129 	switch(cmd) {
1130 	case DMSG_LNK_PING:
1131 		/*
1132 		 * Received ping, send reply
1133 		 */
1134 		rep = kdmsg_msg_alloc(msg->state, DMSG_LNK_PING | DMSGF_REPLY,
1135 				      NULL, NULL);
1136 		kdmsg_msg_write(rep);
1137 		break;
1138 	case DMSG_LNK_PING | DMSGF_REPLY:
1139 		/* ignore replies */
1140 		break;
1141 	case DMSG_LNK_CONN | DMSGF_CREATE:
1142 	case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1143 		/*
1144 		 * Received LNK_CONN transaction.  Transmit response and
1145 		 * leave transaction open, which allows the other end to
1146 		 * start to the SPAN protocol.
1147 		 *
1148 		 * Handle shim after acknowledging the CONN.
1149 		 */
1150 		if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1151 			if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1152 				kdmsg_msg_result(msg, 0);
1153 				if (iocom->auto_callback)
1154 					iocom->auto_callback(msg);
1155 			} else {
1156 				error = iocom->rcvmsg(msg);
1157 			}
1158 			break;
1159 		}
1160 		/* fall through */
1161 	case DMSG_LNK_CONN | DMSGF_DELETE:
1162 		/*
1163 		 * This message is usually simulated after a link is lost
1164 		 * to clean up the transaction.
1165 		 */
1166 		if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1167 			if (iocom->auto_callback)
1168 				iocom->auto_callback(msg);
1169 			kdmsg_msg_reply(msg, 0);
1170 		} else {
1171 			error = iocom->rcvmsg(msg);
1172 		}
1173 		break;
1174 	case DMSG_LNK_SPAN | DMSGF_CREATE:
1175 	case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1176 		/*
1177 		 * Received LNK_SPAN transaction.  We do not have to respond
1178 		 * (except on termination), but we must leave the transaction
1179 		 * open.
1180 		 *
1181 		 * Handle shim after acknowledging the SPAN.
1182 		 */
1183 		if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1184 			if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1185 				if (iocom->auto_callback)
1186 					iocom->auto_callback(msg);
1187 				break;
1188 			}
1189 			/* fall through */
1190 		} else {
1191 			error = iocom->rcvmsg(msg);
1192 			break;
1193 		}
1194 		/* fall through */
1195 	case DMSG_LNK_SPAN | DMSGF_DELETE:
1196 		/*
1197 		 * Process shims (auto_callback) before cleaning up the
1198 		 * circuit structure and closing the transactions.  Device
1199 		 * driver should ensure that the circuit is not used after
1200 		 * the auto_callback() returns.
1201 		 *
1202 		 * Handle shim before closing the SPAN transaction.
1203 		 */
1204 		if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1205 			if (iocom->auto_callback)
1206 				iocom->auto_callback(msg);
1207 			kdmsg_msg_reply(msg, 0);
1208 		} else {
1209 			error = iocom->rcvmsg(msg);
1210 		}
1211 		break;
1212 	default:
1213 		/*
1214 		 * Anything unhandled goes into rcvmsg.
1215 		 *
1216 		 * NOTE: Replies to link-level messages initiated by our side
1217 		 *	 are handled by the state callback, they are NOT
1218 		 *	 handled here.
1219 		 */
1220 		error = iocom->rcvmsg(msg);
1221 		break;
1222 	}
1223 	return (error);
1224 }
1225 
1226 /*
1227  * Post-receive-handling message and state cleanup.  This routine is called
1228  * after the state function handling/callback to properly dispose of the
1229  * message and unlink the state's parent/subq linkage if the state is
1230  * completely closed.
1231  *
1232  * msglk is not held.
1233  */
1234 static
1235 void
kdmsg_state_cleanuprx(kdmsg_msg_t * msg)1236 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1237 {
1238 	kdmsg_state_t *state = msg->state;
1239 	kdmsg_iocom_t *iocom = state->iocom;
1240 
1241 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1242 	if (state != &iocom->state0) {
1243 		/*
1244 		 * When terminating a transaction (in either direction), all
1245 		 * sub-states are aborted.
1246 		 */
1247 		if ((msg->any.head.cmd & DMSGF_DELETE) &&
1248 		    TAILQ_FIRST(&msg->state->subq)) {
1249 			kdio_printf(iocom, 2,
1250 				    "simulate failure for substates of "
1251 				    "state %p cmd %08x/%08x\n",
1252 				    msg->state,
1253 				    msg->state->rxcmd,
1254 				    msg->state->txcmd);
1255 			kdmsg_simulate_failure(msg->state,
1256 					       0, DMSG_ERR_LOSTLINK);
1257 		}
1258 
1259 		/*
1260 		 * Once the state is fully closed we can (try to) remove it
1261 		 * from the subq topology.
1262 		 */
1263 		if ((state->flags & KDMSG_STATE_SUBINSERTED) &&
1264 		    (state->rxcmd & DMSGF_DELETE) &&
1265 		    (state->txcmd & DMSGF_DELETE)) {
1266 			/*
1267 			 * Remove parent linkage if state is completely closed.
1268 			 */
1269 			kdmsg_subq_delete(state);
1270 		}
1271 	}
1272 	kdmsg_msg_free(msg);
1273 
1274 	lockmgr(&iocom->msglk, LK_RELEASE);
1275 }
1276 
1277 /*
1278  * Remove state from its parent's subq.  This can wind up recursively
1279  * dropping the parent upward.
1280  *
1281  * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1282  */
1283 static
1284 void
kdmsg_subq_delete(kdmsg_state_t * state)1285 kdmsg_subq_delete(kdmsg_state_t *state)
1286 {
1287 	kdmsg_state_t *pstate;
1288 
1289 	if (state->flags & KDMSG_STATE_SUBINSERTED) {
1290 		pstate = state->parent;
1291 		KKASSERT(pstate);
1292 		if (pstate->scan == state)
1293 			pstate->scan = NULL;
1294 		TAILQ_REMOVE(&pstate->subq, state, entry);
1295 		state->flags &= ~KDMSG_STATE_SUBINSERTED;
1296 		state->parent = NULL;
1297 		if (TAILQ_EMPTY(&pstate->subq)) {
1298 			kdmsg_state_drop(pstate);/* pstate->subq */
1299 		}
1300 		pstate = NULL;			 /* safety */
1301 		kdmsg_state_drop(state);  	 /* pstate->subq */
1302 	} else {
1303 		KKASSERT(state->parent == NULL);
1304 	}
1305 }
1306 
1307 /*
1308  * Simulate receiving a message which terminates an active transaction
1309  * state.  Our simulated received message must set DELETE and may also
1310  * have to set CREATE.  It must also ensure that all fields are set such
1311  * that the receive handling code can find the state (kdmsg_state_msgrx())
1312  * or an endless loop will ensue.
1313  *
1314  * This is used when the other end of the link is dead so the device driver
1315  * gets a completed transaction for all pending states.
1316  *
1317  * Called with iocom locked.
1318  */
1319 static
1320 void
kdmsg_simulate_failure(kdmsg_state_t * state,int meto,int error)1321 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1322 {
1323 	kdmsg_state_t *substate;
1324 
1325 	kdmsg_state_hold(state);		/* aborting */
1326 
1327 	/*
1328 	 * Abort parent state first. Parent will not actually disappear
1329 	 * until children are gone.  Device drivers must handle the situation.
1330 	 * The advantage of this is that device drivers can flag the situation
1331 	 * as an interlock against new operations on dying states.  And since
1332 	 * device operations are often asynchronous anyway, this sequence of
1333 	 * events works out better.
1334 	 */
1335 	if (meto)
1336 		kdmsg_state_abort(state);
1337 
1338 	/*
1339 	 * Recurse through any children.
1340 	 */
1341 again:
1342 	TAILQ_FOREACH(substate, &state->subq, entry) {
1343 		if (substate->flags & KDMSG_STATE_ABORTING)
1344 			continue;
1345 		state->scan = substate;
1346 		kdmsg_simulate_failure(substate, 1, error);
1347 		if (state->scan != substate)
1348 			goto again;
1349 	}
1350 	kdmsg_state_drop(state);		/* aborting */
1351 }
1352 
1353 static
1354 void
kdmsg_state_abort(kdmsg_state_t * state)1355 kdmsg_state_abort(kdmsg_state_t *state)
1356 {
1357 	kdmsg_msg_t *msg;
1358 
1359 	/*
1360 	 * Set ABORTING and DYING, return if already set.  If the state was
1361 	 * just allocated we defer the abort operation until the related
1362 	 * message is processed.
1363 	 */
1364 	KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1365 	if (state->flags & KDMSG_STATE_ABORTING)
1366 		return;
1367 	state->flags |= KDMSG_STATE_ABORTING;
1368 	kdmsg_state_dying(state);
1369 	if (state->flags & KDMSG_STATE_NEW) {
1370 		kdio_printf(iocom, 5,
1371 			    "kdmsg_state_abort(0): state %p rxcmd %08x "
1372 			    "txcmd %08x flags %08x - in NEW state\n",
1373 			    state, state->rxcmd,
1374 			    state->txcmd, state->flags);
1375 		return;
1376 	}
1377 
1378 	/*
1379 	 * NOTE: The DELETE flag might already be set due to an early
1380 	 *	 termination.
1381 	 *
1382 	 * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1383 	 *
1384 	 * NOTE: We are simulating a received message using our state
1385 	 *	 (vs a message generated by the other side using its state),
1386 	 *	 so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1387 	 */
1388 	kdio_printf(iocom, 5,
1389 		    "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1390 		    state, state->rxcmd, state->txcmd);
1391 	if ((state->rxcmd & DMSGF_DELETE) == 0) {
1392 		msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1393 		if ((state->rxcmd & DMSGF_CREATE) == 0)
1394 			msg->any.head.cmd |= DMSGF_CREATE;
1395 		msg->any.head.cmd |= DMSGF_DELETE |
1396 				     (state->rxcmd & DMSGF_REPLY);
1397 		msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1398 		msg->any.head.error = DMSG_ERR_LOSTLINK;
1399 		kdio_printf(iocom, 5,
1400 			    "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1401 			    state, msg->any.head.cmd);
1402 		/* circuit not initialized */
1403 		lockmgr(&state->iocom->msglk, LK_RELEASE);
1404 		kdmsg_msg_receive_handling(msg);
1405 		lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1406 		msg = NULL;
1407 	}
1408 	kdio_printf(iocom, 5,
1409 		    "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1410 		    state, state->rxcmd, state->txcmd);
1411 }
1412 
1413 /*
1414  * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1415  * the transmission of any new messages on these states.  This is done
1416  * atomically when parent state is terminating, whereas setting ABORTING is
1417  * not atomic and can leak races.
1418  */
1419 static
1420 void
kdmsg_state_dying(kdmsg_state_t * state)1421 kdmsg_state_dying(kdmsg_state_t *state)
1422 {
1423 	kdmsg_state_t *scan;
1424 
1425 	if ((state->flags & KDMSG_STATE_DYING) == 0) {
1426 		state->flags |= KDMSG_STATE_DYING;
1427 		TAILQ_FOREACH(scan, &state->subq, entry)
1428 			kdmsg_state_dying(scan);
1429 	}
1430 }
1431 
1432 /*
1433  * Process state tracking for a message prior to transmission.
1434  *
1435  * Called with msglk held and the msg dequeued.  Returns non-zero if
1436  * the message is bad and should be deleted by the caller.
1437  *
1438  * One-off messages are usually with dummy state and msg->state may be NULL
1439  * in this situation.
1440  *
1441  * New transactions (when CREATE is set) will insert the state.
1442  *
1443  * May request that caller discard the message by setting *discardp to 1.
1444  * A NULL state may be returned in this case.
1445  */
1446 static
1447 int
kdmsg_state_msgtx(kdmsg_msg_t * msg)1448 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1449 {
1450 	kdmsg_iocom_t *iocom = msg->state->iocom;
1451 	kdmsg_state_t *state;
1452 	int error;
1453 
1454 	/*
1455 	 * Make sure a state structure is ready to go in case we need a new
1456 	 * one.  This is the only routine which uses freewr_state so no
1457 	 * races are possible.
1458 	 */
1459 	if ((state = iocom->freewr_state) == NULL) {
1460 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1461 		state->flags = KDMSG_STATE_DYNAMIC;
1462 		state->iocom = iocom;
1463 		state->refs = 1;
1464 		TAILQ_INIT(&state->subq);
1465 		iocom->freewr_state = state;
1466 	}
1467 
1468 	/*
1469 	 * Lock RB tree.  If persistent state is present it will have already
1470 	 * been assigned to msg.
1471 	 */
1472 	state = msg->state;
1473 
1474 	/*
1475 	 * Short-cut one-off or mid-stream messages (state may be NULL).
1476 	 */
1477 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1478 				  DMSGF_ABORT)) == 0) {
1479 		return(0);
1480 	}
1481 
1482 
1483 	/*
1484 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1485 	 * inside the case statements.
1486 	 */
1487 	switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1488 				    DMSGF_REPLY)) {
1489 	case DMSGF_CREATE:
1490 	case DMSGF_CREATE | DMSGF_DELETE:
1491 		/*
1492 		 * Insert the new persistent message state and mark
1493 		 * half-closed if DELETE is set.  Since this is a new
1494 		 * message it isn't possible to transition into the fully
1495 		 * closed state here.
1496 		 *
1497 		 * XXX state must be assigned and inserted by
1498 		 *     kdmsg_msg_write().  txcmd is assigned by us
1499 		 *     on-transmit.
1500 		 */
1501 		KKASSERT(state != NULL);
1502 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1503 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1504 		state->rxcmd = DMSGF_REPLY;
1505 		state->flags &= ~KDMSG_STATE_NEW;
1506 		error = 0;
1507 		break;
1508 	case DMSGF_DELETE:
1509 		/*
1510 		 * Sent ABORT+DELETE in case where msgid has already
1511 		 * been fully closed, ignore the message.
1512 		 */
1513 		if (state == &iocom->state0) {
1514 			if (msg->any.head.cmd & DMSGF_ABORT) {
1515 				error = EALREADY;
1516 			} else {
1517 				kdio_printf(iocom, 1,
1518 					"msgtx: no state match "
1519 					"for DELETE cmd=%08x msgid=%016jx\n",
1520 					msg->any.head.cmd,
1521 					(intmax_t)msg->any.head.msgid);
1522 				error = EINVAL;
1523 			}
1524 			break;
1525 		}
1526 
1527 		/*
1528 		 * Sent ABORT+DELETE in case where msgid has
1529 		 * already been reused for an unrelated message,
1530 		 * ignore the message.
1531 		 */
1532 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1533 			if (msg->any.head.cmd & DMSGF_ABORT) {
1534 				error = EALREADY;
1535 			} else {
1536 				kdio_printf(iocom, 1, "%s\n",
1537 					    "msgtx: state reused "
1538 					    "for DELETE");
1539 				error = EINVAL;
1540 			}
1541 			break;
1542 		}
1543 		error = 0;
1544 		break;
1545 	default:
1546 		/*
1547 		 * Check for mid-stream ABORT command sent
1548 		 */
1549 		if (msg->any.head.cmd & DMSGF_ABORT) {
1550 			if (state == &state->iocom->state0 ||
1551 			    (state->txcmd & DMSGF_CREATE) == 0) {
1552 				error = EALREADY;
1553 				break;
1554 			}
1555 		}
1556 		error = 0;
1557 		break;
1558 	case DMSGF_REPLY | DMSGF_CREATE:
1559 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1560 		/*
1561 		 * When transmitting a reply with CREATE set the original
1562 		 * persistent state message should already exist.
1563 		 */
1564 		if (state == &state->iocom->state0) {
1565 			kdio_printf(iocom, 1, "%s\n",
1566 				    "msgtx: no state match "
1567 				    "for REPLY | CREATE");
1568 			error = EINVAL;
1569 			break;
1570 		}
1571 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1572 		error = 0;
1573 		break;
1574 	case DMSGF_REPLY | DMSGF_DELETE:
1575 		/*
1576 		 * When transmitting a reply with DELETE set the original
1577 		 * persistent state message should already exist.
1578 		 *
1579 		 * This is very similar to the REPLY|CREATE|* case except
1580 		 * txcmd is already stored, so we just add the DELETE flag.
1581 		 *
1582 		 * Sent REPLY+ABORT+DELETE in case where msgid has
1583 		 * already been fully closed, ignore the message.
1584 		 */
1585 		if (state == &state->iocom->state0) {
1586 			if (msg->any.head.cmd & DMSGF_ABORT) {
1587 				error = EALREADY;
1588 			} else {
1589 				kdio_printf(iocom, 1, "%s\n",
1590 					    "msgtx: no state match "
1591 					    "for REPLY | DELETE");
1592 				error = EINVAL;
1593 			}
1594 			break;
1595 		}
1596 
1597 		/*
1598 		 * Sent REPLY+ABORT+DELETE in case where msgid has already
1599 		 * been reused for an unrelated message, ignore the message.
1600 		 */
1601 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1602 			if (msg->any.head.cmd & DMSGF_ABORT) {
1603 				error = EALREADY;
1604 			} else {
1605 				kdio_printf(iocom, 1, "%s\n",
1606 					    "msgtx: state reused "
1607 					    "for REPLY | DELETE");
1608 				error = EINVAL;
1609 			}
1610 			break;
1611 		}
1612 		error = 0;
1613 		break;
1614 	case DMSGF_REPLY:
1615 		/*
1616 		 * Check for mid-stream ABORT reply sent.
1617 		 *
1618 		 * One-off REPLY messages are allowed for e.g. status updates.
1619 		 */
1620 		if (msg->any.head.cmd & DMSGF_ABORT) {
1621 			if (state == &state->iocom->state0 ||
1622 			    (state->txcmd & DMSGF_CREATE) == 0) {
1623 				error = EALREADY;
1624 				break;
1625 			}
1626 		}
1627 		error = 0;
1628 		break;
1629 	}
1630 
1631 	/*
1632 	 * Set interlock (XXX hack) in case the send side blocks and a
1633 	 * response is returned before kdmsg_state_cleanuptx() can be
1634 	 * run.
1635 	 */
1636 	if (state && error == 0)
1637 		state->flags |= KDMSG_STATE_INTERLOCK;
1638 
1639 	return (error);
1640 }
1641 
1642 /*
1643  * Called with iocom locked.
1644  */
1645 static
1646 void
kdmsg_state_cleanuptx(kdmsg_msg_t * msg)1647 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1648 {
1649 	kdmsg_iocom_t *iocom = msg->state->iocom;
1650 	kdmsg_state_t *state;
1651 
1652 	if ((state = msg->state) == NULL) {
1653 		kdmsg_msg_free(msg);
1654 		return;
1655 	}
1656 
1657 	/*
1658 	 * Clear interlock (XXX hack) in case the send side blocks and a
1659 	 * response is returned in the other thread before
1660 	 * kdmsg_state_cleanuptx() can be run.  We maintain our hold on
1661 	 * iocom->msglk so we can do this before completing our task.
1662 	 */
1663 	if (state->flags & KDMSG_STATE_SIGNAL) {
1664 		kdio_printf(iocom, 1, "state %p interlock!\n", state);
1665 		wakeup(state);
1666 	}
1667 	state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1668 	kdmsg_state_hold(state);
1669 
1670 	if (msg->any.head.cmd & DMSGF_DELETE) {
1671 		KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1672 		state->txcmd |= DMSGF_DELETE;
1673 		if (state->rxcmd & DMSGF_DELETE) {
1674 			KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1675 			if (state->txcmd & DMSGF_REPLY) {
1676 				KKASSERT(msg->any.head.cmd &
1677 					 DMSGF_REPLY);
1678 				RB_REMOVE(kdmsg_state_tree,
1679 					  &iocom->staterd_tree, state);
1680 			} else {
1681 				KKASSERT((msg->any.head.cmd &
1682 					  DMSGF_REPLY) == 0);
1683 				RB_REMOVE(kdmsg_state_tree,
1684 					  &iocom->statewr_tree, state);
1685 			}
1686 			state->flags &= ~KDMSG_STATE_RBINSERTED;
1687 
1688 			/*
1689 			 * The subq recursion is used for parent linking and
1690 			 * scanning the topology for aborts, we can only
1691 			 * remove leafs.  The circuit is effectively dead now,
1692 			 * but topology won't be torn down until all of its
1693 			 * children have finished/aborted.
1694 			 *
1695 			 * This is particularly important for end-point
1696 			 * devices which might need to access private data
1697 			 * in parent states.  Out of order disconnects can
1698 			 * occur if an end-point device is processing a
1699 			 * message transaction asynchronously because abort
1700 			 * requests are basically synchronous and it probably
1701 			 * isn't convenient (or possible) for the end-point
1702 			 * to abort an asynchronous operation.
1703 			 */
1704 			if (TAILQ_EMPTY(&state->subq))
1705 				kdmsg_subq_delete(state);
1706 			kdmsg_msg_free(msg);
1707 			kdmsg_state_drop(state);   /* state on rbtree */
1708 		} else {
1709 			kdmsg_msg_free(msg);
1710 		}
1711 	} else {
1712 		kdmsg_msg_free(msg);
1713 	}
1714 
1715 	/*
1716 	 * Deferred abort after transmission.
1717 	 */
1718 	if ((state->flags & (KDMSG_STATE_ABORTING | KDMSG_STATE_DYING)) &&
1719 	    (state->rxcmd & DMSGF_DELETE) == 0) {
1720 		kdio_printf(iocom, 5,
1721 			    "kdmsg_state_cleanuptx: state=%p "
1722 			    "executing deferred abort\n",
1723 			    state);
1724 		state->flags &= ~KDMSG_STATE_ABORTING;
1725 		kdmsg_state_abort(state);
1726 	}
1727 	kdmsg_state_drop(state);
1728 }
1729 
1730 static
1731 void
_kdmsg_state_hold(kdmsg_state_t * state KDMSG_DEBUG_ARGS)1732 _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1733 {
1734 	atomic_add_int(&state->refs, 1);
1735 #if KDMSG_DEBUG
1736 	kd_printf(4, "state %p +%d\t%s:%d\n", state, state->refs, file, line);
1737 #endif
1738 }
1739 
1740 static
1741 void
_kdmsg_state_drop(kdmsg_state_t * state KDMSG_DEBUG_ARGS)1742 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1743 {
1744 	KKASSERT(state->refs > 0);
1745 #if KDMSG_DEBUG
1746 	kd_printf(4, "state %p -%d\t%s:%d\n", state, state->refs, file, line);
1747 #endif
1748 	if (atomic_fetchadd_int(&state->refs, -1) == 1)
1749 		kdmsg_state_free(state);
1750 }
1751 
1752 static
1753 void
kdmsg_state_free(kdmsg_state_t * state)1754 kdmsg_state_free(kdmsg_state_t *state)
1755 {
1756 	kdmsg_iocom_t *iocom = state->iocom;
1757 
1758 	KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1759 	KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1760 	KKASSERT(TAILQ_EMPTY(&state->subq));
1761 
1762 	if (state != &state->iocom->state0)
1763 		kfree(state, iocom->mmsg);
1764 }
1765 
1766 kdmsg_msg_t *
kdmsg_msg_alloc(kdmsg_state_t * state,uint32_t cmd,int (* func)(kdmsg_state_t *,kdmsg_msg_t *),void * data)1767 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1768 		int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1769 {
1770 	kdmsg_iocom_t *iocom = state->iocom;
1771 	kdmsg_state_t *pstate;
1772 	kdmsg_msg_t *msg;
1773 	size_t hbytes;
1774 
1775 	KKASSERT(iocom != NULL);
1776 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1777 	msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1778 		      iocom->mmsg, M_WAITOK | M_ZERO);
1779 	msg->hdr_size = hbytes;
1780 
1781 	if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1782 		/*
1783 		 * New transaction, requires tracking state and a unique
1784 		 * msgid to be allocated.
1785 		 *
1786 		 * It is possible to race a circuit failure, inherit the
1787 		 * parent's STATE_DYING flag to trigger an abort sequence
1788 		 * in the transmit path.  By not inheriting ABORTING the
1789 		 * abort sequence can recurse.
1790 		 *
1791 		 * NOTE: The transactions has not yet been initiated so we
1792 		 *	 cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1793 		 *	 We have to properly setup DMSGF_REPLY, however.
1794 		 */
1795 		pstate = state;
1796 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1797 		TAILQ_INIT(&state->subq);
1798 		state->iocom = iocom;
1799 		state->parent = pstate;
1800 		state->flags = KDMSG_STATE_DYNAMIC |
1801 			       KDMSG_STATE_NEW;
1802 		state->func = func;
1803 		state->any.any = data;
1804 		state->msgid = (uint64_t)(uintptr_t)state;
1805 		/*msg->any.head.msgid = state->msgid;XXX*/
1806 
1807 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1808 		if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1809 			panic("duplicate msgid allocated");
1810 		if (TAILQ_EMPTY(&pstate->subq))
1811 			kdmsg_state_hold(pstate);/* pstate->subq */
1812 		TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1813 		state->flags |= KDMSG_STATE_RBINSERTED |
1814 				KDMSG_STATE_SUBINSERTED;
1815 		state->flags |= pstate->flags & KDMSG_STATE_DYING;
1816 		kdmsg_state_hold(state);	/* pstate->subq */
1817 		kdmsg_state_hold(state);	/* state on rbtree */
1818 		kdmsg_state_hold(state);	/* msg->state */
1819 		lockmgr(&iocom->msglk, LK_RELEASE);
1820 	} else {
1821 		pstate = state->parent;
1822 		KKASSERT(pstate != NULL);
1823 		kdmsg_state_hold(state);	/* msg->state */
1824 	}
1825 
1826 	if (state->flags & KDMSG_STATE_OPPOSITE)
1827 		cmd |= DMSGF_REVTRANS;
1828 	if (pstate->flags & KDMSG_STATE_OPPOSITE)
1829 		cmd |= DMSGF_REVCIRC;
1830 
1831 	msg->any.head.magic = DMSG_HDR_MAGIC;
1832 	msg->any.head.cmd = cmd;
1833 	msg->any.head.msgid = state->msgid;
1834 	msg->any.head.circuit = pstate->msgid;
1835 	msg->state = state;
1836 
1837 	return (msg);
1838 }
1839 
1840 void
kdmsg_msg_free(kdmsg_msg_t * msg)1841 kdmsg_msg_free(kdmsg_msg_t *msg)
1842 {
1843 	kdmsg_iocom_t *iocom = msg->state->iocom;
1844 	kdmsg_state_t *state;
1845 
1846 	if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1847 	    msg->aux_data && msg->aux_size) {
1848 		kfree(msg->aux_data, iocom->mmsg);
1849 		msg->aux_data = NULL;
1850 		msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1851 	}
1852 	if ((state = msg->state) != NULL) {
1853 		msg->state = NULL;
1854 		kdmsg_state_drop(state);	/* msg->state */
1855 	}
1856 	msg->aux_data = NULL;
1857 	msg->aux_size = 0;
1858 
1859 	kfree(msg, iocom->mmsg);
1860 }
1861 
1862 void
kdmsg_detach_aux_data(kdmsg_msg_t * msg,kdmsg_data_t * data)1863 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1864 {
1865 	if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1866 		data->aux_data = msg->aux_data;
1867 		data->aux_size = msg->aux_size;
1868 		data->iocom = msg->state->iocom;
1869 		msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1870 	} else {
1871 		data->aux_data = NULL;
1872 		data->aux_size = 0;
1873 		data->iocom = msg->state->iocom;
1874 	}
1875 }
1876 
1877 void
kdmsg_free_aux_data(kdmsg_data_t * data)1878 kdmsg_free_aux_data(kdmsg_data_t *data)
1879 {
1880 	if (data->aux_data) {
1881 		kfree(data->aux_data, data->iocom->mmsg);
1882 		data->aux_data = NULL;
1883 	}
1884 }
1885 
1886 /*
1887  * Indexed messages are stored in a red-black tree indexed by their
1888  * msgid.  Only persistent messages are indexed.
1889  */
1890 int
kdmsg_state_cmp(kdmsg_state_t * state1,kdmsg_state_t * state2)1891 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1892 {
1893 	if (state1->iocom < state2->iocom)
1894 		return(-1);
1895 	if (state1->iocom > state2->iocom)
1896 		return(1);
1897 	if (state1->msgid < state2->msgid)
1898 		return(-1);
1899 	if (state1->msgid > state2->msgid)
1900 		return(1);
1901 	return(0);
1902 }
1903 
1904 /*
1905  * Write a message.  All requisit command flags have been set.
1906  *
1907  * If msg->state is non-NULL the message is written to the existing
1908  * transaction.  msgid will be set accordingly.
1909  *
1910  * If msg->state is NULL and CREATE is set new state is allocated and
1911  * (func, data) is installed.  A msgid is assigned.
1912  *
1913  * If msg->state is NULL and CREATE is not set the message is assumed
1914  * to be a one-way message.  The originator must assign the msgid
1915  * (or leave it 0, which is typical.
1916  *
1917  * This function merely queues the message to the management thread, it
1918  * does not write to the message socket/pipe.
1919  */
1920 void
kdmsg_msg_write(kdmsg_msg_t * msg)1921 kdmsg_msg_write(kdmsg_msg_t *msg)
1922 {
1923 	kdmsg_iocom_t *iocom = msg->state->iocom;
1924 
1925 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1926 	kdmsg_msg_write_locked(iocom, msg);
1927 	lockmgr(&iocom->msglk, LK_RELEASE);
1928 }
1929 
1930 static void
kdmsg_msg_write_locked(kdmsg_iocom_t * iocom,kdmsg_msg_t * msg)1931 kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg)
1932 {
1933 	kdmsg_state_t *state;
1934 
1935 	if (msg->state) {
1936 		/*
1937 		 * Continuance or termination of existing transaction.
1938 		 * The transaction could have been initiated by either end.
1939 		 *
1940 		 * (Function callback and aux data for the receive side can
1941 		 * be replaced or left alone).
1942 		 */
1943 		state = msg->state;
1944 		msg->any.head.msgid = state->msgid;
1945 	} else {
1946 		/*
1947 		 * One-off message (always uses msgid 0 to distinguish
1948 		 * between a possibly lost in-transaction message due to
1949 		 * competing aborts and a real one-off message?)
1950 		 */
1951 		state = NULL;
1952 		msg->any.head.msgid = 0;
1953 	}
1954 
1955 	/*
1956 	 * For stateful messages, if the circuit is dead or dying we have
1957 	 * to abort the potentially newly-created state and discard the
1958 	 * message.
1959 	 *
1960 	 * - We must discard the message because the other end will not
1961 	 *   be expecting any more messages over the dead or dying circuit
1962 	 *   and might not be able to receive them.
1963 	 *
1964 	 * - We abort the state by simulating a failure to generate a fake
1965 	 *   incoming DELETE.  This will trigger the state callback and allow
1966 	 *   the device to clean things up and reply, closing the outgoing
1967 	 *   direction and allowing the state to be freed.
1968 	 *
1969 	 * This situation occurs quite often, particularly as SPANs stabilize.
1970 	 * End-points must do the right thing.
1971 	 */
1972 	if (state) {
1973 		KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1974 		if (state->flags & KDMSG_STATE_DYING) {
1975 #if 0
1976 		if ((state->flags & KDMSG_STATE_DYING) ||
1977 		    (state->parent->txcmd & DMSGF_DELETE) ||
1978 		    (state->parent->flags & KDMSG_STATE_DYING)) {
1979 #endif
1980 			kdio_printf(iocom, 4,
1981 				    "kdmsg_msg_write: Write to dying circuit "
1982 				    "state=%p "
1983 				    "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1984 				    state,
1985 				    state->parent->rxcmd,
1986 				    state->parent->txcmd,
1987 				    state->parent->flags);
1988 			kdmsg_state_hold(state);
1989 			kdmsg_state_msgtx(msg);
1990 			kdmsg_state_cleanuptx(msg);
1991 			kdmsg_state_drop(state);
1992 			return;
1993 		}
1994 	}
1995 
1996 	/*
1997 	 * Finish up the msg fields.  Note that msg->aux_size and the
1998 	 * aux_bytes stored in the message header represent the unaligned
1999 	 * (actual) bytes of data, but the buffer is sized to an aligned
2000 	 * size and the CRC is generated over the aligned length.
2001 	 */
2002 	msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
2003 	++iocom->msg_seq;
2004 
2005 	if (msg->aux_data && msg->aux_size) {
2006 		uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
2007 
2008 		msg->any.head.aux_bytes = msg->aux_size;
2009 		msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
2010 	}
2011 	msg->any.head.hdr_crc = 0;
2012 	msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
2013 
2014 	/*
2015 	 * If termination races new message senders we must drain the
2016 	 * message immediately instead of queue it.
2017 	 */
2018 	if (iocom->flags & KDMSG_IOCOMF_EXITNOACC)
2019 		kdmsg_drain_msg(msg);
2020 	else
2021 		TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
2022 
2023 	if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
2024 		atomic_clear_int(&iocom->msg_ctl,
2025 				 KDMSG_CLUSTERCTL_SLEEPING);
2026 		wakeup(&iocom->msg_ctl);
2027 	}
2028 }
2029 
2030 /*
2031  * Reply to a message and terminate our side of the transaction.
2032  *
2033  * If msg->state is non-NULL we are replying to a one-way message.
2034  */
2035 void
2036 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
2037 {
2038 	kdmsg_state_t *state = msg->state;
2039 	kdmsg_msg_t *nmsg;
2040 	uint32_t cmd;
2041 
2042 	/*
2043 	 * Reply with a simple error code and terminate the transaction.
2044 	 */
2045 	cmd = DMSG_LNK_ERROR;
2046 
2047 	/*
2048 	 * Check if our direction has even been initiated yet, set CREATE.
2049 	 *
2050 	 * Check what direction this is (command or reply direction).  Note
2051 	 * that txcmd might not have been initiated yet.
2052 	 *
2053 	 * If our direction has already been closed we just return without
2054 	 * doing anything.
2055 	 */
2056 	if (state != &state->iocom->state0) {
2057 		if (state->txcmd & DMSGF_DELETE)
2058 			return;
2059 		if ((state->txcmd & DMSGF_CREATE) == 0)
2060 			cmd |= DMSGF_CREATE;
2061 		if (state->txcmd & DMSGF_REPLY)
2062 			cmd |= DMSGF_REPLY;
2063 		cmd |= DMSGF_DELETE;
2064 	} else {
2065 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2066 			cmd |= DMSGF_REPLY;
2067 	}
2068 
2069 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2070 	nmsg->any.head.error = error;
2071 	kdmsg_msg_write(nmsg);
2072 }
2073 
2074 /*
2075  * Reply to a message and continue our side of the transaction.
2076  *
2077  * If msg->state is non-NULL we are replying to a one-way message and this
2078  * function degenerates into the same as kdmsg_msg_reply().
2079  */
2080 void
2081 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
2082 {
2083 	kdmsg_state_t *state = msg->state;
2084 	kdmsg_msg_t *nmsg;
2085 	uint32_t cmd;
2086 
2087 	/*
2088 	 * Return a simple result code, do NOT terminate the transaction.
2089 	 */
2090 	cmd = DMSG_LNK_ERROR;
2091 
2092 	/*
2093 	 * Check if our direction has even been initiated yet, set CREATE.
2094 	 *
2095 	 * Check what direction this is (command or reply direction).  Note
2096 	 * that txcmd might not have been initiated yet.
2097 	 *
2098 	 * If our direction has already been closed we just return without
2099 	 * doing anything.
2100 	 */
2101 	if (state != &state->iocom->state0) {
2102 		if (state->txcmd & DMSGF_DELETE)
2103 			return;
2104 		if ((state->txcmd & DMSGF_CREATE) == 0)
2105 			cmd |= DMSGF_CREATE;
2106 		if (state->txcmd & DMSGF_REPLY)
2107 			cmd |= DMSGF_REPLY;
2108 		/* continuing transaction, do not set MSGF_DELETE */
2109 	} else {
2110 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2111 			cmd |= DMSGF_REPLY;
2112 	}
2113 
2114 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2115 	nmsg->any.head.error = error;
2116 	kdmsg_msg_write(nmsg);
2117 }
2118 
2119 /*
2120  * Reply to a message and terminate our side of the transaction.
2121  *
2122  * If msg->state is non-NULL we are replying to a one-way message.
2123  */
2124 void
2125 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
2126 {
2127 	kdmsg_msg_t *nmsg;
2128 	uint32_t cmd;
2129 
2130 	/*
2131 	 * Reply with a simple error code and terminate the transaction.
2132 	 */
2133 	cmd = DMSG_LNK_ERROR;
2134 
2135 	/*
2136 	 * Check if our direction has even been initiated yet, set CREATE.
2137 	 *
2138 	 * Check what direction this is (command or reply direction).  Note
2139 	 * that txcmd might not have been initiated yet.
2140 	 *
2141 	 * If our direction has already been closed we just return without
2142 	 * doing anything.
2143 	 */
2144 	KKASSERT(state);
2145 	if (state->txcmd & DMSGF_DELETE)
2146 		return;
2147 	if ((state->txcmd & DMSGF_CREATE) == 0)
2148 		cmd |= DMSGF_CREATE;
2149 	if (state->txcmd & DMSGF_REPLY)
2150 		cmd |= DMSGF_REPLY;
2151 	cmd |= DMSGF_DELETE;
2152 
2153 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2154 	nmsg->any.head.error = error;
2155 	kdmsg_msg_write(nmsg);
2156 }
2157 
2158 /*
2159  * Reply to a message and continue our side of the transaction.
2160  *
2161  * If msg->state is non-NULL we are replying to a one-way message and this
2162  * function degenerates into the same as kdmsg_msg_reply().
2163  */
2164 void
2165 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2166 {
2167 	kdmsg_msg_t *nmsg;
2168 	uint32_t cmd;
2169 
2170 	/*
2171 	 * Return a simple result code, do NOT terminate the transaction.
2172 	 */
2173 	cmd = DMSG_LNK_ERROR;
2174 
2175 	/*
2176 	 * Check if our direction has even been initiated yet, set CREATE.
2177 	 *
2178 	 * Check what direction this is (command or reply direction).  Note
2179 	 * that txcmd might not have been initiated yet.
2180 	 *
2181 	 * If our direction has already been closed we just return without
2182 	 * doing anything.
2183 	 */
2184 	KKASSERT(state);
2185 	if (state->txcmd & DMSGF_DELETE)
2186 		return;
2187 	if ((state->txcmd & DMSGF_CREATE) == 0)
2188 		cmd |= DMSGF_CREATE;
2189 	if (state->txcmd & DMSGF_REPLY)
2190 		cmd |= DMSGF_REPLY;
2191 	/* continuing transaction, do not set MSGF_DELETE */
2192 
2193 	nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2194 	nmsg->any.head.error = error;
2195 	kdmsg_msg_write(nmsg);
2196 }
2197