1 /*-
2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34 /*
35 * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36 * a streaming response. See subr_diskiocom()'s diskiodone().
37 */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/caps.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
56
57 #include <sys/dmsg.h>
58
59 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
60
61 SYSCTL_NODE(, OID_AUTO, kdmsg, CTLFLAG_RW, 0, "kdmsg");
62 static int kdmsg_debug = 1;
63 SYSCTL_INT(_kdmsg, OID_AUTO, debug, CTLFLAG_RW, &kdmsg_debug, 0,
64 "Set debug level for kernel dmsg layer");
65
66 #define kd_printf(level, ctl, ...) \
67 if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
68
69 #define kdio_printf(iocom, level, ctl, ...) \
70 if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
71
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
73 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
74 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
78 static void kdmsg_subq_delete(kdmsg_state_t *state);
79 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
80 static void kdmsg_state_abort(kdmsg_state_t *state);
81 static void kdmsg_state_dying(kdmsg_state_t *state);
82 static void kdmsg_state_free(kdmsg_state_t *state);
83 static void kdmsg_drain_msg(kdmsg_msg_t *msg);
84
85 #ifdef KDMSG_DEBUG
86 #define KDMSG_DEBUG_ARGS , const char *file, int line
87 #define kdmsg_state_hold(state) _kdmsg_state_hold(state, __FILE__, __LINE__)
88 #define kdmsg_state_drop(state) _kdmsg_state_drop(state, __FILE__, __LINE__)
89 #else
90 #define KDMSG_DEBUG 0
91 #define KDMSG_DEBUG_ARGS
92 #define kdmsg_state_hold(state) _kdmsg_state_hold(state)
93 #define kdmsg_state_drop(state) _kdmsg_state_drop(state)
94 #endif
95 static void _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
96 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
97
98 static void kdmsg_iocom_thread_rd(void *arg);
99 static void kdmsg_iocom_thread_wr(void *arg);
100 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
101
102 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
103
104 /*
105 * Initialize the roll-up communications structure for a network
106 * messaging session. This function does not install the socket.
107 */
108 void
kdmsg_iocom_init(kdmsg_iocom_t * iocom,void * handle,uint32_t flags,struct malloc_type * mmsg,int (* rcvmsg)(kdmsg_msg_t * msg))109 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
110 struct malloc_type *mmsg,
111 int (*rcvmsg)(kdmsg_msg_t *msg))
112 {
113 bzero(iocom, sizeof(*iocom));
114 iocom->handle = handle;
115 iocom->mmsg = mmsg;
116 iocom->rcvmsg = rcvmsg;
117 iocom->flags = flags;
118 lockinit(&iocom->msglk, "h2msg", 0, 0);
119 TAILQ_INIT(&iocom->msgq);
120 RB_INIT(&iocom->staterd_tree);
121 RB_INIT(&iocom->statewr_tree);
122
123 iocom->state0.iocom = iocom;
124 iocom->state0.parent = &iocom->state0;
125 TAILQ_INIT(&iocom->state0.subq);
126 }
127
128 /*
129 * [Re]connect using the passed file pointer. The caller must ref the
130 * fp for us. We own that ref now.
131 */
132 void
kdmsg_iocom_reconnect(kdmsg_iocom_t * iocom,struct file * fp,const char * subsysname)133 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
134 const char *subsysname)
135 {
136 /*
137 * Destroy the current connection
138 */
139 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
140 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
141 while (iocom->msgrd_td || iocom->msgwr_td) {
142 wakeup(&iocom->msg_ctl);
143 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
144 }
145
146 /*
147 * Drop communications descriptor
148 */
149 if (iocom->msg_fp) {
150 fdrop(iocom->msg_fp);
151 iocom->msg_fp = NULL;
152 }
153
154 /*
155 * Setup new communications descriptor
156 */
157 iocom->msg_ctl = 0;
158 iocom->msg_fp = fp;
159 iocom->msg_seq = 0;
160 iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
161
162 lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
163 NULL, 0, -1, "%s-msgrd", subsysname);
164 lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
165 NULL, 0, -1, "%s-msgwr", subsysname);
166 lockmgr(&iocom->msglk, LK_RELEASE);
167 }
168
169 /*
170 * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
171 * this function to handle the state machine for LNK_CONN and LNK_SPAN.
172 */
173 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
174 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
175
176 void
kdmsg_iocom_autoinitiate(kdmsg_iocom_t * iocom,void (* auto_callback)(kdmsg_msg_t * msg))177 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
178 void (*auto_callback)(kdmsg_msg_t *msg))
179 {
180 kdmsg_msg_t *msg;
181
182 iocom->auto_callback = auto_callback;
183
184 msg = kdmsg_msg_alloc(&iocom->state0,
185 DMSG_LNK_CONN | DMSGF_CREATE,
186 kdmsg_lnk_conn_reply, NULL);
187 iocom->auto_lnk_conn.head = msg->any.head;
188 msg->any.lnk_conn = iocom->auto_lnk_conn;
189 iocom->conn_state = msg->state;
190 kdmsg_state_hold(msg->state); /* iocom->conn_state */
191 kdmsg_msg_write(msg);
192 }
193
194 static
195 int
kdmsg_lnk_conn_reply(kdmsg_state_t * state,kdmsg_msg_t * msg)196 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
197 {
198 kdmsg_iocom_t *iocom = state->iocom;
199 kdmsg_msg_t *rmsg;
200
201 /*
202 * Upon receipt of the LNK_CONN acknowledgement initiate an
203 * automatic SPAN if we were asked to. Used by e.g. xdisk, but
204 * not used by HAMMER2 which must manage more than one transmitted
205 * SPAN.
206 */
207 if ((msg->any.head.cmd & DMSGF_CREATE) &&
208 (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
209 rmsg = kdmsg_msg_alloc(&iocom->state0,
210 DMSG_LNK_SPAN | DMSGF_CREATE,
211 kdmsg_lnk_span_reply, NULL);
212 iocom->auto_lnk_span.head = rmsg->any.head;
213 rmsg->any.lnk_span = iocom->auto_lnk_span;
214 kdmsg_msg_write(rmsg);
215 }
216
217 /*
218 * Process shim after the CONN is acknowledged and before the CONN
219 * transaction is deleted. For deletions this gives device drivers
220 * the ability to interlock new operations on the circuit before
221 * it becomes illegal and panics.
222 */
223 if (iocom->auto_callback)
224 iocom->auto_callback(msg);
225
226 if ((state->txcmd & DMSGF_DELETE) == 0 &&
227 (msg->any.head.cmd & DMSGF_DELETE)) {
228 /*
229 * iocom->conn_state has a state ref, drop it when clearing.
230 */
231 if (iocom->conn_state)
232 kdmsg_state_drop(iocom->conn_state);
233 iocom->conn_state = NULL;
234 kdmsg_msg_reply(msg, 0);
235 }
236
237 return (0);
238 }
239
240 static
241 int
kdmsg_lnk_span_reply(kdmsg_state_t * state,kdmsg_msg_t * msg)242 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
243 {
244 /*
245 * Be sure to process shim before terminating the SPAN
246 * transaction. Gives device drivers the ability to
247 * interlock new operations on the circuit before it
248 * becomes illegal and panics.
249 */
250 if (state->iocom->auto_callback)
251 state->iocom->auto_callback(msg);
252
253 if ((state->txcmd & DMSGF_DELETE) == 0 &&
254 (msg->any.head.cmd & DMSGF_DELETE)) {
255 kdmsg_msg_reply(msg, 0);
256 }
257 return (0);
258 }
259
260 /*
261 * Disconnect and clean up
262 */
263 void
kdmsg_iocom_uninit(kdmsg_iocom_t * iocom)264 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
265 {
266 kdmsg_state_t *state;
267 kdmsg_msg_t *msg;
268 int retries;
269
270 /*
271 * Ask the cluster controller to go away by setting
272 * KILLRX. Send a PING to get a response to unstick reading
273 * from the pipe.
274 *
275 * After 10 seconds shitcan the pipe and do an unclean shutdown.
276 */
277 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
278
279 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
280 msg = kdmsg_msg_alloc(&iocom->state0, DMSG_LNK_PING, NULL, NULL);
281 kdmsg_msg_write_locked(iocom, msg);
282
283 retries = 10;
284 while (iocom->msgrd_td || iocom->msgwr_td) {
285 wakeup(&iocom->msg_ctl);
286 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
287 if (--retries == 0 && iocom->msg_fp) {
288 kdio_printf(iocom, 0, "%s\n",
289 "iocom_uninit: "
290 "shitcanning unresponsive pipe");
291 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
292 /* retries allowed to go negative, keep looping */
293 }
294 }
295
296 /*
297 * Cleanup caches
298 */
299 if ((state = iocom->freerd_state) != NULL) {
300 iocom->freerd_state = NULL;
301 kdmsg_state_drop(state);
302 }
303
304 if ((state = iocom->freewr_state) != NULL) {
305 iocom->freewr_state = NULL;
306 kdmsg_state_drop(state);
307 }
308
309 /*
310 * Drop communications descriptor
311 */
312 if (iocom->msg_fp) {
313 fdrop(iocom->msg_fp);
314 iocom->msg_fp = NULL;
315 }
316 lockmgr(&iocom->msglk, LK_RELEASE);
317 }
318
319 /*
320 * Cluster controller thread. Perform messaging functions. We have one
321 * thread for the reader and one for the writer. The writer handles
322 * shutdown requests (which should break the reader thread).
323 */
324 static
325 void
kdmsg_iocom_thread_rd(void * arg)326 kdmsg_iocom_thread_rd(void *arg)
327 {
328 kdmsg_iocom_t *iocom = arg;
329 dmsg_hdr_t hdr;
330 kdmsg_msg_t *msg = NULL;
331 size_t hbytes;
332 size_t abytes;
333 int error = 0;
334
335 while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
336 /*
337 * Retrieve the message from the pipe or socket.
338 */
339 error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
340 NULL, 1, UIO_SYSSPACE);
341 if (error)
342 break;
343 if (hdr.magic != DMSG_HDR_MAGIC) {
344 kdio_printf(iocom, 1, "bad magic: %04x\n", hdr.magic);
345 error = EINVAL;
346 break;
347 }
348 hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
349 if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
350 kdio_printf(iocom, 1, "bad header size %zd\n", hbytes);
351 error = EINVAL;
352 break;
353 }
354
355 /* XXX messy: mask cmd to avoid allocating state */
356 msg = kdmsg_msg_alloc(&iocom->state0,
357 hdr.cmd & DMSGF_BASECMDMASK,
358 NULL, NULL);
359 msg->any.head = hdr;
360 msg->hdr_size = hbytes;
361 if (hbytes > sizeof(hdr)) {
362 error = fp_read(iocom->msg_fp, &msg->any.head + 1,
363 hbytes - sizeof(hdr),
364 NULL, 1, UIO_SYSSPACE);
365 if (error) {
366 kdio_printf(iocom, 1, "%s\n",
367 "short msg received");
368 error = EINVAL;
369 break;
370 }
371 }
372 msg->aux_size = hdr.aux_bytes;
373 if (msg->aux_size > DMSG_AUX_MAX) {
374 kdio_printf(iocom, 1,
375 "illegal msg payload size %zd\n",
376 msg->aux_size);
377 error = EINVAL;
378 break;
379 }
380 if (msg->aux_size) {
381 abytes = DMSG_DOALIGN(msg->aux_size);
382 msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
383 msg->flags |= KDMSG_FLAG_AUXALLOC;
384 error = fp_read(iocom->msg_fp, msg->aux_data,
385 abytes, NULL, 1, UIO_SYSSPACE);
386 if (error) {
387 kdio_printf(iocom, 1, "%s\n",
388 "short msg payload received");
389 break;
390 }
391 }
392
393 error = kdmsg_msg_receive_handling(msg);
394 msg = NULL;
395 }
396
397 #if 0
398 kdio_printf(iocom, 1, "read thread terminating error=%d\n", error);
399 #endif
400
401 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
402 if (msg)
403 kdmsg_msg_free(msg);
404
405 /*
406 * Shutdown the socket and set KILLRX for consistency in case the
407 * shutdown was not commanded. Signal the transmit side to shutdown
408 * by setting KILLTX and waking it up.
409 */
410 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
411 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
412 KDMSG_CLUSTERCTL_KILLTX);
413 iocom->msgrd_td = NULL;
414 lockmgr(&iocom->msglk, LK_RELEASE);
415 wakeup(&iocom->msg_ctl);
416
417 /*
418 * iocom can be ripped out at any time once the lock is
419 * released with msgrd_td set to NULL. The wakeup()s are safe but
420 * that is all.
421 */
422 wakeup(iocom);
423 lwkt_exit();
424 }
425
426 static
427 void
kdmsg_iocom_thread_wr(void * arg)428 kdmsg_iocom_thread_wr(void *arg)
429 {
430 kdmsg_iocom_t *iocom = arg;
431 kdmsg_msg_t *msg;
432 ssize_t res;
433 size_t abytes;
434 int error = 0;
435 int save_ticks;
436 int didwarn;
437
438 /*
439 * Transmit loop
440 */
441 msg = NULL;
442 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
443
444 while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
445 /*
446 * Sleep if no messages pending. Interlock with flag while
447 * holding msglk.
448 */
449 if (TAILQ_EMPTY(&iocom->msgq)) {
450 atomic_set_int(&iocom->msg_ctl,
451 KDMSG_CLUSTERCTL_SLEEPING);
452 lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
453 atomic_clear_int(&iocom->msg_ctl,
454 KDMSG_CLUSTERCTL_SLEEPING);
455 }
456
457 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
458 /*
459 * Remove msg from the transmit queue and do
460 * persist and half-closed state handling.
461 */
462 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
463
464 error = kdmsg_state_msgtx(msg);
465 if (error == EALREADY) {
466 error = 0;
467 kdmsg_msg_free(msg);
468 continue;
469 }
470 if (error) {
471 kdmsg_msg_free(msg);
472 break;
473 }
474
475 /*
476 * Dump the message to the pipe or socket.
477 *
478 * We have to clean up the message as if the transmit
479 * succeeded even if it failed.
480 */
481 lockmgr(&iocom->msglk, LK_RELEASE);
482 error = fp_write(iocom->msg_fp, &msg->any,
483 msg->hdr_size, &res, UIO_SYSSPACE);
484 if (error || res != msg->hdr_size) {
485 if (error == 0)
486 error = EINVAL;
487 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
488 kdmsg_state_cleanuptx(msg);
489 break;
490 }
491 if (msg->aux_size) {
492 abytes = DMSG_DOALIGN(msg->aux_size);
493 error = fp_write(iocom->msg_fp,
494 msg->aux_data, abytes,
495 &res, UIO_SYSSPACE);
496 if (error || res != abytes) {
497 if (error == 0)
498 error = EINVAL;
499 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
500 kdmsg_state_cleanuptx(msg);
501 break;
502 }
503 }
504 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
505 kdmsg_state_cleanuptx(msg);
506 }
507 }
508
509 #if 0
510 kdio_printf(iocom, 1, "write thread terminating error=%d\n", error);
511 #endif
512
513 /*
514 * Shutdown the socket and set KILLTX for consistency in case the
515 * shutdown was not commanded. Signal the receive side to shutdown
516 * by setting KILLRX and waking it up.
517 */
518 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
519 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
520 KDMSG_CLUSTERCTL_KILLTX);
521 wakeup(&iocom->msg_ctl);
522
523 /*
524 * The transmit thread is responsible for final cleanups, wait
525 * for the receive side to terminate to prevent new received
526 * states from interfering with our cleanup.
527 *
528 * Do not set msgwr_td to NULL until we actually exit.
529 */
530 while (iocom->msgrd_td) {
531 wakeup(&iocom->msg_ctl);
532 lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
533 }
534
535 /*
536 * We can no longer receive new messages. We must drain the transmit
537 * message queue and simulate received messages to close anay remaining
538 * states.
539 *
540 * Loop until all the states are gone and there are no messages
541 * pending transmit.
542 */
543 save_ticks = ticks;
544 didwarn = 0;
545 iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
546
547 while (TAILQ_FIRST(&iocom->msgq) ||
548 RB_ROOT(&iocom->staterd_tree) ||
549 RB_ROOT(&iocom->statewr_tree) ||
550 iocom->conn_state) {
551 /*
552 * Simulate failure for all sub-states of state0.
553 */
554 kdmsg_drain_msgq(iocom);
555 kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
556
557 lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
558
559 if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
560 didwarn = 1;
561 kdio_printf(iocom, 0,
562 "Warning, write thread on %p "
563 "still terminating\n",
564 iocom);
565 }
566 if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
567 didwarn = 2;
568 kdio_printf(iocom, 0,
569 "Warning, write thread on %p "
570 "still terminating\n",
571 iocom);
572 }
573 if ((int)(ticks - save_ticks) > hz*60) {
574 kdio_printf(iocom, 0,
575 "Can't terminate: msgq %p "
576 "rd_tree %p wr_tree %p\n",
577 TAILQ_FIRST(&iocom->msgq),
578 RB_ROOT(&iocom->staterd_tree),
579 RB_ROOT(&iocom->statewr_tree));
580 lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz * 10);
581 }
582 }
583
584 /*
585 * Exit handling is done by the write thread.
586 */
587 lockmgr(&iocom->msglk, LK_RELEASE);
588
589 /*
590 * The state trees had better be empty now
591 */
592 KKASSERT(RB_EMPTY(&iocom->staterd_tree));
593 KKASSERT(RB_EMPTY(&iocom->statewr_tree));
594 KKASSERT(iocom->conn_state == NULL);
595
596 if (iocom->exit_func) {
597 /*
598 * iocom is invalid after we call the exit function.
599 */
600 iocom->msgwr_td = NULL;
601 iocom->exit_func(iocom);
602 } else {
603 /*
604 * iocom can be ripped out from under us once msgwr_td is
605 * set to NULL. The wakeup is safe.
606 */
607 iocom->msgwr_td = NULL;
608 wakeup(iocom);
609 }
610 lwkt_exit();
611 }
612
613 /*
614 * This cleans out the pending transmit message queue, adjusting any
615 * persistent states properly in the process.
616 *
617 * Called with iocom locked.
618 */
619 void
kdmsg_drain_msgq(kdmsg_iocom_t * iocom)620 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
621 {
622 kdmsg_msg_t *msg;
623
624 /*
625 * Clean out our pending transmit queue, executing the
626 * appropriate state adjustments as if the messages were
627 * sent.
628 */
629 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
630 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
631 kdmsg_drain_msg(msg);
632 }
633 }
634
635 /*
636 * Drain one message by simulating transmission and also simulating a
637 * receive failure.
638 */
639 static void
kdmsg_drain_msg(kdmsg_msg_t * msg)640 kdmsg_drain_msg(kdmsg_msg_t *msg)
641 {
642 if (kdmsg_state_msgtx(msg)) {
643 kdmsg_msg_free(msg);
644 } else {
645 if (msg->state) {
646 kdmsg_simulate_failure(msg->state,
647 0, DMSG_ERR_LOSTLINK);
648 }
649 kdmsg_state_cleanuptx(msg);
650 }
651 }
652
653 /*
654 * Do all processing required to handle a freshly received message
655 * after its low level header has been validated.
656 *
657 * iocom is not locked.
658 */
659 static
660 int
kdmsg_msg_receive_handling(kdmsg_msg_t * msg)661 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
662 {
663 kdmsg_iocom_t *iocom = msg->state->iocom;
664 int error;
665
666 /*
667 * State machine tracking, state assignment for msg,
668 * returns error and discard status. Errors are fatal
669 * to the connection except for EALREADY which forces
670 * a discard without execution.
671 */
672 error = kdmsg_state_msgrx(msg);
673 if (msg->state->flags & KDMSG_STATE_ABORTING) {
674 kdio_printf(iocom, 5,
675 "kdmsg_state_abort(b): state %p rxcmd=%08x "
676 "txcmd=%08x msgrx error %d\n",
677 msg->state, msg->state->rxcmd,
678 msg->state->txcmd, error);
679 }
680 if (error) {
681 /*
682 * Raw protocol or connection error
683 */
684 if (msg->state->flags & KDMSG_STATE_ABORTING)
685 kdio_printf(iocom, 5,
686 "X1 state %p error %d\n",
687 msg->state, error);
688 kdmsg_msg_free(msg);
689 if (error == EALREADY)
690 error = 0;
691 } else if (msg->state && msg->state->func) {
692 /*
693 * Message related to state which already has a
694 * handling function installed for it.
695 */
696 if (msg->state->flags & KDMSG_STATE_ABORTING)
697 kdio_printf(iocom, 5,
698 "X2 state %p func %p\n",
699 msg->state, msg->state->func);
700 error = msg->state->func(msg->state, msg);
701 kdmsg_state_cleanuprx(msg);
702 } else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
703 if (msg->state->flags & KDMSG_STATE_ABORTING)
704 kdio_printf(iocom, 5,
705 "X3 state %p\n", msg->state);
706 error = kdmsg_autorxmsg(msg);
707 kdmsg_state_cleanuprx(msg);
708 } else {
709 if (msg->state->flags & KDMSG_STATE_ABORTING)
710 kdio_printf(iocom, 5,
711 "X4 state %p\n", msg->state);
712 error = iocom->rcvmsg(msg);
713 kdmsg_state_cleanuprx(msg);
714 }
715 return error;
716 }
717
718 /*
719 * Process state tracking for a message after reception and dequeueing,
720 * prior to execution of the state callback. The state is updated and
721 * will be removed from the RBTREE if completely closed, but the state->parent
722 * and subq linkage is not cleaned up until after the callback (see
723 * cleanuprx()).
724 *
725 * msglk is not held.
726 *
727 * NOTE: A message transaction can consist of several messages in either
728 * direction.
729 *
730 * NOTE: The msgid is unique to the initiator, not necessarily unique for
731 * us or for any relay or for the return direction for that matter.
732 * That is, two sides sending a new message can use the same msgid
733 * without colliding.
734 *
735 * --
736 *
737 * ABORT sequences work by setting the ABORT flag along with normal message
738 * state. However, ABORTs can also be sent on half-closed messages, that is
739 * even if the command or reply side has already sent a DELETE, as long as
740 * the message has not been fully closed it can still send an ABORT+DELETE
741 * to terminate the half-closed message state.
742 *
743 * Since ABORT+DELETEs can race we silently discard ABORT's for message
744 * state which has already been fully closed. REPLY+ABORT+DELETEs can
745 * also race, and in this situation the other side might have already
746 * initiated a new unrelated command with the same message id. Since
747 * the abort has not set the CREATE flag the situation can be detected
748 * and the message will also be discarded.
749 *
750 * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
751 * The ABORT request is essentially integrated into the command instead
752 * of being sent later on. In this situation the command implementation
753 * detects that CREATE and ABORT are both set (vs ABORT alone) and can
754 * special-case non-blocking operation for the command.
755 *
756 * NOTE! Messages with ABORT set without CREATE or DELETE are considered
757 * to be mid-stream aborts for command/reply sequences. ABORTs on
758 * one-way messages are not supported.
759 *
760 * NOTE! If a command sequence does not support aborts the ABORT flag is
761 * simply ignored.
762 *
763 * --
764 *
765 * One-off messages (no reply expected) are sent with neither CREATE or DELETE
766 * set. One-off messages cannot be aborted and typically aren't processed
767 * by these routines. The REPLY bit can be used to distinguish whether a
768 * one-off message is a command or reply. For example, one-off replies
769 * will typically just contain status updates.
770 */
771 static
772 int
kdmsg_state_msgrx(kdmsg_msg_t * msg)773 kdmsg_state_msgrx(kdmsg_msg_t *msg)
774 {
775 kdmsg_iocom_t *iocom = msg->state->iocom;
776 kdmsg_state_t *state;
777 kdmsg_state_t *pstate;
778 kdmsg_state_t sdummy;
779 int error;
780
781 bzero(&sdummy, sizeof(sdummy)); /* avoid gcc warnings */
782
783 /*
784 * Make sure a state structure is ready to go in case we need a new
785 * one. This is the only routine which uses freerd_state so no
786 * races are possible.
787 */
788 if ((state = iocom->freerd_state) == NULL) {
789 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
790 state->flags = KDMSG_STATE_DYNAMIC;
791 state->iocom = iocom;
792 state->refs = 1;
793 TAILQ_INIT(&state->subq);
794 iocom->freerd_state = state;
795 }
796 state = NULL; /* safety */
797
798 /*
799 * Lock RB tree and locate existing persistent state, if any.
800 *
801 * If received msg is a command state is on staterd_tree.
802 * If received msg is a reply state is on statewr_tree.
803 */
804 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
805
806 again:
807 if (msg->state == &iocom->state0) {
808 sdummy.msgid = msg->any.head.msgid;
809 sdummy.iocom = iocom;
810 if (msg->any.head.cmd & DMSGF_REVTRANS) {
811 state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
812 &sdummy);
813 } else {
814 state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
815 &sdummy);
816 }
817
818 /*
819 * Set message state unconditionally. If this is a CREATE
820 * message this state will become the parent state and new
821 * state will be allocated for the message state.
822 */
823 if (state == NULL)
824 state = &iocom->state0;
825 if (state->flags & KDMSG_STATE_INTERLOCK) {
826 state->flags |= KDMSG_STATE_SIGNAL;
827 lksleep(state, &iocom->msglk, 0, "dmrace", hz);
828 goto again;
829 }
830 kdmsg_state_hold(state);
831 kdmsg_state_drop(msg->state); /* iocom->state0 */
832 msg->state = state;
833 } else {
834 state = msg->state;
835 }
836
837 /*
838 * Short-cut one-off or mid-stream messages.
839 */
840 if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
841 DMSGF_ABORT)) == 0) {
842 error = 0;
843 goto done;
844 }
845
846 /*
847 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
848 * inside the case statements.
849 */
850 switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
851 case DMSGF_CREATE:
852 case DMSGF_CREATE | DMSGF_DELETE:
853 /*
854 * New persistant command received.
855 */
856 if (state != &iocom->state0) {
857 kdio_printf(iocom, 1, "%s\n",
858 "duplicate transaction");
859 error = EINVAL;
860 break;
861 }
862
863 /*
864 * Lookup the circuit. The circuit is an open transaction.
865 * the REVCIRC bit in the message tells us which side
866 * initiated the transaction representing the circuit.
867 */
868 if (msg->any.head.circuit) {
869 sdummy.msgid = msg->any.head.circuit;
870
871 if (msg->any.head.cmd & DMSGF_REVCIRC) {
872 pstate = RB_FIND(kdmsg_state_tree,
873 &iocom->statewr_tree,
874 &sdummy);
875 } else {
876 pstate = RB_FIND(kdmsg_state_tree,
877 &iocom->staterd_tree,
878 &sdummy);
879 }
880 if (pstate == NULL) {
881 kdio_printf(iocom, 1, "%s\n",
882 "missing parent in "
883 "stacked trans");
884 error = EINVAL;
885 break;
886 }
887 } else {
888 pstate = &iocom->state0;
889 }
890
891 /*
892 * Allocate new state.
893 *
894 * msg->state becomes the owner of the ref we inherit from
895 * freerd_stae.
896 */
897 kdmsg_state_drop(state);
898 state = iocom->freerd_state;
899 iocom->freerd_state = NULL;
900
901 msg->state = state; /* inherits freerd ref */
902 state->parent = pstate;
903 KKASSERT(state->iocom == iocom);
904 state->flags |= KDMSG_STATE_RBINSERTED |
905 KDMSG_STATE_SUBINSERTED |
906 KDMSG_STATE_OPPOSITE;
907 if (TAILQ_EMPTY(&pstate->subq))
908 kdmsg_state_hold(pstate);/* states on pstate->subq */
909 kdmsg_state_hold(state); /* state on pstate->subq */
910 kdmsg_state_hold(state); /* state on rbtree */
911 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
912 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
913 state->txcmd = DMSGF_REPLY;
914 state->msgid = msg->any.head.msgid;
915 state->flags &= ~KDMSG_STATE_NEW;
916 RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
917 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
918 error = 0;
919 break;
920 case DMSGF_DELETE:
921 /*
922 * Persistent state is expected but might not exist if an
923 * ABORT+DELETE races the close.
924 */
925 if (state == &iocom->state0) {
926 if (msg->any.head.cmd & DMSGF_ABORT) {
927 kdio_printf(iocom, 1, "%s\n",
928 "msgrx: "
929 "state already A");
930 error = EALREADY;
931 } else {
932 kdio_printf(iocom, 1, "%s\n",
933 "msgrx: no state for DELETE");
934 error = EINVAL;
935 }
936 break;
937 }
938
939 /*
940 * Handle another ABORT+DELETE case if the msgid has already
941 * been reused.
942 */
943 if ((state->rxcmd & DMSGF_CREATE) == 0) {
944 if (msg->any.head.cmd & DMSGF_ABORT) {
945 kdio_printf(iocom, 1, "%s\n",
946 "msgrx: state already B");
947 error = EALREADY;
948 } else {
949 kdio_printf(iocom, 1, "%s\n",
950 "msgrx: state reused for DELETE");
951 error = EINVAL;
952 }
953 break;
954 }
955 error = 0;
956 break;
957 default:
958 /*
959 * Check for mid-stream ABORT command received, otherwise
960 * allow.
961 */
962 if (msg->any.head.cmd & DMSGF_ABORT) {
963 if (state == &iocom->state0 ||
964 (state->rxcmd & DMSGF_CREATE) == 0) {
965 error = EALREADY;
966 break;
967 }
968 }
969 error = 0;
970 break;
971 case DMSGF_REPLY | DMSGF_CREATE:
972 case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
973 /*
974 * When receiving a reply with CREATE set the original
975 * persistent state message should already exist.
976 */
977 if (state == &iocom->state0) {
978 kdio_printf(iocom, 1,
979 "msgrx: no state match for "
980 "REPLY cmd=%08x msgid=%016jx\n",
981 msg->any.head.cmd,
982 (intmax_t)msg->any.head.msgid);
983 error = EINVAL;
984 break;
985 }
986 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
987 error = 0;
988 break;
989 case DMSGF_REPLY | DMSGF_DELETE:
990 /*
991 * Received REPLY+ABORT+DELETE in case where msgid has
992 * already been fully closed, ignore the message.
993 */
994 if (state == &iocom->state0) {
995 if (msg->any.head.cmd & DMSGF_ABORT) {
996 error = EALREADY;
997 } else {
998 kdio_printf(iocom, 1, "%s\n",
999 "msgrx: no state match "
1000 "for REPLY|DELETE");
1001 error = EINVAL;
1002 }
1003 break;
1004 }
1005
1006 /*
1007 * Received REPLY+ABORT+DELETE in case where msgid has
1008 * already been reused for an unrelated message,
1009 * ignore the message.
1010 */
1011 if ((state->rxcmd & DMSGF_CREATE) == 0) {
1012 if (msg->any.head.cmd & DMSGF_ABORT) {
1013 error = EALREADY;
1014 } else {
1015 kdio_printf(iocom, 1, "%s\n",
1016 "msgrx: state reused "
1017 "for REPLY|DELETE");
1018 error = EINVAL;
1019 }
1020 break;
1021 }
1022 error = 0;
1023 break;
1024 case DMSGF_REPLY:
1025 /*
1026 * Check for mid-stream ABORT reply received to sent command.
1027 */
1028 if (msg->any.head.cmd & DMSGF_ABORT) {
1029 if (state == &iocom->state0 ||
1030 (state->rxcmd & DMSGF_CREATE) == 0) {
1031 error = EALREADY;
1032 break;
1033 }
1034 }
1035 error = 0;
1036 break;
1037 }
1038
1039 /*
1040 * Calculate the easy-switch() transactional command. Represents
1041 * the outer-transaction command for any transaction-create or
1042 * transaction-delete, and the inner message command for any
1043 * non-transaction or inside-transaction command. tcmd will be
1044 * set to 0 if the message state is illegal.
1045 *
1046 * The two can be told apart because outer-transaction commands
1047 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1048 */
1049 done:
1050 if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
1051 if (state != &iocom->state0) {
1052 msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1053 (msg->any.head.cmd & (DMSGF_CREATE |
1054 DMSGF_DELETE |
1055 DMSGF_REPLY));
1056 } else {
1057 msg->tcmd = 0;
1058 }
1059 } else {
1060 msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1061 }
1062
1063 /*
1064 * Adjust the state for DELETE handling now, before making the
1065 * callback so we are atomic with other state updates.
1066 *
1067 * Subq/parent linkages are cleaned up after the callback.
1068 * If an error occurred the message is ignored and state is not
1069 * updated.
1070 */
1071 if ((state = msg->state) == NULL || error != 0) {
1072 kdio_printf(iocom, 1,
1073 "msgrx: state=%p error %d\n",
1074 state, error);
1075 } else if (msg->any.head.cmd & DMSGF_DELETE) {
1076 KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1077 state->rxcmd |= DMSGF_DELETE;
1078 if (state->txcmd & DMSGF_DELETE) {
1079 KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1080 if (state->rxcmd & DMSGF_REPLY) {
1081 KKASSERT(msg->any.head.cmd &
1082 DMSGF_REPLY);
1083 RB_REMOVE(kdmsg_state_tree,
1084 &iocom->statewr_tree, state);
1085 } else {
1086 KKASSERT((msg->any.head.cmd &
1087 DMSGF_REPLY) == 0);
1088 RB_REMOVE(kdmsg_state_tree,
1089 &iocom->staterd_tree, state);
1090 }
1091 state->flags &= ~KDMSG_STATE_RBINSERTED;
1092 kdmsg_state_drop(state); /* state on rbtree */
1093 }
1094 }
1095 lockmgr(&iocom->msglk, LK_RELEASE);
1096
1097 return (error);
1098 }
1099
1100 /*
1101 * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1102 * This routine must call iocom->rcvmsg() for anything not automatically
1103 * handled.
1104 */
1105 static int
kdmsg_autorxmsg(kdmsg_msg_t * msg)1106 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1107 {
1108 kdmsg_iocom_t *iocom = msg->state->iocom;
1109 kdmsg_msg_t *rep;
1110 int error = 0;
1111 uint32_t cmd;
1112
1113 /*
1114 * Main switch processes transaction create/delete sequences only.
1115 * Use icmd (DELETEs use DMSG_LNK_ERROR
1116 *
1117 * NOTE: If processing in-transaction messages you generally want
1118 * an inner switch on msg->any.head.cmd.
1119 */
1120 if (msg->state) {
1121 cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1122 (msg->any.head.cmd & (DMSGF_CREATE |
1123 DMSGF_DELETE |
1124 DMSGF_REPLY));
1125 } else {
1126 cmd = 0;
1127 }
1128
1129 switch(cmd) {
1130 case DMSG_LNK_PING:
1131 /*
1132 * Received ping, send reply
1133 */
1134 rep = kdmsg_msg_alloc(msg->state, DMSG_LNK_PING | DMSGF_REPLY,
1135 NULL, NULL);
1136 kdmsg_msg_write(rep);
1137 break;
1138 case DMSG_LNK_PING | DMSGF_REPLY:
1139 /* ignore replies */
1140 break;
1141 case DMSG_LNK_CONN | DMSGF_CREATE:
1142 case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1143 /*
1144 * Received LNK_CONN transaction. Transmit response and
1145 * leave transaction open, which allows the other end to
1146 * start to the SPAN protocol.
1147 *
1148 * Handle shim after acknowledging the CONN.
1149 */
1150 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1151 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1152 kdmsg_msg_result(msg, 0);
1153 if (iocom->auto_callback)
1154 iocom->auto_callback(msg);
1155 } else {
1156 error = iocom->rcvmsg(msg);
1157 }
1158 break;
1159 }
1160 /* fall through */
1161 case DMSG_LNK_CONN | DMSGF_DELETE:
1162 /*
1163 * This message is usually simulated after a link is lost
1164 * to clean up the transaction.
1165 */
1166 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1167 if (iocom->auto_callback)
1168 iocom->auto_callback(msg);
1169 kdmsg_msg_reply(msg, 0);
1170 } else {
1171 error = iocom->rcvmsg(msg);
1172 }
1173 break;
1174 case DMSG_LNK_SPAN | DMSGF_CREATE:
1175 case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1176 /*
1177 * Received LNK_SPAN transaction. We do not have to respond
1178 * (except on termination), but we must leave the transaction
1179 * open.
1180 *
1181 * Handle shim after acknowledging the SPAN.
1182 */
1183 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1184 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1185 if (iocom->auto_callback)
1186 iocom->auto_callback(msg);
1187 break;
1188 }
1189 /* fall through */
1190 } else {
1191 error = iocom->rcvmsg(msg);
1192 break;
1193 }
1194 /* fall through */
1195 case DMSG_LNK_SPAN | DMSGF_DELETE:
1196 /*
1197 * Process shims (auto_callback) before cleaning up the
1198 * circuit structure and closing the transactions. Device
1199 * driver should ensure that the circuit is not used after
1200 * the auto_callback() returns.
1201 *
1202 * Handle shim before closing the SPAN transaction.
1203 */
1204 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1205 if (iocom->auto_callback)
1206 iocom->auto_callback(msg);
1207 kdmsg_msg_reply(msg, 0);
1208 } else {
1209 error = iocom->rcvmsg(msg);
1210 }
1211 break;
1212 default:
1213 /*
1214 * Anything unhandled goes into rcvmsg.
1215 *
1216 * NOTE: Replies to link-level messages initiated by our side
1217 * are handled by the state callback, they are NOT
1218 * handled here.
1219 */
1220 error = iocom->rcvmsg(msg);
1221 break;
1222 }
1223 return (error);
1224 }
1225
1226 /*
1227 * Post-receive-handling message and state cleanup. This routine is called
1228 * after the state function handling/callback to properly dispose of the
1229 * message and unlink the state's parent/subq linkage if the state is
1230 * completely closed.
1231 *
1232 * msglk is not held.
1233 */
1234 static
1235 void
kdmsg_state_cleanuprx(kdmsg_msg_t * msg)1236 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1237 {
1238 kdmsg_state_t *state = msg->state;
1239 kdmsg_iocom_t *iocom = state->iocom;
1240
1241 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1242 if (state != &iocom->state0) {
1243 /*
1244 * When terminating a transaction (in either direction), all
1245 * sub-states are aborted.
1246 */
1247 if ((msg->any.head.cmd & DMSGF_DELETE) &&
1248 TAILQ_FIRST(&msg->state->subq)) {
1249 kdio_printf(iocom, 2,
1250 "simulate failure for substates of "
1251 "state %p cmd %08x/%08x\n",
1252 msg->state,
1253 msg->state->rxcmd,
1254 msg->state->txcmd);
1255 kdmsg_simulate_failure(msg->state,
1256 0, DMSG_ERR_LOSTLINK);
1257 }
1258
1259 /*
1260 * Once the state is fully closed we can (try to) remove it
1261 * from the subq topology.
1262 */
1263 if ((state->flags & KDMSG_STATE_SUBINSERTED) &&
1264 (state->rxcmd & DMSGF_DELETE) &&
1265 (state->txcmd & DMSGF_DELETE)) {
1266 /*
1267 * Remove parent linkage if state is completely closed.
1268 */
1269 kdmsg_subq_delete(state);
1270 }
1271 }
1272 kdmsg_msg_free(msg);
1273
1274 lockmgr(&iocom->msglk, LK_RELEASE);
1275 }
1276
1277 /*
1278 * Remove state from its parent's subq. This can wind up recursively
1279 * dropping the parent upward.
1280 *
1281 * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1282 */
1283 static
1284 void
kdmsg_subq_delete(kdmsg_state_t * state)1285 kdmsg_subq_delete(kdmsg_state_t *state)
1286 {
1287 kdmsg_state_t *pstate;
1288
1289 if (state->flags & KDMSG_STATE_SUBINSERTED) {
1290 pstate = state->parent;
1291 KKASSERT(pstate);
1292 if (pstate->scan == state)
1293 pstate->scan = NULL;
1294 TAILQ_REMOVE(&pstate->subq, state, entry);
1295 state->flags &= ~KDMSG_STATE_SUBINSERTED;
1296 state->parent = NULL;
1297 if (TAILQ_EMPTY(&pstate->subq)) {
1298 kdmsg_state_drop(pstate);/* pstate->subq */
1299 }
1300 pstate = NULL; /* safety */
1301 kdmsg_state_drop(state); /* pstate->subq */
1302 } else {
1303 KKASSERT(state->parent == NULL);
1304 }
1305 }
1306
1307 /*
1308 * Simulate receiving a message which terminates an active transaction
1309 * state. Our simulated received message must set DELETE and may also
1310 * have to set CREATE. It must also ensure that all fields are set such
1311 * that the receive handling code can find the state (kdmsg_state_msgrx())
1312 * or an endless loop will ensue.
1313 *
1314 * This is used when the other end of the link is dead so the device driver
1315 * gets a completed transaction for all pending states.
1316 *
1317 * Called with iocom locked.
1318 */
1319 static
1320 void
kdmsg_simulate_failure(kdmsg_state_t * state,int meto,int error)1321 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1322 {
1323 kdmsg_state_t *substate;
1324
1325 kdmsg_state_hold(state); /* aborting */
1326
1327 /*
1328 * Abort parent state first. Parent will not actually disappear
1329 * until children are gone. Device drivers must handle the situation.
1330 * The advantage of this is that device drivers can flag the situation
1331 * as an interlock against new operations on dying states. And since
1332 * device operations are often asynchronous anyway, this sequence of
1333 * events works out better.
1334 */
1335 if (meto)
1336 kdmsg_state_abort(state);
1337
1338 /*
1339 * Recurse through any children.
1340 */
1341 again:
1342 TAILQ_FOREACH(substate, &state->subq, entry) {
1343 if (substate->flags & KDMSG_STATE_ABORTING)
1344 continue;
1345 state->scan = substate;
1346 kdmsg_simulate_failure(substate, 1, error);
1347 if (state->scan != substate)
1348 goto again;
1349 }
1350 kdmsg_state_drop(state); /* aborting */
1351 }
1352
1353 static
1354 void
kdmsg_state_abort(kdmsg_state_t * state)1355 kdmsg_state_abort(kdmsg_state_t *state)
1356 {
1357 kdmsg_msg_t *msg;
1358
1359 /*
1360 * Set ABORTING and DYING, return if already set. If the state was
1361 * just allocated we defer the abort operation until the related
1362 * message is processed.
1363 */
1364 KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1365 if (state->flags & KDMSG_STATE_ABORTING)
1366 return;
1367 state->flags |= KDMSG_STATE_ABORTING;
1368 kdmsg_state_dying(state);
1369 if (state->flags & KDMSG_STATE_NEW) {
1370 kdio_printf(iocom, 5,
1371 "kdmsg_state_abort(0): state %p rxcmd %08x "
1372 "txcmd %08x flags %08x - in NEW state\n",
1373 state, state->rxcmd,
1374 state->txcmd, state->flags);
1375 return;
1376 }
1377
1378 /*
1379 * NOTE: The DELETE flag might already be set due to an early
1380 * termination.
1381 *
1382 * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1383 *
1384 * NOTE: We are simulating a received message using our state
1385 * (vs a message generated by the other side using its state),
1386 * so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1387 */
1388 kdio_printf(iocom, 5,
1389 "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1390 state, state->rxcmd, state->txcmd);
1391 if ((state->rxcmd & DMSGF_DELETE) == 0) {
1392 msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1393 if ((state->rxcmd & DMSGF_CREATE) == 0)
1394 msg->any.head.cmd |= DMSGF_CREATE;
1395 msg->any.head.cmd |= DMSGF_DELETE |
1396 (state->rxcmd & DMSGF_REPLY);
1397 msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1398 msg->any.head.error = DMSG_ERR_LOSTLINK;
1399 kdio_printf(iocom, 5,
1400 "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1401 state, msg->any.head.cmd);
1402 /* circuit not initialized */
1403 lockmgr(&state->iocom->msglk, LK_RELEASE);
1404 kdmsg_msg_receive_handling(msg);
1405 lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1406 msg = NULL;
1407 }
1408 kdio_printf(iocom, 5,
1409 "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1410 state, state->rxcmd, state->txcmd);
1411 }
1412
1413 /*
1414 * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1415 * the transmission of any new messages on these states. This is done
1416 * atomically when parent state is terminating, whereas setting ABORTING is
1417 * not atomic and can leak races.
1418 */
1419 static
1420 void
kdmsg_state_dying(kdmsg_state_t * state)1421 kdmsg_state_dying(kdmsg_state_t *state)
1422 {
1423 kdmsg_state_t *scan;
1424
1425 if ((state->flags & KDMSG_STATE_DYING) == 0) {
1426 state->flags |= KDMSG_STATE_DYING;
1427 TAILQ_FOREACH(scan, &state->subq, entry)
1428 kdmsg_state_dying(scan);
1429 }
1430 }
1431
1432 /*
1433 * Process state tracking for a message prior to transmission.
1434 *
1435 * Called with msglk held and the msg dequeued. Returns non-zero if
1436 * the message is bad and should be deleted by the caller.
1437 *
1438 * One-off messages are usually with dummy state and msg->state may be NULL
1439 * in this situation.
1440 *
1441 * New transactions (when CREATE is set) will insert the state.
1442 *
1443 * May request that caller discard the message by setting *discardp to 1.
1444 * A NULL state may be returned in this case.
1445 */
1446 static
1447 int
kdmsg_state_msgtx(kdmsg_msg_t * msg)1448 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1449 {
1450 kdmsg_iocom_t *iocom = msg->state->iocom;
1451 kdmsg_state_t *state;
1452 int error;
1453
1454 /*
1455 * Make sure a state structure is ready to go in case we need a new
1456 * one. This is the only routine which uses freewr_state so no
1457 * races are possible.
1458 */
1459 if ((state = iocom->freewr_state) == NULL) {
1460 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1461 state->flags = KDMSG_STATE_DYNAMIC;
1462 state->iocom = iocom;
1463 state->refs = 1;
1464 TAILQ_INIT(&state->subq);
1465 iocom->freewr_state = state;
1466 }
1467
1468 /*
1469 * Lock RB tree. If persistent state is present it will have already
1470 * been assigned to msg.
1471 */
1472 state = msg->state;
1473
1474 /*
1475 * Short-cut one-off or mid-stream messages (state may be NULL).
1476 */
1477 if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1478 DMSGF_ABORT)) == 0) {
1479 return(0);
1480 }
1481
1482
1483 /*
1484 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1485 * inside the case statements.
1486 */
1487 switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1488 DMSGF_REPLY)) {
1489 case DMSGF_CREATE:
1490 case DMSGF_CREATE | DMSGF_DELETE:
1491 /*
1492 * Insert the new persistent message state and mark
1493 * half-closed if DELETE is set. Since this is a new
1494 * message it isn't possible to transition into the fully
1495 * closed state here.
1496 *
1497 * XXX state must be assigned and inserted by
1498 * kdmsg_msg_write(). txcmd is assigned by us
1499 * on-transmit.
1500 */
1501 KKASSERT(state != NULL);
1502 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1503 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1504 state->rxcmd = DMSGF_REPLY;
1505 state->flags &= ~KDMSG_STATE_NEW;
1506 error = 0;
1507 break;
1508 case DMSGF_DELETE:
1509 /*
1510 * Sent ABORT+DELETE in case where msgid has already
1511 * been fully closed, ignore the message.
1512 */
1513 if (state == &iocom->state0) {
1514 if (msg->any.head.cmd & DMSGF_ABORT) {
1515 error = EALREADY;
1516 } else {
1517 kdio_printf(iocom, 1,
1518 "msgtx: no state match "
1519 "for DELETE cmd=%08x msgid=%016jx\n",
1520 msg->any.head.cmd,
1521 (intmax_t)msg->any.head.msgid);
1522 error = EINVAL;
1523 }
1524 break;
1525 }
1526
1527 /*
1528 * Sent ABORT+DELETE in case where msgid has
1529 * already been reused for an unrelated message,
1530 * ignore the message.
1531 */
1532 if ((state->txcmd & DMSGF_CREATE) == 0) {
1533 if (msg->any.head.cmd & DMSGF_ABORT) {
1534 error = EALREADY;
1535 } else {
1536 kdio_printf(iocom, 1, "%s\n",
1537 "msgtx: state reused "
1538 "for DELETE");
1539 error = EINVAL;
1540 }
1541 break;
1542 }
1543 error = 0;
1544 break;
1545 default:
1546 /*
1547 * Check for mid-stream ABORT command sent
1548 */
1549 if (msg->any.head.cmd & DMSGF_ABORT) {
1550 if (state == &state->iocom->state0 ||
1551 (state->txcmd & DMSGF_CREATE) == 0) {
1552 error = EALREADY;
1553 break;
1554 }
1555 }
1556 error = 0;
1557 break;
1558 case DMSGF_REPLY | DMSGF_CREATE:
1559 case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1560 /*
1561 * When transmitting a reply with CREATE set the original
1562 * persistent state message should already exist.
1563 */
1564 if (state == &state->iocom->state0) {
1565 kdio_printf(iocom, 1, "%s\n",
1566 "msgtx: no state match "
1567 "for REPLY | CREATE");
1568 error = EINVAL;
1569 break;
1570 }
1571 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1572 error = 0;
1573 break;
1574 case DMSGF_REPLY | DMSGF_DELETE:
1575 /*
1576 * When transmitting a reply with DELETE set the original
1577 * persistent state message should already exist.
1578 *
1579 * This is very similar to the REPLY|CREATE|* case except
1580 * txcmd is already stored, so we just add the DELETE flag.
1581 *
1582 * Sent REPLY+ABORT+DELETE in case where msgid has
1583 * already been fully closed, ignore the message.
1584 */
1585 if (state == &state->iocom->state0) {
1586 if (msg->any.head.cmd & DMSGF_ABORT) {
1587 error = EALREADY;
1588 } else {
1589 kdio_printf(iocom, 1, "%s\n",
1590 "msgtx: no state match "
1591 "for REPLY | DELETE");
1592 error = EINVAL;
1593 }
1594 break;
1595 }
1596
1597 /*
1598 * Sent REPLY+ABORT+DELETE in case where msgid has already
1599 * been reused for an unrelated message, ignore the message.
1600 */
1601 if ((state->txcmd & DMSGF_CREATE) == 0) {
1602 if (msg->any.head.cmd & DMSGF_ABORT) {
1603 error = EALREADY;
1604 } else {
1605 kdio_printf(iocom, 1, "%s\n",
1606 "msgtx: state reused "
1607 "for REPLY | DELETE");
1608 error = EINVAL;
1609 }
1610 break;
1611 }
1612 error = 0;
1613 break;
1614 case DMSGF_REPLY:
1615 /*
1616 * Check for mid-stream ABORT reply sent.
1617 *
1618 * One-off REPLY messages are allowed for e.g. status updates.
1619 */
1620 if (msg->any.head.cmd & DMSGF_ABORT) {
1621 if (state == &state->iocom->state0 ||
1622 (state->txcmd & DMSGF_CREATE) == 0) {
1623 error = EALREADY;
1624 break;
1625 }
1626 }
1627 error = 0;
1628 break;
1629 }
1630
1631 /*
1632 * Set interlock (XXX hack) in case the send side blocks and a
1633 * response is returned before kdmsg_state_cleanuptx() can be
1634 * run.
1635 */
1636 if (state && error == 0)
1637 state->flags |= KDMSG_STATE_INTERLOCK;
1638
1639 return (error);
1640 }
1641
1642 /*
1643 * Called with iocom locked.
1644 */
1645 static
1646 void
kdmsg_state_cleanuptx(kdmsg_msg_t * msg)1647 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1648 {
1649 kdmsg_iocom_t *iocom = msg->state->iocom;
1650 kdmsg_state_t *state;
1651
1652 if ((state = msg->state) == NULL) {
1653 kdmsg_msg_free(msg);
1654 return;
1655 }
1656
1657 /*
1658 * Clear interlock (XXX hack) in case the send side blocks and a
1659 * response is returned in the other thread before
1660 * kdmsg_state_cleanuptx() can be run. We maintain our hold on
1661 * iocom->msglk so we can do this before completing our task.
1662 */
1663 if (state->flags & KDMSG_STATE_SIGNAL) {
1664 kdio_printf(iocom, 1, "state %p interlock!\n", state);
1665 wakeup(state);
1666 }
1667 state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1668 kdmsg_state_hold(state);
1669
1670 if (msg->any.head.cmd & DMSGF_DELETE) {
1671 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1672 state->txcmd |= DMSGF_DELETE;
1673 if (state->rxcmd & DMSGF_DELETE) {
1674 KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1675 if (state->txcmd & DMSGF_REPLY) {
1676 KKASSERT(msg->any.head.cmd &
1677 DMSGF_REPLY);
1678 RB_REMOVE(kdmsg_state_tree,
1679 &iocom->staterd_tree, state);
1680 } else {
1681 KKASSERT((msg->any.head.cmd &
1682 DMSGF_REPLY) == 0);
1683 RB_REMOVE(kdmsg_state_tree,
1684 &iocom->statewr_tree, state);
1685 }
1686 state->flags &= ~KDMSG_STATE_RBINSERTED;
1687
1688 /*
1689 * The subq recursion is used for parent linking and
1690 * scanning the topology for aborts, we can only
1691 * remove leafs. The circuit is effectively dead now,
1692 * but topology won't be torn down until all of its
1693 * children have finished/aborted.
1694 *
1695 * This is particularly important for end-point
1696 * devices which might need to access private data
1697 * in parent states. Out of order disconnects can
1698 * occur if an end-point device is processing a
1699 * message transaction asynchronously because abort
1700 * requests are basically synchronous and it probably
1701 * isn't convenient (or possible) for the end-point
1702 * to abort an asynchronous operation.
1703 */
1704 if (TAILQ_EMPTY(&state->subq))
1705 kdmsg_subq_delete(state);
1706 kdmsg_msg_free(msg);
1707 kdmsg_state_drop(state); /* state on rbtree */
1708 } else {
1709 kdmsg_msg_free(msg);
1710 }
1711 } else {
1712 kdmsg_msg_free(msg);
1713 }
1714
1715 /*
1716 * Deferred abort after transmission.
1717 */
1718 if ((state->flags & (KDMSG_STATE_ABORTING | KDMSG_STATE_DYING)) &&
1719 (state->rxcmd & DMSGF_DELETE) == 0) {
1720 kdio_printf(iocom, 5,
1721 "kdmsg_state_cleanuptx: state=%p "
1722 "executing deferred abort\n",
1723 state);
1724 state->flags &= ~KDMSG_STATE_ABORTING;
1725 kdmsg_state_abort(state);
1726 }
1727 kdmsg_state_drop(state);
1728 }
1729
1730 static
1731 void
_kdmsg_state_hold(kdmsg_state_t * state KDMSG_DEBUG_ARGS)1732 _kdmsg_state_hold(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1733 {
1734 atomic_add_int(&state->refs, 1);
1735 #if KDMSG_DEBUG
1736 kd_printf(4, "state %p +%d\t%s:%d\n", state, state->refs, file, line);
1737 #endif
1738 }
1739
1740 static
1741 void
_kdmsg_state_drop(kdmsg_state_t * state KDMSG_DEBUG_ARGS)1742 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1743 {
1744 KKASSERT(state->refs > 0);
1745 #if KDMSG_DEBUG
1746 kd_printf(4, "state %p -%d\t%s:%d\n", state, state->refs, file, line);
1747 #endif
1748 if (atomic_fetchadd_int(&state->refs, -1) == 1)
1749 kdmsg_state_free(state);
1750 }
1751
1752 static
1753 void
kdmsg_state_free(kdmsg_state_t * state)1754 kdmsg_state_free(kdmsg_state_t *state)
1755 {
1756 kdmsg_iocom_t *iocom = state->iocom;
1757
1758 KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1759 KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1760 KKASSERT(TAILQ_EMPTY(&state->subq));
1761
1762 if (state != &state->iocom->state0)
1763 kfree(state, iocom->mmsg);
1764 }
1765
1766 kdmsg_msg_t *
kdmsg_msg_alloc(kdmsg_state_t * state,uint32_t cmd,int (* func)(kdmsg_state_t *,kdmsg_msg_t *),void * data)1767 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1768 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1769 {
1770 kdmsg_iocom_t *iocom = state->iocom;
1771 kdmsg_state_t *pstate;
1772 kdmsg_msg_t *msg;
1773 size_t hbytes;
1774
1775 KKASSERT(iocom != NULL);
1776 hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1777 msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1778 iocom->mmsg, M_WAITOK | M_ZERO);
1779 msg->hdr_size = hbytes;
1780
1781 if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1782 /*
1783 * New transaction, requires tracking state and a unique
1784 * msgid to be allocated.
1785 *
1786 * It is possible to race a circuit failure, inherit the
1787 * parent's STATE_DYING flag to trigger an abort sequence
1788 * in the transmit path. By not inheriting ABORTING the
1789 * abort sequence can recurse.
1790 *
1791 * NOTE: The transactions has not yet been initiated so we
1792 * cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1793 * We have to properly setup DMSGF_REPLY, however.
1794 */
1795 pstate = state;
1796 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1797 TAILQ_INIT(&state->subq);
1798 state->iocom = iocom;
1799 state->parent = pstate;
1800 state->flags = KDMSG_STATE_DYNAMIC |
1801 KDMSG_STATE_NEW;
1802 state->func = func;
1803 state->any.any = data;
1804 state->msgid = (uint64_t)(uintptr_t)state;
1805 /*msg->any.head.msgid = state->msgid;XXX*/
1806
1807 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1808 if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1809 panic("duplicate msgid allocated");
1810 if (TAILQ_EMPTY(&pstate->subq))
1811 kdmsg_state_hold(pstate);/* pstate->subq */
1812 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1813 state->flags |= KDMSG_STATE_RBINSERTED |
1814 KDMSG_STATE_SUBINSERTED;
1815 state->flags |= pstate->flags & KDMSG_STATE_DYING;
1816 kdmsg_state_hold(state); /* pstate->subq */
1817 kdmsg_state_hold(state); /* state on rbtree */
1818 kdmsg_state_hold(state); /* msg->state */
1819 lockmgr(&iocom->msglk, LK_RELEASE);
1820 } else {
1821 pstate = state->parent;
1822 KKASSERT(pstate != NULL);
1823 kdmsg_state_hold(state); /* msg->state */
1824 }
1825
1826 if (state->flags & KDMSG_STATE_OPPOSITE)
1827 cmd |= DMSGF_REVTRANS;
1828 if (pstate->flags & KDMSG_STATE_OPPOSITE)
1829 cmd |= DMSGF_REVCIRC;
1830
1831 msg->any.head.magic = DMSG_HDR_MAGIC;
1832 msg->any.head.cmd = cmd;
1833 msg->any.head.msgid = state->msgid;
1834 msg->any.head.circuit = pstate->msgid;
1835 msg->state = state;
1836
1837 return (msg);
1838 }
1839
1840 void
kdmsg_msg_free(kdmsg_msg_t * msg)1841 kdmsg_msg_free(kdmsg_msg_t *msg)
1842 {
1843 kdmsg_iocom_t *iocom = msg->state->iocom;
1844 kdmsg_state_t *state;
1845
1846 if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1847 msg->aux_data && msg->aux_size) {
1848 kfree(msg->aux_data, iocom->mmsg);
1849 msg->aux_data = NULL;
1850 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1851 }
1852 if ((state = msg->state) != NULL) {
1853 msg->state = NULL;
1854 kdmsg_state_drop(state); /* msg->state */
1855 }
1856 msg->aux_data = NULL;
1857 msg->aux_size = 0;
1858
1859 kfree(msg, iocom->mmsg);
1860 }
1861
1862 void
kdmsg_detach_aux_data(kdmsg_msg_t * msg,kdmsg_data_t * data)1863 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1864 {
1865 if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1866 data->aux_data = msg->aux_data;
1867 data->aux_size = msg->aux_size;
1868 data->iocom = msg->state->iocom;
1869 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1870 } else {
1871 data->aux_data = NULL;
1872 data->aux_size = 0;
1873 data->iocom = msg->state->iocom;
1874 }
1875 }
1876
1877 void
kdmsg_free_aux_data(kdmsg_data_t * data)1878 kdmsg_free_aux_data(kdmsg_data_t *data)
1879 {
1880 if (data->aux_data) {
1881 kfree(data->aux_data, data->iocom->mmsg);
1882 data->aux_data = NULL;
1883 }
1884 }
1885
1886 /*
1887 * Indexed messages are stored in a red-black tree indexed by their
1888 * msgid. Only persistent messages are indexed.
1889 */
1890 int
kdmsg_state_cmp(kdmsg_state_t * state1,kdmsg_state_t * state2)1891 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1892 {
1893 if (state1->iocom < state2->iocom)
1894 return(-1);
1895 if (state1->iocom > state2->iocom)
1896 return(1);
1897 if (state1->msgid < state2->msgid)
1898 return(-1);
1899 if (state1->msgid > state2->msgid)
1900 return(1);
1901 return(0);
1902 }
1903
1904 /*
1905 * Write a message. All requisit command flags have been set.
1906 *
1907 * If msg->state is non-NULL the message is written to the existing
1908 * transaction. msgid will be set accordingly.
1909 *
1910 * If msg->state is NULL and CREATE is set new state is allocated and
1911 * (func, data) is installed. A msgid is assigned.
1912 *
1913 * If msg->state is NULL and CREATE is not set the message is assumed
1914 * to be a one-way message. The originator must assign the msgid
1915 * (or leave it 0, which is typical.
1916 *
1917 * This function merely queues the message to the management thread, it
1918 * does not write to the message socket/pipe.
1919 */
1920 void
kdmsg_msg_write(kdmsg_msg_t * msg)1921 kdmsg_msg_write(kdmsg_msg_t *msg)
1922 {
1923 kdmsg_iocom_t *iocom = msg->state->iocom;
1924
1925 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1926 kdmsg_msg_write_locked(iocom, msg);
1927 lockmgr(&iocom->msglk, LK_RELEASE);
1928 }
1929
1930 static void
kdmsg_msg_write_locked(kdmsg_iocom_t * iocom,kdmsg_msg_t * msg)1931 kdmsg_msg_write_locked(kdmsg_iocom_t *iocom, kdmsg_msg_t *msg)
1932 {
1933 kdmsg_state_t *state;
1934
1935 if (msg->state) {
1936 /*
1937 * Continuance or termination of existing transaction.
1938 * The transaction could have been initiated by either end.
1939 *
1940 * (Function callback and aux data for the receive side can
1941 * be replaced or left alone).
1942 */
1943 state = msg->state;
1944 msg->any.head.msgid = state->msgid;
1945 } else {
1946 /*
1947 * One-off message (always uses msgid 0 to distinguish
1948 * between a possibly lost in-transaction message due to
1949 * competing aborts and a real one-off message?)
1950 */
1951 state = NULL;
1952 msg->any.head.msgid = 0;
1953 }
1954
1955 /*
1956 * For stateful messages, if the circuit is dead or dying we have
1957 * to abort the potentially newly-created state and discard the
1958 * message.
1959 *
1960 * - We must discard the message because the other end will not
1961 * be expecting any more messages over the dead or dying circuit
1962 * and might not be able to receive them.
1963 *
1964 * - We abort the state by simulating a failure to generate a fake
1965 * incoming DELETE. This will trigger the state callback and allow
1966 * the device to clean things up and reply, closing the outgoing
1967 * direction and allowing the state to be freed.
1968 *
1969 * This situation occurs quite often, particularly as SPANs stabilize.
1970 * End-points must do the right thing.
1971 */
1972 if (state) {
1973 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1974 if (state->flags & KDMSG_STATE_DYING) {
1975 #if 0
1976 if ((state->flags & KDMSG_STATE_DYING) ||
1977 (state->parent->txcmd & DMSGF_DELETE) ||
1978 (state->parent->flags & KDMSG_STATE_DYING)) {
1979 #endif
1980 kdio_printf(iocom, 4,
1981 "kdmsg_msg_write: Write to dying circuit "
1982 "state=%p "
1983 "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1984 state,
1985 state->parent->rxcmd,
1986 state->parent->txcmd,
1987 state->parent->flags);
1988 kdmsg_state_hold(state);
1989 kdmsg_state_msgtx(msg);
1990 kdmsg_state_cleanuptx(msg);
1991 kdmsg_state_drop(state);
1992 return;
1993 }
1994 }
1995
1996 /*
1997 * Finish up the msg fields. Note that msg->aux_size and the
1998 * aux_bytes stored in the message header represent the unaligned
1999 * (actual) bytes of data, but the buffer is sized to an aligned
2000 * size and the CRC is generated over the aligned length.
2001 */
2002 msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
2003 ++iocom->msg_seq;
2004
2005 if (msg->aux_data && msg->aux_size) {
2006 uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
2007
2008 msg->any.head.aux_bytes = msg->aux_size;
2009 msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
2010 }
2011 msg->any.head.hdr_crc = 0;
2012 msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
2013
2014 /*
2015 * If termination races new message senders we must drain the
2016 * message immediately instead of queue it.
2017 */
2018 if (iocom->flags & KDMSG_IOCOMF_EXITNOACC)
2019 kdmsg_drain_msg(msg);
2020 else
2021 TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
2022
2023 if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
2024 atomic_clear_int(&iocom->msg_ctl,
2025 KDMSG_CLUSTERCTL_SLEEPING);
2026 wakeup(&iocom->msg_ctl);
2027 }
2028 }
2029
2030 /*
2031 * Reply to a message and terminate our side of the transaction.
2032 *
2033 * If msg->state is non-NULL we are replying to a one-way message.
2034 */
2035 void
2036 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
2037 {
2038 kdmsg_state_t *state = msg->state;
2039 kdmsg_msg_t *nmsg;
2040 uint32_t cmd;
2041
2042 /*
2043 * Reply with a simple error code and terminate the transaction.
2044 */
2045 cmd = DMSG_LNK_ERROR;
2046
2047 /*
2048 * Check if our direction has even been initiated yet, set CREATE.
2049 *
2050 * Check what direction this is (command or reply direction). Note
2051 * that txcmd might not have been initiated yet.
2052 *
2053 * If our direction has already been closed we just return without
2054 * doing anything.
2055 */
2056 if (state != &state->iocom->state0) {
2057 if (state->txcmd & DMSGF_DELETE)
2058 return;
2059 if ((state->txcmd & DMSGF_CREATE) == 0)
2060 cmd |= DMSGF_CREATE;
2061 if (state->txcmd & DMSGF_REPLY)
2062 cmd |= DMSGF_REPLY;
2063 cmd |= DMSGF_DELETE;
2064 } else {
2065 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2066 cmd |= DMSGF_REPLY;
2067 }
2068
2069 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2070 nmsg->any.head.error = error;
2071 kdmsg_msg_write(nmsg);
2072 }
2073
2074 /*
2075 * Reply to a message and continue our side of the transaction.
2076 *
2077 * If msg->state is non-NULL we are replying to a one-way message and this
2078 * function degenerates into the same as kdmsg_msg_reply().
2079 */
2080 void
2081 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
2082 {
2083 kdmsg_state_t *state = msg->state;
2084 kdmsg_msg_t *nmsg;
2085 uint32_t cmd;
2086
2087 /*
2088 * Return a simple result code, do NOT terminate the transaction.
2089 */
2090 cmd = DMSG_LNK_ERROR;
2091
2092 /*
2093 * Check if our direction has even been initiated yet, set CREATE.
2094 *
2095 * Check what direction this is (command or reply direction). Note
2096 * that txcmd might not have been initiated yet.
2097 *
2098 * If our direction has already been closed we just return without
2099 * doing anything.
2100 */
2101 if (state != &state->iocom->state0) {
2102 if (state->txcmd & DMSGF_DELETE)
2103 return;
2104 if ((state->txcmd & DMSGF_CREATE) == 0)
2105 cmd |= DMSGF_CREATE;
2106 if (state->txcmd & DMSGF_REPLY)
2107 cmd |= DMSGF_REPLY;
2108 /* continuing transaction, do not set MSGF_DELETE */
2109 } else {
2110 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
2111 cmd |= DMSGF_REPLY;
2112 }
2113
2114 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2115 nmsg->any.head.error = error;
2116 kdmsg_msg_write(nmsg);
2117 }
2118
2119 /*
2120 * Reply to a message and terminate our side of the transaction.
2121 *
2122 * If msg->state is non-NULL we are replying to a one-way message.
2123 */
2124 void
2125 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
2126 {
2127 kdmsg_msg_t *nmsg;
2128 uint32_t cmd;
2129
2130 /*
2131 * Reply with a simple error code and terminate the transaction.
2132 */
2133 cmd = DMSG_LNK_ERROR;
2134
2135 /*
2136 * Check if our direction has even been initiated yet, set CREATE.
2137 *
2138 * Check what direction this is (command or reply direction). Note
2139 * that txcmd might not have been initiated yet.
2140 *
2141 * If our direction has already been closed we just return without
2142 * doing anything.
2143 */
2144 KKASSERT(state);
2145 if (state->txcmd & DMSGF_DELETE)
2146 return;
2147 if ((state->txcmd & DMSGF_CREATE) == 0)
2148 cmd |= DMSGF_CREATE;
2149 if (state->txcmd & DMSGF_REPLY)
2150 cmd |= DMSGF_REPLY;
2151 cmd |= DMSGF_DELETE;
2152
2153 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2154 nmsg->any.head.error = error;
2155 kdmsg_msg_write(nmsg);
2156 }
2157
2158 /*
2159 * Reply to a message and continue our side of the transaction.
2160 *
2161 * If msg->state is non-NULL we are replying to a one-way message and this
2162 * function degenerates into the same as kdmsg_msg_reply().
2163 */
2164 void
2165 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2166 {
2167 kdmsg_msg_t *nmsg;
2168 uint32_t cmd;
2169
2170 /*
2171 * Return a simple result code, do NOT terminate the transaction.
2172 */
2173 cmd = DMSG_LNK_ERROR;
2174
2175 /*
2176 * Check if our direction has even been initiated yet, set CREATE.
2177 *
2178 * Check what direction this is (command or reply direction). Note
2179 * that txcmd might not have been initiated yet.
2180 *
2181 * If our direction has already been closed we just return without
2182 * doing anything.
2183 */
2184 KKASSERT(state);
2185 if (state->txcmd & DMSGF_DELETE)
2186 return;
2187 if ((state->txcmd & DMSGF_CREATE) == 0)
2188 cmd |= DMSGF_CREATE;
2189 if (state->txcmd & DMSGF_REPLY)
2190 cmd |= DMSGF_REPLY;
2191 /* continuing transaction, do not set MSGF_DELETE */
2192
2193 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
2194 nmsg->any.head.error = error;
2195 kdmsg_msg_write(nmsg);
2196 }
2197