xref: /freebsd/sys/dev/hyperv/utilities/hv_snapshot.c (revision 06c3fb27)
1 /*-
2  * Copyright (c) 2016 Microsoft Corp.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/kernel.h>
29 #include <sys/conf.h>
30 #include <sys/uio.h>
31 #include <sys/bus.h>
32 #include <sys/malloc.h>
33 #include <sys/mbuf.h>
34 #include <sys/module.h>
35 #include <sys/lock.h>
36 #include <sys/taskqueue.h>
37 #include <sys/selinfo.h>
38 #include <sys/sysctl.h>
39 #include <sys/poll.h>
40 #include <sys/proc.h>
41 #include <sys/queue.h>
42 #include <sys/kthread.h>
43 #include <sys/syscallsubr.h>
44 #include <sys/sysproto.h>
45 #include <sys/un.h>
46 #include <sys/endian.h>
47 #include <sys/sema.h>
48 #include <sys/signal.h>
49 #include <sys/syslog.h>
50 #include <sys/systm.h>
51 #include <sys/mutex.h>
52 #include <sys/callout.h>
53 
54 #include <dev/hyperv/include/hyperv.h>
55 #include <dev/hyperv/utilities/hv_utilreg.h>
56 #include <dev/hyperv/utilities/vmbus_icreg.h>
57 #include <dev/hyperv/utilities/vmbus_icvar.h>
58 
59 #include "hv_snapshot.h"
60 #include "vmbus_if.h"
61 
62 #define VSS_MAJOR		5
63 #define VSS_MINOR		0
64 #define VSS_MSGVER		VMBUS_IC_VERSION(VSS_MAJOR, VSS_MINOR)
65 
66 #define VSS_FWVER_MAJOR		3
67 #define VSS_FWVER		VMBUS_IC_VERSION(VSS_FWVER_MAJOR, 0)
68 
69 #define TIMEOUT_LIMIT		(15)	// seconds
70 enum hv_vss_op {
71 	VSS_OP_CREATE = 0,
72 	VSS_OP_DELETE,
73 	VSS_OP_HOT_BACKUP,
74 	VSS_OP_GET_DM_INFO,
75 	VSS_OP_BU_COMPLETE,
76 	/*
77 	 * Following operations are only supported with IC version >= 5.0
78 	 */
79 	VSS_OP_FREEZE, /* Freeze the file systems in the VM */
80 	VSS_OP_THAW, /* Unfreeze the file systems */
81 	VSS_OP_AUTO_RECOVER,
82 	VSS_OP_COUNT /* Number of operations, must be last */
83 };
84 
85 /*
86  * Header for all VSS messages.
87  */
88 struct hv_vss_hdr {
89 	struct vmbus_icmsg_hdr	ic_hdr;
90 	uint8_t			operation;
91 	uint8_t			reserved[7];
92 } __packed;
93 
94 
95 /*
96  * Flag values for the hv_vss_check_feature. Here supports only
97  * one value.
98  */
99 #define VSS_HBU_NO_AUTO_RECOVERY		0x00000005
100 
101 struct hv_vss_check_feature {
102 	uint32_t flags;
103 } __packed;
104 
105 struct hv_vss_check_dm_info {
106 	uint32_t flags;
107 } __packed;
108 
109 struct hv_vss_msg {
110 	union {
111 		struct hv_vss_hdr vss_hdr;
112 	} hdr;
113 	union {
114 		struct hv_vss_check_feature vss_cf;
115 		struct hv_vss_check_dm_info dm_info;
116 	} body;
117 } __packed;
118 
119 struct hv_vss_req {
120 	struct hv_vss_opt_msg	opt_msg;	/* used to communicate with daemon */
121 	struct hv_vss_msg	msg;		/* used to communicate with host */
122 } __packed;
123 
124 /* hv_vss debug control */
125 static int hv_vss_log = 0;
126 
127 #define	hv_vss_log_error(...)	do {				\
128 	if (hv_vss_log > 0)					\
129 		log(LOG_ERR, "hv_vss: " __VA_ARGS__);		\
130 } while (0)
131 
132 #define	hv_vss_log_info(...) do {				\
133 	if (hv_vss_log > 1)					\
134 		log(LOG_INFO, "hv_vss: " __VA_ARGS__);		\
135 } while (0)
136 
137 static const struct vmbus_ic_desc vmbus_vss_descs[] = {
138 	{
139 		.ic_guid = { .hv_guid = {
140 		    0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42,
141 		    0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4,  0x40} },
142 		.ic_desc = "Hyper-V VSS"
143 	},
144 	VMBUS_IC_DESC_END
145 };
146 
147 static const char * vss_opt_name[] = {"None", "VSSCheck", "Freeze", "Thaw"};
148 
149 /* character device prototypes */
150 static d_open_t		hv_vss_dev_open;
151 static d_close_t	hv_vss_dev_close;
152 static d_poll_t		hv_vss_dev_daemon_poll;
153 static d_ioctl_t	hv_vss_dev_daemon_ioctl;
154 
155 static d_open_t		hv_appvss_dev_open;
156 static d_close_t	hv_appvss_dev_close;
157 static d_poll_t		hv_appvss_dev_poll;
158 static d_ioctl_t	hv_appvss_dev_ioctl;
159 
160 /* hv_vss character device structure */
161 static struct cdevsw hv_vss_cdevsw =
162 {
163 	.d_version	= D_VERSION,
164 	.d_open		= hv_vss_dev_open,
165 	.d_close	= hv_vss_dev_close,
166 	.d_poll		= hv_vss_dev_daemon_poll,
167 	.d_ioctl	= hv_vss_dev_daemon_ioctl,
168 	.d_name		= FS_VSS_DEV_NAME,
169 };
170 
171 static struct cdevsw hv_appvss_cdevsw =
172 {
173 	.d_version	= D_VERSION,
174 	.d_open		= hv_appvss_dev_open,
175 	.d_close	= hv_appvss_dev_close,
176 	.d_poll		= hv_appvss_dev_poll,
177 	.d_ioctl	= hv_appvss_dev_ioctl,
178 	.d_name		= APP_VSS_DEV_NAME,
179 };
180 
181 struct hv_vss_sc;
182 /*
183  * Global state to track cdev
184  */
185 struct hv_vss_dev_sc {
186 	/*
187 	 * msg was transferred from host to notify queue, and
188 	 * ack queue. Finally, it was recyled to free list.
189 	 */
190 	STAILQ_HEAD(, hv_vss_req_internal) 	to_notify_queue;
191 	STAILQ_HEAD(, hv_vss_req_internal) 	to_ack_queue;
192 	struct hv_vss_sc			*sc;
193 	struct proc				*proc_task;
194 	struct selinfo				hv_vss_selinfo;
195 };
196 /*
197  * Global state to track and synchronize the transaction requests from the host.
198  * The VSS allows user to register their function to do freeze/thaw for application.
199  * VSS kernel will notify both vss daemon and user application if it is registered.
200  * The implementation state transition is illustrated by:
201  * https://clovertrail.github.io/assets/vssdot.png
202  */
203 typedef struct hv_vss_sc {
204 	struct vmbus_ic_softc			util_sc;
205 	device_t				dev;
206 
207 	struct task				task;
208 
209 	/*
210 	 * mutex is used to protect access of list/queue,
211 	 * callout in request is also used this mutex.
212 	 */
213 	struct mtx				pending_mutex;
214 	/*
215 	 * req_free_list contains all free items
216 	 */
217 	LIST_HEAD(, hv_vss_req_internal)	req_free_list;
218 
219 	/* Indicates if daemon registered with driver */
220 	boolean_t				register_done;
221 
222 	boolean_t				app_register_done;
223 
224 	/* cdev for file system freeze/thaw */
225 	struct cdev				*hv_vss_dev;
226 	/* cdev for application freeze/thaw */
227 	struct cdev				*hv_appvss_dev;
228 
229 	/* sc for app */
230 	struct hv_vss_dev_sc			app_sc;
231 	/* sc for deamon */
232 	struct hv_vss_dev_sc			daemon_sc;
233 } hv_vss_sc;
234 
235 typedef struct hv_vss_req_internal {
236 	LIST_ENTRY(hv_vss_req_internal)		link;
237 	STAILQ_ENTRY(hv_vss_req_internal)	slink;
238 	struct hv_vss_req			vss_req;
239 
240 	/* Rcv buffer for communicating with the host*/
241 	uint8_t					*rcv_buf;
242 	/* Length of host message */
243 	uint32_t				host_msg_len;
244 	/* Host message id */
245 	uint64_t				host_msg_id;
246 
247 	hv_vss_sc				*sc;
248 
249 	struct callout				callout;
250 } hv_vss_req_internal;
251 
252 #define SEARCH_REMOVE_REQ_LOCKED(reqp, queue, link, tmp, id)		\
253 	do {								\
254 		STAILQ_FOREACH_SAFE(reqp, queue, link, tmp) {		\
255 			if (reqp->vss_req.opt_msg.msgid == id) {	\
256 				STAILQ_REMOVE(queue,			\
257 				    reqp, hv_vss_req_internal, link);	\
258 				break;					\
259 			}						\
260 		}							\
261 	} while (0)
262 
263 static bool
264 hv_vss_is_daemon_killed_after_launch(hv_vss_sc *sc)
265 {
266 	return (!sc->register_done && sc->daemon_sc.proc_task);
267 }
268 
269 /*
270  * Callback routine that gets called whenever there is a message from host
271  */
272 static void
273 hv_vss_callback(struct vmbus_channel *chan __unused, void *context)
274 {
275 	hv_vss_sc *sc = (hv_vss_sc*)context;
276 	if (hv_vss_is_daemon_killed_after_launch(sc))
277 		hv_vss_log_info("%s: daemon was killed!\n", __func__);
278 	if (sc->register_done || sc->daemon_sc.proc_task) {
279 		hv_vss_log_info("%s: Queuing work item\n", __func__);
280 		if (hv_vss_is_daemon_killed_after_launch(sc))
281 			hv_vss_log_info("%s: daemon was killed!\n", __func__);
282 		taskqueue_enqueue(taskqueue_thread, &sc->task);
283 	} else {
284 		hv_vss_log_info("%s: daemon has never been registered\n", __func__);
285 	}
286 	hv_vss_log_info("%s: received msg from host\n", __func__);
287 }
288 /*
289  * Send the response back to the host.
290  */
291 static void
292 hv_vss_respond_host(uint8_t *rcv_buf, struct vmbus_channel *ch,
293     uint32_t recvlen, uint64_t requestid, uint32_t error)
294 {
295 	struct vmbus_icmsg_hdr *hv_icmsg_hdrp;
296 
297 	hv_icmsg_hdrp = (struct vmbus_icmsg_hdr *)rcv_buf;
298 
299 	hv_icmsg_hdrp->ic_status = error;
300 	hv_icmsg_hdrp->ic_flags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE;
301 
302 	error = vmbus_chan_send(ch, VMBUS_CHANPKT_TYPE_INBAND, 0,
303 	    rcv_buf, recvlen, requestid);
304 	if (error)
305 		hv_vss_log_info("%s: hv_vss_respond_host: sendpacket error:%d\n",
306 		    __func__, error);
307 }
308 
309 static void
310 hv_vss_notify_host_result_locked(struct hv_vss_req_internal *reqp, uint32_t status)
311 {
312 	struct hv_vss_msg* msg = (struct hv_vss_msg *)reqp->rcv_buf;
313 	hv_vss_sc *sc = reqp->sc;
314 	if (reqp->vss_req.opt_msg.opt == HV_VSS_CHECK) {
315 		msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
316 	}
317 	hv_vss_log_info("%s, %s response %s to host\n", __func__,
318 	    vss_opt_name[reqp->vss_req.opt_msg.opt],
319 	    status == HV_S_OK ? "Success" : "Fail");
320 	hv_vss_respond_host(reqp->rcv_buf, vmbus_get_channel(reqp->sc->dev),
321 	    reqp->host_msg_len, reqp->host_msg_id, status);
322 	/* recycle the request */
323 	LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
324 }
325 
326 static void
327 hv_vss_notify_host_result(struct hv_vss_req_internal *reqp, uint32_t status)
328 {
329 	mtx_lock(&reqp->sc->pending_mutex);
330 	hv_vss_notify_host_result_locked(reqp, status);
331 	mtx_unlock(&reqp->sc->pending_mutex);
332 }
333 
334 static void
335 hv_vss_cp_vssreq_to_user(struct hv_vss_req_internal *reqp,
336     struct hv_vss_opt_msg *userdata)
337 {
338 	struct hv_vss_req *hv_vss_dev_buf;
339 	hv_vss_dev_buf = &reqp->vss_req;
340 	hv_vss_dev_buf->opt_msg.opt = HV_VSS_NONE;
341 	switch (reqp->vss_req.msg.hdr.vss_hdr.operation) {
342 	case VSS_OP_FREEZE:
343 		hv_vss_dev_buf->opt_msg.opt = HV_VSS_FREEZE;
344 		break;
345 	case VSS_OP_THAW:
346 		hv_vss_dev_buf->opt_msg.opt = HV_VSS_THAW;
347 		break;
348 	case VSS_OP_HOT_BACKUP:
349 		hv_vss_dev_buf->opt_msg.opt = HV_VSS_CHECK;
350 		break;
351 	}
352 	*userdata = hv_vss_dev_buf->opt_msg;
353 	hv_vss_log_info("%s, read data from user for "
354 	    "%s (%ju) \n", __func__, vss_opt_name[userdata->opt],
355 	    (uintmax_t)userdata->msgid);
356 }
357 
358 /**
359  * Remove the request id from app notifiy or ack queue,
360  * and recyle the request by inserting it to free list.
361  *
362  * When app was notified but not yet sending ack, the request
363  * should locate in either notify queue or ack queue.
364  */
365 static struct hv_vss_req_internal*
366 hv_vss_drain_req_queue_locked(hv_vss_sc *sc, uint64_t req_id)
367 {
368 	struct hv_vss_req_internal *reqp, *tmp;
369 	SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_notify_queue,
370 	    slink, tmp, req_id);
371 	if (reqp == NULL)
372 		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_ack_queue,
373 		    slink, tmp, req_id);
374 	if (reqp == NULL)
375 		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_notify_queue,
376 		    slink, tmp, req_id);
377 	if (reqp == NULL)
378 		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_ack_queue, slink,
379 		    tmp, req_id);
380 	return (reqp);
381 }
382 /**
383  * Actions for daemon who has been notified.
384  */
385 static void
386 hv_vss_notified(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
387 {
388 	struct hv_vss_req_internal *reqp;
389 	mtx_lock(&dev_sc->sc->pending_mutex);
390 	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue)) {
391 		reqp = STAILQ_FIRST(&dev_sc->to_notify_queue);
392 		hv_vss_cp_vssreq_to_user(reqp, userdata);
393 		STAILQ_REMOVE_HEAD(&dev_sc->to_notify_queue, slink);
394 		/* insert the msg to queue for write */
395 		STAILQ_INSERT_TAIL(&dev_sc->to_ack_queue, reqp, slink);
396 		userdata->status = VSS_SUCCESS;
397 	} else {
398 		/* Timeout occur, thus request was removed from queue. */
399 		hv_vss_log_info("%s: notify queue is empty!\n", __func__);
400 		userdata->status = VSS_FAIL;
401 	}
402 	mtx_unlock(&dev_sc->sc->pending_mutex);
403 }
404 
405 static void
406 hv_vss_notify(struct hv_vss_dev_sc *dev_sc, struct hv_vss_req_internal *reqp)
407 {
408 	uint32_t opt = reqp->vss_req.opt_msg.opt;
409 	mtx_lock(&dev_sc->sc->pending_mutex);
410 	STAILQ_INSERT_TAIL(&dev_sc->to_notify_queue, reqp, slink);
411 	hv_vss_log_info("%s: issuing query %s (%ju) to %s\n", __func__,
412 	    vss_opt_name[opt], (uintmax_t)reqp->vss_req.opt_msg.msgid,
413 	    &dev_sc->sc->app_sc == dev_sc ? "app" : "daemon");
414 	mtx_unlock(&dev_sc->sc->pending_mutex);
415 	selwakeup(&dev_sc->hv_vss_selinfo);
416 }
417 
418 /**
419  * Actions for daemon who has acknowledged.
420  */
421 static void
422 hv_vss_daemon_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
423 {
424 	struct hv_vss_req_internal	*reqp, *tmp;
425 	uint64_t			req_id;
426 	int				opt;
427 	uint32_t			status;
428 
429 	opt = userdata->opt;
430 	req_id = userdata->msgid;
431 	status = userdata->status;
432 	/* make sure the reserved fields are all zeros. */
433 	memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
434 	    __offsetof(struct hv_vss_opt_msg, reserved));
435 	mtx_lock(&dev_sc->sc->pending_mutex);
436 	SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
437 	mtx_unlock(&dev_sc->sc->pending_mutex);
438 	if (reqp == NULL) {
439 		hv_vss_log_info("%s Timeout: fail to find daemon ack request\n",
440 		    __func__);
441 		userdata->status = VSS_FAIL;
442 		return;
443 	}
444 	KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
445 	hv_vss_log_info("%s, get response %d from daemon for %s (%ju) \n", __func__,
446 	    status, vss_opt_name[opt], (uintmax_t)req_id);
447 	switch (opt) {
448 	case HV_VSS_CHECK:
449 	case HV_VSS_FREEZE:
450 		callout_drain(&reqp->callout);
451 		hv_vss_notify_host_result(reqp,
452 		    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
453 		break;
454 	case HV_VSS_THAW:
455 		if (dev_sc->sc->app_register_done) {
456 			if (status == VSS_SUCCESS) {
457 				hv_vss_notify(&dev_sc->sc->app_sc, reqp);
458 			} else {
459 				/* handle error */
460 				callout_drain(&reqp->callout);
461 				hv_vss_notify_host_result(reqp, HV_E_FAIL);
462 			}
463 		} else {
464 			callout_drain(&reqp->callout);
465 			hv_vss_notify_host_result(reqp,
466 			    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
467 		}
468 		break;
469 	}
470 }
471 
472 /**
473  * Actions for app who has acknowledged.
474  */
475 static void
476 hv_vss_app_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
477 {
478 	struct hv_vss_req_internal	*reqp, *tmp;
479 	uint64_t			req_id;
480 	int				opt;
481 	uint8_t				status;
482 
483 	opt = userdata->opt;
484 	req_id = userdata->msgid;
485 	status = userdata->status;
486 	/* make sure the reserved fields are all zeros. */
487 	memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
488 	    __offsetof(struct hv_vss_opt_msg, reserved));
489 	mtx_lock(&dev_sc->sc->pending_mutex);
490 	SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
491 	mtx_unlock(&dev_sc->sc->pending_mutex);
492 	if (reqp == NULL) {
493 		hv_vss_log_info("%s Timeout: fail to find app ack request\n",
494 		    __func__);
495 		userdata->status = VSS_FAIL;
496 		return;
497 	}
498 	KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
499 	hv_vss_log_info("%s, get response %d from app for %s (%ju) \n",
500 	    __func__, status, vss_opt_name[opt], (uintmax_t)req_id);
501 	if (dev_sc->sc->register_done) {
502 		switch (opt) {
503 		case HV_VSS_CHECK:
504 		case HV_VSS_FREEZE:
505 			if (status == VSS_SUCCESS) {
506 				hv_vss_notify(&dev_sc->sc->daemon_sc, reqp);
507 			} else {
508 				/* handle error */
509 				callout_drain(&reqp->callout);
510 				hv_vss_notify_host_result(reqp, HV_E_FAIL);
511 			}
512 			break;
513 		case HV_VSS_THAW:
514 			callout_drain(&reqp->callout);
515 			hv_vss_notify_host_result(reqp,
516 			    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
517 			break;
518 		}
519 	} else {
520 		hv_vss_log_info("%s, Fatal: vss daemon was killed\n", __func__);
521 	}
522 }
523 
524 static int
525 hv_vss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
526 {
527 	struct proc     *td_proc;
528 	td_proc = td->td_proc;
529 
530 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
531 	hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
532 	    __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
533 
534 	if (dev_sc->sc->register_done)
535 		return (EBUSY);
536 
537 	dev_sc->sc->register_done = true;
538 	hv_vss_callback(vmbus_get_channel(dev_sc->sc->dev), dev_sc->sc);
539 
540 	dev_sc->proc_task = curproc;
541 	return (0);
542 }
543 
544 static int
545 hv_vss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
546 				 struct thread *td)
547 {
548 	struct proc     *td_proc;
549 	td_proc = td->td_proc;
550 
551 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
552 
553 	hv_vss_log_info("%s: %s closes device \"%s\"\n",
554 	    __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
555 	dev_sc->sc->register_done = false;
556 	return (0);
557 }
558 
559 static int
560 hv_vss_dev_daemon_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
561     struct thread *td)
562 {
563 	struct proc			*td_proc;
564 	struct hv_vss_dev_sc		*sc;
565 
566 	td_proc = td->td_proc;
567 	sc = (struct hv_vss_dev_sc*)dev->si_drv1;
568 
569 	hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
570 
571 	struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
572 	switch(cmd) {
573 	case IOCHVVSSREAD:
574 		hv_vss_notified(sc, userdata);
575 		break;
576 	case IOCHVVSSWRITE:
577 		hv_vss_daemon_acked(sc, userdata);
578 		break;
579 	}
580 	return (0);
581 }
582 
583 /*
584  * hv_vss_daemon poll invokes this function to check if data is available
585  * for daemon to read.
586  */
587 static int
588 hv_vss_dev_daemon_poll(struct cdev *dev, int events, struct thread *td)
589 {
590 	int revent = 0;
591 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
592 
593 	mtx_lock(&dev_sc->sc->pending_mutex);
594 	/**
595 	 * if there is data ready, inform daemon's poll
596 	 */
597 	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
598 		revent = POLLIN;
599 	if (revent == 0)
600 		selrecord(td, &dev_sc->hv_vss_selinfo);
601 	hv_vss_log_info("%s return 0x%x\n", __func__, revent);
602 	mtx_unlock(&dev_sc->sc->pending_mutex);
603 	return (revent);
604 }
605 
606 static int
607 hv_appvss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
608 {
609 	struct proc     *td_proc;
610 	td_proc = td->td_proc;
611 
612 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
613 	hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
614 	    __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
615 
616 	if (dev_sc->sc->app_register_done)
617 		return (EBUSY);
618 
619 	dev_sc->sc->app_register_done = true;
620 	dev_sc->proc_task = curproc;
621 	return (0);
622 }
623 
624 static int
625 hv_appvss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
626 				 struct thread *td)
627 {
628 	struct proc     *td_proc;
629 	td_proc = td->td_proc;
630 
631 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
632 
633 	hv_vss_log_info("%s: %s closes device \"%s\".\n",
634 	    __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
635 	dev_sc->sc->app_register_done = false;
636 	return (0);
637 }
638 
639 static int
640 hv_appvss_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
641     struct thread *td)
642 {
643 	struct proc			*td_proc;
644 	struct hv_vss_dev_sc		*dev_sc;
645 
646 	td_proc = td->td_proc;
647 	dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
648 
649 	hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
650 
651 	struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
652 	switch(cmd) {
653 	case IOCHVVSSREAD:
654 		hv_vss_notified(dev_sc, userdata);
655 		break;
656 	case IOCHVVSSWRITE:
657 		hv_vss_app_acked(dev_sc, userdata);
658 		break;
659 	}
660 	return (0);
661 }
662 
663 /*
664  * hv_vss_daemon poll invokes this function to check if data is available
665  * for daemon to read.
666  */
667 static int
668 hv_appvss_dev_poll(struct cdev *dev, int events, struct thread *td)
669 {
670 	int revent = 0;
671 	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
672 
673 	mtx_lock(&dev_sc->sc->pending_mutex);
674 	/**
675 	 * if there is data ready, inform daemon's poll
676 	 */
677 	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
678 		revent = POLLIN;
679 	if (revent == 0)
680 		selrecord(td, &dev_sc->hv_vss_selinfo);
681 	hv_vss_log_info("%s return 0x%x\n", __func__, revent);
682 	mtx_unlock(&dev_sc->sc->pending_mutex);
683 	return (revent);
684 }
685 
686 static void
687 hv_vss_timeout(void *arg)
688 {
689 	hv_vss_req_internal *reqp = arg;
690 	hv_vss_req_internal *request __diagused;
691 	hv_vss_sc* sc = reqp->sc;
692 	uint64_t req_id = reqp->vss_req.opt_msg.msgid;
693 	/* This thread is locked */
694 	KASSERT(mtx_owned(&sc->pending_mutex), ("mutex lock is not owned!"));
695 	request = hv_vss_drain_req_queue_locked(sc, req_id);
696 	KASSERT(request != NULL, ("timeout but fail to find request"));
697 	hv_vss_notify_host_result_locked(reqp, HV_E_FAIL);
698 }
699 
700 /*
701  * This routine is called whenever a message is received from the host
702  */
703 static void
704 hv_vss_init_req(hv_vss_req_internal *reqp,
705     uint32_t recvlen, uint64_t requestid, uint8_t *vss_buf, hv_vss_sc *sc)
706 {
707 	struct timespec vm_ts;
708 	struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
709 
710 	memset(reqp, 0, __offsetof(hv_vss_req_internal, callout));
711 	reqp->host_msg_len = recvlen;
712 	reqp->host_msg_id = requestid;
713 	reqp->rcv_buf = vss_buf;
714 	reqp->sc = sc;
715 	memcpy(&reqp->vss_req.msg,
716 	    (struct hv_vss_msg *)vss_buf, sizeof(struct hv_vss_msg));
717 	/* set the opt for users */
718 	switch (msg->hdr.vss_hdr.operation) {
719 	case VSS_OP_FREEZE:
720 		reqp->vss_req.opt_msg.opt = HV_VSS_FREEZE;
721 		break;
722 	case VSS_OP_THAW:
723 		reqp->vss_req.opt_msg.opt = HV_VSS_THAW;
724 		break;
725 	case VSS_OP_HOT_BACKUP:
726 		reqp->vss_req.opt_msg.opt = HV_VSS_CHECK;
727 		break;
728 	}
729 	/* Use a timestamp as msg request ID */
730 	nanotime(&vm_ts);
731 	reqp->vss_req.opt_msg.msgid = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec;
732 }
733 
734 static hv_vss_req_internal*
735 hv_vss_get_new_req_locked(hv_vss_sc *sc)
736 {
737 	hv_vss_req_internal *reqp;
738 	if (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue) ||
739 	    !STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue) ||
740 	    !STAILQ_EMPTY(&sc->app_sc.to_notify_queue) ||
741 	    !STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
742 		/*
743 		 * There is request coming from host before
744 		 * finishing previous requests
745 		 */
746 		hv_vss_log_info("%s: Warning: there is new request "
747 		    "coming before finishing previous requests\n", __func__);
748 		return (NULL);
749 	}
750 	if (LIST_EMPTY(&sc->req_free_list)) {
751 		/* TODO Error: no buffer */
752 		hv_vss_log_info("Error: No buffer\n");
753 		return (NULL);
754 	}
755 	reqp = LIST_FIRST(&sc->req_free_list);
756 	LIST_REMOVE(reqp, link);
757 	return (reqp);
758 }
759 
760 static void
761 hv_vss_start_notify(hv_vss_req_internal *reqp, uint32_t opt)
762 {
763 	hv_vss_sc *sc = reqp->sc;
764 	/*
765 	 * Freeze/Check notification sequence: kernel -> app -> daemon(fs)
766 	 * Thaw notification sequence:         kernel -> daemon(fs) -> app
767 	 *
768 	 * We should wake up the daemon, in case it's doing poll().
769 	 * The response should be received after 5s, otherwise, trigger timeout.
770 	 */
771 	switch (opt) {
772 	case VSS_OP_FREEZE:
773 	case VSS_OP_HOT_BACKUP:
774 		if (sc->app_register_done)
775 			hv_vss_notify(&sc->app_sc, reqp);
776 		else
777 			hv_vss_notify(&sc->daemon_sc, reqp);
778 		callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
779 		    hv_vss_timeout, reqp);
780 		break;
781 	case VSS_OP_THAW:
782 		hv_vss_notify(&sc->daemon_sc, reqp);
783 		callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
784 		    hv_vss_timeout, reqp);
785 		break;
786 	}
787 }
788 
789 /*
790  * Function to read the vss request buffer from host
791  * and interact with daemon
792  */
793 static void
794 hv_vss_process_request(void *context, int pending __unused)
795 {
796 	uint8_t *vss_buf;
797 	struct vmbus_channel *channel;
798 	uint32_t recvlen = 0;
799 	uint64_t requestid;
800 	struct vmbus_icmsg_hdr *icmsghdrp;
801 	int ret = 0;
802 	hv_vss_sc *sc;
803 	hv_vss_req_internal *reqp;
804 
805 	hv_vss_log_info("%s: entering hv_vss_process_request\n", __func__);
806 
807 	sc = (hv_vss_sc*)context;
808 	vss_buf = sc->util_sc.ic_buf;
809 	channel = vmbus_get_channel(sc->dev);
810 
811 	recvlen = sc->util_sc.ic_buflen;
812 	ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
813 	KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
814 	/* XXX check recvlen to make sure that it contains enough data */
815 
816 	while ((ret == 0) && (recvlen > 0)) {
817 		icmsghdrp = (struct vmbus_icmsg_hdr *)vss_buf;
818 
819 		if (icmsghdrp->ic_type == HV_ICMSGTYPE_NEGOTIATE) {
820 			ret = vmbus_ic_negomsg(&sc->util_sc, vss_buf,
821 			    &recvlen, VSS_FWVER, VSS_MSGVER);
822 			hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
823 			    recvlen, requestid, ret);
824 			hv_vss_log_info("%s: version negotiated\n", __func__);
825 		} else if (!hv_vss_is_daemon_killed_after_launch(sc)) {
826 			struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
827 			switch(msg->hdr.vss_hdr.operation) {
828 			case VSS_OP_FREEZE:
829 			case VSS_OP_THAW:
830 			case VSS_OP_HOT_BACKUP:
831 				mtx_lock(&sc->pending_mutex);
832 				reqp = hv_vss_get_new_req_locked(sc);
833 				mtx_unlock(&sc->pending_mutex);
834 				if (reqp == NULL) {
835 					/* ignore this request from host */
836 					break;
837 				}
838 				hv_vss_init_req(reqp, recvlen, requestid, vss_buf, sc);
839 				hv_vss_log_info("%s: receive %s (%ju) from host\n",
840 				    __func__,
841 				    vss_opt_name[reqp->vss_req.opt_msg.opt],
842 				    (uintmax_t)reqp->vss_req.opt_msg.msgid);
843 				hv_vss_start_notify(reqp, msg->hdr.vss_hdr.operation);
844 				break;
845 			case VSS_OP_GET_DM_INFO:
846 				hv_vss_log_info("%s: receive GET_DM_INFO from host\n",
847 				    __func__);
848 				msg->body.dm_info.flags = 0;
849 				hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
850 				    recvlen, requestid, HV_S_OK);
851 				break;
852 			default:
853 				device_printf(sc->dev, "Unknown opt from host: %d\n",
854 				    msg->hdr.vss_hdr.operation);
855 				break;
856 			}
857 		} else {
858 			/* daemon was killed for some reason after it was launched */
859 			struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
860 			switch(msg->hdr.vss_hdr.operation) {
861 			case VSS_OP_FREEZE:
862 				hv_vss_log_info("%s: response fail for FREEZE\n",
863 				    __func__);
864 				break;
865 			case VSS_OP_THAW:
866 				hv_vss_log_info("%s: response fail for THAW\n",
867 				    __func__);
868 				break;
869 			case VSS_OP_HOT_BACKUP:
870 				hv_vss_log_info("%s: response fail for HOT_BACKUP\n",
871 				    __func__);
872 				msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
873 				break;
874 			case VSS_OP_GET_DM_INFO:
875 				hv_vss_log_info("%s: response fail for GET_DM_INFO\n",
876 				    __func__);
877 				msg->body.dm_info.flags = 0;
878 				break;
879 			default:
880 				device_printf(sc->dev, "Unknown opt from host: %d\n",
881 				    msg->hdr.vss_hdr.operation);
882 				break;
883 			}
884 			hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
885 			    recvlen, requestid, HV_E_FAIL);
886 		}
887 		/*
888 		 * Try reading next buffer
889 		 */
890 		recvlen = sc->util_sc.ic_buflen;
891 		ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
892 		KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
893 		/* XXX check recvlen to make sure that it contains enough data */
894 
895 		hv_vss_log_info("%s: read: context %p, ret =%d, recvlen=%d\n",
896 		    __func__, context, ret, recvlen);
897 	}
898 }
899 
900 static int
901 hv_vss_probe(device_t dev)
902 {
903 	return (vmbus_ic_probe(dev, vmbus_vss_descs));
904 }
905 
906 static int
907 hv_vss_init_send_receive_queue(device_t dev)
908 {
909 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
910 	int i;
911 	const int max_list = 4; /* It is big enough for the list */
912 	struct hv_vss_req_internal* reqp;
913 
914 	LIST_INIT(&sc->req_free_list);
915 	STAILQ_INIT(&sc->daemon_sc.to_notify_queue);
916 	STAILQ_INIT(&sc->daemon_sc.to_ack_queue);
917 	STAILQ_INIT(&sc->app_sc.to_notify_queue);
918 	STAILQ_INIT(&sc->app_sc.to_ack_queue);
919 
920 	for (i = 0; i < max_list; i++) {
921 		reqp = malloc(sizeof(struct hv_vss_req_internal),
922 		    M_DEVBUF, M_WAITOK|M_ZERO);
923 		LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
924 		callout_init_mtx(&reqp->callout, &sc->pending_mutex, 0);
925 	}
926 	return (0);
927 }
928 
929 static int
930 hv_vss_destroy_send_receive_queue(device_t dev)
931 {
932 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
933 	hv_vss_req_internal* reqp;
934 
935 	while (!LIST_EMPTY(&sc->req_free_list)) {
936 		reqp = LIST_FIRST(&sc->req_free_list);
937 		LIST_REMOVE(reqp, link);
938 		free(reqp, M_DEVBUF);
939 	}
940 
941 	while (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue)) {
942 		reqp = STAILQ_FIRST(&sc->daemon_sc.to_notify_queue);
943 		STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_notify_queue, slink);
944 		free(reqp, M_DEVBUF);
945 	}
946 
947 	while (!STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue)) {
948 		reqp = STAILQ_FIRST(&sc->daemon_sc.to_ack_queue);
949 		STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_ack_queue, slink);
950 		free(reqp, M_DEVBUF);
951 	}
952 
953 	while (!STAILQ_EMPTY(&sc->app_sc.to_notify_queue)) {
954 		reqp = STAILQ_FIRST(&sc->app_sc.to_notify_queue);
955 		STAILQ_REMOVE_HEAD(&sc->app_sc.to_notify_queue, slink);
956 		free(reqp, M_DEVBUF);
957 	}
958 
959 	while (!STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
960 		reqp = STAILQ_FIRST(&sc->app_sc.to_ack_queue);
961 		STAILQ_REMOVE_HEAD(&sc->app_sc.to_ack_queue, slink);
962 		free(reqp, M_DEVBUF);
963 	}
964 	return (0);
965 }
966 
967 static int
968 hv_vss_attach(device_t dev)
969 {
970 	int error;
971 	struct sysctl_oid_list *child;
972 	struct sysctl_ctx_list *ctx;
973 
974 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
975 
976 	sc->dev = dev;
977 	mtx_init(&sc->pending_mutex, "hv_vss pending mutex", NULL, MTX_DEF);
978 
979 	ctx = device_get_sysctl_ctx(dev);
980 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
981 
982 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_vss_log",
983 	    CTLFLAG_RWTUN, &hv_vss_log, 0, "Hyperv VSS service log level");
984 
985 	TASK_INIT(&sc->task, 0, hv_vss_process_request, sc);
986 	hv_vss_init_send_receive_queue(dev);
987 	/* create character device for file system freeze/thaw */
988 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
989 		    &sc->hv_vss_dev,
990 		    &hv_vss_cdevsw,
991 		    0,
992 		    UID_ROOT,
993 		    GID_WHEEL,
994 		    0640,
995 		    FS_VSS_DEV_NAME);
996 
997 	if (error != 0) {
998 		hv_vss_log_info("Fail to create '%s': %d\n", FS_VSS_DEV_NAME, error);
999 		return (error);
1000 	}
1001 	sc->hv_vss_dev->si_drv1 = &sc->daemon_sc;
1002 	sc->daemon_sc.sc = sc;
1003 	/* create character device for application freeze/thaw */
1004 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
1005 		    &sc->hv_appvss_dev,
1006 		    &hv_appvss_cdevsw,
1007 		    0,
1008 		    UID_ROOT,
1009 		    GID_WHEEL,
1010 		    0640,
1011 		    APP_VSS_DEV_NAME);
1012 
1013 	if (error != 0) {
1014 		hv_vss_log_info("Fail to create '%s': %d\n", APP_VSS_DEV_NAME, error);
1015 		return (error);
1016 	}
1017 	sc->hv_appvss_dev->si_drv1 = &sc->app_sc;
1018 	sc->app_sc.sc = sc;
1019 
1020 	return (vmbus_ic_attach(dev, hv_vss_callback));
1021 }
1022 
1023 static int
1024 hv_vss_detach(device_t dev)
1025 {
1026 	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
1027 	mtx_destroy(&sc->pending_mutex);
1028 	if (sc->daemon_sc.proc_task != NULL) {
1029 		PROC_LOCK(sc->daemon_sc.proc_task);
1030 		kern_psignal(sc->daemon_sc.proc_task, SIGKILL);
1031 		PROC_UNLOCK(sc->daemon_sc.proc_task);
1032 	}
1033 	if (sc->app_sc.proc_task != NULL) {
1034 		PROC_LOCK(sc->app_sc.proc_task);
1035 		kern_psignal(sc->app_sc.proc_task, SIGKILL);
1036 		PROC_UNLOCK(sc->app_sc.proc_task);
1037 	}
1038 	hv_vss_destroy_send_receive_queue(dev);
1039 	destroy_dev(sc->hv_vss_dev);
1040 	destroy_dev(sc->hv_appvss_dev);
1041 	return (vmbus_ic_detach(dev));
1042 }
1043 
1044 static device_method_t vss_methods[] = {
1045 	/* Device interface */
1046 	DEVMETHOD(device_probe, hv_vss_probe),
1047 	DEVMETHOD(device_attach, hv_vss_attach),
1048 	DEVMETHOD(device_detach, hv_vss_detach),
1049 	{ 0, 0 }
1050 };
1051 
1052 static driver_t vss_driver = { "hvvss", vss_methods, sizeof(hv_vss_sc)};
1053 
1054 DRIVER_MODULE(hv_vss, vmbus, vss_driver, NULL, NULL);
1055 MODULE_VERSION(hv_vss, 1);
1056 MODULE_DEPEND(hv_vss, vmbus, 1, 1, 1);
1057