xref: /freebsd/sys/dev/xen/xenstore/xenstore.c (revision 1d84e2b3)
1 /******************************************************************************
2  * xenstore.c
3  *
4  * Low-level kernel interface to the XenStore.
5  *
6  * Copyright (C) 2005 Rusty Russell, IBM Corporation
7  * Copyright (C) 2009,2010 Spectra Logic Corporation
8  *
9  * This file may be distributed separately from the Linux kernel, or
10  * incorporated into other software packages, subject to the following license:
11  *
12  * Permission is hereby granted, free of charge, to any person obtaining a copy
13  * of this source file (the "Software"), to deal in the Software without
14  * restriction, including without limitation the rights to use, copy, modify,
15  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
16  * and to permit persons to whom the Software is furnished to do so, subject to
17  * the following conditions:
18  *
19  * The above copyright notice and this permission notice shall be included in
20  * all copies or substantial portions of the Software.
21  *
22  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28  * IN THE SOFTWARE.
29  */
30 
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/kernel.h>
38 #include <sys/lock.h>
39 #include <sys/module.h>
40 #include <sys/mutex.h>
41 #include <sys/sx.h>
42 #include <sys/syslog.h>
43 #include <sys/malloc.h>
44 #include <sys/systm.h>
45 #include <sys/proc.h>
46 #include <sys/kthread.h>
47 #include <sys/sbuf.h>
48 #include <sys/sysctl.h>
49 #include <sys/uio.h>
50 #include <sys/unistd.h>
51 #include <sys/queue.h>
52 #include <sys/taskqueue.h>
53 
54 #include <machine/stdarg.h>
55 
56 #include <xen/xen-os.h>
57 #include <xen/hypervisor.h>
58 #include <xen/xen_intr.h>
59 
60 #include <xen/interface/hvm/params.h>
61 #include <xen/hvm.h>
62 
63 #include <xen/xenstore/xenstorevar.h>
64 #include <xen/xenstore/xenstore_internal.h>
65 
66 #include <vm/vm.h>
67 #include <vm/pmap.h>
68 
69 /**
70  * \file xenstore.c
71  * \brief XenStore interface
72  *
73  * The XenStore interface is a simple storage system that is a means of
74  * communicating state and configuration data between the Xen Domain 0
75  * and the various guest domains.  All configuration data other than
76  * a small amount of essential information required during the early
77  * boot process of launching a Xen aware guest, is managed using the
78  * XenStore.
79  *
80  * The XenStore is ASCII string based, and has a structure and semantics
81  * similar to a filesystem.  There are files and directories, the directories
82  * able to contain files or other directories.  The depth of the hierachy
83  * is only limited by the XenStore's maximum path length.
84  *
85  * The communication channel between the XenStore service and other
86  * domains is via two, guest specific, ring buffers in a shared memory
87  * area.  One ring buffer is used for communicating in each direction.
88  * The grant table references for this shared memory are given to the
89  * guest either via the xen_start_info structure for a fully para-
90  * virtualized guest, or via HVM hypercalls for a hardware virtualized
91  * guest.
92  *
93  * The XenStore communication relies on an event channel and thus
94  * interrupts.  For this reason, the attachment of the XenStore
95  * relies on an interrupt driven configuration hook to hold off
96  * boot processing until communication with the XenStore service
97  * can be established.
98  *
99  * Several Xen services depend on the XenStore, most notably the
100  * XenBus used to discover and manage Xen devices.  These services
101  * are implemented as NewBus child attachments to a bus exported
102  * by this XenStore driver.
103  */
104 
105 static struct xs_watch *find_watch(const char *token);
106 
107 MALLOC_DEFINE(M_XENSTORE, "xenstore", "XenStore data and results");
108 
109 /**
110  * Pointer to shared memory communication structures allowing us
111  * to communicate with the XenStore service.
112  *
113  * When operating in full PV mode, this pointer is set early in kernel
114  * startup from within xen_machdep.c.  In HVM mode, we use hypercalls
115  * to get the guest frame number for the shared page and then map it
116  * into kva.  See xs_init() for details.
117  */
118 struct xenstore_domain_interface *xen_store;
119 
120 /*-------------------------- Private Data Structures ------------------------*/
121 
122 /**
123  * Structure capturing messages received from the XenStore service.
124  */
125 struct xs_stored_msg {
126 	TAILQ_ENTRY(xs_stored_msg) list;
127 
128 	struct xsd_sockmsg hdr;
129 
130 	union {
131 		/* Queued replies. */
132 		struct {
133 			char *body;
134 		} reply;
135 
136 		/* Queued watch events. */
137 		struct {
138 			struct xs_watch *handle;
139 			const char **vec;
140 			u_int vec_size;
141 		} watch;
142 	} u;
143 };
144 TAILQ_HEAD(xs_stored_msg_list, xs_stored_msg);
145 
146 /**
147  * Container for all XenStore related state.
148  */
149 struct xs_softc {
150 	/** Newbus device for the XenStore. */
151 	device_t xs_dev;
152 
153 	/**
154 	 * Lock serializing access to ring producer/consumer
155 	 * indexes.  Use of this lock guarantees that wakeups
156 	 * of blocking readers/writers are not missed due to
157 	 * races with the XenStore service.
158 	 */
159 	struct mtx ring_lock;
160 
161 	/*
162 	 * Mutex used to insure exclusive access to the outgoing
163 	 * communication ring.  We use a lock type that can be
164 	 * held while sleeping so that xs_write() can block waiting
165 	 * for space in the ring to free up, without allowing another
166 	 * writer to come in and corrupt a partial message write.
167 	 */
168 	struct sx request_mutex;
169 
170 	/**
171 	 * A list of replies to our requests.
172 	 *
173 	 * The reply list is filled by xs_rcv_thread().  It
174 	 * is consumed by the context that issued the request
175 	 * to which a reply is made.  The requester blocks in
176 	 * xs_read_reply().
177 	 *
178 	 * /note Only one requesting context can be active at a time.
179 	 *       This is guaranteed by the request_mutex and insures
180 	 *	 that the requester sees replies matching the order
181 	 *	 of its requests.
182 	 */
183 	struct xs_stored_msg_list reply_list;
184 
185 	/** Lock protecting the reply list. */
186 	struct mtx reply_lock;
187 
188 	/**
189 	 * List of registered watches.
190 	 */
191 	struct xs_watch_list  registered_watches;
192 
193 	/** Lock protecting the registered watches list. */
194 	struct mtx registered_watches_lock;
195 
196 	/**
197 	 * List of pending watch callback events.
198 	 */
199 	struct xs_stored_msg_list watch_events;
200 
201 	/** Lock protecting the watch calback list. */
202 	struct mtx watch_events_lock;
203 
204 	/**
205 	 * Sleepable lock used to prevent VM suspension while a
206 	 * xenstore transaction is outstanding.
207 	 *
208 	 * Each active transaction holds a shared lock on the
209 	 * suspend mutex.  Our suspend method blocks waiting
210 	 * to acquire an exclusive lock.  This guarantees that
211 	 * suspend processing will only proceed once all active
212 	 * transactions have been retired.
213 	 */
214 	struct sx suspend_mutex;
215 
216 	/**
217 	 * The processid of the xenwatch thread.
218 	 */
219 	pid_t xenwatch_pid;
220 
221 	/**
222 	 * Sleepable mutex used to gate the execution of XenStore
223 	 * watch event callbacks.
224 	 *
225 	 * xenwatch_thread holds an exclusive lock on this mutex
226 	 * while delivering event callbacks, and xenstore_unregister_watch()
227 	 * uses an exclusive lock of this mutex to guarantee that no
228 	 * callbacks of the just unregistered watch are pending
229 	 * before returning to its caller.
230 	 */
231 	struct sx xenwatch_mutex;
232 
233 	/**
234 	 * The HVM guest pseudo-physical frame number.  This is Xen's mapping
235 	 * of the true machine frame number into our "physical address space".
236 	 */
237 	unsigned long gpfn;
238 
239 	/**
240 	 * The event channel for communicating with the
241 	 * XenStore service.
242 	 */
243 	int evtchn;
244 
245 	/** Handle for XenStore interrupts. */
246 	xen_intr_handle_t xen_intr_handle;
247 
248 	/**
249 	 * Interrupt driven config hook allowing us to defer
250 	 * attaching children until interrupts (and thus communication
251 	 * with the XenStore service) are available.
252 	 */
253 	struct intr_config_hook xs_attachcb;
254 
255 	/**
256 	 * Xenstore is a user-space process that usually runs in Dom0,
257 	 * so if this domain is booting as Dom0, xenstore wont we accessible,
258 	 * and we have to defer the initialization of xenstore related
259 	 * devices to later (when xenstore is started).
260 	 */
261 	bool initialized;
262 
263 	/**
264 	 * Task to run when xenstore is initialized (Dom0 only), will
265 	 * take care of attaching xenstore related devices.
266 	 */
267 	struct task xs_late_init;
268 };
269 
270 /*-------------------------------- Global Data ------------------------------*/
271 static struct xs_softc xs;
272 
273 /*------------------------- Private Utility Functions -----------------------*/
274 
275 /**
276  * Count and optionally record pointers to a number of NUL terminated
277  * strings in a buffer.
278  *
279  * \param strings  A pointer to a contiguous buffer of NUL terminated strings.
280  * \param dest	   An array to store pointers to each string found in strings.
281  * \param len	   The length of the buffer pointed to by strings.
282  *
283  * \return  A count of the number of strings found.
284  */
285 static u_int
286 extract_strings(const char *strings, const char **dest, u_int len)
287 {
288 	u_int num;
289 	const char *p;
290 
291 	for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) {
292 		if (dest != NULL)
293 			*dest++ = p;
294 		num++;
295 	}
296 
297 	return (num);
298 }
299 
300 /**
301  * Convert a contiguous buffer containing a series of NUL terminated
302  * strings into an array of pointers to strings.
303  *
304  * The returned pointer references the array of string pointers which
305  * is followed by the storage for the string data.  It is the client's
306  * responsibility to free this storage.
307  *
308  * The storage addressed by strings is free'd prior to split returning.
309  *
310  * \param strings  A pointer to a contiguous buffer of NUL terminated strings.
311  * \param len	   The length of the buffer pointed to by strings.
312  * \param num	   The number of strings found and returned in the strings
313  *                 array.
314  *
315  * \return  An array of pointers to the strings found in the input buffer.
316  */
317 static const char **
318 split(char *strings, u_int len, u_int *num)
319 {
320 	const char **ret;
321 
322 	/* Protect against unterminated buffers. */
323 	if (len > 0)
324 		strings[len - 1] = '\0';
325 
326 	/* Count the strings. */
327 	*num = extract_strings(strings, /*dest*/NULL, len);
328 
329 	/* Transfer to one big alloc for easy freeing by the caller. */
330 	ret = malloc(*num * sizeof(char *) + len, M_XENSTORE, M_WAITOK);
331 	memcpy(&ret[*num], strings, len);
332 	free(strings, M_XENSTORE);
333 
334 	/* Extract pointers to newly allocated array. */
335 	strings = (char *)&ret[*num];
336 	(void)extract_strings(strings, /*dest*/ret, len);
337 
338 	return (ret);
339 }
340 
341 /*------------------------- Public Utility Functions -------------------------*/
342 /*------- API comments for these methods can be found in xenstorevar.h -------*/
343 struct sbuf *
344 xs_join(const char *dir, const char *name)
345 {
346 	struct sbuf *sb;
347 
348 	sb = sbuf_new_auto();
349 	sbuf_cat(sb, dir);
350 	if (name[0] != '\0') {
351 		sbuf_putc(sb, '/');
352 		sbuf_cat(sb, name);
353 	}
354 	sbuf_finish(sb);
355 
356 	return (sb);
357 }
358 
359 /*-------------------- Low Level Communication Management --------------------*/
360 /**
361  * Interrupt handler for the XenStore event channel.
362  *
363  * XenStore reads and writes block on "xen_store" for buffer
364  * space.  Wakeup any blocking operations when the XenStore
365  * service has modified the queues.
366  */
367 static void
368 xs_intr(void * arg __unused /*__attribute__((unused))*/)
369 {
370 
371 	/* If xenstore has not been initialized, initialize it now */
372 	if (!xs.initialized) {
373 		xs.initialized = true;
374 		/*
375 		 * Since this task is probing and attaching devices we
376 		 * have to hold the Giant lock.
377 		 */
378 		taskqueue_enqueue(taskqueue_swi_giant, &xs.xs_late_init);
379 	}
380 
381 	/*
382 	 * Hold ring lock across wakeup so that clients
383 	 * cannot miss a wakeup.
384 	 */
385 	mtx_lock(&xs.ring_lock);
386 	wakeup(xen_store);
387 	mtx_unlock(&xs.ring_lock);
388 }
389 
390 /**
391  * Verify that the indexes for a ring are valid.
392  *
393  * The difference between the producer and consumer cannot
394  * exceed the size of the ring.
395  *
396  * \param cons  The consumer index for the ring to test.
397  * \param prod  The producer index for the ring to test.
398  *
399  * \retval 1  If indexes are in range.
400  * \retval 0  If the indexes are out of range.
401  */
402 static int
403 xs_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
404 {
405 
406 	return ((prod - cons) <= XENSTORE_RING_SIZE);
407 }
408 
409 /**
410  * Return a pointer to, and the length of, the contiguous
411  * free region available for output in a ring buffer.
412  *
413  * \param cons  The consumer index for the ring.
414  * \param prod  The producer index for the ring.
415  * \param buf   The base address of the ring's storage.
416  * \param len   The amount of contiguous storage available.
417  *
418  * \return  A pointer to the start location of the free region.
419  */
420 static void *
421 xs_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
422     char *buf, uint32_t *len)
423 {
424 
425 	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
426 	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
427 		*len = XENSTORE_RING_SIZE - (prod - cons);
428 	return (buf + MASK_XENSTORE_IDX(prod));
429 }
430 
431 /**
432  * Return a pointer to, and the length of, the contiguous
433  * data available to read from a ring buffer.
434  *
435  * \param cons  The consumer index for the ring.
436  * \param prod  The producer index for the ring.
437  * \param buf   The base address of the ring's storage.
438  * \param len   The amount of contiguous data available to read.
439  *
440  * \return  A pointer to the start location of the available data.
441  */
442 static const void *
443 xs_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
444     const char *buf, uint32_t *len)
445 {
446 
447 	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
448 	if ((prod - cons) < *len)
449 		*len = prod - cons;
450 	return (buf + MASK_XENSTORE_IDX(cons));
451 }
452 
453 /**
454  * Transmit data to the XenStore service.
455  *
456  * \param tdata  A pointer to the contiguous data to send.
457  * \param len    The amount of data to send.
458  *
459  * \return  On success 0, otherwise an errno value indicating the
460  *          cause of failure.
461  *
462  * \invariant  Called from thread context.
463  * \invariant  The buffer pointed to by tdata is at least len bytes
464  *             in length.
465  * \invariant  xs.request_mutex exclusively locked.
466  */
467 static int
468 xs_write_store(const void *tdata, unsigned len)
469 {
470 	XENSTORE_RING_IDX cons, prod;
471 	const char *data = (const char *)tdata;
472 	int error;
473 
474 	sx_assert(&xs.request_mutex, SX_XLOCKED);
475 	while (len != 0) {
476 		void *dst;
477 		u_int avail;
478 
479 		/* Hold lock so we can't miss wakeups should we block. */
480 		mtx_lock(&xs.ring_lock);
481 		cons = xen_store->req_cons;
482 		prod = xen_store->req_prod;
483 		if ((prod - cons) == XENSTORE_RING_SIZE) {
484 			/*
485 			 * Output ring is full. Wait for a ring event.
486 			 *
487 			 * Note that the events from both queues
488 			 * are combined, so being woken does not
489 			 * guarantee that data exist in the read
490 			 * ring.
491 			 *
492 			 * To simplify error recovery and the retry,
493 			 * we specify PDROP so our lock is *not* held
494 			 * when msleep returns.
495 			 */
496 			error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
497 			     "xbwrite", /*timeout*/0);
498 			if (error && error != EWOULDBLOCK)
499 				return (error);
500 
501 			/* Try again. */
502 			continue;
503 		}
504 		mtx_unlock(&xs.ring_lock);
505 
506 		/* Verify queue sanity. */
507 		if (!xs_check_indexes(cons, prod)) {
508 			xen_store->req_cons = xen_store->req_prod = 0;
509 			return (EIO);
510 		}
511 
512 		dst = xs_get_output_chunk(cons, prod, xen_store->req, &avail);
513 		if (avail > len)
514 			avail = len;
515 
516 		memcpy(dst, data, avail);
517 		data += avail;
518 		len -= avail;
519 
520 		/*
521 		 * The store to the producer index, which indicates
522 		 * to the other side that new data has arrived, must
523 		 * be visible only after our copy of the data into the
524 		 * ring has completed.
525 		 */
526 		wmb();
527 		xen_store->req_prod += avail;
528 
529 		/*
530 		 * xen_intr_signal() implies mb(). The other side will see
531 		 * the change to req_prod at the time of the interrupt.
532 		 */
533 		xen_intr_signal(xs.xen_intr_handle);
534 	}
535 
536 	return (0);
537 }
538 
539 /**
540  * Receive data from the XenStore service.
541  *
542  * \param tdata  A pointer to the contiguous buffer to receive the data.
543  * \param len    The amount of data to receive.
544  *
545  * \return  On success 0, otherwise an errno value indicating the
546  *          cause of failure.
547  *
548  * \invariant  Called from thread context.
549  * \invariant  The buffer pointed to by tdata is at least len bytes
550  *             in length.
551  *
552  * \note xs_read does not perform any internal locking to guarantee
553  *       serial access to the incoming ring buffer.  However, there
554  *	 is only one context processing reads: xs_rcv_thread().
555  */
556 static int
557 xs_read_store(void *tdata, unsigned len)
558 {
559 	XENSTORE_RING_IDX cons, prod;
560 	char *data = (char *)tdata;
561 	int error;
562 
563 	while (len != 0) {
564 		u_int avail;
565 		const char *src;
566 
567 		/* Hold lock so we can't miss wakeups should we block. */
568 		mtx_lock(&xs.ring_lock);
569 		cons = xen_store->rsp_cons;
570 		prod = xen_store->rsp_prod;
571 		if (cons == prod) {
572 			/*
573 			 * Nothing to read. Wait for a ring event.
574 			 *
575 			 * Note that the events from both queues
576 			 * are combined, so being woken does not
577 			 * guarantee that data exist in the read
578 			 * ring.
579 			 *
580 			 * To simplify error recovery and the retry,
581 			 * we specify PDROP so our lock is *not* held
582 			 * when msleep returns.
583 			 */
584 			error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
585 			    "xbread", /*timeout*/0);
586 			if (error && error != EWOULDBLOCK)
587 				return (error);
588 			continue;
589 		}
590 		mtx_unlock(&xs.ring_lock);
591 
592 		/* Verify queue sanity. */
593 		if (!xs_check_indexes(cons, prod)) {
594 			xen_store->rsp_cons = xen_store->rsp_prod = 0;
595 			return (EIO);
596 		}
597 
598 		src = xs_get_input_chunk(cons, prod, xen_store->rsp, &avail);
599 		if (avail > len)
600 			avail = len;
601 
602 		/*
603 		 * Insure the data we read is related to the indexes
604 		 * we read above.
605 		 */
606 		rmb();
607 
608 		memcpy(data, src, avail);
609 		data += avail;
610 		len -= avail;
611 
612 		/*
613 		 * Insure that the producer of this ring does not see
614 		 * the ring space as free until after we have copied it
615 		 * out.
616 		 */
617 		mb();
618 		xen_store->rsp_cons += avail;
619 
620 		/*
621 		 * xen_intr_signal() implies mb(). The producer will see
622 		 * the updated consumer index when the event is delivered.
623 		 */
624 		xen_intr_signal(xs.xen_intr_handle);
625 	}
626 
627 	return (0);
628 }
629 
630 /*----------------------- Received Message Processing ------------------------*/
631 /**
632  * Block reading the next message from the XenStore service and
633  * process the result.
634  *
635  * \param type  The returned type of the XenStore message received.
636  *
637  * \return  0 on success.  Otherwise an errno value indicating the
638  *          type of failure encountered.
639  */
640 static int
641 xs_process_msg(enum xsd_sockmsg_type *type)
642 {
643 	struct xs_stored_msg *msg;
644 	char *body;
645 	int error;
646 
647 	msg = malloc(sizeof(*msg), M_XENSTORE, M_WAITOK);
648 	error = xs_read_store(&msg->hdr, sizeof(msg->hdr));
649 	if (error) {
650 		free(msg, M_XENSTORE);
651 		return (error);
652 	}
653 
654 	body = malloc(msg->hdr.len + 1, M_XENSTORE, M_WAITOK);
655 	error = xs_read_store(body, msg->hdr.len);
656 	if (error) {
657 		free(body, M_XENSTORE);
658 		free(msg, M_XENSTORE);
659 		return (error);
660 	}
661 	body[msg->hdr.len] = '\0';
662 
663 	*type = msg->hdr.type;
664 	if (msg->hdr.type == XS_WATCH_EVENT) {
665 		msg->u.watch.vec = split(body, msg->hdr.len,
666 		    &msg->u.watch.vec_size);
667 
668 		mtx_lock(&xs.registered_watches_lock);
669 		msg->u.watch.handle = find_watch(
670 		    msg->u.watch.vec[XS_WATCH_TOKEN]);
671 		if (msg->u.watch.handle != NULL) {
672 			mtx_lock(&xs.watch_events_lock);
673 			TAILQ_INSERT_TAIL(&xs.watch_events, msg, list);
674 			wakeup(&xs.watch_events);
675 			mtx_unlock(&xs.watch_events_lock);
676 		} else {
677 			free(msg->u.watch.vec, M_XENSTORE);
678 			free(msg, M_XENSTORE);
679 		}
680 		mtx_unlock(&xs.registered_watches_lock);
681 	} else {
682 		msg->u.reply.body = body;
683 		mtx_lock(&xs.reply_lock);
684 		TAILQ_INSERT_TAIL(&xs.reply_list, msg, list);
685 		wakeup(&xs.reply_list);
686 		mtx_unlock(&xs.reply_lock);
687 	}
688 
689 	return (0);
690 }
691 
692 /**
693  * Thread body of the XenStore receive thread.
694  *
695  * This thread blocks waiting for data from the XenStore service
696  * and processes and received messages.
697  */
698 static void
699 xs_rcv_thread(void *arg __unused)
700 {
701 	int error;
702 	enum xsd_sockmsg_type type;
703 
704 	for (;;) {
705 		error = xs_process_msg(&type);
706 		if (error)
707 			printf("XENSTORE error %d while reading message\n",
708 			    error);
709 	}
710 }
711 
712 /*---------------- XenStore Message Request/Reply Processing -----------------*/
713 /**
714  * Filter invoked before transmitting any message to the XenStore service.
715  *
716  * The role of the filter may expand, but currently serves to manage
717  * the interactions of messages with transaction state.
718  *
719  * \param request_msg_type  The message type for the request.
720  */
721 static inline void
722 xs_request_filter(uint32_t request_msg_type)
723 {
724 	if (request_msg_type == XS_TRANSACTION_START)
725 		sx_slock(&xs.suspend_mutex);
726 }
727 
728 /**
729  * Filter invoked after transmitting any message to the XenStore service.
730  *
731  * The role of the filter may expand, but currently serves to manage
732  * the interactions of messages with transaction state.
733  *
734  * \param request_msg_type     The message type for the original request.
735  * \param reply_msg_type       The message type for any received reply.
736  * \param request_reply_error  The error status from the attempt to send
737  *                             the request or retrieve the reply.
738  */
739 static inline void
740 xs_reply_filter(uint32_t request_msg_type,
741     uint32_t reply_msg_type, int request_reply_error)
742 {
743 	/*
744 	 * The count of transactions drops if we attempted
745 	 * to end a transaction (even if that attempt fails
746 	 * in error), we receive a transaction end acknowledgement,
747 	 * or if our attempt to begin a transaction fails.
748 	 */
749 	if (request_msg_type == XS_TRANSACTION_END
750 	 || (request_reply_error == 0 && reply_msg_type == XS_TRANSACTION_END)
751 	 || (request_msg_type == XS_TRANSACTION_START
752 	  && (request_reply_error != 0 || reply_msg_type == XS_ERROR)))
753 		sx_sunlock(&xs.suspend_mutex);
754 
755 }
756 
757 #define xsd_error_count	(sizeof(xsd_errors) / sizeof(xsd_errors[0]))
758 
759 /**
760  * Convert a XenStore error string into an errno number.
761  *
762  * \param errorstring  The error string to convert.
763  *
764  * \return  The errno best matching the input string.
765  *
766  * \note Unknown error strings are converted to EINVAL.
767  */
768 static int
769 xs_get_error(const char *errorstring)
770 {
771 	u_int i;
772 
773 	for (i = 0; i < xsd_error_count; i++) {
774 		if (!strcmp(errorstring, xsd_errors[i].errstring))
775 			return (xsd_errors[i].errnum);
776 	}
777 	log(LOG_WARNING, "XENSTORE xen store gave: unknown error %s",
778 	    errorstring);
779 	return (EINVAL);
780 }
781 
782 /**
783  * Block waiting for a reply to a message request.
784  *
785  * \param type	  The returned type of the reply.
786  * \param len	  The returned body length of the reply.
787  * \param result  The returned body of the reply.
788  *
789  * \return  0 on success.  Otherwise an errno indicating the
790  *          cause of failure.
791  */
792 static int
793 xs_read_reply(enum xsd_sockmsg_type *type, u_int *len, void **result)
794 {
795 	struct xs_stored_msg *msg;
796 	char *body;
797 	int error;
798 
799 	mtx_lock(&xs.reply_lock);
800 	while (TAILQ_EMPTY(&xs.reply_list)) {
801 		error = mtx_sleep(&xs.reply_list, &xs.reply_lock,
802 		    PCATCH, "xswait", hz/10);
803 		if (error && error != EWOULDBLOCK) {
804 			mtx_unlock(&xs.reply_lock);
805 			return (error);
806 		}
807 	}
808 	msg = TAILQ_FIRST(&xs.reply_list);
809 	TAILQ_REMOVE(&xs.reply_list, msg, list);
810 	mtx_unlock(&xs.reply_lock);
811 
812 	*type = msg->hdr.type;
813 	if (len)
814 		*len = msg->hdr.len;
815 	body = msg->u.reply.body;
816 
817 	free(msg, M_XENSTORE);
818 	*result = body;
819 	return (0);
820 }
821 
822 /**
823  * Pass-thru interface for XenStore access by userland processes
824  * via the XenStore device.
825  *
826  * Reply type and length data are returned by overwriting these
827  * fields in the passed in request message.
828  *
829  * \param msg	  A properly formatted message to transmit to
830  *		  the XenStore service.
831  * \param result  The returned body of the reply.
832  *
833  * \return  0 on success.  Otherwise an errno indicating the cause
834  *          of failure.
835  *
836  * \note The returned result is provided in malloced storage and thus
837  *       must be free'd by the caller with 'free(result, M_XENSTORE);
838  */
839 int
840 xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result)
841 {
842 	uint32_t request_type;
843 	int error;
844 
845 	request_type = msg->type;
846 	xs_request_filter(request_type);
847 
848 	sx_xlock(&xs.request_mutex);
849 	if ((error = xs_write_store(msg, sizeof(*msg) + msg->len)) == 0)
850 		error = xs_read_reply(&msg->type, &msg->len, result);
851 	sx_xunlock(&xs.request_mutex);
852 
853 	xs_reply_filter(request_type, msg->type, error);
854 
855 	return (error);
856 }
857 
858 /**
859  * Send a message with an optionally muti-part body to the XenStore service.
860  *
861  * \param t              The transaction to use for this request.
862  * \param request_type   The type of message to send.
863  * \param iovec          Pointers to the body sections of the request.
864  * \param num_vecs       The number of body sections in the request.
865  * \param len            The returned length of the reply.
866  * \param result         The returned body of the reply.
867  *
868  * \return  0 on success.  Otherwise an errno indicating
869  *          the cause of failure.
870  *
871  * \note The returned result is provided in malloced storage and thus
872  *       must be free'd by the caller with 'free(*result, M_XENSTORE);
873  */
874 static int
875 xs_talkv(struct xs_transaction t, enum xsd_sockmsg_type request_type,
876     const struct iovec *iovec, u_int num_vecs, u_int *len, void **result)
877 {
878 	struct xsd_sockmsg msg;
879 	void *ret = NULL;
880 	u_int i;
881 	int error;
882 
883 	msg.tx_id = t.id;
884 	msg.req_id = 0;
885 	msg.type = request_type;
886 	msg.len = 0;
887 	for (i = 0; i < num_vecs; i++)
888 		msg.len += iovec[i].iov_len;
889 
890 	xs_request_filter(request_type);
891 
892 	sx_xlock(&xs.request_mutex);
893 	error = xs_write_store(&msg, sizeof(msg));
894 	if (error) {
895 		printf("xs_talkv failed %d\n", error);
896 		goto error_lock_held;
897 	}
898 
899 	for (i = 0; i < num_vecs; i++) {
900 		error = xs_write_store(iovec[i].iov_base, iovec[i].iov_len);
901 		if (error) {
902 			printf("xs_talkv failed %d\n", error);
903 			goto error_lock_held;
904 		}
905 	}
906 
907 	error = xs_read_reply(&msg.type, len, &ret);
908 
909 error_lock_held:
910 	sx_xunlock(&xs.request_mutex);
911 	xs_reply_filter(request_type, msg.type, error);
912 	if (error)
913 		return (error);
914 
915 	if (msg.type == XS_ERROR) {
916 		error = xs_get_error(ret);
917 		free(ret, M_XENSTORE);
918 		return (error);
919 	}
920 
921 	/* Reply is either error or an echo of our request message type. */
922 	KASSERT(msg.type == request_type, ("bad xenstore message type"));
923 
924 	if (result)
925 		*result = ret;
926 	else
927 		free(ret, M_XENSTORE);
928 
929 	return (0);
930 }
931 
932 /**
933  * Wrapper for xs_talkv allowing easy transmission of a message with
934  * a single, contiguous, message body.
935  *
936  * \param t              The transaction to use for this request.
937  * \param request_type   The type of message to send.
938  * \param body           The body of the request.
939  * \param len            The returned length of the reply.
940  * \param result         The returned body of the reply.
941  *
942  * \return  0 on success.  Otherwise an errno indicating
943  *          the cause of failure.
944  *
945  * \note The returned result is provided in malloced storage and thus
946  *       must be free'd by the caller with 'free(*result, M_XENSTORE);
947  */
948 static int
949 xs_single(struct xs_transaction t, enum xsd_sockmsg_type request_type,
950     const char *body, u_int *len, void **result)
951 {
952 	struct iovec iovec;
953 
954 	iovec.iov_base = (void *)(uintptr_t)body;
955 	iovec.iov_len = strlen(body) + 1;
956 
957 	return (xs_talkv(t, request_type, &iovec, 1, len, result));
958 }
959 
960 /*------------------------- XenStore Watch Support ---------------------------*/
961 /**
962  * Transmit a watch request to the XenStore service.
963  *
964  * \param path    The path in the XenStore to watch.
965  * \param tocken  A unique identifier for this watch.
966  *
967  * \return  0 on success.  Otherwise an errno indicating the
968  *          cause of failure.
969  */
970 static int
971 xs_watch(const char *path, const char *token)
972 {
973 	struct iovec iov[2];
974 
975 	iov[0].iov_base = (void *)(uintptr_t) path;
976 	iov[0].iov_len = strlen(path) + 1;
977 	iov[1].iov_base = (void *)(uintptr_t) token;
978 	iov[1].iov_len = strlen(token) + 1;
979 
980 	return (xs_talkv(XST_NIL, XS_WATCH, iov, 2, NULL, NULL));
981 }
982 
983 /**
984  * Transmit an uwatch request to the XenStore service.
985  *
986  * \param path    The path in the XenStore to watch.
987  * \param tocken  A unique identifier for this watch.
988  *
989  * \return  0 on success.  Otherwise an errno indicating the
990  *          cause of failure.
991  */
992 static int
993 xs_unwatch(const char *path, const char *token)
994 {
995 	struct iovec iov[2];
996 
997 	iov[0].iov_base = (void *)(uintptr_t) path;
998 	iov[0].iov_len = strlen(path) + 1;
999 	iov[1].iov_base = (void *)(uintptr_t) token;
1000 	iov[1].iov_len = strlen(token) + 1;
1001 
1002 	return (xs_talkv(XST_NIL, XS_UNWATCH, iov, 2, NULL, NULL));
1003 }
1004 
1005 /**
1006  * Convert from watch token (unique identifier) to the associated
1007  * internal tracking structure for this watch.
1008  *
1009  * \param tocken  The unique identifier for the watch to find.
1010  *
1011  * \return  A pointer to the found watch structure or NULL.
1012  */
1013 static struct xs_watch *
1014 find_watch(const char *token)
1015 {
1016 	struct xs_watch *i, *cmp;
1017 
1018 	cmp = (void *)strtoul(token, NULL, 16);
1019 
1020 	LIST_FOREACH(i, &xs.registered_watches, list)
1021 		if (i == cmp)
1022 			return (i);
1023 
1024 	return (NULL);
1025 }
1026 
1027 /**
1028  * Thread body of the XenStore watch event dispatch thread.
1029  */
1030 static void
1031 xenwatch_thread(void *unused)
1032 {
1033 	struct xs_stored_msg *msg;
1034 
1035 	for (;;) {
1036 
1037 		mtx_lock(&xs.watch_events_lock);
1038 		while (TAILQ_EMPTY(&xs.watch_events))
1039 			mtx_sleep(&xs.watch_events,
1040 			    &xs.watch_events_lock,
1041 			    PWAIT | PCATCH, "waitev", hz/10);
1042 
1043 		mtx_unlock(&xs.watch_events_lock);
1044 		sx_xlock(&xs.xenwatch_mutex);
1045 
1046 		mtx_lock(&xs.watch_events_lock);
1047 		msg = TAILQ_FIRST(&xs.watch_events);
1048 		if (msg)
1049 			TAILQ_REMOVE(&xs.watch_events, msg, list);
1050 		mtx_unlock(&xs.watch_events_lock);
1051 
1052 		if (msg != NULL) {
1053 			/*
1054 			 * XXX There are messages coming in with a NULL
1055 			 * XXX callback.  This deserves further investigation;
1056 			 * XXX the workaround here simply prevents the kernel
1057 			 * XXX from panic'ing on startup.
1058 			 */
1059 			if (msg->u.watch.handle->callback != NULL)
1060 				msg->u.watch.handle->callback(
1061 					msg->u.watch.handle,
1062 					(const char **)msg->u.watch.vec,
1063 					msg->u.watch.vec_size);
1064 			free(msg->u.watch.vec, M_XENSTORE);
1065 			free(msg, M_XENSTORE);
1066 		}
1067 
1068 		sx_xunlock(&xs.xenwatch_mutex);
1069 	}
1070 }
1071 
1072 /*----------- XenStore Configuration, Initialization, and Control ------------*/
1073 /**
1074  * Setup communication channels with the XenStore service.
1075  *
1076  * \return  On success, 0. Otherwise an errno value indicating the
1077  *          type of failure.
1078  */
1079 static int
1080 xs_init_comms(void)
1081 {
1082 	int error;
1083 
1084 	if (xen_store->rsp_prod != xen_store->rsp_cons) {
1085 		log(LOG_WARNING, "XENSTORE response ring is not quiescent "
1086 		    "(%08x:%08x): fixing up\n",
1087 		    xen_store->rsp_cons, xen_store->rsp_prod);
1088 		xen_store->rsp_cons = xen_store->rsp_prod;
1089 	}
1090 
1091 	xen_intr_unbind(&xs.xen_intr_handle);
1092 
1093 	error = xen_intr_bind_local_port(xs.xs_dev, xs.evtchn,
1094 	    /*filter*/NULL, xs_intr, /*arg*/NULL, INTR_TYPE_NET|INTR_MPSAFE,
1095 	    &xs.xen_intr_handle);
1096 	if (error) {
1097 		log(LOG_WARNING, "XENSTORE request irq failed %i\n", error);
1098 		return (error);
1099 	}
1100 
1101 	return (0);
1102 }
1103 
1104 /*------------------ Private Device Attachment Functions  --------------------*/
1105 static void
1106 xs_identify(driver_t *driver, device_t parent)
1107 {
1108 
1109 	BUS_ADD_CHILD(parent, 0, "xenstore", 0);
1110 }
1111 
1112 /**
1113  * Probe for the existance of the XenStore.
1114  *
1115  * \param dev
1116  */
1117 static int
1118 xs_probe(device_t dev)
1119 {
1120 	/*
1121 	 * We are either operating within a PV kernel or being probed
1122 	 * as the child of the successfully attached xenpci device.
1123 	 * Thus we are in a Xen environment and there will be a XenStore.
1124 	 * Unconditionally return success.
1125 	 */
1126 	device_set_desc(dev, "XenStore");
1127 	return (0);
1128 }
1129 
1130 static void
1131 xs_attach_deferred(void *arg)
1132 {
1133 	xs_dev_init();
1134 
1135 	bus_generic_probe(xs.xs_dev);
1136 	bus_generic_attach(xs.xs_dev);
1137 
1138 	config_intrhook_disestablish(&xs.xs_attachcb);
1139 }
1140 
1141 static void
1142 xs_attach_late(void *arg, int pending)
1143 {
1144 
1145 	KASSERT((pending == 1), ("xs late attach queued several times"));
1146 	bus_generic_probe(xs.xs_dev);
1147 	bus_generic_attach(xs.xs_dev);
1148 }
1149 
1150 /**
1151  * Attach to the XenStore.
1152  *
1153  * This routine also prepares for the probe/attach of drivers that rely
1154  * on the XenStore.
1155  */
1156 static int
1157 xs_attach(device_t dev)
1158 {
1159 	int error;
1160 
1161 	/* Allow us to get device_t from softc and vice-versa. */
1162 	xs.xs_dev = dev;
1163 	device_set_softc(dev, &xs);
1164 
1165 	/* Initialize the interface to xenstore. */
1166 	struct proc *p;
1167 
1168 	xs.initialized = false;
1169 	if (xen_hvm_domain()) {
1170 		xs.evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
1171 		xs.gpfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
1172 		xen_store = pmap_mapdev(xs.gpfn * PAGE_SIZE, PAGE_SIZE);
1173 		xs.initialized = true;
1174 	} else if (xen_pv_domain()) {
1175 		if (HYPERVISOR_start_info->store_evtchn == 0) {
1176 			struct evtchn_alloc_unbound alloc_unbound;
1177 
1178 			/* Allocate a local event channel for xenstore */
1179 			alloc_unbound.dom = DOMID_SELF;
1180 			alloc_unbound.remote_dom = DOMID_SELF;
1181 			error = HYPERVISOR_event_channel_op(
1182 			    EVTCHNOP_alloc_unbound, &alloc_unbound);
1183 			if (error != 0)
1184 				panic(
1185 				   "unable to alloc event channel for Dom0: %d",
1186 				    error);
1187 
1188 			HYPERVISOR_start_info->store_evtchn =
1189 			    alloc_unbound.port;
1190 			xs.evtchn = alloc_unbound.port;
1191 
1192 			/* Allocate memory for the xs shared ring */
1193 			xen_store = malloc(PAGE_SIZE, M_XENSTORE,
1194 			    M_WAITOK | M_ZERO);
1195 		} else {
1196 			xs.evtchn = HYPERVISOR_start_info->store_evtchn;
1197 			xs.initialized = true;
1198 		}
1199 	} else {
1200 		panic("Unknown domain type, cannot initialize xenstore.");
1201 	}
1202 
1203 	TAILQ_INIT(&xs.reply_list);
1204 	TAILQ_INIT(&xs.watch_events);
1205 
1206 	mtx_init(&xs.ring_lock, "ring lock", NULL, MTX_DEF);
1207 	mtx_init(&xs.reply_lock, "reply lock", NULL, MTX_DEF);
1208 	sx_init(&xs.xenwatch_mutex, "xenwatch");
1209 	sx_init(&xs.request_mutex, "xenstore request");
1210 	sx_init(&xs.suspend_mutex, "xenstore suspend");
1211 	mtx_init(&xs.registered_watches_lock, "watches", NULL, MTX_DEF);
1212 	mtx_init(&xs.watch_events_lock, "watch events", NULL, MTX_DEF);
1213 
1214 	/* Initialize the shared memory rings to talk to xenstored */
1215 	error = xs_init_comms();
1216 	if (error)
1217 		return (error);
1218 
1219 	error = kproc_create(xenwatch_thread, NULL, &p, RFHIGHPID,
1220 	    0, "xenwatch");
1221 	if (error)
1222 		return (error);
1223 	xs.xenwatch_pid = p->p_pid;
1224 
1225 	error = kproc_create(xs_rcv_thread, NULL, NULL,
1226 	    RFHIGHPID, 0, "xenstore_rcv");
1227 
1228 	xs.xs_attachcb.ich_func = xs_attach_deferred;
1229 	xs.xs_attachcb.ich_arg = NULL;
1230 	if (xs.initialized) {
1231 		config_intrhook_establish(&xs.xs_attachcb);
1232 	} else {
1233 		TASK_INIT(&xs.xs_late_init, 0, xs_attach_late, NULL);
1234 	}
1235 
1236 	return (error);
1237 }
1238 
1239 /**
1240  * Prepare for suspension of this VM by halting XenStore access after
1241  * all transactions and individual requests have completed.
1242  */
1243 static int
1244 xs_suspend(device_t dev)
1245 {
1246 	int error;
1247 
1248 	/* Suspend child Xen devices. */
1249 	error = bus_generic_suspend(dev);
1250 	if (error != 0)
1251 		return (error);
1252 
1253 	sx_xlock(&xs.suspend_mutex);
1254 	sx_xlock(&xs.request_mutex);
1255 
1256 	return (0);
1257 }
1258 
1259 /**
1260  * Resume XenStore operations after this VM is resumed.
1261  */
1262 static int
1263 xs_resume(device_t dev __unused)
1264 {
1265 	struct xs_watch *watch;
1266 	char token[sizeof(watch) * 2 + 1];
1267 
1268 	xs_init_comms();
1269 
1270 	sx_xunlock(&xs.request_mutex);
1271 
1272 	/*
1273 	 * No need for registered_watches_lock: the suspend_mutex
1274 	 * is sufficient.
1275 	 */
1276 	LIST_FOREACH(watch, &xs.registered_watches, list) {
1277 		sprintf(token, "%lX", (long)watch);
1278 		xs_watch(watch->node, token);
1279 	}
1280 
1281 	sx_xunlock(&xs.suspend_mutex);
1282 
1283 	/* Resume child Xen devices. */
1284 	bus_generic_resume(dev);
1285 
1286 	return (0);
1287 }
1288 
1289 /*-------------------- Private Device Attachment Data  -----------------------*/
1290 static device_method_t xenstore_methods[] = {
1291 	/* Device interface */
1292 	DEVMETHOD(device_identify,	xs_identify),
1293 	DEVMETHOD(device_probe,         xs_probe),
1294 	DEVMETHOD(device_attach,        xs_attach),
1295 	DEVMETHOD(device_detach,        bus_generic_detach),
1296 	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
1297 	DEVMETHOD(device_suspend,       xs_suspend),
1298 	DEVMETHOD(device_resume,        xs_resume),
1299 
1300 	/* Bus interface */
1301 	DEVMETHOD(bus_add_child,        bus_generic_add_child),
1302 	DEVMETHOD(bus_alloc_resource,   bus_generic_alloc_resource),
1303 	DEVMETHOD(bus_release_resource, bus_generic_release_resource),
1304 	DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
1305 	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
1306 
1307 	DEVMETHOD_END
1308 };
1309 
1310 DEFINE_CLASS_0(xenstore, xenstore_driver, xenstore_methods, 0);
1311 static devclass_t xenstore_devclass;
1312 
1313 DRIVER_MODULE(xenstore, xenpv, xenstore_driver, xenstore_devclass, 0, 0);
1314 
1315 /*------------------------------- Sysctl Data --------------------------------*/
1316 /* XXX Shouldn't the node be somewhere else? */
1317 SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen");
1318 SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xs.evtchn, 0, "");
1319 SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, "");
1320 
1321 /*-------------------------------- Public API --------------------------------*/
1322 /*------- API comments for these methods can be found in xenstorevar.h -------*/
1323 int
1324 xs_directory(struct xs_transaction t, const char *dir, const char *node,
1325     u_int *num, const char ***result)
1326 {
1327 	struct sbuf *path;
1328 	char *strings;
1329 	u_int len = 0;
1330 	int error;
1331 
1332 	path = xs_join(dir, node);
1333 	error = xs_single(t, XS_DIRECTORY, sbuf_data(path), &len,
1334 	    (void **)&strings);
1335 	sbuf_delete(path);
1336 	if (error)
1337 		return (error);
1338 
1339 	*result = split(strings, len, num);
1340 
1341 	return (0);
1342 }
1343 
1344 int
1345 xs_exists(struct xs_transaction t, const char *dir, const char *node)
1346 {
1347 	const char **d;
1348 	int error, dir_n;
1349 
1350 	error = xs_directory(t, dir, node, &dir_n, &d);
1351 	if (error)
1352 		return (0);
1353 	free(d, M_XENSTORE);
1354 	return (1);
1355 }
1356 
1357 int
1358 xs_read(struct xs_transaction t, const char *dir, const char *node,
1359     u_int *len, void **result)
1360 {
1361 	struct sbuf *path;
1362 	void *ret;
1363 	int error;
1364 
1365 	path = xs_join(dir, node);
1366 	error = xs_single(t, XS_READ, sbuf_data(path), len, &ret);
1367 	sbuf_delete(path);
1368 	if (error)
1369 		return (error);
1370 	*result = ret;
1371 	return (0);
1372 }
1373 
1374 int
1375 xs_write(struct xs_transaction t, const char *dir, const char *node,
1376     const char *string)
1377 {
1378 	struct sbuf *path;
1379 	struct iovec iovec[2];
1380 	int error;
1381 
1382 	path = xs_join(dir, node);
1383 
1384 	iovec[0].iov_base = (void *)(uintptr_t) sbuf_data(path);
1385 	iovec[0].iov_len = sbuf_len(path) + 1;
1386 	iovec[1].iov_base = (void *)(uintptr_t) string;
1387 	iovec[1].iov_len = strlen(string);
1388 
1389 	error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL);
1390 	sbuf_delete(path);
1391 
1392 	return (error);
1393 }
1394 
1395 int
1396 xs_mkdir(struct xs_transaction t, const char *dir, const char *node)
1397 {
1398 	struct sbuf *path;
1399 	int ret;
1400 
1401 	path = xs_join(dir, node);
1402 	ret = xs_single(t, XS_MKDIR, sbuf_data(path), NULL, NULL);
1403 	sbuf_delete(path);
1404 
1405 	return (ret);
1406 }
1407 
1408 int
1409 xs_rm(struct xs_transaction t, const char *dir, const char *node)
1410 {
1411 	struct sbuf *path;
1412 	int ret;
1413 
1414 	path = xs_join(dir, node);
1415 	ret = xs_single(t, XS_RM, sbuf_data(path), NULL, NULL);
1416 	sbuf_delete(path);
1417 
1418 	return (ret);
1419 }
1420 
1421 int
1422 xs_rm_tree(struct xs_transaction xbt, const char *base, const char *node)
1423 {
1424 	struct xs_transaction local_xbt;
1425 	struct sbuf *root_path_sbuf;
1426 	struct sbuf *cur_path_sbuf;
1427 	char *root_path;
1428 	char *cur_path;
1429 	const char **dir;
1430 	int error;
1431 	int empty;
1432 
1433 retry:
1434 	root_path_sbuf = xs_join(base, node);
1435 	cur_path_sbuf  = xs_join(base, node);
1436 	root_path      = sbuf_data(root_path_sbuf);
1437 	cur_path       = sbuf_data(cur_path_sbuf);
1438 	dir            = NULL;
1439 	local_xbt.id   = 0;
1440 
1441 	if (xbt.id == 0) {
1442 		error = xs_transaction_start(&local_xbt);
1443 		if (error != 0)
1444 			goto out;
1445 		xbt = local_xbt;
1446 	}
1447 
1448 	empty = 0;
1449 	while (1) {
1450 		u_int count;
1451 		u_int i;
1452 
1453 		error = xs_directory(xbt, cur_path, "", &count, &dir);
1454 		if (error)
1455 			goto out;
1456 
1457 		for (i = 0; i < count; i++) {
1458 			error = xs_rm(xbt, cur_path, dir[i]);
1459 			if (error == ENOTEMPTY) {
1460 				struct sbuf *push_dir;
1461 
1462 				/*
1463 				 * Descend to clear out this sub directory.
1464 				 * We'll return to cur_dir once push_dir
1465 				 * is empty.
1466 				 */
1467 				push_dir = xs_join(cur_path, dir[i]);
1468 				sbuf_delete(cur_path_sbuf);
1469 				cur_path_sbuf = push_dir;
1470 				cur_path = sbuf_data(cur_path_sbuf);
1471 				break;
1472 			} else if (error != 0) {
1473 				goto out;
1474 			}
1475 		}
1476 
1477 		free(dir, M_XENSTORE);
1478 		dir = NULL;
1479 
1480 		if (i == count) {
1481 			char *last_slash;
1482 
1483 			/* Directory is empty.  It is now safe to remove. */
1484 			error = xs_rm(xbt, cur_path, "");
1485 			if (error != 0)
1486 				goto out;
1487 
1488 			if (!strcmp(cur_path, root_path))
1489 				break;
1490 
1491 			/* Return to processing the parent directory. */
1492 			last_slash = strrchr(cur_path, '/');
1493 			KASSERT(last_slash != NULL,
1494 				("xs_rm_tree: mangled path %s", cur_path));
1495 			*last_slash = '\0';
1496 		}
1497 	}
1498 
1499 out:
1500 	sbuf_delete(cur_path_sbuf);
1501 	sbuf_delete(root_path_sbuf);
1502 	if (dir != NULL)
1503 		free(dir, M_XENSTORE);
1504 
1505 	if (local_xbt.id != 0) {
1506 		int terror;
1507 
1508 		terror = xs_transaction_end(local_xbt, /*abort*/error != 0);
1509 		xbt.id = 0;
1510 		if (terror == EAGAIN && error == 0)
1511 			goto retry;
1512 	}
1513 	return (error);
1514 }
1515 
1516 int
1517 xs_transaction_start(struct xs_transaction *t)
1518 {
1519 	char *id_str;
1520 	int error;
1521 
1522 	error = xs_single(XST_NIL, XS_TRANSACTION_START, "", NULL,
1523 	    (void **)&id_str);
1524 	if (error == 0) {
1525 		t->id = strtoul(id_str, NULL, 0);
1526 		free(id_str, M_XENSTORE);
1527 	}
1528 	return (error);
1529 }
1530 
1531 int
1532 xs_transaction_end(struct xs_transaction t, int abort)
1533 {
1534 	char abortstr[2];
1535 
1536 	if (abort)
1537 		strcpy(abortstr, "F");
1538 	else
1539 		strcpy(abortstr, "T");
1540 
1541 	return (xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL));
1542 }
1543 
1544 int
1545 xs_scanf(struct xs_transaction t, const char *dir, const char *node,
1546      int *scancountp, const char *fmt, ...)
1547 {
1548 	va_list ap;
1549 	int error, ns;
1550 	char *val;
1551 
1552 	error = xs_read(t, dir, node, NULL, (void **) &val);
1553 	if (error)
1554 		return (error);
1555 
1556 	va_start(ap, fmt);
1557 	ns = vsscanf(val, fmt, ap);
1558 	va_end(ap);
1559 	free(val, M_XENSTORE);
1560 	/* Distinctive errno. */
1561 	if (ns == 0)
1562 		return (ERANGE);
1563 	if (scancountp)
1564 		*scancountp = ns;
1565 	return (0);
1566 }
1567 
1568 int
1569 xs_vprintf(struct xs_transaction t,
1570     const char *dir, const char *node, const char *fmt, va_list ap)
1571 {
1572 	struct sbuf *sb;
1573 	int error;
1574 
1575 	sb = sbuf_new_auto();
1576 	sbuf_vprintf(sb, fmt, ap);
1577 	sbuf_finish(sb);
1578 	error = xs_write(t, dir, node, sbuf_data(sb));
1579 	sbuf_delete(sb);
1580 
1581 	return (error);
1582 }
1583 
1584 int
1585 xs_printf(struct xs_transaction t, const char *dir, const char *node,
1586      const char *fmt, ...)
1587 {
1588 	va_list ap;
1589 	int error;
1590 
1591 	va_start(ap, fmt);
1592 	error = xs_vprintf(t, dir, node, fmt, ap);
1593 	va_end(ap);
1594 
1595 	return (error);
1596 }
1597 
1598 int
1599 xs_gather(struct xs_transaction t, const char *dir, ...)
1600 {
1601 	va_list ap;
1602 	const char *name;
1603 	int error;
1604 
1605 	va_start(ap, dir);
1606 	error = 0;
1607 	while (error == 0 && (name = va_arg(ap, char *)) != NULL) {
1608 		const char *fmt = va_arg(ap, char *);
1609 		void *result = va_arg(ap, void *);
1610 		char *p;
1611 
1612 		error = xs_read(t, dir, name, NULL, (void **) &p);
1613 		if (error)
1614 			break;
1615 
1616 		if (fmt) {
1617 			if (sscanf(p, fmt, result) == 0)
1618 				error = EINVAL;
1619 			free(p, M_XENSTORE);
1620 		} else
1621 			*(char **)result = p;
1622 	}
1623 	va_end(ap);
1624 
1625 	return (error);
1626 }
1627 
1628 int
1629 xs_register_watch(struct xs_watch *watch)
1630 {
1631 	/* Pointer in ascii is the token. */
1632 	char token[sizeof(watch) * 2 + 1];
1633 	int error;
1634 
1635 	sprintf(token, "%lX", (long)watch);
1636 
1637 	sx_slock(&xs.suspend_mutex);
1638 
1639 	mtx_lock(&xs.registered_watches_lock);
1640 	KASSERT(find_watch(token) == NULL, ("watch already registered"));
1641 	LIST_INSERT_HEAD(&xs.registered_watches, watch, list);
1642 	mtx_unlock(&xs.registered_watches_lock);
1643 
1644 	error = xs_watch(watch->node, token);
1645 
1646 	/* Ignore errors due to multiple registration. */
1647 	if (error == EEXIST)
1648 		error = 0;
1649 
1650 	if (error != 0) {
1651 		mtx_lock(&xs.registered_watches_lock);
1652 		LIST_REMOVE(watch, list);
1653 		mtx_unlock(&xs.registered_watches_lock);
1654 	}
1655 
1656 	sx_sunlock(&xs.suspend_mutex);
1657 
1658 	return (error);
1659 }
1660 
1661 void
1662 xs_unregister_watch(struct xs_watch *watch)
1663 {
1664 	struct xs_stored_msg *msg, *tmp;
1665 	char token[sizeof(watch) * 2 + 1];
1666 	int error;
1667 
1668 	sprintf(token, "%lX", (long)watch);
1669 
1670 	sx_slock(&xs.suspend_mutex);
1671 
1672 	mtx_lock(&xs.registered_watches_lock);
1673 	if (find_watch(token) == NULL) {
1674 		mtx_unlock(&xs.registered_watches_lock);
1675 		sx_sunlock(&xs.suspend_mutex);
1676 		return;
1677 	}
1678 	LIST_REMOVE(watch, list);
1679 	mtx_unlock(&xs.registered_watches_lock);
1680 
1681 	error = xs_unwatch(watch->node, token);
1682 	if (error)
1683 		log(LOG_WARNING, "XENSTORE Failed to release watch %s: %i\n",
1684 		    watch->node, error);
1685 
1686 	sx_sunlock(&xs.suspend_mutex);
1687 
1688 	/* Cancel pending watch events. */
1689 	mtx_lock(&xs.watch_events_lock);
1690 	TAILQ_FOREACH_SAFE(msg, &xs.watch_events, list, tmp) {
1691 		if (msg->u.watch.handle != watch)
1692 			continue;
1693 		TAILQ_REMOVE(&xs.watch_events, msg, list);
1694 		free(msg->u.watch.vec, M_XENSTORE);
1695 		free(msg, M_XENSTORE);
1696 	}
1697 	mtx_unlock(&xs.watch_events_lock);
1698 
1699 	/* Flush any currently-executing callback, unless we are it. :-) */
1700 	if (curproc->p_pid != xs.xenwatch_pid) {
1701 		sx_xlock(&xs.xenwatch_mutex);
1702 		sx_xunlock(&xs.xenwatch_mutex);
1703 	}
1704 }
1705