xref: /qemu/hw/i386/kvm/xenstore_impl.c (revision 370ed600)
1 /*
2  * QEMU Xen emulation: The actual implementation of XenStore
3  *
4  * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5  *
6  * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org>
7  *
8  * This work is licensed under the terms of the GNU GPL, version 2 or later.
9  * See the COPYING file in the top-level directory.
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qom/object.h"
14 
15 #include "hw/xen/xen.h"
16 
17 #include "xen_xenstore.h"
18 #include "xenstore_impl.h"
19 
20 #include "hw/xen/interface/io/xs_wire.h"
21 
22 #define XS_MAX_WATCHES          128
23 #define XS_MAX_DOMAIN_NODES     1000
24 #define XS_MAX_NODE_SIZE        2048
25 #define XS_MAX_TRANSACTIONS     10
26 #define XS_MAX_PERMS_PER_NODE   5
27 
28 #define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
29                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
30                        "0123456789-/_"
31 
32 typedef struct XsNode {
33     uint32_t ref;
34     GByteArray *content;
35     GList *perms;
36     GHashTable *children;
37     uint64_t gencnt;
38     bool deleted_in_tx;
39     bool modified_in_tx;
40     unsigned int serialized_tx;
41 #ifdef XS_NODE_UNIT_TEST
42     gchar *name; /* debug only */
43 #endif
44 } XsNode;
45 
46 typedef struct XsWatch {
47     struct XsWatch *next;
48     xs_impl_watch_fn *cb;
49     void *cb_opaque;
50     char *token;
51     unsigned int dom_id;
52     int rel_prefix;
53 } XsWatch;
54 
55 typedef struct XsTransaction {
56     XsNode *root;
57     unsigned int nr_nodes;
58     unsigned int base_tx;
59     unsigned int tx_id;
60     unsigned int dom_id;
61 } XsTransaction;
62 
63 struct XenstoreImplState {
64     XsNode *root;
65     unsigned int nr_nodes;
66     GHashTable *watches;
67     unsigned int nr_domu_watches;
68     GHashTable *transactions;
69     unsigned int nr_domu_transactions;
70     unsigned int root_tx;
71     unsigned int last_tx;
72     bool serialized;
73 };
74 
75 
76 static void nobble_tx(gpointer key, gpointer value, gpointer user_data)
77 {
78     unsigned int *new_tx_id = user_data;
79     XsTransaction *tx = value;
80 
81     if (tx->base_tx == *new_tx_id) {
82         /* Transactions based on XBT_NULL will always fail */
83         tx->base_tx = XBT_NULL;
84     }
85 }
86 
87 static inline unsigned int next_tx(struct XenstoreImplState *s)
88 {
89     unsigned int tx_id;
90 
91     /* Find the next TX id which isn't either XBT_NULL or in use. */
92     do {
93         tx_id = ++s->last_tx;
94     } while (tx_id == XBT_NULL || tx_id == s->root_tx ||
95              g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id)));
96 
97     /*
98      * It is vanishingly unlikely, but ensure that no outstanding transaction
99      * is based on the (previous incarnation of the) newly-allocated TX id.
100      */
101     g_hash_table_foreach(s->transactions, nobble_tx, &tx_id);
102 
103     return tx_id;
104 }
105 
106 static inline XsNode *xs_node_new(void)
107 {
108     XsNode *n = g_new0(XsNode, 1);
109     n->ref = 1;
110 
111 #ifdef XS_NODE_UNIT_TEST
112     nr_xs_nodes++;
113     xs_node_list = g_list_prepend(xs_node_list, n);
114 #endif
115     return n;
116 }
117 
118 static inline XsNode *xs_node_ref(XsNode *n)
119 {
120     /* With just 10 transactions, it can never get anywhere near this. */
121     g_assert(n->ref < INT_MAX);
122 
123     g_assert(n->ref);
124     n->ref++;
125     return n;
126 }
127 
128 static inline void xs_node_unref(XsNode *n)
129 {
130     if (!n) {
131         return;
132     }
133     g_assert(n->ref);
134     if (--n->ref) {
135         return;
136     }
137 
138     if (n->content) {
139         g_byte_array_unref(n->content);
140     }
141     if (n->perms) {
142         g_list_free_full(n->perms, g_free);
143     }
144     if (n->children) {
145         g_hash_table_unref(n->children);
146     }
147 #ifdef XS_NODE_UNIT_TEST
148     g_free(n->name);
149     nr_xs_nodes--;
150     xs_node_list = g_list_remove(xs_node_list, n);
151 #endif
152     g_free(n);
153 }
154 
155 char *xs_perm_as_string(unsigned int perm, unsigned int domid)
156 {
157     char letter;
158 
159     switch (perm) {
160     case XS_PERM_READ | XS_PERM_WRITE:
161         letter = 'b';
162         break;
163     case XS_PERM_READ:
164         letter = 'r';
165         break;
166     case XS_PERM_WRITE:
167         letter = 'w';
168         break;
169     case XS_PERM_NONE:
170     default:
171         letter = 'n';
172         break;
173     }
174 
175     return g_strdup_printf("%c%u", letter, domid);
176 }
177 
178 static gpointer do_perm_copy(gconstpointer src, gpointer user_data)
179 {
180     return g_strdup(src);
181 }
182 
183 static XsNode *xs_node_create(const char *name, GList *perms)
184 {
185     XsNode *n = xs_node_new();
186 
187 #ifdef XS_NODE_UNIT_TEST
188     if (name) {
189         n->name = g_strdup(name);
190     }
191 #endif
192 
193     n->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
194 
195     return n;
196 }
197 
198 /* For copying from one hash table to another using g_hash_table_foreach() */
199 static void do_child_insert(gpointer key, gpointer value, gpointer user_data)
200 {
201     g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
202 }
203 
204 static XsNode *xs_node_copy(XsNode *old)
205 {
206     XsNode *n = xs_node_new();
207 
208     n->gencnt = old->gencnt;
209 
210 #ifdef XS_NODE_UNIT_TEST
211     if (n->name) {
212         n->name = g_strdup(old->name);
213     }
214 #endif
215 
216     assert(old);
217     if (old->children) {
218         n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
219                                             (GDestroyNotify)xs_node_unref);
220         g_hash_table_foreach(old->children, do_child_insert, n->children);
221     }
222     if (old->perms) {
223         n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
224     }
225     if (old->content) {
226         n->content = g_byte_array_ref(old->content);
227     }
228     return n;
229 }
230 
231 /* Returns true if it made a change to the hash table */
232 static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child)
233 {
234     assert(!strchr(path_elem, '/'));
235 
236     if (!child) {
237         assert(n->children);
238         return g_hash_table_remove(n->children, path_elem);
239     }
240 
241 #ifdef XS_NODE_UNIT_TEST
242     g_free(child->name);
243     child->name = g_strdup(path_elem);
244 #endif
245     if (!n->children) {
246         n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
247                                             (GDestroyNotify)xs_node_unref);
248     }
249 
250     /*
251      * The documentation for g_hash_table_insert() says that it "returns a
252      * boolean value to indicate whether the newly added value was already
253      * in the hash table or not."
254      *
255      * It could perhaps be clearer that returning TRUE means it wasn't,
256      */
257     return g_hash_table_insert(n->children, g_strdup(path_elem), child);
258 }
259 
260 struct walk_op {
261     struct XenstoreImplState *s;
262     char path[XENSTORE_ABS_PATH_MAX + 2]; /* Two NUL terminators */
263     int (*op_fn)(XsNode **n, struct walk_op *op);
264     void *op_opaque;
265     void *op_opaque2;
266 
267     GList *watches;
268     unsigned int dom_id;
269     unsigned int tx_id;
270 
271     /* The number of nodes which will exist in the tree if this op succeeds. */
272     unsigned int new_nr_nodes;
273 
274     /*
275      * This is maintained on the way *down* the walk to indicate
276      * whether nodes can be modified in place or whether COW is
277      * required. It starts off being true, as we're always going to
278      * replace the root node. If we walk into a shared subtree it
279      * becomes false. If we start *creating* new nodes for a write,
280      * it becomes true again.
281      *
282      * Do not use it on the way back up.
283      */
284     bool inplace;
285     bool mutating;
286     bool create_dirs;
287     bool in_transaction;
288 
289     /* Tracking during recursion so we know which is first. */
290     bool deleted_in_tx;
291 };
292 
293 static void fire_watches(struct walk_op *op, bool parents)
294 {
295     GList *l = NULL;
296     XsWatch *w;
297 
298     if (!op->mutating || op->in_transaction) {
299         return;
300     }
301 
302     if (parents) {
303         l = op->watches;
304     }
305 
306     w = g_hash_table_lookup(op->s->watches, op->path);
307     while (w || l) {
308         if (!w) {
309             /* Fire the parent nodes from 'op' if asked to */
310             w = l->data;
311             l = l->next;
312             continue;
313         }
314 
315         assert(strlen(op->path) > w->rel_prefix);
316         w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token);
317 
318         w = w->next;
319     }
320 }
321 
322 static int xs_node_add_content(XsNode **n, struct walk_op *op)
323 {
324     GByteArray *data = op->op_opaque;
325 
326     if (op->dom_id) {
327         /*
328          * The real XenStored includes permissions and names of child nodes
329          * in the calculated datasize but life's too short. For a single
330          * tenant internal XenStore, we don't have to be quite as pedantic.
331          */
332         if (data->len > XS_MAX_NODE_SIZE) {
333             return E2BIG;
334         }
335     }
336     /* We *are* the node to be written. Either this or a copy. */
337     if (!op->inplace) {
338         XsNode *old = *n;
339         *n = xs_node_copy(old);
340         xs_node_unref(old);
341     }
342 
343     if ((*n)->content) {
344         g_byte_array_unref((*n)->content);
345     }
346     (*n)->content = g_byte_array_ref(data);
347     if (op->tx_id != XBT_NULL) {
348         (*n)->modified_in_tx = true;
349     }
350     return 0;
351 }
352 
353 static int xs_node_get_content(XsNode **n, struct walk_op *op)
354 {
355     GByteArray *data = op->op_opaque;
356     GByteArray *node_data;
357 
358     assert(op->inplace);
359     assert(*n);
360 
361     node_data = (*n)->content;
362     if (node_data) {
363         g_byte_array_append(data, node_data->data, node_data->len);
364     }
365 
366     return 0;
367 }
368 
369 static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data)
370 {
371     struct walk_op *op = user_data;
372     int path_len = strlen(op->path);
373     int key_len = strlen(key);
374     XsNode *n = value;
375     bool this_inplace = op->inplace;
376 
377     if (n->ref != 1) {
378         op->inplace = 0;
379     }
380 
381     assert(key_len + path_len + 2 <= sizeof(op->path));
382     op->path[path_len] = '/';
383     memcpy(op->path + path_len + 1, key, key_len + 1);
384 
385     if (n->children) {
386         g_hash_table_foreach_remove(n->children, node_rm_recurse, op);
387     }
388     op->new_nr_nodes--;
389 
390     /*
391      * Fire watches on *this* node but not the parents because they are
392      * going to be deleted too, so the watch will fire for them anyway.
393      */
394     fire_watches(op, false);
395     op->path[path_len] = '\0';
396 
397     /*
398      * Actually deleting the child here is just an optimisation; if we
399      * don't then the final unref on the topmost victim will just have
400      * to cascade down again repeating all the g_hash_table_foreach()
401      * calls.
402      */
403     return this_inplace;
404 }
405 
406 static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op);
407 static void copy_deleted_recurse(gpointer key, gpointer value,
408                                  gpointer user_data)
409 {
410     struct walk_op *op = user_data;
411     GHashTable *siblings = op->op_opaque2;
412     XsNode *n = xs_node_copy_deleted(value, op);
413 
414     /*
415      * Reinsert the deleted_in_tx copy of the node into the parent's
416      * 'children' hash table. Having stashed it from op->op_opaque2
417      * before the recursive call to xs_node_copy_deleted() scribbled
418      * over it.
419      */
420     g_hash_table_insert(siblings, g_strdup(key), n);
421 }
422 
423 static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op)
424 {
425     XsNode *n = xs_node_new();
426 
427     n->gencnt = old->gencnt;
428 
429 #ifdef XS_NODE_UNIT_TEST
430     if (old->name) {
431         n->name = g_strdup(old->name);
432     }
433 #endif
434 
435     if (old->children) {
436         n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
437                                             (GDestroyNotify)xs_node_unref);
438         op->op_opaque2 = n->children;
439         g_hash_table_foreach(old->children, copy_deleted_recurse, op);
440     }
441     if (old->perms) {
442         n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
443     }
444     n->deleted_in_tx = true;
445     /* If it gets resurrected we only fire a watch if it lost its content */
446     if (old->content) {
447         n->modified_in_tx = true;
448     }
449     op->new_nr_nodes--;
450     return n;
451 }
452 
453 static int xs_node_rm(XsNode **n, struct walk_op *op)
454 {
455     bool this_inplace = op->inplace;
456 
457     if (op->tx_id != XBT_NULL) {
458         /* It's not trivial to do inplace handling for this one */
459         XsNode *old = *n;
460         *n = xs_node_copy_deleted(old, op);
461         xs_node_unref(old);
462         return 0;
463     }
464 
465     /* Fire watches for, and count, nodes in the subtree which get deleted */
466     if ((*n)->children) {
467         g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
468     }
469     op->new_nr_nodes--;
470 
471     if (this_inplace) {
472         xs_node_unref(*n);
473     }
474     *n = NULL;
475     return 0;
476 }
477 
478 static int xs_node_get_perms(XsNode **n, struct walk_op *op)
479 {
480     GList **perms = op->op_opaque;
481 
482     assert(op->inplace);
483     assert(*n);
484 
485     *perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL);
486     return 0;
487 }
488 
489 static void parse_perm(const char *perm, char *letter, unsigned int *dom_id)
490 {
491     unsigned int n = sscanf(perm, "%c%u", letter, dom_id);
492 
493     assert(n == 2);
494 }
495 
496 static bool can_access(unsigned int dom_id, GList *perms, const char *letters)
497 {
498     unsigned int i, n;
499     char perm_letter;
500     unsigned int perm_dom_id;
501     bool access;
502 
503     if (dom_id == 0) {
504         return true;
505     }
506 
507     n = g_list_length(perms);
508     assert(n >= 1);
509 
510     /*
511      * The dom_id of the first perm is the owner, and the owner always has
512      * read-write access.
513      */
514     parse_perm(g_list_nth_data(perms, 0), &perm_letter, &perm_dom_id);
515     if (dom_id == perm_dom_id) {
516         return true;
517     }
518 
519     /*
520      * The letter of the first perm specified the default access for all other
521      * domains.
522      */
523     access = !!strchr(letters, perm_letter);
524     for (i = 1; i < n; i++) {
525         parse_perm(g_list_nth_data(perms, i), &perm_letter, &perm_dom_id);
526         if (dom_id != perm_dom_id) {
527             continue;
528         }
529         access = !!strchr(letters, perm_letter);
530     }
531 
532     return access;
533 }
534 
535 static int xs_node_set_perms(XsNode **n, struct walk_op *op)
536 {
537     GList *perms = op->op_opaque;
538 
539     if (op->dom_id) {
540         unsigned int perm_dom_id;
541         char perm_letter;
542 
543         /* A guest may not change permissions on nodes it does not own */
544         if (!can_access(op->dom_id, (*n)->perms, "")) {
545             return EPERM;
546         }
547 
548         /* A guest may not change the owner of a node it owns. */
549         parse_perm(perms->data, &perm_letter, &perm_dom_id);
550         if (perm_dom_id != op->dom_id) {
551             return EPERM;
552         }
553 
554         if (g_list_length(perms) > XS_MAX_PERMS_PER_NODE) {
555             return ENOSPC;
556         }
557     }
558 
559     /* We *are* the node to be written. Either this or a copy. */
560     if (!op->inplace) {
561         XsNode *old = *n;
562         *n = xs_node_copy(old);
563         xs_node_unref(old);
564     }
565 
566     if ((*n)->perms) {
567         g_list_free_full((*n)->perms, g_free);
568     }
569     (*n)->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
570     if (op->tx_id != XBT_NULL) {
571         (*n)->modified_in_tx = true;
572     }
573     return 0;
574 }
575 
576 /*
577  * Passed a full reference in *n which it may free if it needs to COW.
578  *
579  * When changing the tree, the op->inplace flag indicates whether this
580  * node may be modified in place (i.e. it and all its parents had a
581  * refcount of one). If walking down the tree we find a node whose
582  * refcount is higher, we must clear op->inplace and COW from there
583  * down. Unless we are creating new nodes as scaffolding for a write
584  * (which works like 'mkdir -p' does). In which case those newly
585  * created nodes can (and must) be modified in place again.
586  */
587 static int xs_node_walk(XsNode **n, struct walk_op *op)
588 {
589     char *child_name = NULL;
590     size_t namelen;
591     XsNode *old = *n, *child = NULL;
592     bool stole_child = false;
593     bool this_inplace;
594     XsWatch *watch;
595     int err;
596 
597     namelen = strlen(op->path);
598     watch = g_hash_table_lookup(op->s->watches, op->path);
599 
600     /* Is there a child, or do we hit the double-NUL termination? */
601     if (op->path[namelen + 1]) {
602         char *slash;
603         child_name = op->path + namelen + 1;
604         slash = strchr(child_name, '/');
605         if (slash) {
606             *slash = '\0';
607         }
608         op->path[namelen] = '/';
609     }
610 
611     /* If we walk into a subtree which is shared, we must COW */
612     if (op->mutating && old->ref != 1) {
613         op->inplace = false;
614     }
615 
616     if (!child_name) {
617         const char *letters = op->mutating ? "wb" : "rb";
618 
619         if (!can_access(op->dom_id, old->perms, letters)) {
620             err = EACCES;
621             goto out;
622         }
623 
624         /* This is the actual node on which the operation shall be performed */
625         err = op->op_fn(n, op);
626         if (!err) {
627             fire_watches(op, true);
628         }
629         goto out;
630     }
631 
632     /* op->inplace will be further modified during the recursion */
633     this_inplace = op->inplace;
634 
635     if (old && old->children) {
636         child = g_hash_table_lookup(old->children, child_name);
637         /* This is a *weak* reference to 'child', owned by the hash table */
638     }
639 
640     if (child) {
641         if (child->deleted_in_tx) {
642             assert(child->ref == 1);
643             /* Cannot actually set child->deleted_in_tx = false until later */
644         }
645         xs_node_ref(child);
646         /*
647          * Now we own it too. But if we can modify inplace, that's going to
648          * foil the check and force it to COW. We want to be the *only* owner
649          * so that it can be modified in place, so remove it from the hash
650          * table in that case. We'll add it (or its replacement) back later.
651          */
652         if (op->mutating && this_inplace) {
653             g_hash_table_remove(old->children, child_name);
654             stole_child = true;
655         }
656     } else if (op->create_dirs) {
657         assert(op->mutating);
658 
659         if (!can_access(op->dom_id, old->perms, "wb")) {
660             err = EACCES;
661             goto out;
662         }
663 
664         if (op->dom_id && op->new_nr_nodes >= XS_MAX_DOMAIN_NODES) {
665             err = ENOSPC;
666             goto out;
667         }
668 
669         child = xs_node_create(child_name, old->perms);
670         op->new_nr_nodes++;
671 
672         /*
673          * If we're creating a new child, we can clearly modify it (and its
674          * children) in place from here on down.
675          */
676         op->inplace = true;
677     } else {
678         err = ENOENT;
679         goto out;
680     }
681 
682     /*
683      * If there's a watch on this node, add it to the list to be fired
684      * (with the correct full pathname for the modified node) at the end.
685      */
686     if (watch) {
687         op->watches = g_list_append(op->watches, watch);
688     }
689 
690     /*
691      * Except for the temporary child-stealing as noted, our node has not
692      * changed yet. We don't yet know the overall operation will complete.
693      */
694     err = xs_node_walk(&child, op);
695 
696     if (watch) {
697         op->watches = g_list_remove(op->watches, watch);
698     }
699 
700     if (err || !op->mutating) {
701         if (stole_child) {
702             /* Put it back as it was. */
703             g_hash_table_replace(old->children, g_strdup(child_name), child);
704         } else {
705             xs_node_unref(child);
706         }
707         goto out;
708     }
709 
710     /*
711      * Now we know the operation has completed successfully and we're on
712      * the way back up. Make the change, substituting 'child' in the
713      * node at our level.
714      */
715     if (!this_inplace) {
716         *n = xs_node_copy(old);
717         xs_node_unref(old);
718     }
719 
720     /*
721      * If we resurrected a deleted_in_tx node, we can mark it as no longer
722      * deleted now that we know the overall operation has succeeded.
723      */
724     if (op->create_dirs && child && child->deleted_in_tx) {
725         op->new_nr_nodes++;
726         child->deleted_in_tx = false;
727     }
728 
729     /*
730      * The child may be NULL here, for a remove operation. Either way,
731      * xs_node_add_child() will do the right thing and return a value
732      * indicating whether it changed the parent's hash table or not.
733      *
734      * We bump the parent gencnt if it adds a child that we *didn't*
735      * steal from it in the first place, or if child==NULL and was
736      * thus removed (whether we stole it earlier and didn't put it
737      * back, or xs_node_add_child() actually removed it now).
738      */
739     if ((xs_node_add_child(*n, child_name, child) && !stole_child) || !child) {
740         (*n)->gencnt++;
741     }
742 
743  out:
744     op->path[namelen] = '\0';
745     if (!namelen) {
746         assert(!op->watches);
747         /*
748          * On completing the recursion back up the path walk and reaching the
749          * top, assign the new node count if the operation was successful. If
750          * the main tree was changed, bump its tx ID so that outstanding
751          * transactions correctly fail. But don't bump it every time; only
752          * if it makes a difference.
753          */
754         if (!err && op->mutating) {
755             if (!op->in_transaction) {
756                 if (op->s->root_tx != op->s->last_tx) {
757                     op->s->root_tx = next_tx(op->s);
758                 }
759                 op->s->nr_nodes = op->new_nr_nodes;
760             } else {
761                 XsTransaction *tx = g_hash_table_lookup(op->s->transactions,
762                                                         GINT_TO_POINTER(op->tx_id));
763                 assert(tx);
764                 tx->nr_nodes = op->new_nr_nodes;
765             }
766         }
767     }
768     return err;
769 }
770 
771 static void append_directory_item(gpointer key, gpointer value,
772                                   gpointer user_data)
773 {
774     GList **items = user_data;
775 
776     *items = g_list_insert_sorted(*items, g_strdup(key), (GCompareFunc)strcmp);
777 }
778 
779 /* Populates items with char * names which caller must free. */
780 static int xs_node_directory(XsNode **n, struct walk_op *op)
781 {
782     GList **items = op->op_opaque;
783 
784     assert(op->inplace);
785     assert(*n);
786 
787     if ((*n)->children) {
788         g_hash_table_foreach((*n)->children, append_directory_item, items);
789     }
790 
791     if (op->op_opaque2) {
792         *(uint64_t *)op->op_opaque2 = (*n)->gencnt;
793     }
794 
795     return 0;
796 }
797 
798 static int validate_path(char *outpath, const char *userpath,
799                          unsigned int dom_id)
800 {
801     size_t i, pathlen = strlen(userpath);
802 
803     if (!pathlen || userpath[pathlen] == '/' || strstr(userpath, "//")) {
804         return EINVAL;
805     }
806     for (i = 0; i < pathlen; i++) {
807         if (!strchr(XS_VALID_CHARS, userpath[i])) {
808             return EINVAL;
809         }
810     }
811     if (userpath[0] == '/') {
812         if (pathlen > XENSTORE_ABS_PATH_MAX) {
813             return E2BIG;
814         }
815         memcpy(outpath, userpath, pathlen + 1);
816     } else {
817         if (pathlen > XENSTORE_REL_PATH_MAX) {
818             return E2BIG;
819         }
820         snprintf(outpath, XENSTORE_ABS_PATH_MAX, "/local/domain/%u/%s", dom_id,
821                  userpath);
822     }
823     return 0;
824 }
825 
826 
827 static int init_walk_op(XenstoreImplState *s, struct walk_op *op,
828                         xs_transaction_t tx_id, unsigned int dom_id,
829                         const char *path, XsNode ***rootp)
830 {
831     int ret = validate_path(op->path, path, dom_id);
832     if (ret) {
833         return ret;
834     }
835 
836     /*
837      * We use *two* NUL terminators at the end of the path, as during the walk
838      * we will temporarily turn each '/' into a NUL to allow us to use that
839      * path element for the lookup.
840      */
841     op->path[strlen(op->path) + 1] = '\0';
842     op->watches = NULL;
843     op->path[0] = '\0';
844     op->inplace = true;
845     op->mutating = false;
846     op->create_dirs = false;
847     op->in_transaction = false;
848     op->dom_id = dom_id;
849     op->tx_id = tx_id;
850     op->s = s;
851 
852     if (tx_id == XBT_NULL) {
853         *rootp = &s->root;
854         op->new_nr_nodes = s->nr_nodes;
855     } else {
856         XsTransaction *tx = g_hash_table_lookup(s->transactions,
857                                                 GINT_TO_POINTER(tx_id));
858         if (!tx) {
859             return ENOENT;
860         }
861         *rootp = &tx->root;
862         op->new_nr_nodes = tx->nr_nodes;
863         op->in_transaction = true;
864     }
865 
866     return 0;
867 }
868 
869 int xs_impl_read(XenstoreImplState *s, unsigned int dom_id,
870                  xs_transaction_t tx_id, const char *path, GByteArray *data)
871 {
872     /*
873      * The data GByteArray shall exist, and will be freed by caller.
874      * Just g_byte_array_append() to it.
875      */
876     struct walk_op op;
877     XsNode **n;
878     int ret;
879 
880     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
881     if (ret) {
882         return ret;
883     }
884     op.op_fn = xs_node_get_content;
885     op.op_opaque = data;
886     return xs_node_walk(n, &op);
887 }
888 
889 int xs_impl_write(XenstoreImplState *s, unsigned int dom_id,
890                   xs_transaction_t tx_id, const char *path, GByteArray *data)
891 {
892     /*
893      * The data GByteArray shall exist, will be freed by caller. You are
894      * free to use g_byte_array_steal() and keep the data. Or just ref it.
895      */
896     struct walk_op op;
897     XsNode **n;
898     int ret;
899 
900     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
901     if (ret) {
902         return ret;
903     }
904     op.op_fn = xs_node_add_content;
905     op.op_opaque = data;
906     op.mutating = true;
907     op.create_dirs = true;
908     return xs_node_walk(n, &op);
909 }
910 
911 int xs_impl_directory(XenstoreImplState *s, unsigned int dom_id,
912                       xs_transaction_t tx_id, const char *path,
913                       uint64_t *gencnt, GList **items)
914 {
915     /*
916      * The items are (char *) to be freed by caller. Although it's consumed
917      * immediately so if you want to change it to (const char *) and keep
918      * them, go ahead and change the caller.
919      */
920     struct walk_op op;
921     XsNode **n;
922     int ret;
923 
924     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
925     if (ret) {
926         return ret;
927     }
928     op.op_fn = xs_node_directory;
929     op.op_opaque = items;
930     op.op_opaque2 = gencnt;
931     return xs_node_walk(n, &op);
932 }
933 
934 int xs_impl_transaction_start(XenstoreImplState *s, unsigned int dom_id,
935                               xs_transaction_t *tx_id)
936 {
937     XsTransaction *tx;
938 
939     if (*tx_id != XBT_NULL) {
940         return EINVAL;
941     }
942 
943     if (dom_id && s->nr_domu_transactions >= XS_MAX_TRANSACTIONS) {
944         return ENOSPC;
945     }
946 
947     tx = g_new0(XsTransaction, 1);
948 
949     tx->nr_nodes = s->nr_nodes;
950     tx->tx_id = next_tx(s);
951     tx->base_tx = s->root_tx;
952     tx->root = xs_node_ref(s->root);
953     tx->dom_id = dom_id;
954 
955     g_hash_table_insert(s->transactions, GINT_TO_POINTER(tx->tx_id), tx);
956     if (dom_id) {
957         s->nr_domu_transactions++;
958     }
959     *tx_id = tx->tx_id;
960     return 0;
961 }
962 
963 static gboolean tx_commit_walk(gpointer key, gpointer value,
964                                gpointer user_data)
965 {
966     struct walk_op *op = user_data;
967     int path_len = strlen(op->path);
968     int key_len = strlen(key);
969     bool fire_parents = true;
970     XsWatch *watch;
971     XsNode *n = value;
972 
973     if (n->ref != 1) {
974         return false;
975     }
976 
977     if (n->deleted_in_tx) {
978         /*
979          * We fire watches on our parents if we are the *first* node
980          * to be deleted (the topmost one). This matches the behaviour
981          * when deleting in the live tree.
982          */
983         fire_parents = !op->deleted_in_tx;
984 
985         /* Only used on the way down so no need to clear it later */
986         op->deleted_in_tx = true;
987     }
988 
989     assert(key_len + path_len + 2 <= sizeof(op->path));
990     op->path[path_len] = '/';
991     memcpy(op->path + path_len + 1, key, key_len + 1);
992 
993     watch = g_hash_table_lookup(op->s->watches, op->path);
994     if (watch) {
995         op->watches = g_list_append(op->watches, watch);
996     }
997 
998     if (n->children) {
999         g_hash_table_foreach_remove(n->children, tx_commit_walk, op);
1000     }
1001 
1002     if (watch) {
1003         op->watches = g_list_remove(op->watches, watch);
1004     }
1005 
1006     /*
1007      * Don't fire watches if this node was only copied because a
1008      * descendent was changed. The modified_in_tx flag indicates the
1009      * ones which were really changed.
1010      */
1011     if (n->modified_in_tx || n->deleted_in_tx) {
1012         fire_watches(op, fire_parents);
1013         n->modified_in_tx = false;
1014     }
1015     op->path[path_len] = '\0';
1016 
1017     /* Deleted nodes really do get expunged when we commit */
1018     return n->deleted_in_tx;
1019 }
1020 
1021 static int transaction_commit(XenstoreImplState *s, XsTransaction *tx)
1022 {
1023     struct walk_op op;
1024     XsNode **n;
1025 
1026     if (s->root_tx != tx->base_tx) {
1027         return EAGAIN;
1028     }
1029     xs_node_unref(s->root);
1030     s->root = tx->root;
1031     tx->root = NULL;
1032     s->root_tx = tx->tx_id;
1033     s->nr_nodes = tx->nr_nodes;
1034 
1035     init_walk_op(s, &op, XBT_NULL, tx->dom_id, "/", &n);
1036     op.deleted_in_tx = false;
1037     op.mutating = true;
1038 
1039     /*
1040      * Walk the new root and fire watches on any node which has a
1041      * refcount of one (which is therefore unique to this transaction).
1042      */
1043     if (s->root->children) {
1044         g_hash_table_foreach_remove(s->root->children, tx_commit_walk, &op);
1045     }
1046 
1047     return 0;
1048 }
1049 
1050 int xs_impl_transaction_end(XenstoreImplState *s, unsigned int dom_id,
1051                             xs_transaction_t tx_id, bool commit)
1052 {
1053     int ret = 0;
1054     XsTransaction *tx = g_hash_table_lookup(s->transactions,
1055                                             GINT_TO_POINTER(tx_id));
1056 
1057     if (!tx || tx->dom_id != dom_id) {
1058         return ENOENT;
1059     }
1060 
1061     if (commit) {
1062         ret = transaction_commit(s, tx);
1063     }
1064 
1065     g_hash_table_remove(s->transactions, GINT_TO_POINTER(tx_id));
1066     if (dom_id) {
1067         assert(s->nr_domu_transactions);
1068         s->nr_domu_transactions--;
1069     }
1070     return ret;
1071 }
1072 
1073 int xs_impl_rm(XenstoreImplState *s, unsigned int dom_id,
1074                xs_transaction_t tx_id, const char *path)
1075 {
1076     struct walk_op op;
1077     XsNode **n;
1078     int ret;
1079 
1080     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1081     if (ret) {
1082         return ret;
1083     }
1084     op.op_fn = xs_node_rm;
1085     op.mutating = true;
1086     return xs_node_walk(n, &op);
1087 }
1088 
1089 int xs_impl_get_perms(XenstoreImplState *s, unsigned int dom_id,
1090                       xs_transaction_t tx_id, const char *path, GList **perms)
1091 {
1092     struct walk_op op;
1093     XsNode **n;
1094     int ret;
1095 
1096     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1097     if (ret) {
1098         return ret;
1099     }
1100     op.op_fn = xs_node_get_perms;
1101     op.op_opaque = perms;
1102     return xs_node_walk(n, &op);
1103 }
1104 
1105 static void is_valid_perm(gpointer data, gpointer user_data)
1106 {
1107     char *perm = data;
1108     bool *valid = user_data;
1109     char letter;
1110     unsigned int dom_id;
1111 
1112     if (!*valid) {
1113         return;
1114     }
1115 
1116     if (sscanf(perm, "%c%u", &letter, &dom_id) != 2) {
1117         *valid = false;
1118         return;
1119     }
1120 
1121     switch (letter) {
1122     case 'n':
1123     case 'r':
1124     case 'w':
1125     case 'b':
1126         break;
1127 
1128     default:
1129         *valid = false;
1130         break;
1131     }
1132 }
1133 
1134 int xs_impl_set_perms(XenstoreImplState *s, unsigned int dom_id,
1135                       xs_transaction_t tx_id, const char *path, GList *perms)
1136 {
1137     struct walk_op op;
1138     XsNode **n;
1139     bool valid = true;
1140     int ret;
1141 
1142     if (!g_list_length(perms)) {
1143         return EINVAL;
1144     }
1145 
1146     g_list_foreach(perms, is_valid_perm, &valid);
1147     if (!valid) {
1148         return EINVAL;
1149     }
1150 
1151     ret = init_walk_op(s, &op, tx_id, dom_id, path, &n);
1152     if (ret) {
1153         return ret;
1154     }
1155     op.op_fn = xs_node_set_perms;
1156     op.op_opaque = perms;
1157     op.mutating = true;
1158     return xs_node_walk(n, &op);
1159 }
1160 
1161 static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id,
1162                             const char *path, const char *token,
1163                             xs_impl_watch_fn fn, void *opaque)
1164 
1165 {
1166     char abspath[XENSTORE_ABS_PATH_MAX + 1];
1167     XsWatch *w, *l;
1168     int ret;
1169 
1170     ret = validate_path(abspath, path, dom_id);
1171     if (ret) {
1172         return ret;
1173     }
1174 
1175     /* Check for duplicates */
1176     l = w = g_hash_table_lookup(s->watches, abspath);
1177     while (w) {
1178         if (!g_strcmp0(token, w->token) &&  opaque == w->cb_opaque &&
1179             fn == w->cb && dom_id == w->dom_id) {
1180             return EEXIST;
1181         }
1182         w = w->next;
1183     }
1184 
1185     if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) {
1186         return E2BIG;
1187     }
1188 
1189     w = g_new0(XsWatch, 1);
1190     w->token = g_strdup(token);
1191     w->cb = fn;
1192     w->cb_opaque = opaque;
1193     w->dom_id = dom_id;
1194     w->rel_prefix = strlen(abspath) - strlen(path);
1195 
1196     /* l was looked up above when checking for duplicates */
1197     if (l) {
1198         w->next = l->next;
1199         l->next = w;
1200     } else {
1201         g_hash_table_insert(s->watches, g_strdup(abspath), w);
1202     }
1203     if (dom_id) {
1204         s->nr_domu_watches++;
1205     }
1206 
1207     return 0;
1208 }
1209 
1210 int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
1211                   const char *token, xs_impl_watch_fn fn, void *opaque)
1212 {
1213     int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque);
1214 
1215     if (!ret) {
1216         /* A new watch should fire immediately */
1217         fn(opaque, path, token);
1218     }
1219 
1220     return ret;
1221 }
1222 
1223 static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w)
1224 {
1225     XsWatch *next = w->next;
1226 
1227     if (w->dom_id) {
1228         assert(s->nr_domu_watches);
1229         s->nr_domu_watches--;
1230     }
1231 
1232     g_free(w->token);
1233     g_free(w);
1234 
1235     return next;
1236 }
1237 
1238 int xs_impl_unwatch(XenstoreImplState *s, unsigned int dom_id,
1239                     const char *path, const char *token,
1240                     xs_impl_watch_fn fn, void *opaque)
1241 {
1242     char abspath[XENSTORE_ABS_PATH_MAX + 1];
1243     XsWatch *w, **l;
1244     int ret;
1245 
1246     ret = validate_path(abspath, path, dom_id);
1247     if (ret) {
1248         return ret;
1249     }
1250 
1251     w = g_hash_table_lookup(s->watches, abspath);
1252     if (!w) {
1253         return ENOENT;
1254     }
1255 
1256     /*
1257      * The hash table contains the first element of a list of
1258      * watches. Removing the first element in the list is a
1259      * special case because we have to update the hash table to
1260      * point to the next (or remove it if there's nothing left).
1261      */
1262     if (!g_strcmp0(token, w->token) && fn == w->cb && opaque == w->cb_opaque &&
1263         dom_id == w->dom_id) {
1264         if (w->next) {
1265             /* Insert the previous 'next' into the hash table */
1266             g_hash_table_insert(s->watches, g_strdup(abspath), w->next);
1267         } else {
1268             /* Nothing left; remove from hash table */
1269             g_hash_table_remove(s->watches, abspath);
1270         }
1271         free_watch(s, w);
1272         return 0;
1273     }
1274 
1275     /*
1276      * We're all done messing with the hash table because the element
1277      * it points to has survived the cull. Now it's just a simple
1278      * linked list removal operation.
1279      */
1280     for (l = &w->next; *l; l = &w->next) {
1281         w = *l;
1282 
1283         if (!g_strcmp0(token, w->token) && fn == w->cb &&
1284             opaque != w->cb_opaque && dom_id == w->dom_id) {
1285             *l = free_watch(s, w);
1286             return 0;
1287         }
1288     }
1289 
1290     return ENOENT;
1291 }
1292 
1293 int xs_impl_reset_watches(XenstoreImplState *s, unsigned int dom_id)
1294 {
1295     char **watch_paths;
1296     guint nr_watch_paths;
1297     guint i;
1298 
1299     watch_paths = (char **)g_hash_table_get_keys_as_array(s->watches,
1300                                                           &nr_watch_paths);
1301 
1302     for (i = 0; i < nr_watch_paths; i++) {
1303         XsWatch *w1 = g_hash_table_lookup(s->watches, watch_paths[i]);
1304         XsWatch *w2, *w, **l;
1305 
1306         /*
1307          * w1 is the original list. The hash table has this pointer.
1308          * w2 is the head of our newly-filtered list.
1309          * w and l are temporary for processing. w is somewhat redundant
1310          * with *l but makes my eyes bleed less.
1311          */
1312 
1313         w = w2 = w1;
1314         l = &w;
1315         while (w) {
1316             if (w->dom_id == dom_id) {
1317                 /* If we're freeing the head of the list, bump w2 */
1318                 if (w2 == w) {
1319                     w2 = w->next;
1320                 }
1321                 *l = free_watch(s, w);
1322             } else {
1323                 l = &w->next;
1324             }
1325             w = *l;
1326         }
1327         /*
1328          * If the head of the list survived the cull, we don't need to
1329          * touch the hash table and we're done with this path. Else...
1330          */
1331         if (w1 != w2) {
1332             g_hash_table_steal(s->watches, watch_paths[i]);
1333 
1334             /*
1335              * It was already freed. (Don't worry, this whole thing is
1336              * single-threaded and nobody saw it in the meantime). And
1337              * having *stolen* it, we now own the watch_paths[i] string
1338              * so if we don't give it back to the hash table, we need
1339              * to free it.
1340              */
1341             if (w2) {
1342                 g_hash_table_insert(s->watches, watch_paths[i], w2);
1343             } else {
1344                 g_free(watch_paths[i]);
1345             }
1346         }
1347     }
1348     g_free(watch_paths);
1349     return 0;
1350 }
1351 
1352 static void xs_tx_free(void *_tx)
1353 {
1354     XsTransaction *tx = _tx;
1355     if (tx->root) {
1356         xs_node_unref(tx->root);
1357     }
1358     g_free(tx);
1359 }
1360 
1361 XenstoreImplState *xs_impl_create(unsigned int dom_id)
1362 {
1363     XenstoreImplState *s = g_new0(XenstoreImplState, 1);
1364     GList *perms;
1365 
1366     s->watches = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
1367     s->transactions = g_hash_table_new_full(g_direct_hash, g_direct_equal,
1368                                             NULL, xs_tx_free);
1369 
1370     perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, 0));
1371     s->root = xs_node_create("/", perms);
1372     g_list_free_full(perms, g_free);
1373     s->nr_nodes = 1;
1374 
1375     s->root_tx = s->last_tx = 1;
1376     return s;
1377 }
1378 
1379 
1380 static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque)
1381 {
1382     XsNode *n = value;
1383 
1384     n->serialized_tx = XBT_NULL;
1385     if (n->children) {
1386         g_hash_table_foreach(n->children, clear_serialized_tx, NULL);
1387     }
1388 }
1389 
1390 static void clear_tx_serialized_tx(gpointer key, gpointer value,
1391                                    gpointer opaque)
1392 {
1393     XsTransaction *t = value;
1394 
1395     clear_serialized_tx(NULL, t->root, NULL);
1396 }
1397 
1398 static void write_be32(GByteArray *save, uint32_t val)
1399 {
1400     uint32_t be = htonl(val);
1401     g_byte_array_append(save, (void *)&be, sizeof(be));
1402 }
1403 
1404 
1405 struct save_state {
1406     GByteArray *bytes;
1407     unsigned int tx_id;
1408 };
1409 
1410 #define MODIFIED_IN_TX  (1U << 0)
1411 #define DELETED_IN_TX   (1U << 1)
1412 #define NODE_REF        (1U << 2)
1413 
1414 static void save_node(gpointer key, gpointer value, gpointer opaque)
1415 {
1416     struct save_state *ss = opaque;
1417     XsNode *n = value;
1418     char *name = key;
1419     uint8_t flag = 0;
1420 
1421     /* Child nodes (i.e. anything but the root) have a name */
1422     if (name) {
1423         g_byte_array_append(ss->bytes, key, strlen(key) + 1);
1424     }
1425 
1426     /*
1427      * If we already wrote this node, refer to the previous copy.
1428      * There's no rename/move in XenStore, so all we need to find
1429      * it is the tx_id of the transation in which it exists. Which
1430      * may be the root tx.
1431      */
1432     if (n->serialized_tx != XBT_NULL) {
1433         flag = NODE_REF;
1434         g_byte_array_append(ss->bytes, &flag, 1);
1435         write_be32(ss->bytes, n->serialized_tx);
1436     } else {
1437         GList *l;
1438         n->serialized_tx = ss->tx_id;
1439 
1440         if (n->modified_in_tx) {
1441             flag |= MODIFIED_IN_TX;
1442         }
1443         if (n->deleted_in_tx) {
1444             flag |= DELETED_IN_TX;
1445         }
1446         g_byte_array_append(ss->bytes, &flag, 1);
1447 
1448         if (n->content) {
1449             write_be32(ss->bytes, n->content->len);
1450             g_byte_array_append(ss->bytes, n->content->data, n->content->len);
1451         } else {
1452             write_be32(ss->bytes, 0);
1453         }
1454 
1455         for (l = n->perms; l; l = l->next) {
1456             g_byte_array_append(ss->bytes, l->data, strlen(l->data) + 1);
1457         }
1458         /* NUL termination after perms */
1459         g_byte_array_append(ss->bytes, (void *)"", 1);
1460 
1461         if (n->children) {
1462             g_hash_table_foreach(n->children, save_node, ss);
1463         }
1464         /* NUL termination after children (child name is NUL) */
1465         g_byte_array_append(ss->bytes, (void *)"", 1);
1466     }
1467 }
1468 
1469 static void save_tree(struct save_state *ss, uint32_t tx_id, XsNode *root)
1470 {
1471     write_be32(ss->bytes, tx_id);
1472     ss->tx_id = tx_id;
1473     save_node(NULL, root, ss);
1474 }
1475 
1476 static void save_tx(gpointer key, gpointer value, gpointer opaque)
1477 {
1478     uint32_t tx_id = GPOINTER_TO_INT(key);
1479     struct save_state *ss = opaque;
1480     XsTransaction *n = value;
1481 
1482     write_be32(ss->bytes, n->base_tx);
1483     write_be32(ss->bytes, n->dom_id);
1484 
1485     save_tree(ss, tx_id, n->root);
1486 }
1487 
1488 static void save_watch(gpointer key, gpointer value, gpointer opaque)
1489 {
1490     struct save_state *ss = opaque;
1491     XsWatch *w = value;
1492 
1493     /* We only save the *guest* watches. */
1494     if (w->dom_id) {
1495         gpointer relpath = key + w->rel_prefix;
1496         g_byte_array_append(ss->bytes, relpath, strlen(relpath) + 1);
1497         g_byte_array_append(ss->bytes, (void *)w->token, strlen(w->token) + 1);
1498     }
1499 }
1500 
1501 GByteArray *xs_impl_serialize(XenstoreImplState *s)
1502 {
1503     struct save_state ss;
1504 
1505     ss.bytes = g_byte_array_new();
1506 
1507     /*
1508      * node = flags [ real_node / node_ref ]
1509      *   flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF)
1510      *   node_ref = tx_id (in which the original version of this node exists)
1511      *   real_node = content perms child* NUL
1512      *     content = len data
1513      *       len = uint32_t
1514      *       data = uint8_t{len}
1515      *     perms = perm* NUL
1516      *       perm = asciiz
1517      *   child = name node
1518      *     name = asciiz
1519      *
1520      * tree = tx_id node
1521      *   tx_id = uint32_t
1522      *
1523      * transaction = base_tx_id dom_id tree
1524      *   base_tx_id = uint32_t
1525      *   dom_id = uint32_t
1526      *
1527      * tx_list = tree transaction* XBT_NULL
1528      *
1529      * watch = path token
1530      *   path = asciiz
1531      *   token = asciiz
1532      *
1533      * watch_list = watch* NUL
1534      *
1535      * xs_serialize_stream = last_tx tx_list watch_list
1536      *   last_tx = uint32_t
1537      */
1538 
1539     /* Clear serialized_tx in every node. */
1540     if (s->serialized) {
1541         clear_serialized_tx(NULL, s->root, NULL);
1542         g_hash_table_foreach(s->transactions, clear_tx_serialized_tx, NULL);
1543     }
1544 
1545     s->serialized = true;
1546 
1547     write_be32(ss.bytes, s->last_tx);
1548     save_tree(&ss, s->root_tx, s->root);
1549     g_hash_table_foreach(s->transactions, save_tx, &ss);
1550 
1551     write_be32(ss.bytes, XBT_NULL);
1552 
1553     g_hash_table_foreach(s->watches, save_watch, &ss);
1554     g_byte_array_append(ss.bytes, (void *)"", 1);
1555 
1556     return ss.bytes;
1557 }
1558 
1559 struct unsave_state {
1560     char path[XENSTORE_ABS_PATH_MAX + 1];
1561     XenstoreImplState *s;
1562     GByteArray *bytes;
1563     uint8_t *d;
1564     size_t l;
1565     bool root_walk;
1566 };
1567 
1568 static int consume_be32(struct unsave_state *us, unsigned int *val)
1569 {
1570     uint32_t d;
1571 
1572     if (us->l < sizeof(d)) {
1573         return -EINVAL;
1574     }
1575     memcpy(&d, us->d, sizeof(d));
1576     *val = ntohl(d);
1577     us->d += sizeof(d);
1578     us->l -= sizeof(d);
1579     return 0;
1580 }
1581 
1582 static int consume_string(struct unsave_state *us, char **str, size_t *len)
1583 {
1584     size_t l;
1585 
1586     if (!us->l) {
1587         return -EINVAL;
1588     }
1589 
1590     l = strnlen((void *)us->d, us->l);
1591     if (l == us->l) {
1592         return -EINVAL;
1593     }
1594 
1595     if (str) {
1596         *str = (void *)us->d;
1597     }
1598     if (len) {
1599         *len = l;
1600     }
1601 
1602     us->d += l + 1;
1603     us->l -= l + 1;
1604     return 0;
1605 }
1606 
1607 static XsNode *lookup_node(XsNode *n, char *path)
1608 {
1609     char *slash = strchr(path, '/');
1610     XsNode *child;
1611 
1612     if (path[0] == '\0') {
1613         return n;
1614     }
1615 
1616     if (slash) {
1617         *slash = '\0';
1618     }
1619 
1620     if (!n->children) {
1621         return NULL;
1622     }
1623     child = g_hash_table_lookup(n->children, path);
1624     if (!slash) {
1625         return child;
1626     }
1627 
1628     *slash = '/';
1629     if (!child) {
1630         return NULL;
1631     }
1632     return lookup_node(child, slash + 1);
1633 }
1634 
1635 static XsNode *lookup_tx_node(struct unsave_state *us, unsigned int tx_id)
1636 {
1637     XsTransaction *t;
1638     if (tx_id == us->s->root_tx) {
1639         return lookup_node(us->s->root, us->path + 1);
1640     }
1641 
1642     t = g_hash_table_lookup(us->s->transactions, GINT_TO_POINTER(tx_id));
1643     if (!t) {
1644         return NULL;
1645     }
1646     g_assert(t->root);
1647     return lookup_node(t->root, us->path + 1);
1648 }
1649 
1650 static void count_child_nodes(gpointer key, gpointer value, gpointer user_data)
1651 {
1652     unsigned int *nr_nodes = user_data;
1653     XsNode *n = value;
1654 
1655     (*nr_nodes)++;
1656 
1657     if (n->children) {
1658         g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1659     }
1660 }
1661 
1662 static int consume_node(struct unsave_state *us, XsNode **nodep,
1663                         unsigned int *nr_nodes)
1664 {
1665     XsNode *n = NULL;
1666     uint8_t flags;
1667     int ret;
1668 
1669     if (us->l < 1) {
1670         return -EINVAL;
1671     }
1672     flags = us->d[0];
1673     us->d++;
1674     us->l--;
1675 
1676     if (flags == NODE_REF) {
1677         unsigned int tx;
1678 
1679         ret = consume_be32(us, &tx);
1680         if (ret) {
1681             return ret;
1682         }
1683 
1684         n = lookup_tx_node(us, tx);
1685         if (!n) {
1686             return -EINVAL;
1687         }
1688         n->ref++;
1689         if (n->children) {
1690             g_hash_table_foreach(n->children, count_child_nodes, nr_nodes);
1691         }
1692     } else {
1693         uint32_t datalen;
1694 
1695         if (flags & ~(DELETED_IN_TX | MODIFIED_IN_TX)) {
1696             return -EINVAL;
1697         }
1698         n = xs_node_new();
1699 
1700         if (flags & DELETED_IN_TX) {
1701             n->deleted_in_tx = true;
1702         }
1703         if (flags & MODIFIED_IN_TX) {
1704             n->modified_in_tx = true;
1705         }
1706         ret = consume_be32(us, &datalen);
1707         if (ret) {
1708             xs_node_unref(n);
1709             return -EINVAL;
1710         }
1711         if (datalen) {
1712             if (datalen > us->l) {
1713                 xs_node_unref(n);
1714                 return -EINVAL;
1715             }
1716 
1717             GByteArray *node_data = g_byte_array_new();
1718             g_byte_array_append(node_data, us->d, datalen);
1719             us->d += datalen;
1720             us->l -= datalen;
1721             n->content = node_data;
1722 
1723             if (us->root_walk) {
1724                 n->modified_in_tx = true;
1725             }
1726         }
1727         while (1) {
1728             char *perm = NULL;
1729             size_t permlen = 0;
1730 
1731             ret = consume_string(us, &perm, &permlen);
1732             if (ret) {
1733                 xs_node_unref(n);
1734                 return ret;
1735             }
1736 
1737             if (!permlen) {
1738                 break;
1739             }
1740 
1741             n->perms = g_list_append(n->perms, g_strdup(perm));
1742         }
1743 
1744         /* Now children */
1745         while (1) {
1746             size_t childlen;
1747             char *childname;
1748             char *pathend;
1749             XsNode *child = NULL;
1750 
1751             ret = consume_string(us, &childname, &childlen);
1752             if (ret) {
1753                 xs_node_unref(n);
1754                 return ret;
1755             }
1756 
1757             if (!childlen) {
1758                 break;
1759             }
1760 
1761             pathend = us->path + strlen(us->path);
1762             strncat(us->path, "/", sizeof(us->path) - 1);
1763             strncat(us->path, childname, sizeof(us->path) - 1);
1764 
1765             ret = consume_node(us, &child, nr_nodes);
1766             *pathend = '\0';
1767             if (ret) {
1768                 xs_node_unref(n);
1769                 return ret;
1770             }
1771             g_assert(child);
1772             xs_node_add_child(n, childname, child);
1773         }
1774 
1775         /*
1776          * If the node has no data and no children we still want to fire
1777          * a watch on it.
1778          */
1779         if (us->root_walk && !n->children) {
1780             n->modified_in_tx = true;
1781         }
1782     }
1783 
1784     if (!n->deleted_in_tx) {
1785         (*nr_nodes)++;
1786     }
1787 
1788     *nodep = n;
1789     return 0;
1790 }
1791 
1792 static int consume_tree(struct unsave_state *us, XsTransaction *t)
1793 {
1794     int ret;
1795 
1796     ret = consume_be32(us, &t->tx_id);
1797     if (ret) {
1798         return ret;
1799     }
1800 
1801     if (t->tx_id > us->s->last_tx) {
1802         return -EINVAL;
1803     }
1804 
1805     us->path[0] = '\0';
1806 
1807     return consume_node(us, &t->root, &t->nr_nodes);
1808 }
1809 
1810 int xs_impl_deserialize(XenstoreImplState *s, GByteArray *bytes,
1811                         unsigned int dom_id, xs_impl_watch_fn watch_fn,
1812                         void *watch_opaque)
1813 {
1814     struct unsave_state us;
1815     XsTransaction base_t = { 0 };
1816     int ret;
1817 
1818     us.s = s;
1819     us.bytes = bytes;
1820     us.d = bytes->data;
1821     us.l = bytes->len;
1822 
1823     xs_impl_reset_watches(s, dom_id);
1824     g_hash_table_remove_all(s->transactions);
1825 
1826     xs_node_unref(s->root);
1827     s->root = NULL;
1828     s->root_tx = s->last_tx = XBT_NULL;
1829 
1830     ret = consume_be32(&us, &s->last_tx);
1831     if (ret) {
1832         return ret;
1833     }
1834 
1835     /*
1836      * Consume the base tree into a transaction so that watches can be
1837      * fired as we commit it. By setting us.root_walk we cause the nodes
1838      * to be marked as 'modified_in_tx' as they are created, so that the
1839      * watches are triggered on them.
1840      */
1841     base_t.dom_id = dom_id;
1842     base_t.base_tx = XBT_NULL;
1843     us.root_walk = true;
1844     ret = consume_tree(&us, &base_t);
1845     if (ret) {
1846         return ret;
1847     }
1848     us.root_walk = false;
1849 
1850     /*
1851      * Commit the transaction now while the refcount on all nodes is 1.
1852      * Note that we haven't yet reinstated the *guest* watches but that's
1853      * OK because we don't want the guest to see any changes. Even any
1854      * backend nodes which get recreated should be *precisely* as they
1855      * were before the migration. Back ends may have been instantiated
1856      * already, and will see the frontend magically blink into existence
1857      * now (well, from the aio_bh which fires the watches). It's their
1858      * responsibility to rebuild everything precisely as it was before.
1859      */
1860     ret = transaction_commit(s, &base_t);
1861     if (ret) {
1862         return ret;
1863     }
1864 
1865     while (1) {
1866         unsigned int base_tx;
1867         XsTransaction *t;
1868 
1869         ret = consume_be32(&us, &base_tx);
1870         if (ret) {
1871             return ret;
1872         }
1873         if (base_tx == XBT_NULL) {
1874             break;
1875         }
1876 
1877         t = g_new0(XsTransaction, 1);
1878         t->base_tx = base_tx;
1879 
1880         ret = consume_be32(&us, &t->dom_id);
1881         if (!ret) {
1882             ret = consume_tree(&us, t);
1883         }
1884         if (ret) {
1885             g_free(t);
1886             return ret;
1887         }
1888         g_assert(t->root);
1889         if (t->dom_id) {
1890             s->nr_domu_transactions++;
1891         }
1892         g_hash_table_insert(s->transactions, GINT_TO_POINTER(t->tx_id), t);
1893     }
1894 
1895     while (1) {
1896         char *path, *token;
1897         size_t pathlen, toklen;
1898 
1899         ret = consume_string(&us, &path, &pathlen);
1900         if (ret) {
1901             return ret;
1902         }
1903         if (!pathlen) {
1904             break;
1905         }
1906 
1907         ret = consume_string(&us, &token, &toklen);
1908         if (ret) {
1909             return ret;
1910         }
1911 
1912         if (!watch_fn) {
1913             continue;
1914         }
1915 
1916         ret = do_xs_impl_watch(s, dom_id, path, token, watch_fn, watch_opaque);
1917         if (ret) {
1918             return ret;
1919         }
1920     }
1921 
1922     if (us.l) {
1923         return -EINVAL;
1924     }
1925 
1926     return 0;
1927 }
1928