1 /*
2  * Copyright (C) 2010-2016 Red Hat, Inc.
3  * Copyright IBM Corp. 2008
4  *
5  * lxc_controller.c: linux container process controller
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library.  If not, see
19  * <http://www.gnu.org/licenses/>.
20  */
21 
22 #include <config.h>
23 
24 #include <sys/epoll.h>
25 #include <sys/wait.h>
26 
27 #ifdef __linux__
28 # include <sys/sysmacros.h>
29 #endif
30 
31 #include <sys/personality.h>
32 #include <unistd.h>
33 #include <fcntl.h>
34 #include <signal.h>
35 #include <getopt.h>
36 #include <sys/mount.h>
37 #include <grp.h>
38 #include <sys/stat.h>
39 #include <time.h>
40 
41 #if WITH_CAPNG
42 # include <cap-ng.h>
43 #endif
44 
45 #include "virerror.h"
46 #include "virlog.h"
47 
48 #include "lxc_conf.h"
49 #include "lxc_container.h"
50 #include "lxc_cgroup.h"
51 #include "lxc_monitor_protocol.h"
52 #include "lxc_fuse.h"
53 #include "virnetdev.h"
54 #include "virnetdevveth.h"
55 #include "viralloc.h"
56 #include "virfile.h"
57 #include "virgdbus.h"
58 #include "virpidfile.h"
59 #include "vircommand.h"
60 #include "virhostcpu.h"
61 #include "virrandom.h"
62 #include "virprocess.h"
63 #include "virnuma.h"
64 #include "rpc/virnetdaemon.h"
65 #include "virstring.h"
66 #include "virgettext.h"
67 #include "virsocket.h"
68 #include "virutil.h"
69 
70 #define VIR_FROM_THIS VIR_FROM_LXC
71 
72 VIR_LOG_INIT("lxc.lxc_controller");
73 
74 typedef struct _virLXCControllerConsole virLXCControllerConsole;
75 struct _virLXCControllerConsole {
76     int hostWatch;
77     int hostFd;  /* PTY FD in the host OS */
78     bool hostClosed;
79     int hostEpoll;
80 
81     int contWatch;
82     int contFd;  /* PTY FD in the container */
83     bool contClosed;
84     int contEpoll;
85 
86     int epollWatch;
87     int epollFd; /* epoll FD for dealing with EOF */
88 
89     size_t fromHostLen;
90     char fromHostBuf[1024];
91     size_t fromContLen;
92     char fromContBuf[1024];
93 
94     virNetDaemon *daemon;
95 };
96 
97 typedef struct _virLXCController virLXCController;
98 struct _virLXCController {
99     char *name;
100     virDomainObj *vm;
101     virDomainDef *def;
102 
103     int handshakeFds[2]; /* { read FD, write FD } */
104 
105     pid_t initpid;
106 
107     size_t nnbdpids;
108     pid_t *nbdpids;
109 
110     size_t nveths;
111     char **veths;
112 
113     size_t nnicindexes;
114     int *nicindexes;
115 
116     size_t npassFDs;
117     int *passFDs;
118 
119     int *nsFDs;
120 
121     size_t nconsoles;
122     virLXCControllerConsole *consoles;
123     char *devptmx;
124 
125     size_t nloopDevs;
126     int *loopDevFds;
127 
128     virSecurityManager *securityManager;
129 
130     virNetDaemon *daemon;
131     bool firstClient;
132     virNetServerClient *client;
133     virNetServerProgram *prog;
134     bool inShutdown;
135     int timerShutdown;
136 
137     virCgroup *cgroup;
138 
139     struct virLXCFuse *fuse;
140 };
141 
142 #include "lxc_controller_dispatch.h"
143 
144 static void virLXCControllerFree(virLXCController *ctrl);
145 static int virLXCControllerEventSendInit(virLXCController *ctrl,
146                                          pid_t initpid);
147 
virLXCControllerQuitTimer(int timer G_GNUC_UNUSED,void * opaque)148 static void virLXCControllerQuitTimer(int timer G_GNUC_UNUSED, void *opaque)
149 {
150     virLXCController *ctrl = opaque;
151 
152     VIR_DEBUG("Triggering event loop quit");
153     virNetDaemonQuit(ctrl->daemon);
154 }
155 
156 
157 static virLXCDriver *
virLXCControllerDriverNew(void)158 virLXCControllerDriverNew(void)
159 {
160     virLXCDriver *driver = g_new0(virLXCDriver, 1);
161 
162     if (virMutexInit(&driver->lock) < 0) {
163         virReportError(VIR_ERR_INTERNAL_ERROR,
164                        "%s", _("cannot initialize mutex"));
165         g_free(driver);
166         return NULL;
167     }
168 
169     driver->caps = virLXCDriverCapsInit(NULL);
170     driver->xmlopt = lxcDomainXMLConfInit(driver, NULL);
171 
172     return driver;
173 }
174 
175 
176 static void
virLXCControllerDriverFree(virLXCDriver * driver)177 virLXCControllerDriverFree(virLXCDriver *driver)
178 {
179     if (!driver)
180         return;
181     virObjectUnref(driver->xmlopt);
182     virObjectUnref(driver->caps);
183     virMutexDestroy(&driver->lock);
184     g_free(driver);
185 }
186 
187 
virLXCControllerNew(const char * name)188 static virLXCController *virLXCControllerNew(const char *name)
189 {
190     virLXCController *ctrl = g_new0(virLXCController, 1);
191     virLXCDriver *driver = NULL;
192     g_autofree char *configFile = NULL;
193 
194     ctrl->timerShutdown = -1;
195     ctrl->firstClient = true;
196     ctrl->name = g_strdup(name);
197     ctrl->handshakeFds[0] = -1;
198     ctrl->handshakeFds[1] = -1;
199 
200     if (!(driver = virLXCControllerDriverNew()))
201         goto error;
202 
203     if ((configFile = virDomainConfigFile(LXC_STATE_DIR,
204                                           ctrl->name)) == NULL)
205         goto error;
206 
207     if ((ctrl->vm = virDomainObjParseFile(configFile,
208                                           driver->xmlopt,
209                                           0)) == NULL)
210         goto error;
211     ctrl->def = ctrl->vm->def;
212 
213     if ((ctrl->timerShutdown = virEventAddTimeout(-1,
214                                                   virLXCControllerQuitTimer, ctrl,
215                                                   NULL)) < 0)
216         goto error;
217 
218  cleanup:
219     virLXCControllerDriverFree(driver);
220     return ctrl;
221 
222  error:
223     virLXCControllerFree(ctrl);
224     ctrl = NULL;
225     goto cleanup;
226 }
227 
228 
virLXCControllerCloseLoopDevices(virLXCController * ctrl)229 static int virLXCControllerCloseLoopDevices(virLXCController *ctrl)
230 {
231     size_t i;
232 
233     for (i = 0; i < ctrl->nloopDevs; i++)
234         VIR_FORCE_CLOSE(ctrl->loopDevFds[i]);
235 
236     return 0;
237 }
238 
239 
virLXCControllerStopInit(virLXCController * ctrl)240 static void virLXCControllerStopInit(virLXCController *ctrl)
241 {
242     if (ctrl->initpid == 0)
243         return;
244 
245     virLXCControllerCloseLoopDevices(ctrl);
246     virProcessAbort(ctrl->initpid);
247     ctrl->initpid = 0;
248 }
249 
250 
virLXCControllerConsoleClose(virLXCControllerConsole * console)251 static void virLXCControllerConsoleClose(virLXCControllerConsole *console)
252 {
253     if (console->hostWatch != -1)
254         virEventRemoveHandle(console->hostWatch);
255     VIR_FORCE_CLOSE(console->hostFd);
256 
257     if (console->contWatch != -1)
258         virEventRemoveHandle(console->contWatch);
259     VIR_FORCE_CLOSE(console->contFd);
260 
261     if (console->epollWatch != -1)
262         virEventRemoveHandle(console->epollWatch);
263     VIR_FORCE_CLOSE(console->epollFd);
264 }
265 
266 
267 static void
virLXCControllerFreeFuse(virLXCController * ctrl)268 virLXCControllerFreeFuse(virLXCController *ctrl)
269 {
270     return lxcFreeFuse(&ctrl->fuse);
271 }
272 
273 
virLXCControllerFree(virLXCController * ctrl)274 static void virLXCControllerFree(virLXCController *ctrl)
275 {
276     size_t i;
277 
278     if (!ctrl)
279         return;
280 
281     virLXCControllerStopInit(ctrl);
282 
283     virObjectUnref(ctrl->securityManager);
284 
285     for (i = 0; i < ctrl->nveths; i++)
286         g_free(ctrl->veths[i]);
287     g_free(ctrl->veths);
288     g_free(ctrl->nicindexes);
289 
290     for (i = 0; i < ctrl->npassFDs; i++)
291         VIR_FORCE_CLOSE(ctrl->passFDs[i]);
292     g_free(ctrl->passFDs);
293 
294     for (i = 0; i < ctrl->nconsoles; i++)
295         virLXCControllerConsoleClose(&(ctrl->consoles[i]));
296     g_free(ctrl->consoles);
297 
298     g_free(ctrl->devptmx);
299 
300     virDomainObjEndAPI(&ctrl->vm);
301     g_free(ctrl->name);
302 
303     if (ctrl->timerShutdown != -1)
304         virEventRemoveTimeout(ctrl->timerShutdown);
305 
306     virObjectUnref(ctrl->daemon);
307     virLXCControllerFreeFuse(ctrl);
308 
309     g_free(ctrl->nbdpids);
310 
311     g_free(ctrl->nsFDs);
312     virCgroupFree(ctrl->cgroup);
313 
314     /* This must always be the last thing to be closed */
315     for (i = 0; i < G_N_ELEMENTS(ctrl->handshakeFds); i++)
316         VIR_FORCE_CLOSE(ctrl->handshakeFds[i]);
317     g_free(ctrl);
318 }
319 
320 
virLXCControllerAddConsole(virLXCController * ctrl,int hostFd)321 static int virLXCControllerAddConsole(virLXCController *ctrl,
322                                       int hostFd)
323 {
324     VIR_EXPAND_N(ctrl->consoles, ctrl->nconsoles, 1);
325     ctrl->consoles[ctrl->nconsoles-1].daemon = ctrl->daemon;
326     ctrl->consoles[ctrl->nconsoles-1].hostFd = hostFd;
327     ctrl->consoles[ctrl->nconsoles-1].hostWatch = -1;
328 
329     ctrl->consoles[ctrl->nconsoles-1].contFd = -1;
330     ctrl->consoles[ctrl->nconsoles-1].contWatch = -1;
331 
332     ctrl->consoles[ctrl->nconsoles-1].epollFd = -1;
333     ctrl->consoles[ctrl->nconsoles-1].epollWatch = -1;
334     return 0;
335 }
336 
337 
virLXCControllerConsoleSetNonblocking(virLXCControllerConsole * console)338 static int virLXCControllerConsoleSetNonblocking(virLXCControllerConsole *console)
339 {
340     if (virSetBlocking(console->hostFd, false) < 0 ||
341         virSetBlocking(console->contFd, false) < 0) {
342         virReportSystemError(errno, "%s",
343                              _("Unable to set console file descriptor non-blocking"));
344         return -1;
345     }
346 
347     return 0;
348 }
349 
350 
virLXCControllerDaemonHandshakeCont(virLXCController * ctrl)351 static int virLXCControllerDaemonHandshakeCont(virLXCController *ctrl)
352 {
353     if (lxcContainerSendContinue(ctrl->handshakeFds[1]) < 0) {
354         virReportSystemError(errno, "%s",
355                              _("error sending continue signal to daemon"));
356         return -1;
357     }
358     return 0;
359 }
360 
virLXCControllerDaemonHandshakeWait(virLXCController * ctrl)361 static int virLXCControllerDaemonHandshakeWait(virLXCController *ctrl)
362 {
363     if (lxcContainerWaitForContinue(ctrl->handshakeFds[0]) < 0) {
364         virReportSystemError(errno, "%s",
365                              _("error waiting for continue signal from daemon"));
366         return -1;
367     }
368     return 0;
369 }
370 
virLXCControllerValidateNICs(virLXCController * ctrl)371 static int virLXCControllerValidateNICs(virLXCController *ctrl)
372 {
373     if (ctrl->def->nnets != ctrl->nveths) {
374         virReportError(VIR_ERR_INTERNAL_ERROR,
375                        _("expecting %zu veths, but got %zu"),
376                        ctrl->def->nnets, ctrl->nveths);
377         return -1;
378     }
379 
380     return 0;
381 }
382 
383 
virLXCControllerGetNICIndexes(virLXCController * ctrl)384 static int virLXCControllerGetNICIndexes(virLXCController *ctrl)
385 {
386     size_t i;
387 
388     /* Gather the ifindexes of the "parent" veths for all interfaces
389      * implemented with a veth pair. These will be used when calling
390      * virCgroupNewMachine (and eventually the dbus method
391      * CreateMachineWithNetwork). ifindexes for the child veths, and
392      * for macvlan interfaces, *should not* be in this list, as they
393      * will be moved into the container. Only the interfaces that will
394      * remain outside the container, but are used for communication
395      * with the container, should be added to the list.
396      */
397 
398     VIR_DEBUG("Getting nic indexes");
399     for (i = 0; i < ctrl->def->nnets; i++) {
400         int nicindex = -1;
401         virDomainNetType actualType = virDomainNetGetActualType(ctrl->def->nets[i]);
402 
403         switch (actualType) {
404         case VIR_DOMAIN_NET_TYPE_BRIDGE:
405         case VIR_DOMAIN_NET_TYPE_NETWORK:
406         case VIR_DOMAIN_NET_TYPE_ETHERNET:
407             if (ctrl->def->nets[i]->ifname == NULL)
408                 continue;
409             if (virNetDevGetIndex(ctrl->def->nets[i]->ifname,
410                                   &nicindex) < 0)
411                 return -1;
412             VIR_EXPAND_N(ctrl->nicindexes, ctrl->nnicindexes, 1);
413             VIR_DEBUG("Index %d for %s", nicindex,
414                       ctrl->def->nets[i]->ifname);
415             ctrl->nicindexes[ctrl->nnicindexes-1] = nicindex;
416             break;
417 
418         case VIR_DOMAIN_NET_TYPE_DIRECT:
419            break;
420 
421         case VIR_DOMAIN_NET_TYPE_USER:
422         case VIR_DOMAIN_NET_TYPE_VHOSTUSER:
423         case VIR_DOMAIN_NET_TYPE_SERVER:
424         case VIR_DOMAIN_NET_TYPE_CLIENT:
425         case VIR_DOMAIN_NET_TYPE_MCAST:
426         case VIR_DOMAIN_NET_TYPE_UDP:
427         case VIR_DOMAIN_NET_TYPE_INTERNAL:
428         case VIR_DOMAIN_NET_TYPE_HOSTDEV:
429         case VIR_DOMAIN_NET_TYPE_VDPA:
430             virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
431                            _("Unsupported net type %s"),
432                            virDomainNetTypeToString(actualType));
433             return -1;
434         case VIR_DOMAIN_NET_TYPE_LAST:
435         default:
436             virReportEnumRangeError(virDomainNetType, actualType);
437             return -1;
438         }
439     }
440 
441     return 0;
442 }
443 
444 
virLXCControllerValidateConsoles(virLXCController * ctrl)445 static int virLXCControllerValidateConsoles(virLXCController *ctrl)
446 {
447     if (ctrl->def->nconsoles != ctrl->nconsoles) {
448         virReportError(VIR_ERR_INTERNAL_ERROR,
449                        _("expecting %zu consoles, but got %zu tty file handlers"),
450                        ctrl->def->nconsoles, ctrl->nconsoles);
451         return -1;
452     }
453 
454     return 0;
455 }
456 
457 
virLXCControllerSetupLoopDeviceFS(virDomainFSDef * fs)458 static int virLXCControllerSetupLoopDeviceFS(virDomainFSDef *fs)
459 {
460     int lofd;
461     char *loname = NULL;
462 
463     if ((lofd = virFileLoopDeviceAssociate(fs->src->path, &loname)) < 0)
464         return -1;
465 
466     VIR_DEBUG("Changing fs %s to use type=block for dev %s",
467               fs->src->path, loname);
468     /*
469      * We now change it into a block device type, so that
470      * the rest of container setup 'just works'
471      */
472     fs->type = VIR_DOMAIN_FS_TYPE_BLOCK;
473     g_free(fs->src->path);
474     fs->src->path = g_steal_pointer(&loname);
475 
476     return lofd;
477 }
478 
479 
virLXCControllerSetupLoopDeviceDisk(virDomainDiskDef * disk)480 static int virLXCControllerSetupLoopDeviceDisk(virDomainDiskDef *disk)
481 {
482     int lofd;
483     g_autofree char *loname = NULL;
484     const char *src = virDomainDiskGetSource(disk);
485 
486     if ((lofd = virFileLoopDeviceAssociate(src, &loname)) < 0)
487         return -1;
488 
489     VIR_DEBUG("Changing disk %s to use type=block for dev %s",
490               src, loname);
491 
492     /*
493      * We now change it into a block device type, so that
494      * the rest of container setup 'just works'
495      */
496     virDomainDiskSetType(disk, VIR_STORAGE_TYPE_BLOCK);
497     virDomainDiskSetSource(disk, loname);
498 
499     return lofd;
500 
501 }
502 
503 
virLXCControllerSetupNBDDeviceFS(virDomainFSDef * fs)504 static int virLXCControllerSetupNBDDeviceFS(virDomainFSDef *fs)
505 {
506     char *dev;
507 
508     if (fs->format <= VIR_STORAGE_FILE_NONE) {
509         virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
510                        _("An explicit disk format must be specified"));
511         return -1;
512     }
513 
514     if (virFileNBDDeviceAssociate(fs->src->path,
515                                   virStorageFileFormatTypeToString(fs->format),
516                                   fs->readonly,
517                                   &dev) < 0)
518         return -1;
519 
520     VIR_DEBUG("Changing fs %s to use type=block for dev %s",
521               fs->src->path, dev);
522     /*
523      * We now change it into a block device type, so that
524      * the rest of container setup 'just works'
525      */
526     fs->type = VIR_DOMAIN_FS_TYPE_BLOCK;
527     g_free(fs->src->path);
528     fs->src->path = dev;
529 
530     return 0;
531 }
532 
533 
virLXCControllerSetupNBDDeviceDisk(virDomainDiskDef * disk)534 static int virLXCControllerSetupNBDDeviceDisk(virDomainDiskDef *disk)
535 {
536     g_autofree char *dev = NULL;
537     const char *src = virDomainDiskGetSource(disk);
538     int format = virDomainDiskGetFormat(disk);
539 
540     if (format <= VIR_STORAGE_FILE_NONE) {
541         virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
542                        _("An explicit disk format must be specified"));
543         return -1;
544     }
545 
546     if (virFileNBDDeviceAssociate(src,
547                                   virStorageFileFormatTypeToString(format),
548                                   disk->src->readonly,
549                                   &dev) < 0)
550         return -1;
551 
552     VIR_DEBUG("Changing disk %s to use type=block for dev %s",
553               src, dev);
554     /*
555      * We now change it into a block device type, so that
556      * the rest of container setup 'just works'
557      */
558     virDomainDiskSetType(disk, VIR_STORAGE_TYPE_BLOCK);
559     virDomainDiskSetSource(disk, dev);
560 
561     return 0;
562 }
563 
virLXCControllerAppendNBDPids(virLXCController * ctrl,const char * dev)564 static int virLXCControllerAppendNBDPids(virLXCController *ctrl,
565                                          const char *dev)
566 {
567     g_autofree char *pidpath = NULL;
568     g_autofree pid_t *pids = NULL;
569     size_t npids = 0;
570     size_t i;
571     size_t loops = 0;
572     pid_t pid;
573 
574     if (!STRPREFIX(dev, "/dev/"))
575         return -1;
576 
577     pidpath = g_strdup_printf("/sys/devices/virtual/block/%s/pid", dev + 5);
578 
579     /* Wait for the pid file to appear */
580     while (!virFileExists(pidpath)) {
581         /* wait for 100ms before checking again, but don't do it for ever */
582         if (errno == ENOENT && loops < 10) {
583             g_usleep(100 * 1000);
584             loops++;
585         } else {
586             virReportSystemError(errno,
587                                  _("Cannot check NBD device %s pid"),
588                                  dev + 5);
589             return -1;
590         }
591     }
592 
593     if (virPidFileReadPath(pidpath, &pid) < 0)
594         return -1;
595 
596     if (virProcessGetPids(pid, &npids, &pids) < 0)
597         return -1;
598 
599     for (i = 0; i < npids; i++) {
600         VIR_APPEND_ELEMENT(ctrl->nbdpids, ctrl->nnbdpids, pids[i]);
601     }
602 
603     return 0;
604 }
605 
virLXCControllerSetupLoopDevices(virLXCController * ctrl)606 static int virLXCControllerSetupLoopDevices(virLXCController *ctrl)
607 {
608     size_t i;
609 
610     VIR_DEBUG("Setting up loop devices for filesystems");
611 
612     for (i = 0; i < ctrl->def->nfss; i++) {
613         virDomainFSDef *fs = ctrl->def->fss[i];
614         int fd;
615 
616         if (fs->type != VIR_DOMAIN_FS_TYPE_FILE)
617             continue;
618 
619         if (fs->fsdriver == VIR_DOMAIN_FS_DRIVER_TYPE_DEFAULT) {
620             if (fs->format == VIR_STORAGE_FILE_RAW ||
621                 fs->format == VIR_STORAGE_FILE_NONE)
622                 fs->fsdriver = VIR_DOMAIN_FS_DRIVER_TYPE_LOOP;
623             else
624                 fs->fsdriver = VIR_DOMAIN_FS_DRIVER_TYPE_NBD;
625         }
626 
627         if (fs->fsdriver == VIR_DOMAIN_FS_DRIVER_TYPE_LOOP) {
628             if (fs->format != VIR_STORAGE_FILE_RAW &&
629                 fs->format != VIR_STORAGE_FILE_NONE) {
630                 virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
631                                _("fs format %s is not supported"),
632                                virStorageFileFormatTypeToString(fs->format));
633                 return -1;
634             }
635 
636             fd = virLXCControllerSetupLoopDeviceFS(fs);
637             if (fd < 0)
638                 return -1;
639 
640             VIR_DEBUG("Saving loop fd %d", fd);
641             VIR_EXPAND_N(ctrl->loopDevFds, ctrl->nloopDevs, 1);
642             ctrl->loopDevFds[ctrl->nloopDevs - 1] = fd;
643         } else if (fs->fsdriver == VIR_DOMAIN_FS_DRIVER_TYPE_NBD) {
644             if (virLXCControllerSetupNBDDeviceFS(fs) < 0)
645                 return -1;
646 
647             /* The NBD device will be cleaned up while the cgroup will end.
648              * For this we need to remember the qemu-nbd pid and add it to
649              * the cgroup */
650             if (virLXCControllerAppendNBDPids(ctrl, fs->src->path) < 0)
651                 return -1;
652         } else {
653             virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
654                            _("fs driver %s is not supported"),
655                            virDomainFSDriverTypeToString(fs->fsdriver));
656             return -1;
657         }
658     }
659 
660     VIR_DEBUG("Setting up loop devices for disks");
661 
662     for (i = 0; i < ctrl->def->ndisks; i++) {
663         virDomainDiskDef *disk = ctrl->def->disks[i];
664         int fd;
665         const char *driver = virDomainDiskGetDriver(disk);
666         int format = virDomainDiskGetFormat(disk);
667 
668         if (virDomainDiskGetType(disk) != VIR_STORAGE_TYPE_FILE)
669             continue;
670 
671         /* If no driverName is set, we prefer 'loop' for
672          * dealing with raw or undefined formats, otherwise
673          * we use 'nbd'.
674          */
675         if (STREQ_NULLABLE(driver, "loop") ||
676             (!driver &&
677              (format == VIR_STORAGE_FILE_RAW ||
678               format == VIR_STORAGE_FILE_NONE))) {
679             if (format != VIR_STORAGE_FILE_RAW &&
680                 format != VIR_STORAGE_FILE_NONE) {
681                 virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
682                                _("disk format %s is not supported"),
683                                virStorageFileFormatTypeToString(format));
684                 return -1;
685             }
686 
687             /* We treat 'none' as meaning 'raw' since we
688              * don't want to go into the auto-probing
689              * business for security reasons
690              */
691             fd = virLXCControllerSetupLoopDeviceDisk(disk);
692             if (fd < 0)
693                 return -1;
694 
695             VIR_DEBUG("Saving loop fd %d", fd);
696             VIR_EXPAND_N(ctrl->loopDevFds, ctrl->nloopDevs, 1);
697             ctrl->loopDevFds[ctrl->nloopDevs - 1] = fd;
698         } else if (!driver || STREQ(driver, "nbd")) {
699             if (disk->cachemode != VIR_DOMAIN_DISK_CACHE_DEFAULT &&
700                 disk->cachemode != VIR_DOMAIN_DISK_CACHE_DISABLE) {
701                 virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
702                                _("Disk cache mode %s is not supported"),
703                                virDomainDiskCacheTypeToString(disk->cachemode));
704                 return -1;
705             }
706             if (virLXCControllerSetupNBDDeviceDisk(disk) < 0)
707                 return -1;
708 
709             /* The NBD device will be cleaned up while the cgroup will end.
710              * For this we need to remember the qemu-nbd pid and add it to
711              * the cgroup */
712             if (virLXCControllerAppendNBDPids(ctrl, virDomainDiskGetSource(disk)) < 0)
713                 return -1;
714         } else {
715             virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
716                            _("disk driver %s is not supported"),
717                            driver);
718             return -1;
719         }
720     }
721 
722     VIR_DEBUG("Setup all loop devices");
723 
724     return 0;
725 }
726 
727 
728 /*
729  * To be run while still single threaded
730  */
virLXCControllerSetupCpuAffinity(virLXCController * ctrl)731 static int virLXCControllerSetupCpuAffinity(virLXCController *ctrl)
732 {
733     int hostcpus, maxcpu = CPU_SETSIZE;
734     virBitmap *cpumap;
735     virBitmap *cpumapToSet;
736 
737     VIR_DEBUG("Setting CPU affinity");
738 
739     /* setaffinity fails if you set bits for CPUs which
740      * aren't present, so we have to limit ourselves */
741     if ((hostcpus = virHostCPUGetCount()) < 0)
742         return -1;
743 
744     if (maxcpu > hostcpus)
745         maxcpu = hostcpus;
746 
747     cpumap = virBitmapNew(maxcpu);
748     cpumapToSet = cpumap;
749 
750     if (ctrl->def->cpumask) {
751         cpumapToSet = ctrl->def->cpumask;
752     } else {
753         /* You may think this is redundant, but we can't assume libvirtd
754          * itself is running on all pCPUs, so we need to explicitly set
755          * the spawned LXC instance to all pCPUs if no map is given in
756          * its config file */
757         virBitmapSetAll(cpumap);
758     }
759 
760     /* We are presuming we are running between fork/exec of LXC
761      * so use '0' to indicate our own process ID. No threads are
762      * running at this point
763      */
764     if (virProcessSetAffinity(0 /* Self */, cpumapToSet, false) < 0) {
765         virBitmapFree(cpumap);
766         return -1;
767     }
768     virBitmapFree(cpumap);
769 
770     return 0;
771 }
772 
773 
virLXCControllerGetNumadAdvice(virLXCController * ctrl,virBitmap ** mask)774 static int virLXCControllerGetNumadAdvice(virLXCController *ctrl,
775                                           virBitmap **mask)
776 {
777     virBitmap *nodemask = NULL;
778     g_autofree char *nodeset = NULL;
779 
780     /* Get the advisory nodeset from numad if 'placement' of
781      * either <vcpu> or <numatune> is 'auto'.
782      */
783     if (virDomainDefNeedsPlacementAdvice(ctrl->def)) {
784         nodeset = virNumaGetAutoPlacementAdvice(virDomainDefGetVcpus(ctrl->def),
785                                                 ctrl->def->mem.cur_balloon);
786         if (!nodeset)
787             return -1;
788 
789         VIR_DEBUG("Nodeset returned from numad: %s", nodeset);
790 
791         if (virBitmapParse(nodeset, &nodemask, VIR_DOMAIN_CPUMASK_LEN) < 0)
792             return -1;
793     }
794 
795     *mask = nodemask;
796 
797     return 0;
798 }
799 
800 
801 /**
802  * virLXCControllerSetupResourceLimits
803  * @ctrl: the controller state
804  *
805  * Sets up the non-cgroup based resource limits that need
806  * to be inherited by the child process across clone()/exec().
807  * The cgroup limits are setup later
808  *
809  * Returns 0 on success or -1 in case of error
810  */
virLXCControllerSetupResourceLimits(virLXCController * ctrl)811 static int virLXCControllerSetupResourceLimits(virLXCController *ctrl)
812 {
813     virBitmap *auto_nodeset = NULL;
814     int ret = -1;
815     virBitmap *nodeset = NULL;
816     virDomainNumatuneMemMode mode;
817 
818     if (virDomainNumatuneGetMode(ctrl->def->numa, -1, &mode) == 0) {
819         if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
820             virCgroupControllerAvailable(VIR_CGROUP_CONTROLLER_CPUSET)) {
821             /* Use virNuma* API iff necessary. Once set and child is exec()-ed,
822              * there's no way for us to change it. Rely on cgroups (if available
823              * and enabled in the config) rather than virNuma*. */
824             VIR_DEBUG("Relying on CGroups for memory binding");
825         } else {
826 
827             VIR_DEBUG("Setting up process resource limits");
828 
829             if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0)
830                 goto cleanup;
831 
832             nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1);
833 
834             if (virNumaSetupMemoryPolicy(mode, nodeset) < 0)
835                 goto cleanup;
836         }
837     }
838 
839     if (virLXCControllerSetupCpuAffinity(ctrl) < 0)
840         goto cleanup;
841 
842     ret = 0;
843  cleanup:
844     virBitmapFree(auto_nodeset);
845     return ret;
846 }
847 
848 
849 /*
850  * Creates the cgroup and sets up the various limits associated
851  * with it
852  */
virLXCControllerSetupCgroupLimits(virLXCController * ctrl)853 static int virLXCControllerSetupCgroupLimits(virLXCController *ctrl)
854 {
855     virBitmap *auto_nodeset = NULL;
856     int ret = -1;
857     virBitmap *nodeset = NULL;
858     size_t i;
859 
860     VIR_DEBUG("Setting up cgroup resource limits");
861 
862     if (virLXCControllerGetNumadAdvice(ctrl, &auto_nodeset) < 0)
863         goto cleanup;
864 
865     nodeset = virDomainNumatuneGetNodeset(ctrl->def->numa, auto_nodeset, -1);
866 
867     if (!(ctrl->cgroup = virLXCCgroupCreate(ctrl->def,
868                                             getpid(),
869                                             ctrl->nnicindexes,
870                                             ctrl->nicindexes)))
871         goto cleanup;
872 
873     if (virCgroupAddMachineProcess(ctrl->cgroup, ctrl->initpid) < 0)
874         goto cleanup;
875 
876     /* Add all qemu-nbd tasks to the cgroup */
877     for (i = 0; i < ctrl->nnbdpids; i++) {
878         if (virCgroupAddMachineProcess(ctrl->cgroup, ctrl->nbdpids[i]) < 0)
879             goto cleanup;
880     }
881 
882     if (virLXCCgroupSetup(ctrl->def, ctrl->cgroup, nodeset) < 0)
883         goto cleanup;
884 
885     ret = 0;
886  cleanup:
887     virBitmapFree(auto_nodeset);
888     return ret;
889 }
890 
891 
virLXCControllerClientCloseHook(virNetServerClient * client)892 static void virLXCControllerClientCloseHook(virNetServerClient *client)
893 {
894     virLXCController *ctrl = virNetServerClientGetPrivateData(client);
895 
896     VIR_DEBUG("Client %p has closed", client);
897     if (ctrl->client == client)
898         ctrl->client = NULL;
899     if (ctrl->inShutdown) {
900         VIR_DEBUG("Arm timer to quit event loop");
901         virEventUpdateTimeout(ctrl->timerShutdown, 0);
902     }
903 }
904 
virLXCControllerClientPrivateFree(void * data)905 static void virLXCControllerClientPrivateFree(void *data)
906 {
907     virLXCController *ctrl = data;
908     VIR_DEBUG("Got private data free %p", ctrl);
909 }
910 
virLXCControllerClientPrivateNew(virNetServerClient * client,void * opaque)911 static void *virLXCControllerClientPrivateNew(virNetServerClient *client,
912                                               void *opaque)
913 {
914     virLXCController *ctrl = opaque;
915 
916     virNetServerClientSetCloseHook(client, virLXCControllerClientCloseHook);
917     VIR_DEBUG("Got new client %p", client);
918     ctrl->client = client;
919 
920     if (ctrl->initpid && ctrl->firstClient)
921         virLXCControllerEventSendInit(ctrl, ctrl->initpid);
922     ctrl->firstClient = false;
923 
924     return ctrl;
925 }
926 
927 
virLXCControllerSetupServer(virLXCController * ctrl)928 static int virLXCControllerSetupServer(virLXCController *ctrl)
929 {
930     virNetServer *srv = NULL;
931     virNetServerService *svc = NULL;
932     g_autofree char *sockpath = NULL;
933 
934     sockpath = g_strdup_printf("%s/%s.sock", LXC_STATE_DIR, ctrl->name);
935 
936     if (!(srv = virNetServerNew("LXC", 1,
937                                 0, 0, 0, 1,
938                                 0, -1, 0,
939                                 virLXCControllerClientPrivateNew,
940                                 NULL,
941                                 virLXCControllerClientPrivateFree,
942                                 ctrl)))
943         goto error;
944 
945     if (virSecurityManagerSetSocketLabel(ctrl->securityManager, ctrl->def) < 0)
946         goto error;
947 
948     if (!(svc = virNetServerServiceNewUNIX(sockpath,
949                                            0700,
950                                            0,
951                                            0,
952                                            NULL,
953                                            false,
954                                            0,
955                                            5)))
956         goto error;
957 
958     if (virSecurityManagerClearSocketLabel(ctrl->securityManager, ctrl->def) < 0)
959         goto error;
960 
961     if (virNetServerAddService(srv, svc) < 0)
962         goto error;
963     virObjectUnref(svc);
964     svc = NULL;
965 
966     if (!(ctrl->prog = virNetServerProgramNew(VIR_LXC_MONITOR_PROGRAM,
967                                               VIR_LXC_MONITOR_PROGRAM_VERSION,
968                                               virLXCMonitorProcs,
969                                               virLXCMonitorNProcs)))
970         goto error;
971 
972     if (!(ctrl->daemon = virNetDaemonNew()) ||
973         virNetDaemonAddServer(ctrl->daemon, srv) < 0)
974         goto error;
975 
976     virNetDaemonUpdateServices(ctrl->daemon, true);
977     return 0;
978 
979  error:
980     virObjectUnref(srv);
981     virObjectUnref(ctrl->daemon);
982     ctrl->daemon = NULL;
983     virObjectUnref(svc);
984     return -1;
985 }
986 
987 
lxcControllerClearCapabilities(void)988 static int lxcControllerClearCapabilities(void)
989 {
990 #if WITH_CAPNG
991     int ret;
992 
993     capng_clear(CAPNG_SELECT_BOTH);
994 
995     if ((ret = capng_apply(CAPNG_SELECT_BOTH)) < 0) {
996         virReportError(VIR_ERR_INTERNAL_ERROR,
997                        _("failed to apply capabilities: %d"), ret);
998         return -1;
999     }
1000 #else
1001     VIR_WARN("libcap-ng support not compiled in, unable to clear capabilities");
1002 #endif
1003     return 0;
1004 }
1005 
1006 static bool wantReboot;
1007 static virMutex lock = VIR_MUTEX_INITIALIZER;
1008 
1009 
virLXCControllerSignalChildIO(virNetDaemon * dmn,siginfo_t * info G_GNUC_UNUSED,void * opaque)1010 static void virLXCControllerSignalChildIO(virNetDaemon *dmn,
1011                                           siginfo_t *info G_GNUC_UNUSED,
1012                                           void *opaque)
1013 {
1014     virLXCController *ctrl = opaque;
1015     int ret;
1016     int status;
1017 
1018     ret = waitpid(-1, &status, WNOHANG);
1019     VIR_DEBUG("Got sig child %d vs %lld", ret, (long long)ctrl->initpid);
1020     if (ret == ctrl->initpid) {
1021         virNetDaemonQuit(dmn);
1022         virMutexLock(&lock);
1023         if (WIFSIGNALED(status) &&
1024             WTERMSIG(status) == SIGHUP) {
1025             VIR_DEBUG("Status indicates reboot");
1026             wantReboot = true;
1027         }
1028         virMutexUnlock(&lock);
1029     }
1030 }
1031 
1032 
virLXCControllerConsoleUpdateWatch(virLXCControllerConsole * console)1033 static void virLXCControllerConsoleUpdateWatch(virLXCControllerConsole *console)
1034 {
1035     int hostEvents = 0;
1036     int contEvents = 0;
1037 
1038     /* If host console is open, then we can look to read/write */
1039     if (!console->hostClosed) {
1040         if (console->fromHostLen < sizeof(console->fromHostBuf))
1041             hostEvents |= VIR_EVENT_HANDLE_READABLE;
1042         if (console->fromContLen)
1043             hostEvents |= VIR_EVENT_HANDLE_WRITABLE;
1044     }
1045 
1046     /* If cont console is open, then we can look to read/write */
1047     if (!console->contClosed) {
1048         if (console->fromContLen < sizeof(console->fromContBuf))
1049             contEvents |= VIR_EVENT_HANDLE_READABLE;
1050         if (console->fromHostLen)
1051             contEvents |= VIR_EVENT_HANDLE_WRITABLE;
1052     }
1053 
1054     VIR_DEBUG("Container watch=%d, events=%d closed=%d; host watch=%d events=%d closed=%d",
1055               console->contWatch, contEvents, console->contClosed,
1056               console->hostWatch, hostEvents, console->hostClosed);
1057     virEventUpdateHandle(console->contWatch, contEvents);
1058     virEventUpdateHandle(console->hostWatch, hostEvents);
1059 
1060     if (console->hostClosed) {
1061         /* Must setup an epoll to detect when host becomes accessible again */
1062         int events = EPOLLIN | EPOLLET;
1063         if (console->fromContLen)
1064             events |= EPOLLOUT;
1065 
1066         if (events != console->hostEpoll) {
1067             struct epoll_event event;
1068             int action = EPOLL_CTL_ADD;
1069             if (console->hostEpoll)
1070                 action = EPOLL_CTL_MOD;
1071 
1072             VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->hostEpoll);
1073 
1074             event.events = events;
1075             event.data.fd = console->hostFd;
1076             if (epoll_ctl(console->epollFd, action, console->hostFd, &event) < 0) {
1077                 VIR_DEBUG(":fail");
1078                 virReportSystemError(errno, "%s",
1079                                      _("Unable to add epoll fd"));
1080                 virNetDaemonQuit(console->daemon);
1081                 return;
1082             }
1083             console->hostEpoll = events;
1084             VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->hostEpoll);
1085         }
1086     } else if (console->hostEpoll) {
1087         VIR_DEBUG("Stop epoll oldContEvents=%x", console->hostEpoll);
1088         if (epoll_ctl(console->epollFd, EPOLL_CTL_DEL, console->hostFd, NULL) < 0) {
1089             virReportSystemError(errno, "%s",
1090                                  _("Unable to remove epoll fd"));
1091             VIR_DEBUG(":fail");
1092             virNetDaemonQuit(console->daemon);
1093             return;
1094         }
1095         console->hostEpoll = 0;
1096     }
1097 
1098     if (console->contClosed) {
1099         /* Must setup an epoll to detect when guest becomes accessible again */
1100         int events = EPOLLIN | EPOLLET;
1101         if (console->fromHostLen)
1102             events |= EPOLLOUT;
1103 
1104         if (events != console->contEpoll) {
1105             struct epoll_event event;
1106             int action = EPOLL_CTL_ADD;
1107             if (console->contEpoll)
1108                 action = EPOLL_CTL_MOD;
1109 
1110             VIR_DEBUG("newContEvents=%x oldContEvents=%x", events, console->contEpoll);
1111 
1112             event.events = events;
1113             event.data.fd = console->contFd;
1114             if (epoll_ctl(console->epollFd, action, console->contFd, &event) < 0) {
1115                 virReportSystemError(errno, "%s",
1116                                      _("Unable to add epoll fd"));
1117                 VIR_DEBUG(":fail");
1118                 virNetDaemonQuit(console->daemon);
1119                 return;
1120             }
1121             console->contEpoll = events;
1122             VIR_DEBUG("newHostEvents=%x oldHostEvents=%x", events, console->contEpoll);
1123         }
1124     } else if (console->contEpoll) {
1125         VIR_DEBUG("Stop epoll oldContEvents=%x", console->contEpoll);
1126         if (epoll_ctl(console->epollFd, EPOLL_CTL_DEL, console->contFd, NULL) < 0) {
1127             virReportSystemError(errno, "%s",
1128                                  _("Unable to remove epoll fd"));
1129             VIR_DEBUG(":fail");
1130             virNetDaemonQuit(console->daemon);
1131             return;
1132         }
1133         console->contEpoll = 0;
1134     }
1135 }
1136 
1137 
virLXCControllerConsoleEPoll(int watch,int fd,int events,void * opaque)1138 static void virLXCControllerConsoleEPoll(int watch, int fd, int events, void *opaque)
1139 {
1140     virLXCControllerConsole *console = opaque;
1141 
1142     virMutexLock(&lock);
1143     VIR_DEBUG("IO event watch=%d fd=%d events=%d fromHost=%zu fromcont=%zu",
1144               watch, fd, events,
1145               console->fromHostLen,
1146               console->fromContLen);
1147 
1148     while (1) {
1149         struct epoll_event event;
1150         int ret;
1151         ret = epoll_wait(console->epollFd, &event, 1, 0);
1152         if (ret < 0) {
1153             if (errno == EINTR)
1154                 continue;
1155             virReportSystemError(errno, "%s",
1156                                  _("Unable to wait on epoll"));
1157             virNetDaemonQuit(console->daemon);
1158             goto cleanup;
1159         }
1160 
1161         if (ret == 0)
1162             break;
1163 
1164         VIR_DEBUG("fd=%d hostFd=%d contFd=%d hostEpoll=%x contEpoll=%x",
1165                   event.data.fd, console->hostFd, console->contFd,
1166                   console->hostEpoll, console->contEpoll);
1167 
1168         /* If we get HUP+dead PID, we just re-enable the main loop
1169          * which will see the PID has died and exit */
1170         if ((event.events & (EPOLLIN|EPOLLOUT))) {
1171             if (event.data.fd == console->hostFd) {
1172                 console->hostClosed = false;
1173             } else {
1174                 console->contClosed = false;
1175             }
1176             virLXCControllerConsoleUpdateWatch(console);
1177             break;
1178         }
1179     }
1180 
1181  cleanup:
1182     virMutexUnlock(&lock);
1183 }
1184 
virLXCControllerConsoleIO(int watch,int fd,int events,void * opaque)1185 static void virLXCControllerConsoleIO(int watch, int fd, int events, void *opaque)
1186 {
1187     virLXCControllerConsole *console = opaque;
1188 
1189     virMutexLock(&lock);
1190     VIR_DEBUG("IO event watch=%d fd=%d events=%d fromHost=%zu fromcont=%zu",
1191               watch, fd, events,
1192               console->fromHostLen,
1193               console->fromContLen);
1194     if (events & VIR_EVENT_HANDLE_READABLE) {
1195         char *buf;
1196         size_t *len;
1197         size_t avail;
1198         ssize_t done;
1199         if (watch == console->hostWatch) {
1200             buf = console->fromHostBuf;
1201             len = &console->fromHostLen;
1202             avail = sizeof(console->fromHostBuf) - *len;
1203         } else {
1204             buf = console->fromContBuf;
1205             len = &console->fromContLen;
1206             avail = sizeof(console->fromContBuf) - *len;
1207         }
1208      reread:
1209         done = read(fd, buf + *len, avail);
1210         if (done == -1 && errno == EINTR)
1211             goto reread;
1212         if (done == -1 && errno != EAGAIN) {
1213             virReportSystemError(errno, "%s",
1214                                  _("Unable to read container pty"));
1215             goto error;
1216         }
1217         if (done > 0) {
1218             *len += done;
1219         } else {
1220             VIR_DEBUG("Read fd %d done %d errno %d", fd, (int)done, errno);
1221         }
1222     }
1223 
1224     if (events & VIR_EVENT_HANDLE_WRITABLE) {
1225         char *buf;
1226         size_t *len;
1227         ssize_t done;
1228         if (watch == console->hostWatch) {
1229             buf = console->fromContBuf;
1230             len = &console->fromContLen;
1231         } else {
1232             buf = console->fromHostBuf;
1233             len = &console->fromHostLen;
1234         }
1235 
1236      rewrite:
1237         done = write(fd, buf, *len);
1238         if (done == -1 && errno == EINTR)
1239             goto rewrite;
1240         if (done == -1 && errno != EAGAIN) {
1241             virReportSystemError(errno, "%s",
1242                                  _("Unable to write to container pty"));
1243             goto error;
1244         }
1245         if (done > 0) {
1246             memmove(buf, buf + done, (*len - done));
1247             *len -= done;
1248         } else {
1249             VIR_DEBUG("Write fd %d done %d errno %d", fd, (int)done, errno);
1250         }
1251     }
1252 
1253     if (events & VIR_EVENT_HANDLE_HANGUP) {
1254         if (watch == console->hostWatch) {
1255             console->hostClosed = true;
1256         } else {
1257             console->contClosed = true;
1258         }
1259         VIR_DEBUG("Got EOF on %d %d", watch, fd);
1260     }
1261 
1262     virLXCControllerConsoleUpdateWatch(console);
1263     virMutexUnlock(&lock);
1264     return;
1265 
1266  error:
1267     virEventRemoveHandle(console->contWatch);
1268     virEventRemoveHandle(console->hostWatch);
1269     console->contWatch = console->hostWatch = -1;
1270     virNetDaemonQuit(console->daemon);
1271     virMutexUnlock(&lock);
1272 }
1273 
1274 
1275 /**
1276  * lxcControllerMain
1277  * @serverFd: server socket fd to accept client requests
1278  * @clientFd: initial client which is the libvirtd daemon
1279  *
1280  * Processes I/O on consoles and the monitor
1281  *
1282  * Returns 0 on success or -1 in case of error
1283  */
virLXCControllerMain(virLXCController * ctrl)1284 static int virLXCControllerMain(virLXCController *ctrl)
1285 {
1286     int rc = -1;
1287     size_t i;
1288 
1289     if (virNetDaemonAddSignalHandler(ctrl->daemon,
1290                                      SIGCHLD,
1291                                      virLXCControllerSignalChildIO,
1292                                      ctrl) < 0)
1293         goto cleanup;
1294 
1295     virResetLastError();
1296 
1297     for (i = 0; i < ctrl->nconsoles; i++) {
1298         if ((ctrl->consoles[i].epollFd = epoll_create1(EPOLL_CLOEXEC)) < 0) {
1299             virReportSystemError(errno, "%s",
1300                                  _("Unable to create epoll fd"));
1301             goto cleanup;
1302         }
1303 
1304         if ((ctrl->consoles[i].epollWatch = virEventAddHandle(ctrl->consoles[i].epollFd,
1305                                                               VIR_EVENT_HANDLE_READABLE,
1306                                                               virLXCControllerConsoleEPoll,
1307                                                               &(ctrl->consoles[i]),
1308                                                               NULL)) < 0) {
1309             virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
1310                            _("Unable to watch epoll FD"));
1311             goto cleanup;
1312         }
1313 
1314         if ((ctrl->consoles[i].hostWatch = virEventAddHandle(ctrl->consoles[i].hostFd,
1315                                                              VIR_EVENT_HANDLE_READABLE,
1316                                                              virLXCControllerConsoleIO,
1317                                                              &(ctrl->consoles[i]),
1318                                                              NULL)) < 0) {
1319             virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
1320                            _("Unable to watch host console PTY"));
1321             goto cleanup;
1322         }
1323 
1324         if ((ctrl->consoles[i].contWatch = virEventAddHandle(ctrl->consoles[i].contFd,
1325                                                              VIR_EVENT_HANDLE_READABLE,
1326                                                              virLXCControllerConsoleIO,
1327                                                              &(ctrl->consoles[i]),
1328                                                              NULL)) < 0) {
1329             virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
1330                            _("Unable to watch host console PTY"));
1331             goto cleanup;
1332         }
1333     }
1334 
1335     virNetDaemonRun(ctrl->daemon);
1336 
1337     if (virGetLastErrorCode() == VIR_ERR_OK)
1338         rc = wantReboot ? 1 : 0;
1339 
1340  cleanup:
1341     for (i = 0; i < ctrl->nconsoles; i++)
1342         virLXCControllerConsoleClose(&(ctrl->consoles[i]));
1343 
1344     return rc;
1345 }
1346 
1347 static unsigned int
virLXCControllerLookupUsernsMap(virDomainIdMapEntry * map,int num,unsigned int src)1348 virLXCControllerLookupUsernsMap(virDomainIdMapEntry *map,
1349                                 int num,
1350                                 unsigned int src)
1351 {
1352     size_t i;
1353 
1354     for (i = 0; i < num; i++) {
1355         if (src > map[i].start && src < map[i].start + map[i].count)
1356             return map[i].target + (src - map[i].start);
1357     }
1358 
1359     return src;
1360 }
1361 
1362 static int
virLXCControllerSetupUsernsMap(virDomainIdMapEntry * map,int num,char * path)1363 virLXCControllerSetupUsernsMap(virDomainIdMapEntry *map,
1364                                int num,
1365                                char *path)
1366 {
1367     g_auto(virBuffer) map_value = VIR_BUFFER_INITIALIZER;
1368     size_t i;
1369 
1370     /* The kernel supports up to 340 lines in /proc/<pid>/{g,u}id_map */
1371     if (num > 340) {
1372         virReportError(VIR_ERR_INVALID_ARG, "%s",
1373                        _("Too many id mappings defined."));
1374         return -1;
1375     }
1376 
1377     for (i = 0; i < num; i++)
1378         virBufferAsprintf(&map_value, "%u %u %u\n",
1379                           map[i].start, map[i].target, map[i].count);
1380 
1381     VIR_DEBUG("Set '%s' to '%s'", path, virBufferCurrentContent(&map_value));
1382 
1383     if (virFileWriteStr(path, virBufferCurrentContent(&map_value), 0) < 0) {
1384         virReportSystemError(errno, _("unable write to %s"), path);
1385         return -1;
1386     }
1387 
1388     return 0;
1389 }
1390 
1391 /**
1392  * virLXCControllerSetupUserns
1393  *
1394  * Set proc files for user namespace
1395  *
1396  * Returns 0 on success or -1 in case of error
1397  */
virLXCControllerSetupUserns(virLXCController * ctrl)1398 static int virLXCControllerSetupUserns(virLXCController *ctrl)
1399 {
1400     g_autofree char *uid_map = NULL;
1401     g_autofree char *gid_map = NULL;
1402 
1403     /* User namespace is disabled for container */
1404     if (ctrl->def->idmap.nuidmap == 0) {
1405         VIR_DEBUG("No uid map, skipping userns setup");
1406         return 0;
1407     }
1408 
1409     VIR_DEBUG("Setting up userns maps");
1410     uid_map = g_strdup_printf("/proc/%d/uid_map", ctrl->initpid);
1411 
1412     if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.uidmap,
1413                                        ctrl->def->idmap.nuidmap,
1414                                        uid_map) < 0)
1415         return -1;
1416 
1417     gid_map = g_strdup_printf("/proc/%d/gid_map", ctrl->initpid);
1418 
1419     if (virLXCControllerSetupUsernsMap(ctrl->def->idmap.gidmap,
1420                                        ctrl->def->idmap.ngidmap,
1421                                        gid_map) < 0)
1422         return -1;
1423 
1424     return 0;
1425 }
1426 
virLXCControllerSetupDev(virLXCController * ctrl)1427 static int virLXCControllerSetupDev(virLXCController *ctrl)
1428 {
1429     g_autofree char *mount_options = NULL;
1430     g_autofree char *opts = NULL;
1431     g_autofree char *dev = NULL;
1432 
1433     VIR_DEBUG("Setting up /dev/ for container");
1434 
1435     mount_options = virSecurityManagerGetMountOptions(ctrl->securityManager,
1436                                                       ctrl->def);
1437 
1438     dev = g_strdup_printf("/%s/%s.dev", LXC_STATE_DIR, ctrl->def->name);
1439 
1440     /*
1441      * tmpfs is limited to 64kb, since we only have device nodes in there
1442      * and don't want to DOS the entire OS RAM usage
1443      */
1444 
1445     opts = g_strdup_printf("mode=755,size=65536%s", mount_options);
1446 
1447     if (virFileSetupDev(dev, opts) < 0)
1448         return -1;
1449 
1450     if (lxcContainerChown(ctrl->def, dev) < 0)
1451         return -1;
1452 
1453     return 0;
1454 }
1455 
virLXCControllerPopulateDevices(virLXCController * ctrl)1456 static int virLXCControllerPopulateDevices(virLXCController *ctrl)
1457 {
1458     size_t i;
1459     const struct {
1460         int maj;
1461         int min;
1462         mode_t mode;
1463         const char *path;
1464     } devs[] = {
1465         { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_NULL, 0666, "/null" },
1466         { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_ZERO, 0666, "/zero" },
1467         { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_FULL, 0666, "/full" },
1468         { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_RANDOM, 0666, "/random" },
1469         { LXC_DEV_MAJ_MEMORY, LXC_DEV_MIN_URANDOM, 0666, "/urandom" },
1470         { LXC_DEV_MAJ_TTY, LXC_DEV_MIN_TTY, 0666, "/tty" },
1471     };
1472 
1473     if (virLXCControllerSetupDev(ctrl) < 0)
1474         return -1;
1475 
1476     /* Populate /dev/ with a few important bits */
1477     for (i = 0; i < G_N_ELEMENTS(devs); i++) {
1478         g_autofree char *path = NULL;
1479         dev_t dev;
1480 
1481         path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, ctrl->def->name,
1482                                devs[i].path);
1483 
1484         dev = makedev(devs[i].maj, devs[i].min);
1485         if (mknod(path, S_IFCHR, dev) < 0 ||
1486             chmod(path, devs[i].mode)) {
1487             virReportSystemError(errno,
1488                                  _("Failed to make device %s"),
1489                                  path);
1490             return -1;
1491         }
1492 
1493         if (lxcContainerChown(ctrl->def, path) < 0)
1494             return -1;
1495     }
1496 
1497     return 0;
1498 }
1499 
1500 
1501 static int
virLXCControllerSetupTimers(virLXCController * ctrl)1502 virLXCControllerSetupTimers(virLXCController *ctrl)
1503 {
1504     virDomainDef *def = ctrl->def;
1505     size_t i;
1506 
1507     /* Not sync'ed with Host clock */
1508     if (def->clock.offset != VIR_DOMAIN_CLOCK_OFFSET_LOCALTIME)
1509         return 0;
1510 
1511     for (i = 0; i < def->clock.ntimers; i++) {
1512         virDomainTimerDef *timer = def->clock.timers[i];
1513         g_autofree char *path = NULL;
1514         const char *timer_dev = NULL;
1515         struct stat sb;
1516         dev_t dev;
1517 
1518         /* Check if "present" is set to "no" otherwise enable it. */
1519         if (!timer->present)
1520             continue;
1521 
1522         switch ((virDomainTimerNameType)timer->name) {
1523         case VIR_DOMAIN_TIMER_NAME_PLATFORM:
1524         case VIR_DOMAIN_TIMER_NAME_TSC:
1525         case VIR_DOMAIN_TIMER_NAME_KVMCLOCK:
1526         case VIR_DOMAIN_TIMER_NAME_HYPERVCLOCK:
1527         case VIR_DOMAIN_TIMER_NAME_PIT:
1528         case VIR_DOMAIN_TIMER_NAME_ARMVTIMER:
1529         case VIR_DOMAIN_TIMER_NAME_LAST:
1530             virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1531                            _("unsupported timer type (name) '%s'"),
1532                            virDomainTimerNameTypeToString(timer->name));
1533             return -1;
1534         case VIR_DOMAIN_TIMER_NAME_RTC:
1535             timer_dev = "/dev/rtc0";
1536             path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR,
1537                                    def->name, "/rtc");
1538             break;
1539         case VIR_DOMAIN_TIMER_NAME_HPET:
1540             timer_dev = "/dev/hpet";
1541             path = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR,
1542                                    ctrl->def->name, "/hpet");
1543             break;
1544         }
1545 
1546         if (!timer_dev)
1547             continue;
1548 
1549         if (stat(timer_dev, &sb) < 0) {
1550             virReportSystemError(errno, _("Unable to access %s"),
1551                                  timer_dev);
1552             return -1;
1553         }
1554 
1555         dev = makedev(major(sb.st_rdev), minor(sb.st_rdev));
1556         if (mknod(path, S_IFCHR, dev) < 0 ||
1557             chmod(path, sb.st_mode)) {
1558             virReportSystemError(errno,
1559                                  _("Failed to make device %s"),
1560                                  path);
1561             return -1;
1562         }
1563 
1564         if (lxcContainerChown(def, path) < 0)
1565             return -1;
1566     }
1567 
1568     return 0;
1569 }
1570 
1571 
1572 static int
virLXCControllerSetupHostdevSubsysUSB(virDomainDef * vmDef,virDomainHostdevDef * def,virSecurityManager * securityDriver)1573 virLXCControllerSetupHostdevSubsysUSB(virDomainDef *vmDef,
1574                                       virDomainHostdevDef *def,
1575                                       virSecurityManager *securityDriver)
1576 {
1577     g_autofree char *src = NULL;
1578     g_autofree char *dstdir = NULL;
1579     g_autofree char *dstfile = NULL;
1580     g_autofree char *vroot = NULL;
1581     struct stat sb;
1582     mode_t mode;
1583     virDomainHostdevSubsysUSB *usbsrc = &def->source.subsys.u.usb;
1584 
1585     src = g_strdup_printf(USB_DEVFS "/%03d/%03d", usbsrc->bus, usbsrc->device);
1586 
1587     vroot = g_strdup_printf("/%s/%s.dev/bus/usb/", LXC_STATE_DIR, vmDef->name);
1588 
1589     dstdir = g_strdup_printf("%s/%03d/", vroot, usbsrc->bus);
1590 
1591     dstfile = g_strdup_printf("%s/%03d", dstdir, usbsrc->device);
1592 
1593     if (stat(src, &sb) < 0) {
1594         virReportSystemError(errno,
1595                              _("Unable to access %s"), src);
1596         return -1;
1597     }
1598 
1599     if (!S_ISCHR(sb.st_mode)) {
1600         virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1601                        _("USB source %s was not a character device"),
1602                        src);
1603         return -1;
1604     }
1605 
1606     mode = 0700 | S_IFCHR;
1607 
1608     if (g_mkdir_with_parents(dstdir, 0777) < 0) {
1609         virReportSystemError(errno,
1610                              _("Unable to create %s"), dstdir);
1611         return -1;
1612     }
1613 
1614     VIR_DEBUG("Creating dev %s (%d,%d)",
1615               dstfile, major(sb.st_rdev), minor(sb.st_rdev));
1616     if (mknod(dstfile, mode, sb.st_rdev) < 0) {
1617         virReportSystemError(errno,
1618                              _("Unable to create device %s"),
1619                              dstfile);
1620         return -1;
1621     }
1622 
1623     if (lxcContainerChown(vmDef, dstfile) < 0)
1624         return -1;
1625 
1626     if (virSecurityManagerSetHostdevLabel(securityDriver,
1627                                           vmDef, def, vroot) < 0)
1628         return -1;
1629 
1630     return 0;
1631 }
1632 
1633 
1634 static int
virLXCControllerSetupHostdevCapsStorage(virDomainDef * vmDef,virDomainHostdevDef * def,virSecurityManager * securityDriver)1635 virLXCControllerSetupHostdevCapsStorage(virDomainDef *vmDef,
1636                                         virDomainHostdevDef *def,
1637                                         virSecurityManager *securityDriver)
1638 {
1639     g_autofree char *dst = NULL;
1640     g_autofree char *path = NULL;
1641     int len = 0;
1642     int ret = -1;
1643     struct stat sb;
1644     mode_t mode;
1645     char *dev = def->source.caps.u.storage.block;
1646 
1647     if (dev == NULL) {
1648         virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
1649                        _("Missing storage host block path"));
1650         goto cleanup;
1651     }
1652 
1653     path = g_strdup(dev);
1654 
1655     while (*(path + len) == '/')
1656         len++;
1657 
1658     dst = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, vmDef->name,
1659                           strchr(path + len, '/'));
1660 
1661     if (stat(dev, &sb) < 0) {
1662         virReportSystemError(errno,
1663                              _("Unable to access %s"),
1664                              dev);
1665         goto cleanup;
1666     }
1667 
1668     if (!S_ISBLK(sb.st_mode)) {
1669         virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1670                        _("Storage source %s must be a block device"),
1671                        dev);
1672         goto cleanup;
1673     }
1674 
1675     if (lxcContainerSetupHostdevCapsMakePath(dst) < 0) {
1676         virReportError(errno,
1677                        _("Failed to create directory for device %s"),
1678                        dev);
1679         goto cleanup;
1680     }
1681 
1682     mode = 0700 | S_IFBLK;
1683 
1684     VIR_DEBUG("Creating dev %s (%d,%d)", dst,
1685               major(sb.st_rdev), minor(sb.st_rdev));
1686     if (mknod(dst, mode, sb.st_rdev) < 0) {
1687         virReportSystemError(errno,
1688                              _("Unable to create device %s"),
1689                              dst);
1690         goto cleanup;
1691     }
1692 
1693     if (lxcContainerChown(vmDef, dst) < 0)
1694         goto cleanup;
1695 
1696     def->source.caps.u.storage.block = dst;
1697     if (virSecurityManagerSetHostdevLabel(securityDriver, vmDef, def, NULL) < 0)
1698         goto cleanup;
1699 
1700     ret = 0;
1701 
1702  cleanup:
1703     def->source.caps.u.storage.block = dev;
1704     return ret;
1705 }
1706 
1707 
1708 static int
virLXCControllerSetupHostdevCapsMisc(virDomainDef * vmDef,virDomainHostdevDef * def,virSecurityManager * securityDriver)1709 virLXCControllerSetupHostdevCapsMisc(virDomainDef *vmDef,
1710                                      virDomainHostdevDef *def,
1711                                      virSecurityManager *securityDriver)
1712 {
1713     g_autofree char *dst = NULL;
1714     g_autofree char *path = NULL;
1715     int len = 0;
1716     int ret = -1;
1717     struct stat sb;
1718     mode_t mode;
1719     char *dev = def->source.caps.u.misc.chardev;
1720 
1721     if (dev == NULL) {
1722         virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
1723                        _("Missing storage host block path"));
1724         goto cleanup;
1725     }
1726 
1727     path = g_strdup(dev);
1728 
1729     while (*(path + len) == '/')
1730         len++;
1731 
1732     dst = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, vmDef->name,
1733                           strchr(path + len, '/'));
1734 
1735     if (stat(dev, &sb) < 0) {
1736         virReportSystemError(errno,
1737                              _("Unable to access %s"),
1738                              dev);
1739         goto cleanup;
1740     }
1741 
1742     if (!S_ISCHR(sb.st_mode)) {
1743         virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1744                        _("Storage source %s must be a character device"),
1745                        dev);
1746         goto cleanup;
1747     }
1748 
1749     if (lxcContainerSetupHostdevCapsMakePath(dst) < 0) {
1750         virReportError(errno,
1751                        _("Failed to create directory for device %s"),
1752                        dst);
1753         goto cleanup;
1754     }
1755 
1756     mode = 0700 | S_IFCHR;
1757 
1758     VIR_DEBUG("Creating dev %s (%d,%d)", dst,
1759               major(sb.st_rdev), minor(sb.st_rdev));
1760     if (mknod(dst, mode, sb.st_rdev) < 0) {
1761         virReportSystemError(errno,
1762                              _("Unable to create device %s"),
1763                              dev);
1764         goto cleanup;
1765     }
1766 
1767     if (lxcContainerChown(vmDef, dst) < 0)
1768         goto cleanup;
1769 
1770     def->source.caps.u.misc.chardev = dst;
1771     if (virSecurityManagerSetHostdevLabel(securityDriver, vmDef, def, NULL) < 0)
1772         goto cleanup;
1773 
1774     ret = 0;
1775 
1776  cleanup:
1777     def->source.caps.u.misc.chardev = dev;
1778     return ret;
1779 }
1780 
1781 static int
virLXCControllerSetupHostdevSubsys(virDomainDef * vmDef,virDomainHostdevDef * def,virSecurityManager * securityDriver)1782 virLXCControllerSetupHostdevSubsys(virDomainDef *vmDef,
1783                                    virDomainHostdevDef *def,
1784                                    virSecurityManager *securityDriver)
1785 {
1786     switch (def->source.subsys.type) {
1787     case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
1788         return virLXCControllerSetupHostdevSubsysUSB(vmDef,
1789                                                      def,
1790                                                      securityDriver);
1791 
1792     default:
1793         virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1794                        _("Unsupported host device mode %s"),
1795                        virDomainHostdevSubsysTypeToString(def->source.subsys.type));
1796         return -1;
1797     }
1798 }
1799 
1800 
1801 static int
virLXCControllerSetupHostdevCaps(virDomainDef * vmDef,virDomainHostdevDef * def,virSecurityManager * securityDriver)1802 virLXCControllerSetupHostdevCaps(virDomainDef *vmDef,
1803                                  virDomainHostdevDef *def,
1804                                  virSecurityManager *securityDriver)
1805 {
1806     switch (def->source.subsys.type) {
1807     case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_STORAGE:
1808         return virLXCControllerSetupHostdevCapsStorage(vmDef,
1809                                                        def,
1810                                                        securityDriver);
1811 
1812     case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_MISC:
1813         return virLXCControllerSetupHostdevCapsMisc(vmDef,
1814                                                     def,
1815                                                     securityDriver);
1816 
1817     case VIR_DOMAIN_HOSTDEV_CAPS_TYPE_NET:
1818         return 0; /* case is handled in virLXCControllerMoveInterfaces */
1819 
1820     default:
1821         virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1822                        _("Unsupported host device mode %s"),
1823                        virDomainHostdevCapsTypeToString(def->source.subsys.type));
1824         return -1;
1825     }
1826 }
1827 
1828 
1829 static int
virLXCControllerSetupAllHostdevs(virLXCController * ctrl)1830 virLXCControllerSetupAllHostdevs(virLXCController *ctrl)
1831 {
1832     size_t i;
1833     virDomainDef *vmDef = ctrl->def;
1834     virSecurityManager *securityDriver = ctrl->securityManager;
1835     VIR_DEBUG("Setting up hostdevs");
1836 
1837     for (i = 0; i < vmDef->nhostdevs; i++) {
1838         virDomainHostdevDef *def = vmDef->hostdevs[i];
1839         switch (def->mode) {
1840         case VIR_DOMAIN_HOSTDEV_MODE_SUBSYS:
1841             if (virLXCControllerSetupHostdevSubsys(vmDef,
1842                                                    def,
1843                                                    securityDriver) < 0)
1844                 return -1;
1845             break;
1846         case VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES:
1847             if (virLXCControllerSetupHostdevCaps(vmDef,
1848                                                  def,
1849                                                  securityDriver) < 0)
1850                 return -1;
1851             break;
1852         default:
1853             virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1854                            _("Unsupported host device mode %s"),
1855                            virDomainHostdevModeTypeToString(def->mode));
1856             return -1;
1857         }
1858     }
1859 
1860     VIR_DEBUG("Setup all hostdevs");
1861     return 0;
1862 }
1863 
1864 
virLXCControllerSetupDisk(virLXCController * ctrl,virDomainDiskDef * def,virSecurityManager * securityDriver)1865 static int virLXCControllerSetupDisk(virLXCController *ctrl,
1866                                      virDomainDiskDef *def,
1867                                      virSecurityManager *securityDriver)
1868 {
1869     g_autofree char *dst = NULL;
1870     int ret = -1;
1871     struct stat sb;
1872     mode_t mode;
1873     char *tmpsrc = def->src->path;
1874 
1875     if (virDomainDiskGetType(def) != VIR_STORAGE_TYPE_BLOCK) {
1876         virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
1877                        _("Can't setup disk for non-block device"));
1878         goto cleanup;
1879     }
1880     if (!tmpsrc) {
1881         virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
1882                        _("Can't setup disk without media"));
1883         goto cleanup;
1884     }
1885 
1886     dst = g_strdup_printf("/%s/%s.dev/%s", LXC_STATE_DIR, ctrl->def->name,
1887                           def->dst);
1888 
1889     if (stat(def->src->path, &sb) < 0) {
1890         virReportSystemError(errno,
1891                              _("Unable to access %s"), tmpsrc);
1892         goto cleanup;
1893     }
1894 
1895     if (!S_ISCHR(sb.st_mode) && !S_ISBLK(sb.st_mode)) {
1896         virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1897                        _("Disk source %s must be a character/block device"),
1898                        tmpsrc);
1899         goto cleanup;
1900     }
1901 
1902     mode = 0700;
1903     if (S_ISCHR(sb.st_mode))
1904         mode |= S_IFCHR;
1905     else
1906         mode |= S_IFBLK;
1907 
1908     /* Yes, the device name we're creating may not
1909      * actually correspond to the major:minor number
1910      * we're using, but we've no other option at this
1911      * time. Just have to hope that containerized apps
1912      * don't get upset that the major:minor is different
1913      * to that normally implied by the device name
1914      */
1915     VIR_DEBUG("Creating dev %s (%d,%d) from %s",
1916               dst, major(sb.st_rdev), minor(sb.st_rdev), tmpsrc);
1917     if (mknod(dst, mode, sb.st_rdev) < 0) {
1918         virReportSystemError(errno,
1919                              _("Unable to create device %s"),
1920                              dst);
1921         goto cleanup;
1922     }
1923 
1924     if (lxcContainerChown(ctrl->def, dst) < 0)
1925         goto cleanup;
1926 
1927     /* Labelling normally operates on src, but we need
1928      * to actually label the dst here, so hack the config */
1929     def->src->path = dst;
1930     if (virSecurityManagerSetImageLabel(securityDriver, ctrl->def, def->src,
1931                                         VIR_SECURITY_DOMAIN_IMAGE_LABEL_BACKING_CHAIN) < 0)
1932         goto cleanup;
1933 
1934     ret = 0;
1935 
1936  cleanup:
1937     def->src->path = tmpsrc;
1938     return ret;
1939 }
1940 
virLXCControllerSetupAllDisks(virLXCController * ctrl)1941 static int virLXCControllerSetupAllDisks(virLXCController *ctrl)
1942 {
1943     size_t i;
1944     VIR_DEBUG("Setting up disks");
1945 
1946     for (i = 0; i < ctrl->def->ndisks; i++) {
1947         if (virLXCControllerSetupDisk(ctrl, ctrl->def->disks[i],
1948                                       ctrl->securityManager) < 0)
1949             return -1;
1950     }
1951 
1952     VIR_DEBUG("Setup all disks");
1953     return 0;
1954 }
1955 
1956 
1957 
1958 /**
1959  * virLXCControllerMoveInterfaces
1960  * @nveths: number of interfaces
1961  * @veths: interface names
1962  * @container: pid of container
1963  *
1964  * Moves network interfaces into a container's namespace
1965  *
1966  * Returns 0 on success or -1 in case of error
1967  */
virLXCControllerMoveInterfaces(virLXCController * ctrl)1968 static int virLXCControllerMoveInterfaces(virLXCController *ctrl)
1969 {
1970     size_t i;
1971     virDomainDef *def = ctrl->def;
1972 
1973     for (i = 0; i < ctrl->nveths; i++) {
1974         if (virNetDevSetNamespace(ctrl->veths[i], ctrl->initpid) < 0)
1975             return -1;
1976     }
1977 
1978     for (i = 0; i < def->nhostdevs; i ++) {
1979         virDomainHostdevDef *hdev = def->hostdevs[i];
1980         virDomainHostdevCaps hdcaps;
1981 
1982         if (hdev->mode != VIR_DOMAIN_HOSTDEV_MODE_CAPABILITIES)
1983             continue;
1984 
1985         hdcaps = hdev->source.caps;
1986 
1987         if (hdcaps.type != VIR_DOMAIN_HOSTDEV_CAPS_TYPE_NET)
1988             continue;
1989 
1990         if (virNetDevSetNamespace(hdcaps.u.net.ifname, ctrl->initpid) < 0)
1991             return -1;
1992     }
1993 
1994     return 0;
1995 }
1996 
1997 
1998 /**
1999  * virLXCControllerDeleteInterfaces:
2000  * @ctrl: the LXC controller
2001  *
2002  * Cleans up the container interfaces by deleting the veth device pairs.
2003  *
2004  * Returns 0 on success or -1 in case of error
2005  */
virLXCControllerDeleteInterfaces(virLXCController * ctrl)2006 static int virLXCControllerDeleteInterfaces(virLXCController *ctrl)
2007 {
2008     size_t i;
2009     int ret = 0;
2010 
2011     for (i = 0; i < ctrl->nveths; i++) {
2012         if (virNetDevVethDelete(ctrl->veths[i]) < 0)
2013             ret = -1;
2014     }
2015 
2016     return ret;
2017 }
2018 
2019 
lxcSetPersonality(virDomainDef * def)2020 static int lxcSetPersonality(virDomainDef *def)
2021 {
2022     virArch altArch;
2023 
2024     VIR_DEBUG("Checking for 32-bit personality");
2025     altArch = lxcContainerGetAlt32bitArch(virArchFromHost());
2026     if (altArch &&
2027         (def->os.arch == altArch)) {
2028         VIR_DEBUG("Setting personality to %s",
2029                   virArchToString(altArch));
2030         if (personality(PER_LINUX32) < 0) {
2031             virReportSystemError(errno, _("Unable to request personality for %s on %s"),
2032                                  virArchToString(altArch),
2033                                  virArchToString(virArchFromHost()));
2034             return -1;
2035         }
2036     }
2037     return 0;
2038 }
2039 
2040 /* Create a private tty using the private devpts at PTMX, returning
2041  * the primary in @ttyprimary and the name of the secondary, _from the
2042  * perspective of the guest after remounting file systems_, in
2043  * @ttyName.  Heavily borrowed from glibc, but doesn't require that
2044  * devpts == "/dev/pts" */
2045 static int
lxcCreateTty(virLXCController * ctrl,int * ttyprimary,char ** ttyName,char ** ttyHostPath)2046 lxcCreateTty(virLXCController *ctrl, int *ttyprimary,
2047              char **ttyName, char **ttyHostPath)
2048 {
2049     int ret = -1;
2050     int ptyno;
2051     int unlock = 0;
2052 
2053     if ((*ttyprimary = open(ctrl->devptmx, O_RDWR|O_NOCTTY|O_NONBLOCK)) < 0)
2054         goto cleanup;
2055 
2056     if (ioctl(*ttyprimary, TIOCSPTLCK, &unlock) < 0)
2057         goto cleanup;
2058 
2059     if (ioctl(*ttyprimary, TIOCGPTN, &ptyno) < 0)
2060         goto cleanup;
2061 
2062     /* If mount() succeeded at honoring newinstance, then the kernel
2063      * was new enough to also honor the mode=0620,gid=5 options, which
2064      * guarantee that the new pty already has correct permissions; so
2065      * while glibc has to fstat(), fchmod(), and fchown() for older
2066      * kernels, we can skip those steps.  ptyno shouldn't currently be
2067      * anything other than 0, but let's play it safe.  */
2068     *ttyName = g_strdup_printf("/dev/pts/%d", ptyno);
2069     *ttyHostPath = g_strdup_printf("/%s/%s.devpts/%d", LXC_STATE_DIR, ctrl->def->name, ptyno);
2070 
2071     ret = 0;
2072 
2073  cleanup:
2074     if (ret != 0) {
2075         VIR_FORCE_CLOSE(*ttyprimary);
2076         g_free(*ttyName);
2077         *ttyName = NULL;
2078     }
2079 
2080     return ret;
2081 }
2082 
2083 
2084 static int
virLXCControllerSetupPrivateNS(void)2085 virLXCControllerSetupPrivateNS(void)
2086 {
2087     /*
2088      * If doing a chroot style setup, we need to prepare
2089      * a private /dev/pts for the child now, which they
2090      * will later move into position.
2091      *
2092      * This is complex because 'virsh console' needs to
2093      * use /dev/pts from the host OS, and the guest OS
2094      * needs to use /dev/pts from the guest.
2095      *
2096      * This means that we (libvirt_lxc) need to see and
2097      * use both /dev/pts instances. We're running in the
2098      * host OS context though and don't want to expose
2099      * the guest OS /dev/pts there.
2100      *
2101      * Thus we call unshare(CLONE_NS) so that we can see
2102      * the guest's new /dev/pts, without it becoming
2103      * visible to the host OS. We also disable mount
2104      * propagation out of the root FS, in case it was
2105      * currently allowing bi-directional propagation.
2106      */
2107 
2108     return virProcessSetupPrivateMountNS();
2109 }
2110 
2111 
2112 static int
virLXCControllerSetupDevPTS(virLXCController * ctrl)2113 virLXCControllerSetupDevPTS(virLXCController *ctrl)
2114 {
2115     g_autofree char *mount_options = NULL;
2116     g_autofree char *opts = NULL;
2117     g_autofree char *devpts = NULL;
2118     gid_t ptsgid = 5;
2119 
2120     VIR_DEBUG("Setting up private /dev/pts");
2121 
2122     mount_options = virSecurityManagerGetMountOptions(ctrl->securityManager,
2123                                                       ctrl->def);
2124 
2125     devpts = g_strdup_printf("%s/%s.devpts", LXC_STATE_DIR, ctrl->def->name);
2126     ctrl->devptmx = g_strdup_printf("%s/%s.devpts/ptmx", LXC_STATE_DIR, ctrl->def->name);
2127 
2128     if (g_mkdir_with_parents(devpts, 0777) < 0) {
2129         virReportSystemError(errno,
2130                              _("Failed to make path %s"),
2131                              devpts);
2132         return -1;
2133     }
2134 
2135     if (ctrl->def->idmap.ngidmap)
2136         ptsgid = virLXCControllerLookupUsernsMap(ctrl->def->idmap.gidmap,
2137                                                  ctrl->def->idmap.ngidmap,
2138                                                  ptsgid);
2139 
2140     /* XXX should we support gid=X for X!=5 for distros which use
2141      * a different gid for tty?  */
2142     opts = g_strdup_printf("newinstance,ptmxmode=0666,mode=0620,gid=%u%s", ptsgid,
2143                            NULLSTR_EMPTY(mount_options));
2144 
2145     VIR_DEBUG("Mount devpts on %s type=tmpfs flags=0x%x, opts=%s",
2146               devpts, MS_NOSUID, opts);
2147     if (mount("devpts", devpts, "devpts", MS_NOSUID, opts) < 0) {
2148         virReportSystemError(errno,
2149                              _("Failed to mount devpts on %s"),
2150                              devpts);
2151         return -1;
2152     }
2153 
2154     if (access(ctrl->devptmx, R_OK) < 0) {
2155         virReportSystemError(ENOSYS, "%s",
2156                              _("Kernel does not support private devpts"));
2157         return -1;
2158     }
2159 
2160     if ((lxcContainerChown(ctrl->def, ctrl->devptmx) < 0) ||
2161         (lxcContainerChown(ctrl->def, devpts) < 0))
2162         return -1;
2163 
2164     return 0;
2165 }
2166 
2167 
2168 static int
virLXCControllerSetupFuse(virLXCController * ctrl)2169 virLXCControllerSetupFuse(virLXCController *ctrl)
2170 {
2171     return lxcSetupFuse(&ctrl->fuse, ctrl->def);
2172 }
2173 
2174 static int
virLXCControllerStartFuse(virLXCController * ctrl)2175 virLXCControllerStartFuse(virLXCController *ctrl)
2176 {
2177     return lxcStartFuse(ctrl->fuse);
2178 }
2179 
2180 static int
virLXCControllerSetupConsoles(virLXCController * ctrl,char ** containerTTYPaths)2181 virLXCControllerSetupConsoles(virLXCController *ctrl,
2182                               char **containerTTYPaths)
2183 {
2184     size_t i;
2185 
2186     for (i = 0; i < ctrl->nconsoles; i++) {
2187         g_autofree char *ttyHostPath = NULL;
2188 
2189         VIR_DEBUG("Opening tty on private %s", ctrl->devptmx);
2190         if (lxcCreateTty(ctrl,
2191                          &ctrl->consoles[i].contFd,
2192                          &containerTTYPaths[i], &ttyHostPath) < 0) {
2193             virReportSystemError(errno, "%s",
2194                                  _("Failed to allocate tty"));
2195             return -1;
2196         }
2197 
2198         /* Change the owner of tty device to the root user of container */
2199         if (lxcContainerChown(ctrl->def, ttyHostPath) < 0)
2200             return -1;
2201     }
2202 
2203     return 0;
2204 }
2205 
2206 
2207 static void
virLXCControllerEventSend(virLXCController * ctrl,int procnr,xdrproc_t proc,void * data)2208 virLXCControllerEventSend(virLXCController *ctrl,
2209                           int procnr,
2210                           xdrproc_t proc,
2211                           void *data)
2212 {
2213     virNetMessage *msg;
2214 
2215     if (!ctrl->client) {
2216         VIR_WARN("Dropping event %d because libvirtd is not connected", procnr);
2217         return;
2218     }
2219 
2220     VIR_DEBUG("Send event %d client=%p", procnr, ctrl->client);
2221     if (!(msg = virNetMessageNew(false)))
2222         goto error;
2223 
2224     msg->header.prog = virNetServerProgramGetID(ctrl->prog);
2225     msg->header.vers = virNetServerProgramGetVersion(ctrl->prog);
2226     msg->header.proc = procnr;
2227     msg->header.type = VIR_NET_MESSAGE;
2228     msg->header.serial = 1;
2229     msg->header.status = VIR_NET_OK;
2230 
2231     if (virNetMessageEncodeHeader(msg) < 0)
2232         goto error;
2233 
2234     if (virNetMessageEncodePayload(msg, proc, data) < 0)
2235         goto error;
2236 
2237     VIR_DEBUG("Queue event %d %zu", procnr, msg->bufferLength);
2238     if (virNetServerClientSendMessage(ctrl->client, msg) < 0)
2239         goto error;
2240 
2241     xdr_free(proc, data);
2242     return;
2243 
2244  error:
2245     virNetMessageFree(msg);
2246     xdr_free(proc, data);
2247 }
2248 
2249 
2250 static int
virLXCControllerEventSendExit(virLXCController * ctrl,int exitstatus)2251 virLXCControllerEventSendExit(virLXCController *ctrl,
2252                               int exitstatus)
2253 {
2254     virLXCMonitorExitEventMsg msg;
2255 
2256     VIR_DEBUG("Exit status %d (client=%p)", exitstatus, ctrl->client);
2257     memset(&msg, 0, sizeof(msg));
2258     switch (exitstatus) {
2259     case 0:
2260         msg.status = VIR_LXC_MONITOR_EXIT_STATUS_SHUTDOWN;
2261         break;
2262     case 1:
2263         msg.status = VIR_LXC_MONITOR_EXIT_STATUS_REBOOT;
2264         break;
2265     default:
2266         msg.status = VIR_LXC_MONITOR_EXIT_STATUS_ERROR;
2267         break;
2268     }
2269 
2270     virLXCControllerEventSend(ctrl,
2271                               VIR_LXC_MONITOR_PROC_EXIT_EVENT,
2272                               (xdrproc_t)xdr_virLXCMonitorExitEventMsg,
2273                               (void*)&msg);
2274 
2275     if (ctrl->client) {
2276         VIR_DEBUG("Waiting for client to complete dispatch");
2277         ctrl->inShutdown = true;
2278         virNetServerClientDelayedClose(ctrl->client);
2279         virNetDaemonRun(ctrl->daemon);
2280     }
2281     VIR_DEBUG("Client has gone away");
2282     return 0;
2283 }
2284 
2285 
2286 static int
virLXCControllerEventSendInit(virLXCController * ctrl,pid_t initpid)2287 virLXCControllerEventSendInit(virLXCController *ctrl,
2288                               pid_t initpid)
2289 {
2290     virLXCMonitorInitEventMsg msg;
2291 
2292     VIR_DEBUG("Init pid %lld", (long long)initpid);
2293     memset(&msg, 0, sizeof(msg));
2294     msg.initpid = initpid;
2295 
2296     virLXCControllerEventSend(ctrl,
2297                               VIR_LXC_MONITOR_PROC_INIT_EVENT,
2298                               (xdrproc_t)xdr_virLXCMonitorInitEventMsg,
2299                               (void*)&msg);
2300     return 0;
2301 }
2302 
2303 
2304 static int
virLXCControllerRun(virLXCController * ctrl)2305 virLXCControllerRun(virLXCController *ctrl)
2306 {
2307     int rc = -1;
2308     int control[2] = { -1, -1};
2309     int containerhandshake[2] = { -1, -1 };
2310     char **containerTTYPaths = g_new0(char *, ctrl->nconsoles);
2311     size_t i;
2312 
2313     if (socketpair(PF_UNIX, SOCK_STREAM, 0, control) < 0) {
2314         virReportSystemError(errno, "%s",
2315                              _("sockpair failed"));
2316         goto cleanup;
2317     }
2318 
2319     if (socketpair(PF_UNIX, SOCK_STREAM, 0, containerhandshake) < 0) {
2320         virReportSystemError(errno, "%s",
2321                              _("socketpair failed"));
2322         goto cleanup;
2323     }
2324 
2325     if (virLXCControllerSetupPrivateNS() < 0)
2326         goto cleanup;
2327 
2328     if (virLXCControllerSetupLoopDevices(ctrl) < 0)
2329         goto cleanup;
2330 
2331     if (virLXCControllerSetupResourceLimits(ctrl) < 0)
2332         goto cleanup;
2333 
2334     if (virLXCControllerSetupDevPTS(ctrl) < 0)
2335         goto cleanup;
2336 
2337     if (virLXCControllerPopulateDevices(ctrl) < 0)
2338         goto cleanup;
2339 
2340     if (virLXCControllerSetupTimers(ctrl) < 0)
2341         goto cleanup;
2342 
2343     if (virLXCControllerSetupAllDisks(ctrl) < 0)
2344         goto cleanup;
2345 
2346     if (virLXCControllerSetupAllHostdevs(ctrl) < 0)
2347         goto cleanup;
2348 
2349     if (virLXCControllerSetupFuse(ctrl) < 0)
2350         goto cleanup;
2351 
2352     if (virLXCControllerSetupConsoles(ctrl, containerTTYPaths) < 0)
2353         goto cleanup;
2354 
2355     if (lxcSetPersonality(ctrl->def) < 0)
2356         goto cleanup;
2357 
2358     if ((ctrl->initpid = lxcContainerStart(ctrl->def,
2359                                            ctrl->securityManager,
2360                                            ctrl->nveths,
2361                                            ctrl->veths,
2362                                            ctrl->npassFDs,
2363                                            ctrl->passFDs,
2364                                            control[1],
2365                                            containerhandshake[1],
2366                                            ctrl->nsFDs,
2367                                            ctrl->nconsoles,
2368                                            containerTTYPaths)) < 0)
2369         goto cleanup;
2370     VIR_FORCE_CLOSE(control[1]);
2371     VIR_FORCE_CLOSE(containerhandshake[1]);
2372 
2373     for (i = 0; i < ctrl->npassFDs; i++)
2374         VIR_FORCE_CLOSE(ctrl->passFDs[i]);
2375 
2376     if (ctrl->nsFDs)
2377         for (i = 0; i < VIR_LXC_DOMAIN_NAMESPACE_LAST; i++)
2378             VIR_FORCE_CLOSE(ctrl->nsFDs[i]);
2379 
2380     if (virLXCControllerSetupCgroupLimits(ctrl) < 0)
2381         goto cleanup;
2382 
2383     /* Allow daemon to detect CGroups. */
2384     if (virLXCControllerDaemonHandshakeCont(ctrl) < 0 ||
2385         virLXCControllerDaemonHandshakeWait(ctrl) < 0)
2386         goto cleanup;
2387 
2388     if (virLXCControllerSetupUserns(ctrl) < 0)
2389         goto cleanup;
2390 
2391     if (virLXCControllerMoveInterfaces(ctrl) < 0)
2392         goto cleanup;
2393 
2394     if (virLXCControllerStartFuse(ctrl) < 0)
2395         goto cleanup;
2396 
2397     if (lxcContainerSendContinue(control[0]) < 0) {
2398         virReportSystemError(errno, "%s",
2399                              _("Unable to send container continue message"));
2400         goto cleanup;
2401     }
2402 
2403     if (lxcContainerWaitForContinue(containerhandshake[0]) < 0) {
2404         virReportSystemError(errno, "%s",
2405                              _("error receiving signal from container"));
2406         goto cleanup;
2407     }
2408 
2409     /* ...and reduce our privileges */
2410     if (lxcControllerClearCapabilities() < 0)
2411         goto cleanup;
2412 
2413     for (i = 0; i < ctrl->nconsoles; i++)
2414         if (virLXCControllerConsoleSetNonblocking(&(ctrl->consoles[i])) < 0)
2415             goto cleanup;
2416 
2417     /* Allow daemon to connect to the monitor. */
2418     if (virLXCControllerDaemonHandshakeCont(ctrl) < 0)
2419         goto cleanup;
2420 
2421     /* and preemptively close handshakeFds */
2422     for (i = 0; i < G_N_ELEMENTS(ctrl->handshakeFds); i++)
2423         VIR_FORCE_CLOSE(ctrl->handshakeFds[i]);
2424 
2425     /* We must not hold open a dbus connection for life
2426      * of LXC instance, since dbus-daemon is limited to
2427      * only a few 100 connections by default
2428      */
2429     virGDBusCloseSystemBus();
2430 
2431     rc = virLXCControllerMain(ctrl);
2432 
2433     virLXCControllerEventSendExit(ctrl, rc);
2434 
2435  cleanup:
2436     VIR_FORCE_CLOSE(control[0]);
2437     VIR_FORCE_CLOSE(control[1]);
2438     VIR_FORCE_CLOSE(containerhandshake[0]);
2439     VIR_FORCE_CLOSE(containerhandshake[1]);
2440 
2441     for (i = 0; i < ctrl->nconsoles; i++)
2442         g_free(containerTTYPaths[i]);
2443     g_free(containerTTYPaths);
2444 
2445     virLXCControllerStopInit(ctrl);
2446 
2447     return rc;
2448 }
2449 
2450 
2451 static int
parseFDPair(const char * arg,int (* fd)[2])2452 parseFDPair(const char *arg,
2453             int (*fd)[2])
2454 {
2455     g_auto(GStrv) fds = NULL;
2456 
2457     fds = g_strsplit(arg, ":", 0);
2458 
2459     if (fds[0] == NULL || fds[1] == NULL || fds[2] != NULL ||
2460         virStrToLong_i(fds[0], NULL, 10, &(*fd)[0]) < 0 ||
2461         virStrToLong_i(fds[1], NULL, 10, &(*fd)[1]) < 0) {
2462         fprintf(stderr, "malformed --handshakefds argument '%s'",
2463                 optarg);
2464         return -1;
2465     }
2466 
2467     return 0;
2468 }
2469 
2470 
main(int argc,char * argv[])2471 int main(int argc, char *argv[])
2472 {
2473     pid_t pid;
2474     int rc = -1;
2475     const char *name = NULL;
2476     size_t nveths = 0;
2477     char **veths = NULL;
2478     int ns_fd[VIR_LXC_DOMAIN_NAMESPACE_LAST];
2479     int handshakeFds[2] = { -1, -1 };
2480     bool bg = false;
2481     const struct option options[] = {
2482         { "background", 0, NULL, 'b' },
2483         { "name",   1, NULL, 'n' },
2484         { "veth",   1, NULL, 'v' },
2485         { "console", 1, NULL, 'c' },
2486         { "passfd", 1, NULL, 'p' },
2487         { "handshakefds", 1, NULL, 's' },
2488         { "security", 1, NULL, 'S' },
2489         { "share-net", 1, NULL, 'N' },
2490         { "share-ipc", 1, NULL, 'I' },
2491         { "share-uts", 1, NULL, 'U' },
2492         { "help", 0, NULL, 'h' },
2493         { 0, 0, 0, 0 },
2494     };
2495     g_autofree int *ttyFDs = NULL;
2496     size_t nttyFDs = 0;
2497     g_autofree int *passFDs = NULL;
2498     size_t npassFDs = 0;
2499     virLXCController *ctrl = NULL;
2500     size_t i;
2501     const char *securityDriver = "none";
2502 
2503     for (i = 0; i < VIR_LXC_DOMAIN_NAMESPACE_LAST; i++)
2504         ns_fd[i] = -1;
2505 
2506     if (virGettextInitialize() < 0 ||
2507         virErrorInitialize() < 0) {
2508         fprintf(stderr, _("%s: initialization failed\n"), argv[0]);
2509         exit(EXIT_FAILURE);
2510     }
2511 
2512     /* Initialize logging */
2513     virLogSetFromEnv();
2514 
2515     while (1) {
2516         int c;
2517 
2518         c = getopt_long(argc, argv, "dn:v:p:m:c:s:h:S:N:I:U:",
2519                         options, NULL);
2520 
2521         if (c == -1)
2522             break;
2523 
2524         switch (c) {
2525         case 'b':
2526             bg = true;
2527             break;
2528 
2529         case 'n':
2530             name = optarg;
2531             break;
2532 
2533         case 'v':
2534             veths = g_renew(char *, veths, nveths+1);
2535             veths[nveths++] = g_strdup(optarg);
2536             break;
2537 
2538         case 'c':
2539             ttyFDs = g_renew(int, ttyFDs, nttyFDs + 1);
2540             if (virStrToLong_i(optarg, NULL, 10, &ttyFDs[nttyFDs++]) < 0) {
2541                 fprintf(stderr, "malformed --console argument '%s'", optarg);
2542                 goto cleanup;
2543             }
2544             break;
2545 
2546         case 'p':
2547             passFDs = g_renew(int, passFDs, npassFDs + 1);
2548             if (virStrToLong_i(optarg, NULL, 10, &passFDs[npassFDs++]) < 0) {
2549                 fprintf(stderr, "malformed --passfd argument '%s'", optarg);
2550                 goto cleanup;
2551             }
2552             break;
2553 
2554         case 's':
2555             if (parseFDPair(optarg, &handshakeFds) < 0)
2556                 goto cleanup;
2557             break;
2558 
2559         case 'N':
2560             if (virStrToLong_i(optarg, NULL, 10, &ns_fd[VIR_LXC_DOMAIN_NAMESPACE_SHARENET]) < 0) {
2561                 fprintf(stderr, "malformed --share-net argument '%s'",
2562                         optarg);
2563                 goto cleanup;
2564             }
2565             break;
2566 
2567         case 'I':
2568             if (virStrToLong_i(optarg, NULL, 10, &ns_fd[VIR_LXC_DOMAIN_NAMESPACE_SHAREIPC]) < 0) {
2569                 fprintf(stderr, "malformed --share-ipc argument '%s'",
2570                         optarg);
2571                 goto cleanup;
2572             }
2573             break;
2574 
2575         case 'U':
2576             if (virStrToLong_i(optarg, NULL, 10, &ns_fd[VIR_LXC_DOMAIN_NAMESPACE_SHAREUTS]) < 0) {
2577                 fprintf(stderr, "malformed --share-uts argument '%s'",
2578                         optarg);
2579                 goto cleanup;
2580             }
2581             break;
2582 
2583         case 'S':
2584             securityDriver = optarg;
2585             break;
2586 
2587         case 'h':
2588         case '?':
2589             fprintf(stderr, "\n");
2590             fprintf(stderr, "syntax: %s [OPTIONS]\n", argv[0]);
2591             fprintf(stderr, "\n");
2592             fprintf(stderr, "Options\n");
2593             fprintf(stderr, "\n");
2594             fprintf(stderr, "  -b, --background\n");
2595             fprintf(stderr, "  -n NAME, --name NAME\n");
2596             fprintf(stderr, "  -c FD, --console FD\n");
2597             fprintf(stderr, "  -v VETH, --veth VETH\n");
2598             fprintf(stderr, "  -s FD:FD, --handshakefds FD:FD (read:write)\n");
2599             fprintf(stderr, "  -S NAME, --security NAME\n");
2600             fprintf(stderr, "  -N FD, --share-net FD\n");
2601             fprintf(stderr, "  -I FD, --share-ipc FD\n");
2602             fprintf(stderr, "  -U FD, --share-uts FD\n");
2603             fprintf(stderr, "  -h, --help\n");
2604             fprintf(stderr, "\n");
2605             rc = 0;
2606             goto cleanup;
2607         }
2608     }
2609 
2610     if (name == NULL) {
2611         fprintf(stderr, "%s: missing --name argument for configuration\n", argv[0]);
2612         goto cleanup;
2613     }
2614 
2615     if (handshakeFds[0] < 0 || handshakeFds[1] < 0) {
2616         fprintf(stderr, "%s: missing --handshakefds argument for container PTY\n",
2617                 argv[0]);
2618         goto cleanup;
2619     }
2620 
2621     if (geteuid() != 0) {
2622         fprintf(stderr, "%s: must be run as the 'root' user\n", argv[0]);
2623         goto cleanup;
2624     }
2625 
2626     virEventRegisterDefaultImpl();
2627 
2628     virGDBusSetSharedBus(false);
2629 
2630     if (!(ctrl = virLXCControllerNew(name)))
2631         goto cleanup;
2632 
2633     memcpy(&ctrl->handshakeFds, &handshakeFds, sizeof(handshakeFds));
2634 
2635     if (!(ctrl->securityManager = virSecurityManagerNew(securityDriver,
2636                                                         LXC_DRIVER_NAME, 0)))
2637         goto cleanup;
2638 
2639     if (ctrl->def->seclabels) {
2640         VIR_DEBUG("Security model %s type %s label %s imagelabel %s",
2641                   NULLSTR(ctrl->def->seclabels[0]->model),
2642                   virDomainSeclabelTypeToString(ctrl->def->seclabels[0]->type),
2643                   NULLSTR(ctrl->def->seclabels[0]->label),
2644                   NULLSTR(ctrl->def->seclabels[0]->imagelabel));
2645     } else {
2646         VIR_DEBUG("Security model not initialized");
2647     }
2648 
2649     ctrl->veths = veths;
2650     ctrl->nveths = nveths;
2651 
2652     ctrl->passFDs = passFDs;
2653     ctrl->npassFDs = npassFDs;
2654 
2655     for (i = 0; i < VIR_LXC_DOMAIN_NAMESPACE_LAST; i++) {
2656         if (ns_fd[i] != -1) {
2657             if (!ctrl->nsFDs) {/*allocate only once */
2658                 size_t j = 0;
2659                 ctrl->nsFDs = g_new0(int, VIR_LXC_DOMAIN_NAMESPACE_LAST);
2660                 for (j = 0; j < VIR_LXC_DOMAIN_NAMESPACE_LAST; j++)
2661                     ctrl->nsFDs[j] = -1;
2662             }
2663             ctrl->nsFDs[i] = ns_fd[i];
2664         }
2665     }
2666 
2667     for (i = 0; i < nttyFDs; i++) {
2668         if (virLXCControllerAddConsole(ctrl, ttyFDs[i]) < 0)
2669             goto cleanup;
2670         ttyFDs[i] = -1;
2671     }
2672 
2673     if (virLXCControllerValidateNICs(ctrl) < 0)
2674         goto cleanup;
2675 
2676     if (virLXCControllerGetNICIndexes(ctrl) < 0)
2677         goto cleanup;
2678 
2679     if (virLXCControllerValidateConsoles(ctrl) < 0)
2680         goto cleanup;
2681 
2682     if (virLXCControllerSetupServer(ctrl) < 0)
2683         goto cleanup;
2684 
2685     if (bg) {
2686         if ((pid = fork()) < 0)
2687             goto cleanup;
2688 
2689         if (pid > 0) {
2690             if ((rc = virPidFileWrite(LXC_STATE_DIR, name, pid)) < 0) {
2691                 virReportSystemError(-rc,
2692                                      _("Unable to write pid file '%s/%s.pid'"),
2693                                      LXC_STATE_DIR, name);
2694                 _exit(1);
2695             }
2696 
2697             /* First child now exits, allowing original caller
2698              * (ie libvirtd's LXC driver to complete their
2699              * waitpid & continue */
2700             _exit(0);
2701         }
2702 
2703         /* Don't hold on to any cwd we inherit from libvirtd either */
2704         if (chdir("/") < 0) {
2705             virReportSystemError(errno, "%s",
2706                                  _("Unable to change to root dir"));
2707             goto cleanup;
2708         }
2709 
2710         if (setsid() < 0) {
2711             virReportSystemError(errno, "%s",
2712                                  _("Unable to become session leader"));
2713             goto cleanup;
2714         }
2715     }
2716 
2717     rc = virLXCControllerRun(ctrl);
2718 
2719  cleanup:
2720     if (rc < 0) {
2721         fprintf(stderr,
2722                 _("Failure in libvirt_lxc startup: %s\n"),
2723                 virGetLastErrorMessage());
2724     }
2725 
2726     virPidFileDelete(LXC_STATE_DIR, name);
2727     if (ctrl)
2728         virLXCControllerDeleteInterfaces(ctrl);
2729     for (i = 0; i < nttyFDs; i++)
2730         VIR_FORCE_CLOSE(ttyFDs[i]);
2731     for (i = 0; i < npassFDs; i++)
2732         VIR_FORCE_CLOSE(passFDs[i]);
2733 
2734     virLXCControllerFree(ctrl);
2735 
2736     return rc < 0? EXIT_FAILURE : EXIT_SUCCESS;
2737 }
2738