1 /*
2  * virprocess.c: interaction with processes
3  *
4  * Copyright (C) 2010-2015 Red Hat, Inc.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library.  If not, see
18  * <http://www.gnu.org/licenses/>.
19  *
20  */
21 
22 
23 #include <config.h>
24 
25 #include <fcntl.h>
26 #include <signal.h>
27 #ifndef WIN32
28 # include <sys/wait.h>
29 #endif
30 #include <unistd.h>
31 #if WITH_SYS_MOUNT_H
32 # include <sys/mount.h>
33 #endif
34 #if WITH_SETRLIMIT
35 # include <sys/time.h>
36 # include <sys/resource.h>
37 #endif
38 #if WITH_SCHED_H
39 # include <sched.h>
40 #endif
41 
42 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || WITH_BSD_CPU_AFFINITY
43 # include <sys/param.h>
44 #endif
45 
46 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
47 # include <sys/sysctl.h>
48 # include <sys/user.h>
49 #endif
50 
51 #if WITH_BSD_CPU_AFFINITY
52 # include <sys/cpuset.h>
53 #endif
54 
55 #ifdef WIN32
56 # define WIN32_LEAN_AND_MEAN
57 # include <windows.h>
58 #endif
59 
60 #include "virprocess.h"
61 #include "virerror.h"
62 #include "viralloc.h"
63 #include "virfile.h"
64 #include "virlog.h"
65 #include "virutil.h"
66 #include "virstring.h"
67 #include "vircommand.h"
68 
69 #define VIR_FROM_THIS VIR_FROM_NONE
70 
71 VIR_LOG_INIT("util.process");
72 
73 #ifdef __linux__
74 /*
75  * Workaround older glibc. While kernel may support the setns
76  * syscall, the glibc wrapper might not exist. If that's the
77  * case, use our own.
78  */
79 # ifndef __NR_setns
80 #  if defined(__x86_64__)
81 #   define __NR_setns 308
82 #  elif defined(__i386__)
83 #   define __NR_setns 346
84 #  elif defined(__arm__)
85 #   define __NR_setns 375
86 #  elif defined(__aarch64__)
87 #   define __NR_setns 375
88 #  elif defined(__powerpc__)
89 #   define __NR_setns 350
90 #  elif defined(__s390__)
91 #   define __NR_setns 339
92 #  endif
93 # endif
94 
95 # ifndef WITH_SETNS
96 #  if defined(__NR_setns)
97 #   include <sys/syscall.h>
98 
setns(int fd,int nstype)99 static inline int setns(int fd, int nstype)
100 {
101     return syscall(__NR_setns, fd, nstype);
102 }
103 #  else /* !__NR_setns */
104 #   error Please determine the syscall number for setns on your architecture
105 #  endif
106 # endif
107 #else /* !__linux__ */
setns(int fd G_GNUC_UNUSED,int nstype G_GNUC_UNUSED)108 static inline int setns(int fd G_GNUC_UNUSED, int nstype G_GNUC_UNUSED)
109 {
110     virReportSystemError(ENOSYS, "%s",
111                          _("Namespaces are not supported on this platform."));
112     return -1;
113 }
114 #endif
115 
116 VIR_ENUM_IMPL(virProcessSchedPolicy,
117               VIR_PROC_POLICY_LAST,
118               "none",
119               "batch",
120               "idle",
121               "fifo",
122               "rr",
123 );
124 
125 
126 #ifndef WIN32
127 /**
128  * virProcessTranslateStatus:
129  * @status: child exit status to translate
130  *
131  * Translate an exit status into a malloc'd string.  Generic helper
132  * for virCommandRun(), virCommandWait() and virProcessWait()
133  * status argument, as well as raw waitpid().
134  */
135 char *
virProcessTranslateStatus(int status)136 virProcessTranslateStatus(int status)
137 {
138     char *buf;
139     if (WIFEXITED(status)) {
140         buf = g_strdup_printf(_("exit status %d"),
141                               WEXITSTATUS(status));
142     } else if (WIFSIGNALED(status)) {
143         buf = g_strdup_printf(_("fatal signal %d"),
144                               WTERMSIG(status));
145     } else {
146         buf = g_strdup_printf(_("invalid value %d"), status);
147     }
148     return buf;
149 }
150 
151 
152 /**
153  * virProcessAbort:
154  * @pid: child process to kill
155  *
156  * Abort a child process if PID is positive and that child is still
157  * running, without issuing any errors or affecting errno.  Designed
158  * for error paths where some but not all paths to the cleanup code
159  * might have started the child process.  If @pid is 0 or negative,
160  * this does nothing.
161  */
162 void
virProcessAbort(pid_t pid)163 virProcessAbort(pid_t pid)
164 {
165     int saved_errno;
166     int ret;
167     int status;
168     g_autofree char *tmp = NULL;
169 
170     if (pid <= 0)
171         return;
172 
173     /* See if intermediate process has exited; if not, try a nice
174      * SIGTERM followed by a more severe SIGKILL.
175      */
176     saved_errno = errno;
177     VIR_DEBUG("aborting child process %d", pid);
178     while ((ret = waitpid(pid, &status, WNOHANG)) == -1 &&
179            errno == EINTR);
180     if (ret == pid) {
181         tmp = virProcessTranslateStatus(status);
182         VIR_DEBUG("process has ended: %s", tmp);
183         goto cleanup;
184     } else if (ret == 0) {
185         VIR_DEBUG("trying SIGTERM to child process %d", pid);
186         kill(pid, SIGTERM);
187         g_usleep(10 * 1000);
188         while ((ret = waitpid(pid, &status, WNOHANG)) == -1 &&
189                errno == EINTR);
190         if (ret == pid) {
191             tmp = virProcessTranslateStatus(status);
192             VIR_DEBUG("process has ended: %s", tmp);
193             goto cleanup;
194         } else if (ret == 0) {
195             VIR_DEBUG("trying SIGKILL to child process %d", pid);
196             kill(pid, SIGKILL);
197             while ((ret = waitpid(pid, &status, 0)) == -1 &&
198                    errno == EINTR);
199             if (ret == pid) {
200                 tmp = virProcessTranslateStatus(status);
201                 VIR_DEBUG("process has ended: %s", tmp);
202                 goto cleanup;
203             }
204         }
205     }
206     VIR_DEBUG("failed to reap child %lld, abandoning it", (long long) pid);
207 
208  cleanup:
209     errno = saved_errno;
210 }
211 
212 
213 /**
214  * virProcessWait:
215  * @pid: child to wait on
216  * @exitstatus: optional status collection
217  * @raw: whether to pass non-normal status back to caller
218  *
219  * Wait for a child process to complete.  If @pid is -1, do nothing, but
220  * return -1 (useful for error cleanup, and assumes an earlier message was
221  * already issued).  All other pids issue an error message on failure.
222  *
223  * If @exitstatus is NULL, then the child must exit normally with status 0.
224  * Otherwise, if @raw is false, the child must exit normally, and
225  * @exitstatus will contain the final exit status (no need for the caller
226  * to use WEXITSTATUS()).  If @raw is true, then the result of waitpid() is
227  * returned in @exitstatus, and the caller must use WIFEXITED() and friends
228  * to decipher the child's status.
229  *
230  * Returns 0 on a successful wait.  Returns -1 on any error waiting for
231  * completion, or if the command completed with a status that cannot be
232  * reflected via the choice of @exitstatus and @raw.
233  */
234 int
virProcessWait(pid_t pid,int * exitstatus,bool raw)235 virProcessWait(pid_t pid, int *exitstatus, bool raw)
236 {
237     int ret;
238     int status;
239     g_autofree char *st = NULL;
240 
241     if (pid <= 0) {
242         if (pid != -1)
243             virReportSystemError(EINVAL, _("unable to wait for process %lld"),
244                                  (long long) pid);
245         return -1;
246     }
247 
248     /* Wait for intermediate process to exit */
249     while ((ret = waitpid(pid, &status, 0)) == -1 &&
250            errno == EINTR);
251 
252     if (ret == -1) {
253         virReportSystemError(errno, _("unable to wait for process %lld"),
254                              (long long) pid);
255         return -1;
256     }
257 
258     if (exitstatus == NULL) {
259         if (status != 0)
260             goto error;
261     } else if (raw) {
262         *exitstatus = status;
263     } else if (WIFEXITED(status)) {
264         *exitstatus = WEXITSTATUS(status);
265     } else {
266         goto error;
267     }
268 
269     return 0;
270 
271  error:
272     st = virProcessTranslateStatus(status);
273     virReportError(VIR_ERR_INTERNAL_ERROR,
274                    _("Child process (%lld) unexpected %s"),
275                    (long long) pid, NULLSTR(st));
276     return -1;
277 }
278 
279 #else /* WIN32 */
280 
281 char *
virProcessTranslateStatus(int status)282 virProcessTranslateStatus(int status)
283 {
284     return g_strdup_printf(_("invalid value %d"), status);
285 }
286 
287 
288 void
virProcessAbort(pid_t pid)289 virProcessAbort(pid_t pid)
290 {
291     /* Not yet ported to mingw.  Any volunteers?  */
292     VIR_DEBUG("failed to reap child %lld, abandoning it", (long long)pid);
293 }
294 
295 
296 int
virProcessWait(pid_t pid,int * exitstatus G_GNUC_UNUSED,bool raw G_GNUC_UNUSED)297 virProcessWait(pid_t pid, int *exitstatus G_GNUC_UNUSED, bool raw G_GNUC_UNUSED)
298 {
299     virReportSystemError(ENOSYS, _("unable to wait for process %lld"),
300                          (long long) pid);
301     return -1;
302 }
303 
304 #endif /* WIN32 */
305 
306 
307 /* send signal to a single process */
virProcessKill(pid_t pid,int sig)308 int virProcessKill(pid_t pid, int sig)
309 {
310     if (pid <= 1) {
311         errno = ESRCH;
312         return -1;
313     }
314 
315 #ifdef WIN32
316     /* Mingw / Windows don't have many signals (AFAIK) */
317     switch (sig) {
318     case SIGINT:
319         /* This does a Ctrl+C equiv */
320         if (!GenerateConsoleCtrlEvent(CTRL_C_EVENT, pid)) {
321             errno = ESRCH;
322             return -1;
323         }
324         break;
325 
326     case SIGTERM:
327         /* Since TerminateProcess is closer to SIG_KILL, we do
328          * a Ctrl+Break equiv which is more pleasant like the
329          * good old unix SIGTERM/HUP
330          */
331         if (!GenerateConsoleCtrlEvent(CTRL_BREAK_EVENT, pid)) {
332             errno = ESRCH;
333             return -1;
334         }
335         break;
336 
337     default:
338     {
339         HANDLE proc;
340         proc = OpenProcess(PROCESS_TERMINATE, FALSE, pid);
341         if (!proc) {
342             errno = ESRCH; /* Not entirely accurate, but close enough */
343             return -1;
344         }
345 
346         /*
347          * TerminateProcess is more or less equiv to SIG_KILL, in that
348          * a process can't trap / block it
349          */
350         if (sig != 0 && !TerminateProcess(proc, sig)) {
351             errno = ESRCH;
352             return -1;
353         }
354         CloseHandle(proc);
355     }
356     }
357     return 0;
358 #else
359     return kill(pid, sig);
360 #endif
361 }
362 
363 
364 /* send signal to a process group */
virProcessGroupKill(pid_t pid,int sig G_GNUC_UNUSED)365 int virProcessGroupKill(pid_t pid, int sig G_GNUC_UNUSED)
366 {
367     if (pid <= 1) {
368         errno = ESRCH;
369         return -1;
370     }
371 
372 #ifdef WIN32
373     errno = ENOSYS;
374     return -1;
375 #else
376     return killpg(pid, sig);
377 #endif
378 }
379 
380 
381 /* get process group from a pid */
virProcessGroupGet(pid_t pid)382 pid_t virProcessGroupGet(pid_t pid)
383 {
384     if (pid <= 1) {
385         errno = ESRCH;
386         return -1;
387     }
388 
389 #ifdef WIN32
390     errno = ENOSYS;
391     return -1;
392 #else
393     return getpgid(pid);
394 #endif
395 }
396 
397 
398 /*
399  * Try to kill the process and verify it has exited
400  *
401  * Returns 0 if it was killed gracefully, 1 if it
402  * was killed forcibly, -1 if it is still alive,
403  * or another error occurred.
404  *
405  * Callers can provide an extra delay in seconds to
406  * wait longer than the default.
407  */
408 int
virProcessKillPainfullyDelay(pid_t pid,bool force,unsigned int extradelay,bool group)409 virProcessKillPainfullyDelay(pid_t pid, bool force, unsigned int extradelay, bool group)
410 {
411     size_t i;
412     /* This is in 1/5th seconds since polling is on a 0.2s interval */
413     unsigned int polldelay = (force ? 200 : 75) + (extradelay*5);
414     const char *signame = "TERM";
415 
416     VIR_DEBUG("vpid=%lld force=%d extradelay=%u group=%d",
417               (long long)pid, force, extradelay, group);
418 
419     /* This loop sends SIGTERM, then waits a few iterations (10 seconds)
420      * to see if it dies. If the process still hasn't exited, and
421      * @force is requested, a SIGKILL will be sent, and this will
422      * wait up to 30 seconds more for the process to exit before
423      * returning.
424      *
425      * An extra delay can be passed by the caller for cases that are
426      * expected to clean up slower than usual.
427      *
428      * Note that setting @force could result in dataloss for the process.
429      */
430     for (i = 0; i < polldelay; i++) {
431         int signum;
432         int rc;
433 
434         if (i == 0) {
435             signum = SIGTERM; /* kindly suggest it should exit */
436         } else if (i == 50 && force) {
437             VIR_DEBUG("Timed out waiting after SIGTERM to process %lld, "
438                       "sending SIGKILL", (long long)pid);
439             /* No SIGKILL kill on Win32 ! Use SIGABRT instead which our
440              * virProcessKill proc will handle more or less like SIGKILL */
441 #ifdef WIN32
442             signum = SIGABRT; /* kill it after a grace period */
443             signame = "ABRT";
444 #else
445             signum = SIGKILL; /* kill it after a grace period */
446             signame = "KILL";
447 #endif
448         } else {
449             signum = 0; /* Just check for existence */
450         }
451 
452         if (group)
453             rc = virProcessGroupKill(pid, signum);
454         else
455             rc = virProcessKill(pid, signum);
456 
457         if (rc < 0) {
458             if (errno != ESRCH) {
459                 virReportSystemError(errno,
460                                      _("Failed to terminate process %lld with SIG%s"),
461                                      (long long)pid, signame);
462                 return -1;
463             }
464             return signum == SIGTERM ? 0 : 1;
465         }
466 
467         g_usleep(200 * 1000);
468     }
469 
470     virReportSystemError(EBUSY,
471                          _("Failed to terminate process %lld with SIG%s"),
472                          (long long)pid, signame);
473 
474     return 0;
475 }
476 
477 
virProcessKillPainfully(pid_t pid,bool force)478 int virProcessKillPainfully(pid_t pid, bool force)
479 {
480     return virProcessKillPainfullyDelay(pid, force, 0, false);
481 }
482 
483 #if WITH_DECL_CPU_SET_T && defined(__linux__)
484 
virProcessSetAffinity(pid_t pid,virBitmap * map,bool quiet)485 int virProcessSetAffinity(pid_t pid, virBitmap *map, bool quiet)
486 {
487     size_t i;
488 #ifndef CPU_ALLOC
489     /* Legacy method uses a fixed size cpu mask, only allows up to 1024 cpus */
490     cpu_set_t mask;
491 
492     CPU_ZERO(&mask);
493     for (i = 0; i < virBitmapSize(map); i++) {
494         if (virBitmapIsBitSet(map, i))
495             CPU_SET(i, &mask);
496     }
497 
498     if (sched_setaffinity(pid, sizeof(mask), &mask) < 0) {
499         virReportSystemError(errno,
500                              _("cannot set CPU affinity on process %d"), pid);
501         return -1;
502     }
503 #else
504     int numcpus = 1024;
505     size_t masklen;
506     cpu_set_t *mask;
507 
508     int rv = -1;
509 
510 	/* New method dynamically allocates cpu mask, allowing unlimted cpus */
511     VIR_DEBUG("Set process affinity on %lld", (long long)pid);
512 
513     /* Not only may the statically allocated cpu_set_t be too small,
514      * but there is no way to ask the kernel what size is large enough.
515      * So you have no option but to pick a size, try, catch EINVAL,
516      * enlarge, and re-try.
517      *
518      * https://lkml.org/lkml/2009/7/28/620
519      */
520  realloc:
521     masklen = CPU_ALLOC_SIZE(numcpus);
522     mask = CPU_ALLOC(numcpus);
523 
524     if (!mask)
525         abort();
526 
527     CPU_ZERO_S(masklen, mask);
528     for (i = 0; i < virBitmapSize(map); i++) {
529         if (virBitmapIsBitSet(map, i))
530             CPU_SET_S(i, masklen, mask);
531     }
532 
533     rv = sched_setaffinity(pid, masklen, mask);
534     CPU_FREE(mask);
535 
536     if (rv < 0) {
537         if (errno == EINVAL &&
538             numcpus < (1024 << 8)) { /* 262144 cpus ought to be enough for anyone */
539             numcpus = numcpus << 2;
540             goto realloc;
541         }
542 
543         if (quiet) {
544             VIR_DEBUG("cannot set CPU affinity on process %d: %s",
545                       pid, g_strerror(errno));
546         } else {
547             virReportSystemError(errno,
548                                  _("cannot set CPU affinity on process %d"), pid);
549             return -1;
550         }
551     }
552 #endif
553 
554     return 0;
555 }
556 
557 virBitmap *
virProcessGetAffinity(pid_t pid)558 virProcessGetAffinity(pid_t pid)
559 {
560     size_t i;
561 #ifdef CPU_ALLOC
562     cpu_set_t *mask;
563 #else
564 	cpu_set_t maskt;
565 #endif
566     size_t masklen;
567     size_t ncpus;
568     virBitmap *ret = NULL;
569 
570 #ifdef CPU_ALLOC
571     /* 262144 cpus ought to be enough for anyone */
572     ncpus = 1024 << 8;
573     masklen = CPU_ALLOC_SIZE(ncpus);
574     mask = CPU_ALLOC(ncpus);
575 
576     if (!mask)
577         abort();
578 
579     CPU_ZERO_S(masklen, mask);
580 #else
581     ncpus = 256; /* XXX */
582     masklen = sizeof(maskt);
583     CPU_ZERO(&maskt);
584 # endif
585 
586 # ifdef CPU_ALLOC
587      if (sched_getaffinity(pid, masklen, mask) < 0) {
588 # else
589     if (sched_getaffinity(pid, masklen, &maskt) < 0) {
590 #endif
591         virReportSystemError(errno,
592                              _("cannot get CPU affinity of process %d"), pid);
593         goto cleanup;
594     }
595 
596     ret = virBitmapNew(ncpus);
597 
598     for (i = 0; i < ncpus; i++) {
599 #ifdef CPU_ALLOC
600         if (CPU_ISSET_S(i, masklen, mask))
601             ignore_value(virBitmapSetBit(ret, i));
602 #else
603         if (CPU_ISSET(i, &maskt))
604             ignore_value(virBitmapSetBit(ret, i));
605 # endif
606     }
607 
608  cleanup:
609 #ifdef CPU_ALLOC
610     CPU_FREE(mask);
611 #endif
612 
613     return ret;
614 }
615 
616 #elif defined(WITH_BSD_CPU_AFFINITY)
617 
618 int virProcessSetAffinity(pid_t pid,
619                           virBitmap *map,
620                           bool quiet)
621 {
622     size_t i;
623     cpuset_t mask;
624 
625     CPU_ZERO(&mask);
626     for (i = 0; i < virBitmapSize(map); i++) {
627         if (virBitmapIsBitSet(map, i))
628             CPU_SET(i, &mask);
629     }
630 
631     if (cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
632                            sizeof(mask), &mask) != 0) {
633         if (quiet) {
634             VIR_DEBUG("cannot set CPU affinity on process %d: %s",
635                       pid, g_strerror(errno));
636         } else {
637             virReportSystemError(errno,
638                                  _("cannot set CPU affinity on process %d"), pid);
639             return -1;
640         }
641     }
642 
643     return 0;
644 }
645 
646 virBitmap *
647 virProcessGetAffinity(pid_t pid)
648 {
649     size_t i;
650     cpuset_t mask;
651     virBitmap *ret = NULL;
652 
653     CPU_ZERO(&mask);
654     if (cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
655                            sizeof(mask), &mask) != 0) {
656         virReportSystemError(errno,
657                              _("cannot get CPU affinity of process %d"), pid);
658         return NULL;
659     }
660 
661     ret = virBitmapNew(sizeof(mask) * 8);
662 
663     for (i = 0; i < sizeof(mask) * 8; i++)
664         if (CPU_ISSET(i, &mask))
665             ignore_value(virBitmapSetBit(ret, i));
666 
667     return ret;
668 }
669 
670 #else /* WITH_DECL_CPU_SET_T */
671 
672 int virProcessSetAffinity(pid_t pid G_GNUC_UNUSED,
673                           virBitmap *map G_GNUC_UNUSED,
674                           bool quiet G_GNUC_UNUSED)
675 {
676     /* The @quiet parameter is ignored here, it is used only for silencing
677      * actual failures. */
678     virReportSystemError(ENOSYS, "%s",
679                          _("Process CPU affinity is not supported on this platform"));
680     return -1;
681 }
682 
683 virBitmap *
684 virProcessGetAffinity(pid_t pid G_GNUC_UNUSED)
685 {
686     virReportSystemError(ENOSYS, "%s",
687                          _("Process CPU affinity is not supported on this platform"));
688     return NULL;
689 }
690 #endif /* WITH_DECL_CPU_SET_T */
691 
692 
693 int virProcessGetPids(pid_t pid, size_t *npids, pid_t **pids)
694 {
695     int ret = -1;
696     g_autoptr(DIR) dir = NULL;
697     int value;
698     struct dirent *ent;
699     g_autofree char *taskPath = NULL;
700 
701     *npids = 0;
702     *pids = NULL;
703 
704     taskPath = g_strdup_printf("/proc/%llu/task", (long long)pid);
705 
706     if (virDirOpen(&dir, taskPath) < 0)
707         goto cleanup;
708 
709     while ((value = virDirRead(dir, &ent, taskPath)) > 0) {
710         long long tmp;
711         pid_t tmp_pid;
712 
713         if (virStrToLong_ll(ent->d_name, NULL, 10, &tmp) < 0)
714             goto cleanup;
715         tmp_pid = tmp;
716 
717         VIR_APPEND_ELEMENT(*pids, *npids, tmp_pid);
718     }
719 
720     if (value < 0)
721         goto cleanup;
722 
723     ret = 0;
724 
725  cleanup:
726     if (ret < 0)
727         VIR_FREE(*pids);
728     return ret;
729 }
730 
731 
732 int virProcessGetNamespaces(pid_t pid,
733                             size_t *nfdlist,
734                             int **fdlist)
735 {
736     size_t i = 0;
737     const char *ns[] = { "user", "ipc", "uts", "net", "pid", "mnt" };
738 
739     *nfdlist = 0;
740     *fdlist = NULL;
741 
742     for (i = 0; i < G_N_ELEMENTS(ns); i++) {
743         int fd;
744         g_autofree char *nsfile = NULL;
745 
746         nsfile = g_strdup_printf("/proc/%llu/ns/%s", (long long)pid, ns[i]);
747 
748         if ((fd = open(nsfile, O_RDONLY)) >= 0) {
749             VIR_EXPAND_N(*fdlist, *nfdlist, 1);
750             (*fdlist)[(*nfdlist)-1] = fd;
751         }
752     }
753 
754     return 0;
755 }
756 
757 
758 int virProcessSetNamespaces(size_t nfdlist,
759                             int *fdlist)
760 {
761     size_t i;
762 
763     if (nfdlist == 0) {
764         virReportInvalidArg(nfdlist, "%s",
765                             _("Expected at least one file descriptor"));
766         return -1;
767     }
768     for (i = 0; i < nfdlist; i++) {
769         if (fdlist[i] < 0)
770             continue;
771 
772         /* We get EINVAL if new NS is same as the current
773          * NS, or if the fd namespace doesn't match the
774          * type passed to setns()'s second param. Since we
775          * pass 0, we know the EINVAL is harmless
776          */
777         if (setns(fdlist[i], 0) < 0 &&
778             errno != EINVAL) {
779             virReportSystemError(errno, "%s",
780                                  _("Unable to join domain namespace"));
781             return -1;
782         }
783     }
784     return 0;
785 }
786 
787 #if WITH_PRLIMIT
788 static int
789 virProcessPrLimit(pid_t pid,
790                   int resource,
791                   const struct rlimit *new_limit,
792                   struct rlimit *old_limit)
793 {
794     return prlimit(pid, resource, new_limit, old_limit);
795 }
796 #elif WITH_SETRLIMIT
797 static int
798 virProcessPrLimit(pid_t pid G_GNUC_UNUSED,
799                   int resource G_GNUC_UNUSED,
800                   const struct rlimit *new_limit G_GNUC_UNUSED,
801                   struct rlimit *old_limit G_GNUC_UNUSED)
802 {
803     errno = ENOSYS;
804     return -1;
805 }
806 #endif
807 
808 #if WITH_GETRLIMIT
809 static int
810 virProcessGetRLimit(int resource,
811                     struct rlimit *old_limit)
812 {
813     return getrlimit(resource, old_limit);
814 }
815 #endif /* WITH_GETRLIMIT */
816 
817 #if WITH_SETRLIMIT
818 static int
819 virProcessSetRLimit(int resource,
820                     const struct rlimit *new_limit)
821 {
822     return setrlimit(resource, new_limit);
823 }
824 #endif /* WITH_SETRLIMIT */
825 
826 #if WITH_GETRLIMIT
827 static const char*
828 virProcessLimitResourceToLabel(int resource)
829 {
830     switch (resource) {
831 # if defined(RLIMIT_MEMLOCK)
832         case RLIMIT_MEMLOCK:
833             return "Max locked memory";
834 # endif /* defined(RLIMIT_MEMLOCK) */
835 
836 # if defined(RLIMIT_NPROC)
837         case RLIMIT_NPROC:
838             return "Max processes";
839 # endif /* defined(RLIMIT_NPROC) */
840 
841 # if defined(RLIMIT_NOFILE)
842         case RLIMIT_NOFILE:
843             return "Max open files";
844 # endif /* defined(RLIMIT_NOFILE) */
845 
846 # if defined(RLIMIT_CORE)
847         case RLIMIT_CORE:
848             return "Max core file size";
849 # endif /* defined(RLIMIT_CORE) */
850 
851         default:
852             return NULL;
853     }
854 }
855 
856 # if defined(__linux__)
857 static int
858 virProcessGetLimitFromProc(pid_t pid,
859                            int resource,
860                            struct rlimit *limit)
861 {
862     g_autofree char *procfile = NULL;
863     g_autofree char *buf = NULL;
864     g_auto(GStrv) lines = NULL;
865     const char *label;
866     size_t i;
867 
868     if (!(label = virProcessLimitResourceToLabel(resource))) {
869         errno = EINVAL;
870         return -1;
871     }
872 
873     procfile = g_strdup_printf("/proc/%lld/limits", (long long)pid);
874 
875     if (virFileReadAllQuiet(procfile, 2048, &buf) < 0) {
876         /* virFileReadAllQuiet() already sets errno, so don't overwrite
877          * that and return immediately instead */
878         return -1;
879     }
880 
881     lines = g_strsplit(buf, "\n", 0);
882 
883     for (i = 0; lines[i]; i++) {
884         g_autofree char *softLimit = NULL;
885         g_autofree char *hardLimit = NULL;
886         char *line = lines[i];
887         unsigned long long tmp;
888 
889         if (!(line = STRSKIP(line, label)))
890             continue;
891 
892         if (sscanf(line, "%ms %ms %*s", &softLimit, &hardLimit) < 2)
893             goto error;
894 
895         if (STREQ(softLimit, "unlimited")) {
896             limit->rlim_cur = RLIM_INFINITY;
897         } else {
898             if (virStrToLong_ull(softLimit, NULL, 10, &tmp) < 0)
899                 goto error;
900             limit->rlim_cur = tmp;
901         }
902         if (STREQ(hardLimit, "unlimited")) {
903             limit->rlim_max = RLIM_INFINITY;
904         } else {
905             if (virStrToLong_ull(hardLimit, NULL, 10, &tmp) < 0)
906                 goto error;
907             limit->rlim_max = tmp;
908         }
909     }
910 
911     return 0;
912 
913  error:
914     errno = EIO;
915     return -1;
916 }
917 # else /* !defined(__linux__) */
918 static int
919 virProcessGetLimitFromProc(pid_t pid G_GNUC_UNUSED,
920                            int resource G_GNUC_UNUSED,
921                            struct rlimit *limit G_GNUC_UNUSED)
922 {
923     errno = ENOSYS;
924     return -1;
925 }
926 # endif /* !defined(__linux__) */
927 
928 static int
929 virProcessGetLimit(pid_t pid,
930                    int resource,
931                    struct rlimit *old_limit)
932 {
933     pid_t current_pid = getpid();
934     bool same_process = (pid == current_pid);
935 
936     if (virProcessPrLimit(pid, resource, NULL, old_limit) == 0)
937         return 0;
938 
939     /* For whatever reason, using prlimit() on another process - even
940      * when it's just to obtain the current limit rather than changing
941      * it - requires CAP_SYS_RESOURCE, which we might not have in a
942      * containerized environment; on the other hand, no particular
943      * permission is needed to poke around /proc, so try that if going
944      * through the syscall didn't work */
945     if (virProcessGetLimitFromProc(pid, resource, old_limit) == 0)
946         return 0;
947 
948     if (same_process && virProcessGetRLimit(resource, old_limit) == 0)
949         return 0;
950 
951     return -1;
952 }
953 #endif /* WITH_GETRLIMIT */
954 
955 #if WITH_SETRLIMIT
956 static int
957 virProcessSetLimit(pid_t pid,
958                    int resource,
959                    const struct rlimit *new_limit)
960 {
961     pid_t current_pid = getpid();
962     bool same_process = (pid == current_pid);
963 
964     if (virProcessPrLimit(pid, resource, new_limit, NULL) == 0)
965         return 0;
966 
967     if (same_process && virProcessSetRLimit(resource, new_limit) == 0)
968         return 0;
969 
970     return -1;
971 }
972 #endif /* WITH_SETRLIMIT */
973 
974 #if WITH_SETRLIMIT && defined(RLIMIT_MEMLOCK)
975 /**
976  * virProcessSetMaxMemLock:
977  * @pid: process to be changed
978  * @bytes: new limit
979  *
980  * Sets a new limit on the amount of locked memory for a process.
981  *
982  * Returns: 0 on success, <0 on failure.
983  */
984 int
985 virProcessSetMaxMemLock(pid_t pid, unsigned long long bytes)
986 {
987     struct rlimit rlim;
988 
989     /* We use VIR_DOMAIN_MEMORY_PARAM_UNLIMITED internally to represent
990      * unlimited memory amounts, but setrlimit() and prlimit() use
991      * RLIM_INFINITY for the same purpose, so we need to translate between
992      * the two conventions */
993     if (virMemoryLimitIsSet(bytes))
994         rlim.rlim_cur = rlim.rlim_max = bytes;
995     else
996         rlim.rlim_cur = rlim.rlim_max = RLIM_INFINITY;
997 
998     if (virProcessSetLimit(pid, RLIMIT_MEMLOCK, &rlim) < 0) {
999         virReportSystemError(errno,
1000                              _("cannot limit locked memory "
1001                                "of process %lld to %llu"),
1002                              (long long int)pid, bytes);
1003     }
1004 
1005     VIR_DEBUG("Locked memory for process %lld limited to %llu bytes",
1006               (long long int) pid, bytes);
1007 
1008     return 0;
1009 }
1010 #else /* ! (WITH_SETRLIMIT && defined(RLIMIT_MEMLOCK)) */
1011 int
1012 virProcessSetMaxMemLock(pid_t pid G_GNUC_UNUSED,
1013                         unsigned long long bytes G_GNUC_UNUSED)
1014 {
1015     virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1016     return -1;
1017 }
1018 #endif /* ! (WITH_SETRLIMIT && defined(RLIMIT_MEMLOCK)) */
1019 
1020 #if WITH_GETRLIMIT && defined(RLIMIT_MEMLOCK)
1021 /**
1022  * virProcessGetMaxMemLock:
1023  * @pid: process to be queried
1024  * @bytes: return location for the limit
1025  *
1026  * Obtain the current limit on the amount of locked memory for a process.
1027  *
1028  * Returns: 0 on success, <0 on failure.
1029  */
1030 int
1031 virProcessGetMaxMemLock(pid_t pid,
1032                         unsigned long long *bytes)
1033 {
1034     struct rlimit rlim;
1035 
1036     if (!bytes)
1037         return 0;
1038 
1039     if (virProcessGetLimit(pid, RLIMIT_MEMLOCK, &rlim) < 0) {
1040         virReportSystemError(errno,
1041                              _("cannot get locked memory limit "
1042                                "of process %lld"),
1043                              (long long int) pid);
1044         return -1;
1045     }
1046 
1047     /* virProcessSetMaxMemLock() sets both rlim_cur and rlim_max to the
1048      * same value, so we can retrieve just rlim_max here. We use
1049      * VIR_DOMAIN_MEMORY_PARAM_UNLIMITED internally to represent unlimited
1050      * memory amounts, but setrlimit() and prlimit() use RLIM_INFINITY for the
1051      * same purpose, so we need to translate between the two conventions */
1052     if (rlim.rlim_max == RLIM_INFINITY)
1053         *bytes = VIR_DOMAIN_MEMORY_PARAM_UNLIMITED;
1054     else
1055         *bytes = rlim.rlim_max;
1056 
1057     return 0;
1058 }
1059 #else /* ! (WITH_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */
1060 int
1061 virProcessGetMaxMemLock(pid_t pid G_GNUC_UNUSED,
1062                         unsigned long long *bytes G_GNUC_UNUSED)
1063 {
1064     virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1065     return -1;
1066 }
1067 #endif /* ! (WITH_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */
1068 
1069 #if WITH_SETRLIMIT && defined(RLIMIT_NPROC)
1070 /**
1071  * virProcessSetMaxProcesses:
1072  * @pid: process to be changed
1073  * @procs: new limit
1074  *
1075  * Sets a new limit on the amount of processes for the user the
1076  * process is running as.
1077  *
1078  * Returns: 0 on success, <0 on failure.
1079  */
1080 int
1081 virProcessSetMaxProcesses(pid_t pid, unsigned int procs)
1082 {
1083     struct rlimit rlim;
1084 
1085     rlim.rlim_cur = rlim.rlim_max = procs;
1086 
1087     if (virProcessSetLimit(pid, RLIMIT_NPROC, &rlim) < 0) {
1088         virReportSystemError(errno,
1089                 _("cannot limit number of subprocesses "
1090                   "of process %lld to %u"),
1091                 (long long int)pid, procs);
1092         return -1;
1093     }
1094     return 0;
1095 }
1096 #else /* ! (WITH_SETRLIMIT && defined(RLIMIT_NPROC)) */
1097 int
1098 virProcessSetMaxProcesses(pid_t pid G_GNUC_UNUSED,
1099                           unsigned int procs G_GNUC_UNUSED)
1100 {
1101     virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1102     return -1;
1103 }
1104 #endif /* ! (WITH_SETRLIMIT && defined(RLIMIT_NPROC)) */
1105 
1106 #if WITH_SETRLIMIT && defined(RLIMIT_NOFILE)
1107 /**
1108  * virProcessSetMaxFiles:
1109  * @pid: process to be changed
1110  * @files: new limit
1111  *
1112  * Sets a new limit on the number of opened files for a process.
1113  *
1114  * Returns: 0 on success, <0 on failure.
1115  */
1116 int
1117 virProcessSetMaxFiles(pid_t pid, unsigned int files)
1118 {
1119     struct rlimit rlim;
1120 
1121    /* Max number of opened files is one greater than actual limit. See
1122     * man setrlimit.
1123     *
1124     * NB: That indicates to me that we would want the following code
1125     * to say "files - 1", but the original of this code in
1126     * qemu_process.c also had files + 1, so this preserves current
1127     * behavior.
1128     */
1129     rlim.rlim_cur = rlim.rlim_max = files + 1;
1130 
1131     if (virProcessSetLimit(pid, RLIMIT_NOFILE, &rlim) < 0) {
1132         virReportSystemError(errno,
1133                              _("cannot limit number of open files "
1134                                "of process %lld to %u"),
1135                              (long long int)pid, files);
1136         return -1;
1137     }
1138 
1139     return 0;
1140 }
1141 #else /* ! (WITH_SETRLIMIT && defined(RLIMIT_NOFILE)) */
1142 int
1143 virProcessSetMaxFiles(pid_t pid G_GNUC_UNUSED,
1144                       unsigned int files G_GNUC_UNUSED)
1145 {
1146     virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1147     return -1;
1148 }
1149 #endif /* ! (WITH_SETRLIMIT && defined(RLIMIT_NOFILE)) */
1150 
1151 #if WITH_SETRLIMIT && defined(RLIMIT_CORE)
1152 /**
1153  * virProcessSetMaxCoreSize:
1154  * @pid: process to be changed
1155  * @bytes: new limit (0 to disable core dumps)
1156  *
1157  * Sets a new limit on the size of core dumps for a process.
1158  *
1159  * Returns: 0 on success, <0 on failure.
1160  */
1161 int
1162 virProcessSetMaxCoreSize(pid_t pid, unsigned long long bytes)
1163 {
1164     struct rlimit rlim;
1165 
1166     rlim.rlim_cur = rlim.rlim_max = bytes;
1167 
1168     if (virProcessSetLimit(pid, RLIMIT_CORE, &rlim) < 0) {
1169         virReportSystemError(errno,
1170                 _("cannot limit core file size "
1171                   "of process %lld to %llu"),
1172                 (long long int)pid, bytes);
1173         return -1;
1174     }
1175 
1176     return 0;
1177 }
1178 #else /* ! (WITH_SETRLIMIT && defined(RLIMIT_CORE)) */
1179 int
1180 virProcessSetMaxCoreSize(pid_t pid G_GNUC_UNUSED,
1181                          unsigned long long bytes G_GNUC_UNUSED)
1182 {
1183     virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1184     return -1;
1185 }
1186 #endif /* ! (WITH_SETRLIMIT && defined(RLIMIT_CORE)) */
1187 
1188 
1189 #ifdef __linux__
1190 /*
1191  * Port of code from polkitunixprocess.c under terms
1192  * of the LGPLv2+
1193  */
1194 int virProcessGetStartTime(pid_t pid,
1195                            unsigned long long *timestamp)
1196 {
1197     g_auto(GStrv) proc_stat = virProcessGetStat(pid, 0);
1198     const char *starttime_str = NULL;
1199 
1200     if (!proc_stat || g_strv_length(proc_stat) < 22) {
1201         virReportError(VIR_ERR_INTERNAL_ERROR,
1202                        _("Cannot find start time for pid %d"), (int)pid);
1203         return -1;
1204     }
1205 
1206     starttime_str = proc_stat[VIR_PROCESS_STAT_STARTTIME];
1207     if (virStrToLong_ull(starttime_str, NULL, 10, timestamp) < 0) {
1208         virReportError(VIR_ERR_INTERNAL_ERROR,
1209                        _("Cannot parse start time %s for pid %d"),
1210                        starttime_str, (int)pid);
1211         return -1;
1212     }
1213     return 0;
1214 }
1215 #elif defined(__FreeBSD__) && ! defined __DragonFly__
1216 int virProcessGetStartTime(pid_t pid,
1217                            unsigned long long *timestamp)
1218 {
1219     struct kinfo_proc p;
1220     int mib[4];
1221     size_t len = 4;
1222 
1223     sysctlnametomib("kern.proc.pid", mib, &len);
1224 
1225     len = sizeof(struct kinfo_proc);
1226     mib[3] = pid;
1227 
1228     if (sysctl(mib, 4, &p, &len, NULL, 0) < 0) {
1229         virReportSystemError(errno, "%s",
1230                              _("Unable to query process ID start time"));
1231         return -1;
1232     }
1233 
1234     *timestamp = (unsigned long long)p.ki_start.tv_sec;
1235 
1236     return 0;
1237 
1238 }
1239 #else
1240 int virProcessGetStartTime(pid_t pid,
1241                            unsigned long long *timestamp)
1242 {
1243     static int warned;
1244     if (g_atomic_int_add(&warned, 1) == 0) {
1245         VIR_WARN("Process start time of pid %lld not available on this platform",
1246                  (long long) pid);
1247     }
1248     *timestamp = 0;
1249     return 0;
1250 }
1251 #endif
1252 
1253 
1254 #ifdef __linux__
1255 typedef struct _virProcessNamespaceHelperData virProcessNamespaceHelperData;
1256 struct _virProcessNamespaceHelperData {
1257     pid_t pid;
1258     virProcessNamespaceCallback cb;
1259     void *opaque;
1260 };
1261 
1262 static int virProcessNamespaceHelper(pid_t pid G_GNUC_UNUSED,
1263                                      void *opaque)
1264 {
1265     virProcessNamespaceHelperData *data = opaque;
1266     int fd = -1;
1267     int ret = -1;
1268     g_autofree char *path = NULL;
1269 
1270     path = g_strdup_printf("/proc/%lld/ns/mnt", (long long)data->pid);
1271 
1272     if ((fd = open(path, O_RDONLY)) < 0) {
1273         virReportSystemError(errno, "%s",
1274                              _("Kernel does not provide mount namespace"));
1275         goto cleanup;
1276     }
1277 
1278     if (setns(fd, 0) < 0) {
1279         virReportSystemError(errno, "%s",
1280                              _("Unable to enter mount namespace"));
1281         goto cleanup;
1282     }
1283 
1284     ret = data->cb(data->pid, data->opaque);
1285 
1286  cleanup:
1287     VIR_FORCE_CLOSE(fd);
1288     return ret;
1289 }
1290 
1291 /* Run cb(opaque) in the mount namespace of pid.  Return -1 with error
1292  * message raised if we fail to run the child, if the child dies from
1293  * a signal, or if the child has status EXIT_CANCELED; otherwise return
1294  * value is the retval of the callback. The callback will be run in a child
1295  * process so must be careful to only use async signal safe functions.
1296  */
1297 int
1298 virProcessRunInMountNamespace(pid_t pid,
1299                               virProcessNamespaceCallback cb,
1300                               void *opaque)
1301 {
1302     virProcessNamespaceHelperData data = {.pid = pid, .cb = cb, .opaque = opaque};
1303 
1304     return virProcessRunInFork(virProcessNamespaceHelper, &data);
1305 }
1306 
1307 #else /* ! __linux__ */
1308 
1309 int
1310 virProcessRunInMountNamespace(pid_t pid G_GNUC_UNUSED,
1311                               virProcessNamespaceCallback cb G_GNUC_UNUSED,
1312                               void *opaque G_GNUC_UNUSED)
1313 {
1314     virReportSystemError(ENOSYS, "%s",
1315                          _("Namespaces are not supported on this platform"));
1316     return -1;
1317 }
1318 
1319 #endif /* ! __linux__ */
1320 
1321 
1322 #ifndef WIN32
1323 /* We assume that error messages will fit into 1024 chars */
1324 # define VIR_PROCESS_ERROR_MAX_LENGTH 1024
1325 typedef struct {
1326     int code;
1327     int domain;
1328     char message[VIR_PROCESS_ERROR_MAX_LENGTH];
1329     virErrorLevel level;
1330     char str1[VIR_PROCESS_ERROR_MAX_LENGTH];
1331     char str2[VIR_PROCESS_ERROR_MAX_LENGTH];
1332     char str3[VIR_PROCESS_ERROR_MAX_LENGTH];
1333     int int1;
1334     int int2;
1335 } errorData;
1336 
1337 typedef union {
1338     errorData data;
1339     char bindata[sizeof(errorData)];
1340 } errorDataBin;
1341 
1342 static int
1343 virProcessRunInForkHelper(int errfd,
1344                           pid_t ppid,
1345                           virProcessForkCallback cb,
1346                           void *opaque)
1347 {
1348     int ret = 0;
1349 
1350     if ((ret = cb(ppid, opaque)) < 0) {
1351         virErrorPtr err = virGetLastError();
1352 
1353         if (err) {
1354             g_autofree errorDataBin *bin = g_new0(errorDataBin, 1);
1355 
1356             bin->data.code = err->code;
1357             bin->data.domain = err->domain;
1358             virStrcpyStatic(bin->data.message, err->message);
1359             bin->data.level = err->level;
1360             if (err->str1)
1361                 virStrcpyStatic(bin->data.str1, err->str1);
1362             if (err->str2)
1363                 virStrcpyStatic(bin->data.str2, err->str2);
1364             if (err->str3)
1365                 virStrcpyStatic(bin->data.str3, err->str3);
1366             bin->data.int1 = err->int1;
1367             bin->data.int2 = err->int2;
1368 
1369             ignore_value(safewrite(errfd, bin->bindata, sizeof(*bin)));
1370         }
1371 
1372         return -1;
1373     }
1374 
1375     return ret;
1376 }
1377 
1378 
1379 /**
1380  * virProcessRunInFork:
1381  * @cb: callback to run
1382  * @opaque: opaque data to @cb
1383  *
1384  * Do the fork and run @cb in the child. This can be used when
1385  * @cb does something thread unsafe, for instance.  All signals
1386  * will be reset to have their platform default handlers and
1387  * unmasked. @cb must only use async signal safe functions. In
1388  * particular no mutexes should be used in @cb, unless steps were
1389  * taken before forking to guarantee a predictable state. @cb
1390  * must not exec any external binaries, instead
1391  * virCommand should be used for that purpose.
1392  *
1393  * On return, the returned value is either -1 with error message
1394  * reported if something went bad in the parent, if child has
1395  * died from a signal or if the child returned EXIT_CANCELED.
1396  * Otherwise the returned value is the retval of the callback.
1397  */
1398 int
1399 virProcessRunInFork(virProcessForkCallback cb,
1400                     void *opaque)
1401 {
1402     int ret = -1;
1403     pid_t child = -1;
1404     pid_t parent = getpid();
1405     int errfd[2] = { -1, -1 };
1406 
1407     if (virPipe(errfd) < 0)
1408         return -1;
1409 
1410     if ((child = virFork()) < 0)
1411         goto cleanup;
1412 
1413     if (child == 0) {
1414         VIR_FORCE_CLOSE(errfd[0]);
1415         ret = virProcessRunInForkHelper(errfd[1], parent, cb, opaque);
1416         VIR_FORCE_CLOSE(errfd[1]);
1417         _exit(ret < 0 ? EXIT_CANCELED : ret);
1418     } else {
1419         int status;
1420         g_autofree char *buf = NULL;
1421         g_autofree errorDataBin *bin = NULL;
1422         int nread;
1423 
1424         VIR_FORCE_CLOSE(errfd[1]);
1425         nread = virFileReadHeaderFD(errfd[0], sizeof(*bin), &buf);
1426         ret = virProcessWait(child, &status, false);
1427         if (ret == 0) {
1428             ret = status == EXIT_CANCELED ? -1 : status;
1429             if (ret < 0) {
1430                 if (nread == sizeof(*bin)) {
1431                     bin = g_new0(errorDataBin, 1);
1432                     memcpy(bin->bindata, buf, sizeof(*bin));
1433 
1434                     virReportError(VIR_ERR_INTERNAL_ERROR,
1435                                    _("child reported (status=%d): %s"),
1436                                    status, NULLSTR(bin->data.message));
1437 
1438                     virRaiseErrorFull(__FILE__, __FUNCTION__, __LINE__,
1439                                       bin->data.domain,
1440                                       bin->data.code,
1441                                       bin->data.level,
1442                                       bin->data.str1,
1443                                       bin->data.str2,
1444                                       bin->data.str3,
1445                                       bin->data.int1,
1446                                       bin->data.int2,
1447                                       "%s", bin->data.message);
1448                 } else {
1449                     virReportError(VIR_ERR_INTERNAL_ERROR,
1450                                    _("child didn't write error (status=%d)"),
1451                                    status);
1452                 }
1453             }
1454         }
1455     }
1456 
1457  cleanup:
1458     VIR_FORCE_CLOSE(errfd[0]);
1459     VIR_FORCE_CLOSE(errfd[1]);
1460     return ret;
1461 }
1462 
1463 #else /* WIN32 */
1464 
1465 int
1466 virProcessRunInFork(virProcessForkCallback cb G_GNUC_UNUSED,
1467                     void *opaque G_GNUC_UNUSED)
1468 {
1469     virReportSystemError(ENOSYS, "%s",
1470                          _("Process spawning is not supported on this platform"));
1471     return -1;
1472 }
1473 
1474 #endif /* WIN32 */
1475 
1476 
1477 #if defined(__linux__)
1478 int
1479 virProcessSetupPrivateMountNS(void)
1480 {
1481     if (unshare(CLONE_NEWNS) < 0) {
1482         virReportSystemError(errno, "%s",
1483                              _("Cannot unshare mount namespace"));
1484         return -1;
1485     }
1486 
1487     if (mount("", "/", "none", MS_SLAVE|MS_REC, NULL) < 0) {
1488         virReportSystemError(errno, "%s",
1489                              _("Failed disable mount propagation out of the root filesystem"));
1490         return -1;
1491     }
1492 
1493     return 0;
1494 }
1495 
1496 
1497 G_GNUC_NORETURN static int
1498 virProcessDummyChild(void *argv G_GNUC_UNUSED)
1499 {
1500     _exit(0);
1501 }
1502 
1503 
1504 /**
1505  * virProcessNamespaceAvailable:
1506  * @ns: what namespaces to check (bitwise-OR of virProcessNamespaceFlags)
1507  *
1508  * Check if given list of namespaces (@ns) is available.
1509  * If not, appropriate error message is produced.
1510  *
1511  * Returns: 0 on success (all the namespaces from @flags are available),
1512  *         -1 on error (with error message reported).
1513  */
1514 int
1515 virProcessNamespaceAvailable(unsigned int ns)
1516 {
1517     int flags = 0;
1518     int cpid;
1519     char *childStack;
1520     int stacksize = getpagesize() * 4;
1521     g_autofree char *stack = NULL;
1522 
1523     if (ns & VIR_PROCESS_NAMESPACE_MNT)
1524         flags |= CLONE_NEWNS;
1525     if (ns & VIR_PROCESS_NAMESPACE_IPC)
1526         flags |= CLONE_NEWIPC;
1527     if (ns & VIR_PROCESS_NAMESPACE_NET)
1528         flags |= CLONE_NEWNET;
1529     if (ns & VIR_PROCESS_NAMESPACE_PID)
1530         flags |= CLONE_NEWPID;
1531     if (ns & VIR_PROCESS_NAMESPACE_USER)
1532         flags |= CLONE_NEWUSER;
1533     if (ns & VIR_PROCESS_NAMESPACE_UTS)
1534         flags |= CLONE_NEWUTS;
1535 
1536     /* Signal parent as soon as the child dies. RIP. */
1537     flags |= SIGCHLD;
1538 
1539     stack = g_new0(char, stacksize);
1540 
1541     childStack = stack + stacksize;
1542 
1543     cpid = clone(virProcessDummyChild, childStack, flags, NULL);
1544 
1545     if (cpid < 0) {
1546         VIR_DEBUG("clone call returned %s, container support is not enabled",
1547                   g_strerror(errno));
1548         return -1;
1549     } else if (virProcessWait(cpid, NULL, false) < 0) {
1550         return -1;
1551     }
1552 
1553     VIR_DEBUG("All namespaces (%x) are enabled", ns);
1554     return 0;
1555 }
1556 
1557 #else /* !defined(__linux__) */
1558 
1559 int
1560 virProcessSetupPrivateMountNS(void)
1561 {
1562     virReportSystemError(ENOSYS, "%s",
1563                          _("Namespaces are not supported on this platform."));
1564     return -1;
1565 }
1566 
1567 int
1568 virProcessNamespaceAvailable(unsigned int ns G_GNUC_UNUSED)
1569 {
1570     virReportSystemError(ENOSYS, "%s",
1571                          _("Namespaces are not supported on this platform."));
1572     return -1;
1573 }
1574 
1575 #endif /* !defined(__linux__) */
1576 
1577 /**
1578  * virProcessExitWithStatus:
1579  * @status: raw status to be reproduced when this process dies
1580  *
1581  * Given a raw status obtained by waitpid() or similar, attempt to
1582  * make this process exit in the same manner.  If the child died by
1583  * signal, reset that signal handler to default and raise the same
1584  * signal; if that doesn't kill this process, then exit with 128 +
1585  * signal number.  If @status can't be deciphered, use
1586  * EXIT_CANNOT_INVOKE.
1587  *
1588  * Never returns.
1589  */
1590 void
1591 virProcessExitWithStatus(int status)
1592 {
1593     int value = EXIT_CANNOT_INVOKE;
1594 
1595 #ifndef WIN32
1596     if (WIFEXITED(status)) {
1597         value = WEXITSTATUS(status);
1598     } else if (WIFSIGNALED(status)) {
1599         struct sigaction act;
1600         sigset_t sigs;
1601 
1602         if (sigemptyset(&sigs) == 0 &&
1603             sigaddset(&sigs, WTERMSIG(status)) == 0)
1604             sigprocmask(SIG_UNBLOCK, &sigs, NULL);
1605         memset(&act, 0, sizeof(act));
1606         act.sa_handler = SIG_DFL;
1607         sigfillset(&act.sa_mask);
1608         sigaction(WTERMSIG(status), &act, NULL);
1609         raise(WTERMSIG(status));
1610         value = 128 + WTERMSIG(status);
1611     }
1612 #else /* WIN32 */
1613     (void)status;
1614 #endif /* WIN32 */
1615     exit(value);
1616 }
1617 
1618 #if WITH_SCHED_SETSCHEDULER
1619 
1620 static int
1621 virProcessSchedTranslatePolicy(virProcessSchedPolicy policy)
1622 {
1623     switch (policy) {
1624     case VIR_PROC_POLICY_NONE:
1625         return SCHED_OTHER;
1626 
1627     case VIR_PROC_POLICY_BATCH:
1628 # ifdef SCHED_BATCH
1629         return SCHED_BATCH;
1630 # else
1631         return -1;
1632 # endif
1633 
1634     case VIR_PROC_POLICY_IDLE:
1635 # ifdef SCHED_IDLE
1636         return SCHED_IDLE;
1637 # else
1638         return -1;
1639 # endif
1640 
1641     case VIR_PROC_POLICY_FIFO:
1642         return SCHED_FIFO;
1643 
1644     case VIR_PROC_POLICY_RR:
1645         return SCHED_RR;
1646 
1647     case VIR_PROC_POLICY_LAST:
1648         /* nada */
1649         break;
1650     }
1651 
1652     return -1;
1653 }
1654 
1655 int
1656 virProcessSetScheduler(pid_t pid,
1657                        virProcessSchedPolicy policy,
1658                        int priority)
1659 {
1660     struct sched_param param = {0};
1661     int pol = virProcessSchedTranslatePolicy(policy);
1662 
1663     VIR_DEBUG("pid=%lld, policy=%d, priority=%u",
1664               (long long) pid, policy, priority);
1665 
1666     if (!policy)
1667         return 0;
1668 
1669     if (pol < 0) {
1670         virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1671                        _("Scheduler '%s' is not supported on this platform"),
1672                        virProcessSchedPolicyTypeToString(policy));
1673         return -1;
1674     }
1675 
1676     if (pol == SCHED_FIFO || pol == SCHED_RR) {
1677         int min = 0;
1678         int max = 0;
1679 
1680         if ((min = sched_get_priority_min(pol)) < 0) {
1681             virReportSystemError(errno, "%s",
1682                                  _("Cannot get minimum scheduler "
1683                                    "priority value"));
1684             return -1;
1685         }
1686 
1687         if ((max = sched_get_priority_max(pol)) < 0) {
1688             virReportSystemError(errno, "%s",
1689                                  _("Cannot get maximum scheduler "
1690                                    "priority value"));
1691             return -1;
1692         }
1693 
1694         if (priority < min || priority > max) {
1695             virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1696                            _("Scheduler priority %d out of range [%d, %d]"),
1697                            priority, min, max);
1698             return -1;
1699         }
1700 
1701         param.sched_priority = priority;
1702     }
1703 
1704     if (sched_setscheduler(pid, pol, &param) < 0) {
1705         virReportSystemError(errno,
1706                              _("Cannot set scheduler parameters for pid %lld"),
1707                              (long long) pid);
1708         return -1;
1709     }
1710 
1711     return 0;
1712 }
1713 
1714 #else /* ! WITH_SCHED_SETSCHEDULER */
1715 
1716 int
1717 virProcessSetScheduler(pid_t pid G_GNUC_UNUSED,
1718                        virProcessSchedPolicy policy,
1719                        int priority G_GNUC_UNUSED)
1720 {
1721     if (!policy)
1722         return 0;
1723 
1724     virReportSystemError(ENOSYS, "%s",
1725                          _("Process CPU scheduling is not supported "
1726                            "on this platform"));
1727     return -1;
1728 }
1729 
1730 #endif /* !WITH_SCHED_SETSCHEDULER */
1731 
1732 /*
1733  * Get all stat fields for a process based on pid and tid:
1734  * - pid == 0 && tid == 0 => /proc/self/stat
1735  * - pid != 0 && tid == 0 => /proc/<pid>/stat
1736  * - pid == 0 && tid != 0 => /proc/self/task/<tid>/stat
1737  * - pid != 0 && tid != 0 => /proc/<pid>/task/<tid>/stat
1738  * and return them as array of strings.
1739  */
1740 GStrv
1741 virProcessGetStat(pid_t pid,
1742                   pid_t tid)
1743 {
1744     int len = 10 * 1024;  /* 10kB ought to be enough for everyone */
1745     g_autofree char *buf = NULL;
1746     g_autofree char *path = NULL;
1747     GStrv rest = NULL;
1748     GStrv ret = NULL;
1749     char *comm = NULL;
1750     char *rparen = NULL;
1751     size_t nrest = 0;
1752 
1753     if (pid) {
1754         if (tid)
1755             path = g_strdup_printf("/proc/%d/task/%d/stat", (int)pid, (int)tid);
1756         else
1757             path = g_strdup_printf("/proc/%d/stat", (int)pid);
1758     } else {
1759         if (tid)
1760             path = g_strdup_printf("/proc/self/task/%d/stat", (int)tid);
1761         else
1762             path = g_strdup("/proc/self/stat");
1763     }
1764 
1765     len = virFileReadAllQuiet(path, len, &buf);
1766     if (len < 0)
1767         return NULL;
1768 
1769     /* eliminate trailing spaces */
1770     while (len > 0 && g_ascii_isspace(buf[--len]))
1771            buf[len] = '\0';
1772 
1773     /* Find end of the first field */
1774     if (!(comm = strchr(buf, ' ')))
1775         return NULL;
1776     *comm = '\0';
1777 
1778     /* Check start of the second field (filename of the executable, in
1779      * parentheses) */
1780     comm++;
1781     if (*comm != '(')
1782         return NULL;
1783     comm++;
1784 
1785     /* Check end of the second field (last closing parenthesis) */
1786     rparen = strrchr(comm, ')');
1787     if (!rparen)
1788         return NULL;
1789     *rparen = '\0';
1790 
1791     /* We need to check that the next char is not '\0', but why not just opt in
1792      * for the safer way of checking whether it is ' ' (space) instead */
1793     if (rparen[1] != ' ')
1794         return NULL;
1795 
1796     rest = g_strsplit(rparen + 2, " ", 0);
1797     nrest = g_strv_length(rest);
1798     ret = g_new0(char *, nrest + 3);
1799     ret[0] = g_strdup(buf);
1800     ret[1] = g_strdup(comm);
1801     memcpy(ret + 2, rest, nrest * sizeof(char *));
1802 
1803     /* Do not use g_strfreev() as individual elements they were moved to @ret. */
1804     VIR_FREE(rest);
1805 
1806     return ret;
1807 }
1808