1 /*
2 * virprocess.c: interaction with processes
3 *
4 * Copyright (C) 2010-2015 Red Hat, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library. If not, see
18 * <http://www.gnu.org/licenses/>.
19 *
20 */
21
22
23 #include <config.h>
24
25 #include <fcntl.h>
26 #include <signal.h>
27 #ifndef WIN32
28 # include <sys/wait.h>
29 #endif
30 #include <unistd.h>
31 #if WITH_SYS_MOUNT_H
32 # include <sys/mount.h>
33 #endif
34 #if WITH_SETRLIMIT
35 # include <sys/time.h>
36 # include <sys/resource.h>
37 #endif
38 #if WITH_SCHED_H
39 # include <sched.h>
40 #endif
41
42 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || WITH_BSD_CPU_AFFINITY
43 # include <sys/param.h>
44 #endif
45
46 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
47 # include <sys/sysctl.h>
48 # include <sys/user.h>
49 #endif
50
51 #if WITH_BSD_CPU_AFFINITY
52 # include <sys/cpuset.h>
53 #endif
54
55 #ifdef WIN32
56 # define WIN32_LEAN_AND_MEAN
57 # include <windows.h>
58 #endif
59
60 #include "virprocess.h"
61 #include "virerror.h"
62 #include "viralloc.h"
63 #include "virfile.h"
64 #include "virlog.h"
65 #include "virutil.h"
66 #include "virstring.h"
67 #include "vircommand.h"
68
69 #define VIR_FROM_THIS VIR_FROM_NONE
70
71 VIR_LOG_INIT("util.process");
72
73 #ifdef __linux__
74 /*
75 * Workaround older glibc. While kernel may support the setns
76 * syscall, the glibc wrapper might not exist. If that's the
77 * case, use our own.
78 */
79 # ifndef __NR_setns
80 # if defined(__x86_64__)
81 # define __NR_setns 308
82 # elif defined(__i386__)
83 # define __NR_setns 346
84 # elif defined(__arm__)
85 # define __NR_setns 375
86 # elif defined(__aarch64__)
87 # define __NR_setns 375
88 # elif defined(__powerpc__)
89 # define __NR_setns 350
90 # elif defined(__s390__)
91 # define __NR_setns 339
92 # endif
93 # endif
94
95 # ifndef WITH_SETNS
96 # if defined(__NR_setns)
97 # include <sys/syscall.h>
98
setns(int fd,int nstype)99 static inline int setns(int fd, int nstype)
100 {
101 return syscall(__NR_setns, fd, nstype);
102 }
103 # else /* !__NR_setns */
104 # error Please determine the syscall number for setns on your architecture
105 # endif
106 # endif
107 #else /* !__linux__ */
setns(int fd G_GNUC_UNUSED,int nstype G_GNUC_UNUSED)108 static inline int setns(int fd G_GNUC_UNUSED, int nstype G_GNUC_UNUSED)
109 {
110 virReportSystemError(ENOSYS, "%s",
111 _("Namespaces are not supported on this platform."));
112 return -1;
113 }
114 #endif
115
116 VIR_ENUM_IMPL(virProcessSchedPolicy,
117 VIR_PROC_POLICY_LAST,
118 "none",
119 "batch",
120 "idle",
121 "fifo",
122 "rr",
123 );
124
125
126 #ifndef WIN32
127 /**
128 * virProcessTranslateStatus:
129 * @status: child exit status to translate
130 *
131 * Translate an exit status into a malloc'd string. Generic helper
132 * for virCommandRun(), virCommandWait() and virProcessWait()
133 * status argument, as well as raw waitpid().
134 */
135 char *
virProcessTranslateStatus(int status)136 virProcessTranslateStatus(int status)
137 {
138 char *buf;
139 if (WIFEXITED(status)) {
140 buf = g_strdup_printf(_("exit status %d"),
141 WEXITSTATUS(status));
142 } else if (WIFSIGNALED(status)) {
143 buf = g_strdup_printf(_("fatal signal %d"),
144 WTERMSIG(status));
145 } else {
146 buf = g_strdup_printf(_("invalid value %d"), status);
147 }
148 return buf;
149 }
150
151
152 /**
153 * virProcessAbort:
154 * @pid: child process to kill
155 *
156 * Abort a child process if PID is positive and that child is still
157 * running, without issuing any errors or affecting errno. Designed
158 * for error paths where some but not all paths to the cleanup code
159 * might have started the child process. If @pid is 0 or negative,
160 * this does nothing.
161 */
162 void
virProcessAbort(pid_t pid)163 virProcessAbort(pid_t pid)
164 {
165 int saved_errno;
166 int ret;
167 int status;
168 g_autofree char *tmp = NULL;
169
170 if (pid <= 0)
171 return;
172
173 /* See if intermediate process has exited; if not, try a nice
174 * SIGTERM followed by a more severe SIGKILL.
175 */
176 saved_errno = errno;
177 VIR_DEBUG("aborting child process %d", pid);
178 while ((ret = waitpid(pid, &status, WNOHANG)) == -1 &&
179 errno == EINTR);
180 if (ret == pid) {
181 tmp = virProcessTranslateStatus(status);
182 VIR_DEBUG("process has ended: %s", tmp);
183 goto cleanup;
184 } else if (ret == 0) {
185 VIR_DEBUG("trying SIGTERM to child process %d", pid);
186 kill(pid, SIGTERM);
187 g_usleep(10 * 1000);
188 while ((ret = waitpid(pid, &status, WNOHANG)) == -1 &&
189 errno == EINTR);
190 if (ret == pid) {
191 tmp = virProcessTranslateStatus(status);
192 VIR_DEBUG("process has ended: %s", tmp);
193 goto cleanup;
194 } else if (ret == 0) {
195 VIR_DEBUG("trying SIGKILL to child process %d", pid);
196 kill(pid, SIGKILL);
197 while ((ret = waitpid(pid, &status, 0)) == -1 &&
198 errno == EINTR);
199 if (ret == pid) {
200 tmp = virProcessTranslateStatus(status);
201 VIR_DEBUG("process has ended: %s", tmp);
202 goto cleanup;
203 }
204 }
205 }
206 VIR_DEBUG("failed to reap child %lld, abandoning it", (long long) pid);
207
208 cleanup:
209 errno = saved_errno;
210 }
211
212
213 /**
214 * virProcessWait:
215 * @pid: child to wait on
216 * @exitstatus: optional status collection
217 * @raw: whether to pass non-normal status back to caller
218 *
219 * Wait for a child process to complete. If @pid is -1, do nothing, but
220 * return -1 (useful for error cleanup, and assumes an earlier message was
221 * already issued). All other pids issue an error message on failure.
222 *
223 * If @exitstatus is NULL, then the child must exit normally with status 0.
224 * Otherwise, if @raw is false, the child must exit normally, and
225 * @exitstatus will contain the final exit status (no need for the caller
226 * to use WEXITSTATUS()). If @raw is true, then the result of waitpid() is
227 * returned in @exitstatus, and the caller must use WIFEXITED() and friends
228 * to decipher the child's status.
229 *
230 * Returns 0 on a successful wait. Returns -1 on any error waiting for
231 * completion, or if the command completed with a status that cannot be
232 * reflected via the choice of @exitstatus and @raw.
233 */
234 int
virProcessWait(pid_t pid,int * exitstatus,bool raw)235 virProcessWait(pid_t pid, int *exitstatus, bool raw)
236 {
237 int ret;
238 int status;
239 g_autofree char *st = NULL;
240
241 if (pid <= 0) {
242 if (pid != -1)
243 virReportSystemError(EINVAL, _("unable to wait for process %lld"),
244 (long long) pid);
245 return -1;
246 }
247
248 /* Wait for intermediate process to exit */
249 while ((ret = waitpid(pid, &status, 0)) == -1 &&
250 errno == EINTR);
251
252 if (ret == -1) {
253 virReportSystemError(errno, _("unable to wait for process %lld"),
254 (long long) pid);
255 return -1;
256 }
257
258 if (exitstatus == NULL) {
259 if (status != 0)
260 goto error;
261 } else if (raw) {
262 *exitstatus = status;
263 } else if (WIFEXITED(status)) {
264 *exitstatus = WEXITSTATUS(status);
265 } else {
266 goto error;
267 }
268
269 return 0;
270
271 error:
272 st = virProcessTranslateStatus(status);
273 virReportError(VIR_ERR_INTERNAL_ERROR,
274 _("Child process (%lld) unexpected %s"),
275 (long long) pid, NULLSTR(st));
276 return -1;
277 }
278
279 #else /* WIN32 */
280
281 char *
virProcessTranslateStatus(int status)282 virProcessTranslateStatus(int status)
283 {
284 return g_strdup_printf(_("invalid value %d"), status);
285 }
286
287
288 void
virProcessAbort(pid_t pid)289 virProcessAbort(pid_t pid)
290 {
291 /* Not yet ported to mingw. Any volunteers? */
292 VIR_DEBUG("failed to reap child %lld, abandoning it", (long long)pid);
293 }
294
295
296 int
virProcessWait(pid_t pid,int * exitstatus G_GNUC_UNUSED,bool raw G_GNUC_UNUSED)297 virProcessWait(pid_t pid, int *exitstatus G_GNUC_UNUSED, bool raw G_GNUC_UNUSED)
298 {
299 virReportSystemError(ENOSYS, _("unable to wait for process %lld"),
300 (long long) pid);
301 return -1;
302 }
303
304 #endif /* WIN32 */
305
306
307 /* send signal to a single process */
virProcessKill(pid_t pid,int sig)308 int virProcessKill(pid_t pid, int sig)
309 {
310 if (pid <= 1) {
311 errno = ESRCH;
312 return -1;
313 }
314
315 #ifdef WIN32
316 /* Mingw / Windows don't have many signals (AFAIK) */
317 switch (sig) {
318 case SIGINT:
319 /* This does a Ctrl+C equiv */
320 if (!GenerateConsoleCtrlEvent(CTRL_C_EVENT, pid)) {
321 errno = ESRCH;
322 return -1;
323 }
324 break;
325
326 case SIGTERM:
327 /* Since TerminateProcess is closer to SIG_KILL, we do
328 * a Ctrl+Break equiv which is more pleasant like the
329 * good old unix SIGTERM/HUP
330 */
331 if (!GenerateConsoleCtrlEvent(CTRL_BREAK_EVENT, pid)) {
332 errno = ESRCH;
333 return -1;
334 }
335 break;
336
337 default:
338 {
339 HANDLE proc;
340 proc = OpenProcess(PROCESS_TERMINATE, FALSE, pid);
341 if (!proc) {
342 errno = ESRCH; /* Not entirely accurate, but close enough */
343 return -1;
344 }
345
346 /*
347 * TerminateProcess is more or less equiv to SIG_KILL, in that
348 * a process can't trap / block it
349 */
350 if (sig != 0 && !TerminateProcess(proc, sig)) {
351 errno = ESRCH;
352 return -1;
353 }
354 CloseHandle(proc);
355 }
356 }
357 return 0;
358 #else
359 return kill(pid, sig);
360 #endif
361 }
362
363
364 /* send signal to a process group */
virProcessGroupKill(pid_t pid,int sig G_GNUC_UNUSED)365 int virProcessGroupKill(pid_t pid, int sig G_GNUC_UNUSED)
366 {
367 if (pid <= 1) {
368 errno = ESRCH;
369 return -1;
370 }
371
372 #ifdef WIN32
373 errno = ENOSYS;
374 return -1;
375 #else
376 return killpg(pid, sig);
377 #endif
378 }
379
380
381 /* get process group from a pid */
virProcessGroupGet(pid_t pid)382 pid_t virProcessGroupGet(pid_t pid)
383 {
384 if (pid <= 1) {
385 errno = ESRCH;
386 return -1;
387 }
388
389 #ifdef WIN32
390 errno = ENOSYS;
391 return -1;
392 #else
393 return getpgid(pid);
394 #endif
395 }
396
397
398 /*
399 * Try to kill the process and verify it has exited
400 *
401 * Returns 0 if it was killed gracefully, 1 if it
402 * was killed forcibly, -1 if it is still alive,
403 * or another error occurred.
404 *
405 * Callers can provide an extra delay in seconds to
406 * wait longer than the default.
407 */
408 int
virProcessKillPainfullyDelay(pid_t pid,bool force,unsigned int extradelay,bool group)409 virProcessKillPainfullyDelay(pid_t pid, bool force, unsigned int extradelay, bool group)
410 {
411 size_t i;
412 /* This is in 1/5th seconds since polling is on a 0.2s interval */
413 unsigned int polldelay = (force ? 200 : 75) + (extradelay*5);
414 const char *signame = "TERM";
415
416 VIR_DEBUG("vpid=%lld force=%d extradelay=%u group=%d",
417 (long long)pid, force, extradelay, group);
418
419 /* This loop sends SIGTERM, then waits a few iterations (10 seconds)
420 * to see if it dies. If the process still hasn't exited, and
421 * @force is requested, a SIGKILL will be sent, and this will
422 * wait up to 30 seconds more for the process to exit before
423 * returning.
424 *
425 * An extra delay can be passed by the caller for cases that are
426 * expected to clean up slower than usual.
427 *
428 * Note that setting @force could result in dataloss for the process.
429 */
430 for (i = 0; i < polldelay; i++) {
431 int signum;
432 int rc;
433
434 if (i == 0) {
435 signum = SIGTERM; /* kindly suggest it should exit */
436 } else if (i == 50 && force) {
437 VIR_DEBUG("Timed out waiting after SIGTERM to process %lld, "
438 "sending SIGKILL", (long long)pid);
439 /* No SIGKILL kill on Win32 ! Use SIGABRT instead which our
440 * virProcessKill proc will handle more or less like SIGKILL */
441 #ifdef WIN32
442 signum = SIGABRT; /* kill it after a grace period */
443 signame = "ABRT";
444 #else
445 signum = SIGKILL; /* kill it after a grace period */
446 signame = "KILL";
447 #endif
448 } else {
449 signum = 0; /* Just check for existence */
450 }
451
452 if (group)
453 rc = virProcessGroupKill(pid, signum);
454 else
455 rc = virProcessKill(pid, signum);
456
457 if (rc < 0) {
458 if (errno != ESRCH) {
459 virReportSystemError(errno,
460 _("Failed to terminate process %lld with SIG%s"),
461 (long long)pid, signame);
462 return -1;
463 }
464 return signum == SIGTERM ? 0 : 1;
465 }
466
467 g_usleep(200 * 1000);
468 }
469
470 virReportSystemError(EBUSY,
471 _("Failed to terminate process %lld with SIG%s"),
472 (long long)pid, signame);
473
474 return 0;
475 }
476
477
virProcessKillPainfully(pid_t pid,bool force)478 int virProcessKillPainfully(pid_t pid, bool force)
479 {
480 return virProcessKillPainfullyDelay(pid, force, 0, false);
481 }
482
483 #if WITH_DECL_CPU_SET_T && defined(__linux__)
484
virProcessSetAffinity(pid_t pid,virBitmap * map,bool quiet)485 int virProcessSetAffinity(pid_t pid, virBitmap *map, bool quiet)
486 {
487 size_t i;
488 #ifndef CPU_ALLOC
489 /* Legacy method uses a fixed size cpu mask, only allows up to 1024 cpus */
490 cpu_set_t mask;
491
492 CPU_ZERO(&mask);
493 for (i = 0; i < virBitmapSize(map); i++) {
494 if (virBitmapIsBitSet(map, i))
495 CPU_SET(i, &mask);
496 }
497
498 if (sched_setaffinity(pid, sizeof(mask), &mask) < 0) {
499 virReportSystemError(errno,
500 _("cannot set CPU affinity on process %d"), pid);
501 return -1;
502 }
503 #else
504 int numcpus = 1024;
505 size_t masklen;
506 cpu_set_t *mask;
507
508 int rv = -1;
509
510 /* New method dynamically allocates cpu mask, allowing unlimted cpus */
511 VIR_DEBUG("Set process affinity on %lld", (long long)pid);
512
513 /* Not only may the statically allocated cpu_set_t be too small,
514 * but there is no way to ask the kernel what size is large enough.
515 * So you have no option but to pick a size, try, catch EINVAL,
516 * enlarge, and re-try.
517 *
518 * https://lkml.org/lkml/2009/7/28/620
519 */
520 realloc:
521 masklen = CPU_ALLOC_SIZE(numcpus);
522 mask = CPU_ALLOC(numcpus);
523
524 if (!mask)
525 abort();
526
527 CPU_ZERO_S(masklen, mask);
528 for (i = 0; i < virBitmapSize(map); i++) {
529 if (virBitmapIsBitSet(map, i))
530 CPU_SET_S(i, masklen, mask);
531 }
532
533 rv = sched_setaffinity(pid, masklen, mask);
534 CPU_FREE(mask);
535
536 if (rv < 0) {
537 if (errno == EINVAL &&
538 numcpus < (1024 << 8)) { /* 262144 cpus ought to be enough for anyone */
539 numcpus = numcpus << 2;
540 goto realloc;
541 }
542
543 if (quiet) {
544 VIR_DEBUG("cannot set CPU affinity on process %d: %s",
545 pid, g_strerror(errno));
546 } else {
547 virReportSystemError(errno,
548 _("cannot set CPU affinity on process %d"), pid);
549 return -1;
550 }
551 }
552 #endif
553
554 return 0;
555 }
556
557 virBitmap *
virProcessGetAffinity(pid_t pid)558 virProcessGetAffinity(pid_t pid)
559 {
560 size_t i;
561 #ifdef CPU_ALLOC
562 cpu_set_t *mask;
563 #else
564 cpu_set_t maskt;
565 #endif
566 size_t masklen;
567 size_t ncpus;
568 virBitmap *ret = NULL;
569
570 #ifdef CPU_ALLOC
571 /* 262144 cpus ought to be enough for anyone */
572 ncpus = 1024 << 8;
573 masklen = CPU_ALLOC_SIZE(ncpus);
574 mask = CPU_ALLOC(ncpus);
575
576 if (!mask)
577 abort();
578
579 CPU_ZERO_S(masklen, mask);
580 #else
581 ncpus = 256; /* XXX */
582 masklen = sizeof(maskt);
583 CPU_ZERO(&maskt);
584 # endif
585
586 # ifdef CPU_ALLOC
587 if (sched_getaffinity(pid, masklen, mask) < 0) {
588 # else
589 if (sched_getaffinity(pid, masklen, &maskt) < 0) {
590 #endif
591 virReportSystemError(errno,
592 _("cannot get CPU affinity of process %d"), pid);
593 goto cleanup;
594 }
595
596 ret = virBitmapNew(ncpus);
597
598 for (i = 0; i < ncpus; i++) {
599 #ifdef CPU_ALLOC
600 if (CPU_ISSET_S(i, masklen, mask))
601 ignore_value(virBitmapSetBit(ret, i));
602 #else
603 if (CPU_ISSET(i, &maskt))
604 ignore_value(virBitmapSetBit(ret, i));
605 # endif
606 }
607
608 cleanup:
609 #ifdef CPU_ALLOC
610 CPU_FREE(mask);
611 #endif
612
613 return ret;
614 }
615
616 #elif defined(WITH_BSD_CPU_AFFINITY)
617
618 int virProcessSetAffinity(pid_t pid,
619 virBitmap *map,
620 bool quiet)
621 {
622 size_t i;
623 cpuset_t mask;
624
625 CPU_ZERO(&mask);
626 for (i = 0; i < virBitmapSize(map); i++) {
627 if (virBitmapIsBitSet(map, i))
628 CPU_SET(i, &mask);
629 }
630
631 if (cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
632 sizeof(mask), &mask) != 0) {
633 if (quiet) {
634 VIR_DEBUG("cannot set CPU affinity on process %d: %s",
635 pid, g_strerror(errno));
636 } else {
637 virReportSystemError(errno,
638 _("cannot set CPU affinity on process %d"), pid);
639 return -1;
640 }
641 }
642
643 return 0;
644 }
645
646 virBitmap *
647 virProcessGetAffinity(pid_t pid)
648 {
649 size_t i;
650 cpuset_t mask;
651 virBitmap *ret = NULL;
652
653 CPU_ZERO(&mask);
654 if (cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
655 sizeof(mask), &mask) != 0) {
656 virReportSystemError(errno,
657 _("cannot get CPU affinity of process %d"), pid);
658 return NULL;
659 }
660
661 ret = virBitmapNew(sizeof(mask) * 8);
662
663 for (i = 0; i < sizeof(mask) * 8; i++)
664 if (CPU_ISSET(i, &mask))
665 ignore_value(virBitmapSetBit(ret, i));
666
667 return ret;
668 }
669
670 #else /* WITH_DECL_CPU_SET_T */
671
672 int virProcessSetAffinity(pid_t pid G_GNUC_UNUSED,
673 virBitmap *map G_GNUC_UNUSED,
674 bool quiet G_GNUC_UNUSED)
675 {
676 /* The @quiet parameter is ignored here, it is used only for silencing
677 * actual failures. */
678 virReportSystemError(ENOSYS, "%s",
679 _("Process CPU affinity is not supported on this platform"));
680 return -1;
681 }
682
683 virBitmap *
684 virProcessGetAffinity(pid_t pid G_GNUC_UNUSED)
685 {
686 virReportSystemError(ENOSYS, "%s",
687 _("Process CPU affinity is not supported on this platform"));
688 return NULL;
689 }
690 #endif /* WITH_DECL_CPU_SET_T */
691
692
693 int virProcessGetPids(pid_t pid, size_t *npids, pid_t **pids)
694 {
695 int ret = -1;
696 g_autoptr(DIR) dir = NULL;
697 int value;
698 struct dirent *ent;
699 g_autofree char *taskPath = NULL;
700
701 *npids = 0;
702 *pids = NULL;
703
704 taskPath = g_strdup_printf("/proc/%llu/task", (long long)pid);
705
706 if (virDirOpen(&dir, taskPath) < 0)
707 goto cleanup;
708
709 while ((value = virDirRead(dir, &ent, taskPath)) > 0) {
710 long long tmp;
711 pid_t tmp_pid;
712
713 if (virStrToLong_ll(ent->d_name, NULL, 10, &tmp) < 0)
714 goto cleanup;
715 tmp_pid = tmp;
716
717 VIR_APPEND_ELEMENT(*pids, *npids, tmp_pid);
718 }
719
720 if (value < 0)
721 goto cleanup;
722
723 ret = 0;
724
725 cleanup:
726 if (ret < 0)
727 VIR_FREE(*pids);
728 return ret;
729 }
730
731
732 int virProcessGetNamespaces(pid_t pid,
733 size_t *nfdlist,
734 int **fdlist)
735 {
736 size_t i = 0;
737 const char *ns[] = { "user", "ipc", "uts", "net", "pid", "mnt" };
738
739 *nfdlist = 0;
740 *fdlist = NULL;
741
742 for (i = 0; i < G_N_ELEMENTS(ns); i++) {
743 int fd;
744 g_autofree char *nsfile = NULL;
745
746 nsfile = g_strdup_printf("/proc/%llu/ns/%s", (long long)pid, ns[i]);
747
748 if ((fd = open(nsfile, O_RDONLY)) >= 0) {
749 VIR_EXPAND_N(*fdlist, *nfdlist, 1);
750 (*fdlist)[(*nfdlist)-1] = fd;
751 }
752 }
753
754 return 0;
755 }
756
757
758 int virProcessSetNamespaces(size_t nfdlist,
759 int *fdlist)
760 {
761 size_t i;
762
763 if (nfdlist == 0) {
764 virReportInvalidArg(nfdlist, "%s",
765 _("Expected at least one file descriptor"));
766 return -1;
767 }
768 for (i = 0; i < nfdlist; i++) {
769 if (fdlist[i] < 0)
770 continue;
771
772 /* We get EINVAL if new NS is same as the current
773 * NS, or if the fd namespace doesn't match the
774 * type passed to setns()'s second param. Since we
775 * pass 0, we know the EINVAL is harmless
776 */
777 if (setns(fdlist[i], 0) < 0 &&
778 errno != EINVAL) {
779 virReportSystemError(errno, "%s",
780 _("Unable to join domain namespace"));
781 return -1;
782 }
783 }
784 return 0;
785 }
786
787 #if WITH_PRLIMIT
788 static int
789 virProcessPrLimit(pid_t pid,
790 int resource,
791 const struct rlimit *new_limit,
792 struct rlimit *old_limit)
793 {
794 return prlimit(pid, resource, new_limit, old_limit);
795 }
796 #elif WITH_SETRLIMIT
797 static int
798 virProcessPrLimit(pid_t pid G_GNUC_UNUSED,
799 int resource G_GNUC_UNUSED,
800 const struct rlimit *new_limit G_GNUC_UNUSED,
801 struct rlimit *old_limit G_GNUC_UNUSED)
802 {
803 errno = ENOSYS;
804 return -1;
805 }
806 #endif
807
808 #if WITH_GETRLIMIT
809 static int
810 virProcessGetRLimit(int resource,
811 struct rlimit *old_limit)
812 {
813 return getrlimit(resource, old_limit);
814 }
815 #endif /* WITH_GETRLIMIT */
816
817 #if WITH_SETRLIMIT
818 static int
819 virProcessSetRLimit(int resource,
820 const struct rlimit *new_limit)
821 {
822 return setrlimit(resource, new_limit);
823 }
824 #endif /* WITH_SETRLIMIT */
825
826 #if WITH_GETRLIMIT
827 static const char*
828 virProcessLimitResourceToLabel(int resource)
829 {
830 switch (resource) {
831 # if defined(RLIMIT_MEMLOCK)
832 case RLIMIT_MEMLOCK:
833 return "Max locked memory";
834 # endif /* defined(RLIMIT_MEMLOCK) */
835
836 # if defined(RLIMIT_NPROC)
837 case RLIMIT_NPROC:
838 return "Max processes";
839 # endif /* defined(RLIMIT_NPROC) */
840
841 # if defined(RLIMIT_NOFILE)
842 case RLIMIT_NOFILE:
843 return "Max open files";
844 # endif /* defined(RLIMIT_NOFILE) */
845
846 # if defined(RLIMIT_CORE)
847 case RLIMIT_CORE:
848 return "Max core file size";
849 # endif /* defined(RLIMIT_CORE) */
850
851 default:
852 return NULL;
853 }
854 }
855
856 # if defined(__linux__)
857 static int
858 virProcessGetLimitFromProc(pid_t pid,
859 int resource,
860 struct rlimit *limit)
861 {
862 g_autofree char *procfile = NULL;
863 g_autofree char *buf = NULL;
864 g_auto(GStrv) lines = NULL;
865 const char *label;
866 size_t i;
867
868 if (!(label = virProcessLimitResourceToLabel(resource))) {
869 errno = EINVAL;
870 return -1;
871 }
872
873 procfile = g_strdup_printf("/proc/%lld/limits", (long long)pid);
874
875 if (virFileReadAllQuiet(procfile, 2048, &buf) < 0) {
876 /* virFileReadAllQuiet() already sets errno, so don't overwrite
877 * that and return immediately instead */
878 return -1;
879 }
880
881 lines = g_strsplit(buf, "\n", 0);
882
883 for (i = 0; lines[i]; i++) {
884 g_autofree char *softLimit = NULL;
885 g_autofree char *hardLimit = NULL;
886 char *line = lines[i];
887 unsigned long long tmp;
888
889 if (!(line = STRSKIP(line, label)))
890 continue;
891
892 if (sscanf(line, "%ms %ms %*s", &softLimit, &hardLimit) < 2)
893 goto error;
894
895 if (STREQ(softLimit, "unlimited")) {
896 limit->rlim_cur = RLIM_INFINITY;
897 } else {
898 if (virStrToLong_ull(softLimit, NULL, 10, &tmp) < 0)
899 goto error;
900 limit->rlim_cur = tmp;
901 }
902 if (STREQ(hardLimit, "unlimited")) {
903 limit->rlim_max = RLIM_INFINITY;
904 } else {
905 if (virStrToLong_ull(hardLimit, NULL, 10, &tmp) < 0)
906 goto error;
907 limit->rlim_max = tmp;
908 }
909 }
910
911 return 0;
912
913 error:
914 errno = EIO;
915 return -1;
916 }
917 # else /* !defined(__linux__) */
918 static int
919 virProcessGetLimitFromProc(pid_t pid G_GNUC_UNUSED,
920 int resource G_GNUC_UNUSED,
921 struct rlimit *limit G_GNUC_UNUSED)
922 {
923 errno = ENOSYS;
924 return -1;
925 }
926 # endif /* !defined(__linux__) */
927
928 static int
929 virProcessGetLimit(pid_t pid,
930 int resource,
931 struct rlimit *old_limit)
932 {
933 pid_t current_pid = getpid();
934 bool same_process = (pid == current_pid);
935
936 if (virProcessPrLimit(pid, resource, NULL, old_limit) == 0)
937 return 0;
938
939 /* For whatever reason, using prlimit() on another process - even
940 * when it's just to obtain the current limit rather than changing
941 * it - requires CAP_SYS_RESOURCE, which we might not have in a
942 * containerized environment; on the other hand, no particular
943 * permission is needed to poke around /proc, so try that if going
944 * through the syscall didn't work */
945 if (virProcessGetLimitFromProc(pid, resource, old_limit) == 0)
946 return 0;
947
948 if (same_process && virProcessGetRLimit(resource, old_limit) == 0)
949 return 0;
950
951 return -1;
952 }
953 #endif /* WITH_GETRLIMIT */
954
955 #if WITH_SETRLIMIT
956 static int
957 virProcessSetLimit(pid_t pid,
958 int resource,
959 const struct rlimit *new_limit)
960 {
961 pid_t current_pid = getpid();
962 bool same_process = (pid == current_pid);
963
964 if (virProcessPrLimit(pid, resource, new_limit, NULL) == 0)
965 return 0;
966
967 if (same_process && virProcessSetRLimit(resource, new_limit) == 0)
968 return 0;
969
970 return -1;
971 }
972 #endif /* WITH_SETRLIMIT */
973
974 #if WITH_SETRLIMIT && defined(RLIMIT_MEMLOCK)
975 /**
976 * virProcessSetMaxMemLock:
977 * @pid: process to be changed
978 * @bytes: new limit
979 *
980 * Sets a new limit on the amount of locked memory for a process.
981 *
982 * Returns: 0 on success, <0 on failure.
983 */
984 int
985 virProcessSetMaxMemLock(pid_t pid, unsigned long long bytes)
986 {
987 struct rlimit rlim;
988
989 /* We use VIR_DOMAIN_MEMORY_PARAM_UNLIMITED internally to represent
990 * unlimited memory amounts, but setrlimit() and prlimit() use
991 * RLIM_INFINITY for the same purpose, so we need to translate between
992 * the two conventions */
993 if (virMemoryLimitIsSet(bytes))
994 rlim.rlim_cur = rlim.rlim_max = bytes;
995 else
996 rlim.rlim_cur = rlim.rlim_max = RLIM_INFINITY;
997
998 if (virProcessSetLimit(pid, RLIMIT_MEMLOCK, &rlim) < 0) {
999 virReportSystemError(errno,
1000 _("cannot limit locked memory "
1001 "of process %lld to %llu"),
1002 (long long int)pid, bytes);
1003 }
1004
1005 VIR_DEBUG("Locked memory for process %lld limited to %llu bytes",
1006 (long long int) pid, bytes);
1007
1008 return 0;
1009 }
1010 #else /* ! (WITH_SETRLIMIT && defined(RLIMIT_MEMLOCK)) */
1011 int
1012 virProcessSetMaxMemLock(pid_t pid G_GNUC_UNUSED,
1013 unsigned long long bytes G_GNUC_UNUSED)
1014 {
1015 virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1016 return -1;
1017 }
1018 #endif /* ! (WITH_SETRLIMIT && defined(RLIMIT_MEMLOCK)) */
1019
1020 #if WITH_GETRLIMIT && defined(RLIMIT_MEMLOCK)
1021 /**
1022 * virProcessGetMaxMemLock:
1023 * @pid: process to be queried
1024 * @bytes: return location for the limit
1025 *
1026 * Obtain the current limit on the amount of locked memory for a process.
1027 *
1028 * Returns: 0 on success, <0 on failure.
1029 */
1030 int
1031 virProcessGetMaxMemLock(pid_t pid,
1032 unsigned long long *bytes)
1033 {
1034 struct rlimit rlim;
1035
1036 if (!bytes)
1037 return 0;
1038
1039 if (virProcessGetLimit(pid, RLIMIT_MEMLOCK, &rlim) < 0) {
1040 virReportSystemError(errno,
1041 _("cannot get locked memory limit "
1042 "of process %lld"),
1043 (long long int) pid);
1044 return -1;
1045 }
1046
1047 /* virProcessSetMaxMemLock() sets both rlim_cur and rlim_max to the
1048 * same value, so we can retrieve just rlim_max here. We use
1049 * VIR_DOMAIN_MEMORY_PARAM_UNLIMITED internally to represent unlimited
1050 * memory amounts, but setrlimit() and prlimit() use RLIM_INFINITY for the
1051 * same purpose, so we need to translate between the two conventions */
1052 if (rlim.rlim_max == RLIM_INFINITY)
1053 *bytes = VIR_DOMAIN_MEMORY_PARAM_UNLIMITED;
1054 else
1055 *bytes = rlim.rlim_max;
1056
1057 return 0;
1058 }
1059 #else /* ! (WITH_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */
1060 int
1061 virProcessGetMaxMemLock(pid_t pid G_GNUC_UNUSED,
1062 unsigned long long *bytes G_GNUC_UNUSED)
1063 {
1064 virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1065 return -1;
1066 }
1067 #endif /* ! (WITH_GETRLIMIT && defined(RLIMIT_MEMLOCK)) */
1068
1069 #if WITH_SETRLIMIT && defined(RLIMIT_NPROC)
1070 /**
1071 * virProcessSetMaxProcesses:
1072 * @pid: process to be changed
1073 * @procs: new limit
1074 *
1075 * Sets a new limit on the amount of processes for the user the
1076 * process is running as.
1077 *
1078 * Returns: 0 on success, <0 on failure.
1079 */
1080 int
1081 virProcessSetMaxProcesses(pid_t pid, unsigned int procs)
1082 {
1083 struct rlimit rlim;
1084
1085 rlim.rlim_cur = rlim.rlim_max = procs;
1086
1087 if (virProcessSetLimit(pid, RLIMIT_NPROC, &rlim) < 0) {
1088 virReportSystemError(errno,
1089 _("cannot limit number of subprocesses "
1090 "of process %lld to %u"),
1091 (long long int)pid, procs);
1092 return -1;
1093 }
1094 return 0;
1095 }
1096 #else /* ! (WITH_SETRLIMIT && defined(RLIMIT_NPROC)) */
1097 int
1098 virProcessSetMaxProcesses(pid_t pid G_GNUC_UNUSED,
1099 unsigned int procs G_GNUC_UNUSED)
1100 {
1101 virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1102 return -1;
1103 }
1104 #endif /* ! (WITH_SETRLIMIT && defined(RLIMIT_NPROC)) */
1105
1106 #if WITH_SETRLIMIT && defined(RLIMIT_NOFILE)
1107 /**
1108 * virProcessSetMaxFiles:
1109 * @pid: process to be changed
1110 * @files: new limit
1111 *
1112 * Sets a new limit on the number of opened files for a process.
1113 *
1114 * Returns: 0 on success, <0 on failure.
1115 */
1116 int
1117 virProcessSetMaxFiles(pid_t pid, unsigned int files)
1118 {
1119 struct rlimit rlim;
1120
1121 /* Max number of opened files is one greater than actual limit. See
1122 * man setrlimit.
1123 *
1124 * NB: That indicates to me that we would want the following code
1125 * to say "files - 1", but the original of this code in
1126 * qemu_process.c also had files + 1, so this preserves current
1127 * behavior.
1128 */
1129 rlim.rlim_cur = rlim.rlim_max = files + 1;
1130
1131 if (virProcessSetLimit(pid, RLIMIT_NOFILE, &rlim) < 0) {
1132 virReportSystemError(errno,
1133 _("cannot limit number of open files "
1134 "of process %lld to %u"),
1135 (long long int)pid, files);
1136 return -1;
1137 }
1138
1139 return 0;
1140 }
1141 #else /* ! (WITH_SETRLIMIT && defined(RLIMIT_NOFILE)) */
1142 int
1143 virProcessSetMaxFiles(pid_t pid G_GNUC_UNUSED,
1144 unsigned int files G_GNUC_UNUSED)
1145 {
1146 virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1147 return -1;
1148 }
1149 #endif /* ! (WITH_SETRLIMIT && defined(RLIMIT_NOFILE)) */
1150
1151 #if WITH_SETRLIMIT && defined(RLIMIT_CORE)
1152 /**
1153 * virProcessSetMaxCoreSize:
1154 * @pid: process to be changed
1155 * @bytes: new limit (0 to disable core dumps)
1156 *
1157 * Sets a new limit on the size of core dumps for a process.
1158 *
1159 * Returns: 0 on success, <0 on failure.
1160 */
1161 int
1162 virProcessSetMaxCoreSize(pid_t pid, unsigned long long bytes)
1163 {
1164 struct rlimit rlim;
1165
1166 rlim.rlim_cur = rlim.rlim_max = bytes;
1167
1168 if (virProcessSetLimit(pid, RLIMIT_CORE, &rlim) < 0) {
1169 virReportSystemError(errno,
1170 _("cannot limit core file size "
1171 "of process %lld to %llu"),
1172 (long long int)pid, bytes);
1173 return -1;
1174 }
1175
1176 return 0;
1177 }
1178 #else /* ! (WITH_SETRLIMIT && defined(RLIMIT_CORE)) */
1179 int
1180 virProcessSetMaxCoreSize(pid_t pid G_GNUC_UNUSED,
1181 unsigned long long bytes G_GNUC_UNUSED)
1182 {
1183 virReportSystemError(ENOSYS, "%s", _("Not supported on this platform"));
1184 return -1;
1185 }
1186 #endif /* ! (WITH_SETRLIMIT && defined(RLIMIT_CORE)) */
1187
1188
1189 #ifdef __linux__
1190 /*
1191 * Port of code from polkitunixprocess.c under terms
1192 * of the LGPLv2+
1193 */
1194 int virProcessGetStartTime(pid_t pid,
1195 unsigned long long *timestamp)
1196 {
1197 g_auto(GStrv) proc_stat = virProcessGetStat(pid, 0);
1198 const char *starttime_str = NULL;
1199
1200 if (!proc_stat || g_strv_length(proc_stat) < 22) {
1201 virReportError(VIR_ERR_INTERNAL_ERROR,
1202 _("Cannot find start time for pid %d"), (int)pid);
1203 return -1;
1204 }
1205
1206 starttime_str = proc_stat[VIR_PROCESS_STAT_STARTTIME];
1207 if (virStrToLong_ull(starttime_str, NULL, 10, timestamp) < 0) {
1208 virReportError(VIR_ERR_INTERNAL_ERROR,
1209 _("Cannot parse start time %s for pid %d"),
1210 starttime_str, (int)pid);
1211 return -1;
1212 }
1213 return 0;
1214 }
1215 #elif defined(__FreeBSD__) && ! defined __DragonFly__
1216 int virProcessGetStartTime(pid_t pid,
1217 unsigned long long *timestamp)
1218 {
1219 struct kinfo_proc p;
1220 int mib[4];
1221 size_t len = 4;
1222
1223 sysctlnametomib("kern.proc.pid", mib, &len);
1224
1225 len = sizeof(struct kinfo_proc);
1226 mib[3] = pid;
1227
1228 if (sysctl(mib, 4, &p, &len, NULL, 0) < 0) {
1229 virReportSystemError(errno, "%s",
1230 _("Unable to query process ID start time"));
1231 return -1;
1232 }
1233
1234 *timestamp = (unsigned long long)p.ki_start.tv_sec;
1235
1236 return 0;
1237
1238 }
1239 #else
1240 int virProcessGetStartTime(pid_t pid,
1241 unsigned long long *timestamp)
1242 {
1243 static int warned;
1244 if (g_atomic_int_add(&warned, 1) == 0) {
1245 VIR_WARN("Process start time of pid %lld not available on this platform",
1246 (long long) pid);
1247 }
1248 *timestamp = 0;
1249 return 0;
1250 }
1251 #endif
1252
1253
1254 #ifdef __linux__
1255 typedef struct _virProcessNamespaceHelperData virProcessNamespaceHelperData;
1256 struct _virProcessNamespaceHelperData {
1257 pid_t pid;
1258 virProcessNamespaceCallback cb;
1259 void *opaque;
1260 };
1261
1262 static int virProcessNamespaceHelper(pid_t pid G_GNUC_UNUSED,
1263 void *opaque)
1264 {
1265 virProcessNamespaceHelperData *data = opaque;
1266 int fd = -1;
1267 int ret = -1;
1268 g_autofree char *path = NULL;
1269
1270 path = g_strdup_printf("/proc/%lld/ns/mnt", (long long)data->pid);
1271
1272 if ((fd = open(path, O_RDONLY)) < 0) {
1273 virReportSystemError(errno, "%s",
1274 _("Kernel does not provide mount namespace"));
1275 goto cleanup;
1276 }
1277
1278 if (setns(fd, 0) < 0) {
1279 virReportSystemError(errno, "%s",
1280 _("Unable to enter mount namespace"));
1281 goto cleanup;
1282 }
1283
1284 ret = data->cb(data->pid, data->opaque);
1285
1286 cleanup:
1287 VIR_FORCE_CLOSE(fd);
1288 return ret;
1289 }
1290
1291 /* Run cb(opaque) in the mount namespace of pid. Return -1 with error
1292 * message raised if we fail to run the child, if the child dies from
1293 * a signal, or if the child has status EXIT_CANCELED; otherwise return
1294 * value is the retval of the callback. The callback will be run in a child
1295 * process so must be careful to only use async signal safe functions.
1296 */
1297 int
1298 virProcessRunInMountNamespace(pid_t pid,
1299 virProcessNamespaceCallback cb,
1300 void *opaque)
1301 {
1302 virProcessNamespaceHelperData data = {.pid = pid, .cb = cb, .opaque = opaque};
1303
1304 return virProcessRunInFork(virProcessNamespaceHelper, &data);
1305 }
1306
1307 #else /* ! __linux__ */
1308
1309 int
1310 virProcessRunInMountNamespace(pid_t pid G_GNUC_UNUSED,
1311 virProcessNamespaceCallback cb G_GNUC_UNUSED,
1312 void *opaque G_GNUC_UNUSED)
1313 {
1314 virReportSystemError(ENOSYS, "%s",
1315 _("Namespaces are not supported on this platform"));
1316 return -1;
1317 }
1318
1319 #endif /* ! __linux__ */
1320
1321
1322 #ifndef WIN32
1323 /* We assume that error messages will fit into 1024 chars */
1324 # define VIR_PROCESS_ERROR_MAX_LENGTH 1024
1325 typedef struct {
1326 int code;
1327 int domain;
1328 char message[VIR_PROCESS_ERROR_MAX_LENGTH];
1329 virErrorLevel level;
1330 char str1[VIR_PROCESS_ERROR_MAX_LENGTH];
1331 char str2[VIR_PROCESS_ERROR_MAX_LENGTH];
1332 char str3[VIR_PROCESS_ERROR_MAX_LENGTH];
1333 int int1;
1334 int int2;
1335 } errorData;
1336
1337 typedef union {
1338 errorData data;
1339 char bindata[sizeof(errorData)];
1340 } errorDataBin;
1341
1342 static int
1343 virProcessRunInForkHelper(int errfd,
1344 pid_t ppid,
1345 virProcessForkCallback cb,
1346 void *opaque)
1347 {
1348 int ret = 0;
1349
1350 if ((ret = cb(ppid, opaque)) < 0) {
1351 virErrorPtr err = virGetLastError();
1352
1353 if (err) {
1354 g_autofree errorDataBin *bin = g_new0(errorDataBin, 1);
1355
1356 bin->data.code = err->code;
1357 bin->data.domain = err->domain;
1358 virStrcpyStatic(bin->data.message, err->message);
1359 bin->data.level = err->level;
1360 if (err->str1)
1361 virStrcpyStatic(bin->data.str1, err->str1);
1362 if (err->str2)
1363 virStrcpyStatic(bin->data.str2, err->str2);
1364 if (err->str3)
1365 virStrcpyStatic(bin->data.str3, err->str3);
1366 bin->data.int1 = err->int1;
1367 bin->data.int2 = err->int2;
1368
1369 ignore_value(safewrite(errfd, bin->bindata, sizeof(*bin)));
1370 }
1371
1372 return -1;
1373 }
1374
1375 return ret;
1376 }
1377
1378
1379 /**
1380 * virProcessRunInFork:
1381 * @cb: callback to run
1382 * @opaque: opaque data to @cb
1383 *
1384 * Do the fork and run @cb in the child. This can be used when
1385 * @cb does something thread unsafe, for instance. All signals
1386 * will be reset to have their platform default handlers and
1387 * unmasked. @cb must only use async signal safe functions. In
1388 * particular no mutexes should be used in @cb, unless steps were
1389 * taken before forking to guarantee a predictable state. @cb
1390 * must not exec any external binaries, instead
1391 * virCommand should be used for that purpose.
1392 *
1393 * On return, the returned value is either -1 with error message
1394 * reported if something went bad in the parent, if child has
1395 * died from a signal or if the child returned EXIT_CANCELED.
1396 * Otherwise the returned value is the retval of the callback.
1397 */
1398 int
1399 virProcessRunInFork(virProcessForkCallback cb,
1400 void *opaque)
1401 {
1402 int ret = -1;
1403 pid_t child = -1;
1404 pid_t parent = getpid();
1405 int errfd[2] = { -1, -1 };
1406
1407 if (virPipe(errfd) < 0)
1408 return -1;
1409
1410 if ((child = virFork()) < 0)
1411 goto cleanup;
1412
1413 if (child == 0) {
1414 VIR_FORCE_CLOSE(errfd[0]);
1415 ret = virProcessRunInForkHelper(errfd[1], parent, cb, opaque);
1416 VIR_FORCE_CLOSE(errfd[1]);
1417 _exit(ret < 0 ? EXIT_CANCELED : ret);
1418 } else {
1419 int status;
1420 g_autofree char *buf = NULL;
1421 g_autofree errorDataBin *bin = NULL;
1422 int nread;
1423
1424 VIR_FORCE_CLOSE(errfd[1]);
1425 nread = virFileReadHeaderFD(errfd[0], sizeof(*bin), &buf);
1426 ret = virProcessWait(child, &status, false);
1427 if (ret == 0) {
1428 ret = status == EXIT_CANCELED ? -1 : status;
1429 if (ret < 0) {
1430 if (nread == sizeof(*bin)) {
1431 bin = g_new0(errorDataBin, 1);
1432 memcpy(bin->bindata, buf, sizeof(*bin));
1433
1434 virReportError(VIR_ERR_INTERNAL_ERROR,
1435 _("child reported (status=%d): %s"),
1436 status, NULLSTR(bin->data.message));
1437
1438 virRaiseErrorFull(__FILE__, __FUNCTION__, __LINE__,
1439 bin->data.domain,
1440 bin->data.code,
1441 bin->data.level,
1442 bin->data.str1,
1443 bin->data.str2,
1444 bin->data.str3,
1445 bin->data.int1,
1446 bin->data.int2,
1447 "%s", bin->data.message);
1448 } else {
1449 virReportError(VIR_ERR_INTERNAL_ERROR,
1450 _("child didn't write error (status=%d)"),
1451 status);
1452 }
1453 }
1454 }
1455 }
1456
1457 cleanup:
1458 VIR_FORCE_CLOSE(errfd[0]);
1459 VIR_FORCE_CLOSE(errfd[1]);
1460 return ret;
1461 }
1462
1463 #else /* WIN32 */
1464
1465 int
1466 virProcessRunInFork(virProcessForkCallback cb G_GNUC_UNUSED,
1467 void *opaque G_GNUC_UNUSED)
1468 {
1469 virReportSystemError(ENOSYS, "%s",
1470 _("Process spawning is not supported on this platform"));
1471 return -1;
1472 }
1473
1474 #endif /* WIN32 */
1475
1476
1477 #if defined(__linux__)
1478 int
1479 virProcessSetupPrivateMountNS(void)
1480 {
1481 if (unshare(CLONE_NEWNS) < 0) {
1482 virReportSystemError(errno, "%s",
1483 _("Cannot unshare mount namespace"));
1484 return -1;
1485 }
1486
1487 if (mount("", "/", "none", MS_SLAVE|MS_REC, NULL) < 0) {
1488 virReportSystemError(errno, "%s",
1489 _("Failed disable mount propagation out of the root filesystem"));
1490 return -1;
1491 }
1492
1493 return 0;
1494 }
1495
1496
1497 G_GNUC_NORETURN static int
1498 virProcessDummyChild(void *argv G_GNUC_UNUSED)
1499 {
1500 _exit(0);
1501 }
1502
1503
1504 /**
1505 * virProcessNamespaceAvailable:
1506 * @ns: what namespaces to check (bitwise-OR of virProcessNamespaceFlags)
1507 *
1508 * Check if given list of namespaces (@ns) is available.
1509 * If not, appropriate error message is produced.
1510 *
1511 * Returns: 0 on success (all the namespaces from @flags are available),
1512 * -1 on error (with error message reported).
1513 */
1514 int
1515 virProcessNamespaceAvailable(unsigned int ns)
1516 {
1517 int flags = 0;
1518 int cpid;
1519 char *childStack;
1520 int stacksize = getpagesize() * 4;
1521 g_autofree char *stack = NULL;
1522
1523 if (ns & VIR_PROCESS_NAMESPACE_MNT)
1524 flags |= CLONE_NEWNS;
1525 if (ns & VIR_PROCESS_NAMESPACE_IPC)
1526 flags |= CLONE_NEWIPC;
1527 if (ns & VIR_PROCESS_NAMESPACE_NET)
1528 flags |= CLONE_NEWNET;
1529 if (ns & VIR_PROCESS_NAMESPACE_PID)
1530 flags |= CLONE_NEWPID;
1531 if (ns & VIR_PROCESS_NAMESPACE_USER)
1532 flags |= CLONE_NEWUSER;
1533 if (ns & VIR_PROCESS_NAMESPACE_UTS)
1534 flags |= CLONE_NEWUTS;
1535
1536 /* Signal parent as soon as the child dies. RIP. */
1537 flags |= SIGCHLD;
1538
1539 stack = g_new0(char, stacksize);
1540
1541 childStack = stack + stacksize;
1542
1543 cpid = clone(virProcessDummyChild, childStack, flags, NULL);
1544
1545 if (cpid < 0) {
1546 VIR_DEBUG("clone call returned %s, container support is not enabled",
1547 g_strerror(errno));
1548 return -1;
1549 } else if (virProcessWait(cpid, NULL, false) < 0) {
1550 return -1;
1551 }
1552
1553 VIR_DEBUG("All namespaces (%x) are enabled", ns);
1554 return 0;
1555 }
1556
1557 #else /* !defined(__linux__) */
1558
1559 int
1560 virProcessSetupPrivateMountNS(void)
1561 {
1562 virReportSystemError(ENOSYS, "%s",
1563 _("Namespaces are not supported on this platform."));
1564 return -1;
1565 }
1566
1567 int
1568 virProcessNamespaceAvailable(unsigned int ns G_GNUC_UNUSED)
1569 {
1570 virReportSystemError(ENOSYS, "%s",
1571 _("Namespaces are not supported on this platform."));
1572 return -1;
1573 }
1574
1575 #endif /* !defined(__linux__) */
1576
1577 /**
1578 * virProcessExitWithStatus:
1579 * @status: raw status to be reproduced when this process dies
1580 *
1581 * Given a raw status obtained by waitpid() or similar, attempt to
1582 * make this process exit in the same manner. If the child died by
1583 * signal, reset that signal handler to default and raise the same
1584 * signal; if that doesn't kill this process, then exit with 128 +
1585 * signal number. If @status can't be deciphered, use
1586 * EXIT_CANNOT_INVOKE.
1587 *
1588 * Never returns.
1589 */
1590 void
1591 virProcessExitWithStatus(int status)
1592 {
1593 int value = EXIT_CANNOT_INVOKE;
1594
1595 #ifndef WIN32
1596 if (WIFEXITED(status)) {
1597 value = WEXITSTATUS(status);
1598 } else if (WIFSIGNALED(status)) {
1599 struct sigaction act;
1600 sigset_t sigs;
1601
1602 if (sigemptyset(&sigs) == 0 &&
1603 sigaddset(&sigs, WTERMSIG(status)) == 0)
1604 sigprocmask(SIG_UNBLOCK, &sigs, NULL);
1605 memset(&act, 0, sizeof(act));
1606 act.sa_handler = SIG_DFL;
1607 sigfillset(&act.sa_mask);
1608 sigaction(WTERMSIG(status), &act, NULL);
1609 raise(WTERMSIG(status));
1610 value = 128 + WTERMSIG(status);
1611 }
1612 #else /* WIN32 */
1613 (void)status;
1614 #endif /* WIN32 */
1615 exit(value);
1616 }
1617
1618 #if WITH_SCHED_SETSCHEDULER
1619
1620 static int
1621 virProcessSchedTranslatePolicy(virProcessSchedPolicy policy)
1622 {
1623 switch (policy) {
1624 case VIR_PROC_POLICY_NONE:
1625 return SCHED_OTHER;
1626
1627 case VIR_PROC_POLICY_BATCH:
1628 # ifdef SCHED_BATCH
1629 return SCHED_BATCH;
1630 # else
1631 return -1;
1632 # endif
1633
1634 case VIR_PROC_POLICY_IDLE:
1635 # ifdef SCHED_IDLE
1636 return SCHED_IDLE;
1637 # else
1638 return -1;
1639 # endif
1640
1641 case VIR_PROC_POLICY_FIFO:
1642 return SCHED_FIFO;
1643
1644 case VIR_PROC_POLICY_RR:
1645 return SCHED_RR;
1646
1647 case VIR_PROC_POLICY_LAST:
1648 /* nada */
1649 break;
1650 }
1651
1652 return -1;
1653 }
1654
1655 int
1656 virProcessSetScheduler(pid_t pid,
1657 virProcessSchedPolicy policy,
1658 int priority)
1659 {
1660 struct sched_param param = {0};
1661 int pol = virProcessSchedTranslatePolicy(policy);
1662
1663 VIR_DEBUG("pid=%lld, policy=%d, priority=%u",
1664 (long long) pid, policy, priority);
1665
1666 if (!policy)
1667 return 0;
1668
1669 if (pol < 0) {
1670 virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1671 _("Scheduler '%s' is not supported on this platform"),
1672 virProcessSchedPolicyTypeToString(policy));
1673 return -1;
1674 }
1675
1676 if (pol == SCHED_FIFO || pol == SCHED_RR) {
1677 int min = 0;
1678 int max = 0;
1679
1680 if ((min = sched_get_priority_min(pol)) < 0) {
1681 virReportSystemError(errno, "%s",
1682 _("Cannot get minimum scheduler "
1683 "priority value"));
1684 return -1;
1685 }
1686
1687 if ((max = sched_get_priority_max(pol)) < 0) {
1688 virReportSystemError(errno, "%s",
1689 _("Cannot get maximum scheduler "
1690 "priority value"));
1691 return -1;
1692 }
1693
1694 if (priority < min || priority > max) {
1695 virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
1696 _("Scheduler priority %d out of range [%d, %d]"),
1697 priority, min, max);
1698 return -1;
1699 }
1700
1701 param.sched_priority = priority;
1702 }
1703
1704 if (sched_setscheduler(pid, pol, ¶m) < 0) {
1705 virReportSystemError(errno,
1706 _("Cannot set scheduler parameters for pid %lld"),
1707 (long long) pid);
1708 return -1;
1709 }
1710
1711 return 0;
1712 }
1713
1714 #else /* ! WITH_SCHED_SETSCHEDULER */
1715
1716 int
1717 virProcessSetScheduler(pid_t pid G_GNUC_UNUSED,
1718 virProcessSchedPolicy policy,
1719 int priority G_GNUC_UNUSED)
1720 {
1721 if (!policy)
1722 return 0;
1723
1724 virReportSystemError(ENOSYS, "%s",
1725 _("Process CPU scheduling is not supported "
1726 "on this platform"));
1727 return -1;
1728 }
1729
1730 #endif /* !WITH_SCHED_SETSCHEDULER */
1731
1732 /*
1733 * Get all stat fields for a process based on pid and tid:
1734 * - pid == 0 && tid == 0 => /proc/self/stat
1735 * - pid != 0 && tid == 0 => /proc/<pid>/stat
1736 * - pid == 0 && tid != 0 => /proc/self/task/<tid>/stat
1737 * - pid != 0 && tid != 0 => /proc/<pid>/task/<tid>/stat
1738 * and return them as array of strings.
1739 */
1740 GStrv
1741 virProcessGetStat(pid_t pid,
1742 pid_t tid)
1743 {
1744 int len = 10 * 1024; /* 10kB ought to be enough for everyone */
1745 g_autofree char *buf = NULL;
1746 g_autofree char *path = NULL;
1747 GStrv rest = NULL;
1748 GStrv ret = NULL;
1749 char *comm = NULL;
1750 char *rparen = NULL;
1751 size_t nrest = 0;
1752
1753 if (pid) {
1754 if (tid)
1755 path = g_strdup_printf("/proc/%d/task/%d/stat", (int)pid, (int)tid);
1756 else
1757 path = g_strdup_printf("/proc/%d/stat", (int)pid);
1758 } else {
1759 if (tid)
1760 path = g_strdup_printf("/proc/self/task/%d/stat", (int)tid);
1761 else
1762 path = g_strdup("/proc/self/stat");
1763 }
1764
1765 len = virFileReadAllQuiet(path, len, &buf);
1766 if (len < 0)
1767 return NULL;
1768
1769 /* eliminate trailing spaces */
1770 while (len > 0 && g_ascii_isspace(buf[--len]))
1771 buf[len] = '\0';
1772
1773 /* Find end of the first field */
1774 if (!(comm = strchr(buf, ' ')))
1775 return NULL;
1776 *comm = '\0';
1777
1778 /* Check start of the second field (filename of the executable, in
1779 * parentheses) */
1780 comm++;
1781 if (*comm != '(')
1782 return NULL;
1783 comm++;
1784
1785 /* Check end of the second field (last closing parenthesis) */
1786 rparen = strrchr(comm, ')');
1787 if (!rparen)
1788 return NULL;
1789 *rparen = '\0';
1790
1791 /* We need to check that the next char is not '\0', but why not just opt in
1792 * for the safer way of checking whether it is ' ' (space) instead */
1793 if (rparen[1] != ' ')
1794 return NULL;
1795
1796 rest = g_strsplit(rparen + 2, " ", 0);
1797 nrest = g_strv_length(rest);
1798 ret = g_new0(char *, nrest + 3);
1799 ret[0] = g_strdup(buf);
1800 ret[1] = g_strdup(comm);
1801 memcpy(ret + 2, rest, nrest * sizeof(char *));
1802
1803 /* Do not use g_strfreev() as individual elements they were moved to @ret. */
1804 VIR_FREE(rest);
1805
1806 return ret;
1807 }
1808