xref: /qemu/util/oslib-posix.c (revision ab9056ff)
1 /*
2  * os-posix-lib.c
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2010 Red Hat, Inc.
6  *
7  * QEMU library functions on POSIX which are shared between QEMU and
8  * the QEMU tools.
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include <termios.h>
31 
32 #include <glib/gprintf.h>
33 
34 #include "qemu-common.h"
35 #include "sysemu/sysemu.h"
36 #include "trace.h"
37 #include "qapi/error.h"
38 #include "qemu/sockets.h"
39 #include "qemu/thread.h"
40 #include <libgen.h>
41 #include <sys/signal.h>
42 #include "qemu/cutils.h"
43 
44 #ifdef CONFIG_LINUX
45 #include <sys/syscall.h>
46 #endif
47 
48 #ifdef __FreeBSD__
49 #include <sys/sysctl.h>
50 #include <sys/user.h>
51 #include <libutil.h>
52 #endif
53 
54 #ifdef __NetBSD__
55 #include <sys/sysctl.h>
56 #endif
57 
58 #include "qemu/mmap-alloc.h"
59 
60 #ifdef CONFIG_DEBUG_STACK_USAGE
61 #include "qemu/error-report.h"
62 #endif
63 
64 #define MAX_MEM_PREALLOC_THREAD_COUNT 16
65 
66 struct MemsetThread {
67     char *addr;
68     size_t numpages;
69     size_t hpagesize;
70     QemuThread pgthread;
71     sigjmp_buf env;
72 };
73 typedef struct MemsetThread MemsetThread;
74 
75 static MemsetThread *memset_thread;
76 static int memset_num_threads;
77 static bool memset_thread_failed;
78 
79 int qemu_get_thread_id(void)
80 {
81 #if defined(__linux__)
82     return syscall(SYS_gettid);
83 #else
84     return getpid();
85 #endif
86 }
87 
88 int qemu_daemon(int nochdir, int noclose)
89 {
90     return daemon(nochdir, noclose);
91 }
92 
93 bool qemu_write_pidfile(const char *path, Error **errp)
94 {
95     int fd;
96     char pidstr[32];
97 
98     while (1) {
99         struct stat a, b;
100         struct flock lock = {
101             .l_type = F_WRLCK,
102             .l_whence = SEEK_SET,
103             .l_len = 0,
104         };
105 
106         fd = qemu_open(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
107         if (fd == -1) {
108             error_setg_errno(errp, errno, "Cannot open pid file");
109             return false;
110         }
111 
112         if (fstat(fd, &b) < 0) {
113             error_setg_errno(errp, errno, "Cannot stat file");
114             goto fail_close;
115         }
116 
117         if (fcntl(fd, F_SETLK, &lock)) {
118             error_setg_errno(errp, errno, "Cannot lock pid file");
119             goto fail_close;
120         }
121 
122         /*
123          * Now make sure the path we locked is the same one that now
124          * exists on the filesystem.
125          */
126         if (stat(path, &a) < 0) {
127             /*
128              * PID file disappeared, someone else must be racing with
129              * us, so try again.
130              */
131             close(fd);
132             continue;
133         }
134 
135         if (a.st_ino == b.st_ino) {
136             break;
137         }
138 
139         /*
140          * PID file was recreated, someone else must be racing with
141          * us, so try again.
142          */
143         close(fd);
144     }
145 
146     if (ftruncate(fd, 0) < 0) {
147         error_setg_errno(errp, errno, "Failed to truncate pid file");
148         goto fail_unlink;
149     }
150 
151     snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
152     if (write(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
153         error_setg(errp, "Failed to write pid file");
154         goto fail_unlink;
155     }
156 
157     return true;
158 
159 fail_unlink:
160     unlink(path);
161 fail_close:
162     close(fd);
163     return false;
164 }
165 
166 void *qemu_oom_check(void *ptr)
167 {
168     if (ptr == NULL) {
169         fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno));
170         abort();
171     }
172     return ptr;
173 }
174 
175 void *qemu_try_memalign(size_t alignment, size_t size)
176 {
177     void *ptr;
178 
179     if (alignment < sizeof(void*)) {
180         alignment = sizeof(void*);
181     }
182 
183 #if defined(CONFIG_POSIX_MEMALIGN)
184     int ret;
185     ret = posix_memalign(&ptr, alignment, size);
186     if (ret != 0) {
187         errno = ret;
188         ptr = NULL;
189     }
190 #elif defined(CONFIG_BSD)
191     ptr = valloc(size);
192 #else
193     ptr = memalign(alignment, size);
194 #endif
195     trace_qemu_memalign(alignment, size, ptr);
196     return ptr;
197 }
198 
199 void *qemu_memalign(size_t alignment, size_t size)
200 {
201     return qemu_oom_check(qemu_try_memalign(alignment, size));
202 }
203 
204 /* alloc shared memory pages */
205 void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
206 {
207     size_t align = QEMU_VMALLOC_ALIGN;
208     void *ptr = qemu_ram_mmap(-1, size, align, shared, false);
209 
210     if (ptr == MAP_FAILED) {
211         return NULL;
212     }
213 
214     if (alignment) {
215         *alignment = align;
216     }
217 
218     trace_qemu_anon_ram_alloc(size, ptr);
219     return ptr;
220 }
221 
222 void qemu_vfree(void *ptr)
223 {
224     trace_qemu_vfree(ptr);
225     free(ptr);
226 }
227 
228 void qemu_anon_ram_free(void *ptr, size_t size)
229 {
230     trace_qemu_anon_ram_free(ptr, size);
231     qemu_ram_munmap(-1, ptr, size);
232 }
233 
234 void qemu_set_block(int fd)
235 {
236     int f;
237     f = fcntl(fd, F_GETFL);
238     assert(f != -1);
239     f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
240     assert(f != -1);
241 }
242 
243 void qemu_set_nonblock(int fd)
244 {
245     int f;
246     f = fcntl(fd, F_GETFL);
247     assert(f != -1);
248     f = fcntl(fd, F_SETFL, f | O_NONBLOCK);
249 #ifdef __OpenBSD__
250     if (f == -1) {
251         /*
252          * Previous to OpenBSD 6.3, fcntl(F_SETFL) is not permitted on
253          * memory devices and sets errno to ENODEV.
254          * It's OK if we fail to set O_NONBLOCK on devices like /dev/null,
255          * because they will never block anyway.
256          */
257         assert(errno == ENODEV);
258     }
259 #else
260     assert(f != -1);
261 #endif
262 }
263 
264 int socket_set_fast_reuse(int fd)
265 {
266     int val = 1, ret;
267 
268     ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
269                      (const char *)&val, sizeof(val));
270 
271     assert(ret == 0);
272 
273     return ret;
274 }
275 
276 void qemu_set_cloexec(int fd)
277 {
278     int f;
279     f = fcntl(fd, F_GETFD);
280     assert(f != -1);
281     f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
282     assert(f != -1);
283 }
284 
285 /*
286  * Creates a pipe with FD_CLOEXEC set on both file descriptors
287  */
288 int qemu_pipe(int pipefd[2])
289 {
290     int ret;
291 
292 #ifdef CONFIG_PIPE2
293     ret = pipe2(pipefd, O_CLOEXEC);
294     if (ret != -1 || errno != ENOSYS) {
295         return ret;
296     }
297 #endif
298     ret = pipe(pipefd);
299     if (ret == 0) {
300         qemu_set_cloexec(pipefd[0]);
301         qemu_set_cloexec(pipefd[1]);
302     }
303 
304     return ret;
305 }
306 
307 char *
308 qemu_get_local_state_pathname(const char *relative_pathname)
309 {
310     return g_strdup_printf("%s/%s", CONFIG_QEMU_LOCALSTATEDIR,
311                            relative_pathname);
312 }
313 
314 void qemu_set_tty_echo(int fd, bool echo)
315 {
316     struct termios tty;
317 
318     tcgetattr(fd, &tty);
319 
320     if (echo) {
321         tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
322     } else {
323         tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
324     }
325 
326     tcsetattr(fd, TCSANOW, &tty);
327 }
328 
329 static char exec_dir[PATH_MAX];
330 
331 void qemu_init_exec_dir(const char *argv0)
332 {
333     char *dir;
334     char *p = NULL;
335     char buf[PATH_MAX];
336 
337     assert(!exec_dir[0]);
338 
339 #if defined(__linux__)
340     {
341         int len;
342         len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
343         if (len > 0) {
344             buf[len] = 0;
345             p = buf;
346         }
347     }
348 #elif defined(__FreeBSD__) \
349       || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME))
350     {
351 #if defined(__FreeBSD__)
352         static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
353 #else
354         static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
355 #endif
356         size_t len = sizeof(buf) - 1;
357 
358         *buf = '\0';
359         if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
360             *buf) {
361             buf[sizeof(buf) - 1] = '\0';
362             p = buf;
363         }
364     }
365 #endif
366     /* If we don't have any way of figuring out the actual executable
367        location then try argv[0].  */
368     if (!p) {
369         if (!argv0) {
370             return;
371         }
372         p = realpath(argv0, buf);
373         if (!p) {
374             return;
375         }
376     }
377     dir = g_path_get_dirname(p);
378 
379     pstrcpy(exec_dir, sizeof(exec_dir), dir);
380 
381     g_free(dir);
382 }
383 
384 char *qemu_get_exec_dir(void)
385 {
386     return g_strdup(exec_dir);
387 }
388 
389 static void sigbus_handler(int signal)
390 {
391     int i;
392     if (memset_thread) {
393         for (i = 0; i < memset_num_threads; i++) {
394             if (qemu_thread_is_self(&memset_thread[i].pgthread)) {
395                 siglongjmp(memset_thread[i].env, 1);
396             }
397         }
398     }
399 }
400 
401 static void *do_touch_pages(void *arg)
402 {
403     MemsetThread *memset_args = (MemsetThread *)arg;
404     sigset_t set, oldset;
405 
406     /* unblock SIGBUS */
407     sigemptyset(&set);
408     sigaddset(&set, SIGBUS);
409     pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
410 
411     if (sigsetjmp(memset_args->env, 1)) {
412         memset_thread_failed = true;
413     } else {
414         char *addr = memset_args->addr;
415         size_t numpages = memset_args->numpages;
416         size_t hpagesize = memset_args->hpagesize;
417         size_t i;
418         for (i = 0; i < numpages; i++) {
419             /*
420              * Read & write back the same value, so we don't
421              * corrupt existing user/app data that might be
422              * stored.
423              *
424              * 'volatile' to stop compiler optimizing this away
425              * to a no-op
426              *
427              * TODO: get a better solution from kernel so we
428              * don't need to write at all so we don't cause
429              * wear on the storage backing the region...
430              */
431             *(volatile char *)addr = *addr;
432             addr += hpagesize;
433         }
434     }
435     pthread_sigmask(SIG_SETMASK, &oldset, NULL);
436     return NULL;
437 }
438 
439 static inline int get_memset_num_threads(int smp_cpus)
440 {
441     long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
442     int ret = 1;
443 
444     if (host_procs > 0) {
445         ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
446     }
447     /* In case sysconf() fails, we fall back to single threaded */
448     return ret;
449 }
450 
451 static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
452                             int smp_cpus)
453 {
454     size_t numpages_per_thread;
455     size_t size_per_thread;
456     char *addr = area;
457     int i = 0;
458 
459     memset_thread_failed = false;
460     memset_num_threads = get_memset_num_threads(smp_cpus);
461     memset_thread = g_new0(MemsetThread, memset_num_threads);
462     numpages_per_thread = (numpages / memset_num_threads);
463     size_per_thread = (hpagesize * numpages_per_thread);
464     for (i = 0; i < memset_num_threads; i++) {
465         memset_thread[i].addr = addr;
466         memset_thread[i].numpages = (i == (memset_num_threads - 1)) ?
467                                     numpages : numpages_per_thread;
468         memset_thread[i].hpagesize = hpagesize;
469         qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
470                            do_touch_pages, &memset_thread[i],
471                            QEMU_THREAD_JOINABLE);
472         addr += size_per_thread;
473         numpages -= numpages_per_thread;
474     }
475     for (i = 0; i < memset_num_threads; i++) {
476         qemu_thread_join(&memset_thread[i].pgthread);
477     }
478     g_free(memset_thread);
479     memset_thread = NULL;
480 
481     return memset_thread_failed;
482 }
483 
484 void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
485                      Error **errp)
486 {
487     int ret;
488     struct sigaction act, oldact;
489     size_t hpagesize = qemu_fd_getpagesize(fd);
490     size_t numpages = DIV_ROUND_UP(memory, hpagesize);
491 
492     memset(&act, 0, sizeof(act));
493     act.sa_handler = &sigbus_handler;
494     act.sa_flags = 0;
495 
496     ret = sigaction(SIGBUS, &act, &oldact);
497     if (ret) {
498         error_setg_errno(errp, errno,
499             "os_mem_prealloc: failed to install signal handler");
500         return;
501     }
502 
503     /* touch pages simultaneously */
504     if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
505         error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
506             "pages available to allocate guest RAM");
507     }
508 
509     ret = sigaction(SIGBUS, &oldact, NULL);
510     if (ret) {
511         /* Terminate QEMU since it can't recover from error */
512         perror("os_mem_prealloc: failed to reinstall signal handler");
513         exit(1);
514     }
515 }
516 
517 char *qemu_get_pid_name(pid_t pid)
518 {
519     char *name = NULL;
520 
521 #if defined(__FreeBSD__)
522     /* BSDs don't have /proc, but they provide a nice substitute */
523     struct kinfo_proc *proc = kinfo_getproc(pid);
524 
525     if (proc) {
526         name = g_strdup(proc->ki_comm);
527         free(proc);
528     }
529 #else
530     /* Assume a system with reasonable procfs */
531     char *pid_path;
532     size_t len;
533 
534     pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
535     g_file_get_contents(pid_path, &name, &len, NULL);
536     g_free(pid_path);
537 #endif
538 
539     return name;
540 }
541 
542 
543 pid_t qemu_fork(Error **errp)
544 {
545     sigset_t oldmask, newmask;
546     struct sigaction sig_action;
547     int saved_errno;
548     pid_t pid;
549 
550     /*
551      * Need to block signals now, so that child process can safely
552      * kill off caller's signal handlers without a race.
553      */
554     sigfillset(&newmask);
555     if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
556         error_setg_errno(errp, errno,
557                          "cannot block signals");
558         return -1;
559     }
560 
561     pid = fork();
562     saved_errno = errno;
563 
564     if (pid < 0) {
565         /* attempt to restore signal mask, but ignore failure, to
566          * avoid obscuring the fork failure */
567         (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
568         error_setg_errno(errp, saved_errno,
569                          "cannot fork child process");
570         errno = saved_errno;
571         return -1;
572     } else if (pid) {
573         /* parent process */
574 
575         /* Restore our original signal mask now that the child is
576          * safely running. Only documented failures are EFAULT (not
577          * possible, since we are using just-grabbed mask) or EINVAL
578          * (not possible, since we are using correct arguments).  */
579         (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
580     } else {
581         /* child process */
582         size_t i;
583 
584         /* Clear out all signal handlers from parent so nothing
585          * unexpected can happen in our child once we unblock
586          * signals */
587         sig_action.sa_handler = SIG_DFL;
588         sig_action.sa_flags = 0;
589         sigemptyset(&sig_action.sa_mask);
590 
591         for (i = 1; i < NSIG; i++) {
592             /* Only possible errors are EFAULT or EINVAL The former
593              * won't happen, the latter we expect, so no need to check
594              * return value */
595             (void)sigaction(i, &sig_action, NULL);
596         }
597 
598         /* Unmask all signals in child, since we've no idea what the
599          * caller's done with their signal mask and don't want to
600          * propagate that to children */
601         sigemptyset(&newmask);
602         if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
603             Error *local_err = NULL;
604             error_setg_errno(&local_err, errno,
605                              "cannot unblock signals");
606             error_report_err(local_err);
607             _exit(1);
608         }
609     }
610     return pid;
611 }
612 
613 void *qemu_alloc_stack(size_t *sz)
614 {
615     void *ptr, *guardpage;
616     int flags;
617 #ifdef CONFIG_DEBUG_STACK_USAGE
618     void *ptr2;
619 #endif
620     size_t pagesz = qemu_real_host_page_size;
621 #ifdef _SC_THREAD_STACK_MIN
622     /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
623     long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
624     *sz = MAX(MAX(min_stack_sz, 0), *sz);
625 #endif
626     /* adjust stack size to a multiple of the page size */
627     *sz = ROUND_UP(*sz, pagesz);
628     /* allocate one extra page for the guard page */
629     *sz += pagesz;
630 
631     flags = MAP_PRIVATE | MAP_ANONYMOUS;
632 #if defined(MAP_STACK) && defined(__OpenBSD__)
633     /* Only enable MAP_STACK on OpenBSD. Other OS's such as
634      * Linux/FreeBSD/NetBSD have a flag with the same name
635      * but have differing functionality. OpenBSD will SEGV
636      * if it spots execution with a stack pointer pointing
637      * at memory that was not allocated with MAP_STACK.
638      */
639     flags |= MAP_STACK;
640 #endif
641 
642     ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
643     if (ptr == MAP_FAILED) {
644         perror("failed to allocate memory for stack");
645         abort();
646     }
647 
648 #if defined(HOST_IA64)
649     /* separate register stack */
650     guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
651 #elif defined(HOST_HPPA)
652     /* stack grows up */
653     guardpage = ptr + *sz - pagesz;
654 #else
655     /* stack grows down */
656     guardpage = ptr;
657 #endif
658     if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
659         perror("failed to set up stack guard page");
660         abort();
661     }
662 
663 #ifdef CONFIG_DEBUG_STACK_USAGE
664     for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
665         *(uint32_t *)ptr2 = 0xdeadbeaf;
666     }
667 #endif
668 
669     return ptr;
670 }
671 
672 #ifdef CONFIG_DEBUG_STACK_USAGE
673 static __thread unsigned int max_stack_usage;
674 #endif
675 
676 void qemu_free_stack(void *stack, size_t sz)
677 {
678 #ifdef CONFIG_DEBUG_STACK_USAGE
679     unsigned int usage;
680     void *ptr;
681 
682     for (ptr = stack + qemu_real_host_page_size; ptr < stack + sz;
683          ptr += sizeof(uint32_t)) {
684         if (*(uint32_t *)ptr != 0xdeadbeaf) {
685             break;
686         }
687     }
688     usage = sz - (uintptr_t) (ptr - stack);
689     if (usage > max_stack_usage) {
690         error_report("thread %d max stack usage increased from %u to %u",
691                      qemu_get_thread_id(), max_stack_usage, usage);
692         max_stack_usage = usage;
693     }
694 #endif
695 
696     munmap(stack, sz);
697 }
698 
699 void sigaction_invoke(struct sigaction *action,
700                       struct qemu_signalfd_siginfo *info)
701 {
702     siginfo_t si = {};
703     si.si_signo = info->ssi_signo;
704     si.si_errno = info->ssi_errno;
705     si.si_code = info->ssi_code;
706 
707     /* Convert the minimal set of fields defined by POSIX.
708      * Positive si_code values are reserved for kernel-generated
709      * signals, where the valid siginfo fields are determined by
710      * the signal number.  But according to POSIX, it is unspecified
711      * whether SI_USER and SI_QUEUE have values less than or equal to
712      * zero.
713      */
714     if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
715         info->ssi_code <= 0) {
716         /* SIGTERM, etc.  */
717         si.si_pid = info->ssi_pid;
718         si.si_uid = info->ssi_uid;
719     } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
720                info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
721         si.si_addr = (void *)(uintptr_t)info->ssi_addr;
722     } else if (info->ssi_signo == SIGCHLD) {
723         si.si_pid = info->ssi_pid;
724         si.si_status = info->ssi_status;
725         si.si_uid = info->ssi_uid;
726     }
727     action->sa_sigaction(info->ssi_signo, &si, NULL);
728 }
729