1 /*
2  * %CopyrightBegin%
3  *
4  * Copyright Ericsson AB 2002-2018. All Rights Reserved.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  * %CopyrightEnd%
19  */
20 
21 /*
22  * This program is started at erts startup and all fork's that
23  * have to be done are done in here. This is done for a couple
24  * of reasons:
25  *  - Allow usage of fork without a memory explosion.
26  *  -- we do not want to use vfork, as it blocks the VM
27  *     until the execv is done, and if the program that
28  *     is to be executed is on an NFS that is unavailable,
29  *     the execv can block for a very long time.
30  *  -- we cannot do fork inside the VM as that would temporarily
31  *     duplicate the memory usage of the VM per parallel exec.
32  *
33  * Some implementation notes:
34  *  - A single Unix Domain Socket is setup in between the VM and
35  *    this program. Over that UDS the file descriptors that should
36  *    be used to talk to the child program are sent.
37  *    The actual command to execute, together with options and the
38  *    environment, is sent over the pipe represented by the
39  *    file descriptors mentioned above. We don't send the
40  *    command over the UDS as that would increase the likely hood
41  *    that it's buffer would be full.
42  *
43  *  - Since it is this program that execv's, it has to take care of
44  *    all the SIGCHLD signals that the child programs generate. The
45  *    signals are received and the pid+exit reason is sent as data
46  *    on the UDS to the VM. The VM is then able to map the pid to the
47  *    port of the child program that just exited and deliver the status
48  *    code if requested.
49  */
50 
51 #ifdef HAVE_CONFIG_H
52 #  include "config.h"
53 #endif
54 
55 #include <stdlib.h>
56 #include <stdio.h>
57 #include <stdarg.h>
58 #include <sys/wait.h>
59 #include <sys/types.h>
60 #include <sys/socket.h>
61 
62 #define WANT_NONBLOCKING
63 
64 #include "erl_driver.h"
65 #include "sys_uds.h"
66 #include "erl_term.h"
67 #include "erl_child_setup.h"
68 
69 #undef ERTS_GLB_INLINE_INCL_FUNC_DEF
70 #define ERTS_GLB_INLINE_INCL_FUNC_DEF 1
71 #include "hash.h"
72 
73 #define SET_CLOEXEC(fd) fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC)
74 
75 #if defined(__ANDROID__)
76 #define SHELL "/system/bin/sh"
77 #else
78 #define SHELL "/bin/sh"
79 #endif /* __ANDROID__ */
80 
81 #if !defined(MSG_DONTWAIT) && defined(MSG_NONBLOCK)
82 #define MSG_DONTWAIT MSG_NONBLOCK
83 #endif
84 
85 //#define HARD_DEBUG
86 #ifdef HARD_DEBUG
87 #define DEBUG_PRINT(fmt, ...) fprintf(stderr, "%d:" fmt "\r\n", getpid(), ##__VA_ARGS__)
88 #else
89 #define DEBUG_PRINT(fmt, ...)
90 #endif
91 
92 #ifdef __clang_analyzer__
93    /* CodeChecker does not seem to understand inline asm in FD_ZERO */
94 #  undef FD_ZERO
95 #  define FD_ZERO(FD_SET_PTR) memset(FD_SET_PTR, 0, sizeof(fd_set))
96 #endif
97 
98 static char abort_reason[200]; /* for core dump inspection */
99 
ABORT(const char * fmt,...)100 static void ABORT(const char* fmt, ...)
101 {
102     va_list arglist;
103     va_start(arglist, fmt);
104     vsprintf(abort_reason, fmt, arglist);
105     fprintf(stderr, "erl_child_setup: %s\r\n", abort_reason);
106     va_end(arglist);
107     abort();
108 }
109 
110 #ifdef DEBUG
111 void
erl_assert_error(const char * expr,const char * func,const char * file,int line)112 erl_assert_error(const char* expr, const char* func, const char* file, int line)
113 {
114     fflush(stdout);
115     fprintf(stderr, "%s:%d:%s() Assertion failed: %s\n",
116             file, line, func, expr);
117     fflush(stderr);
118     abort();
119 }
120 #endif
121 
sys_sigblock(int sig)122 void sys_sigblock(int sig)
123 {
124     sigset_t mask;
125 
126     sigemptyset(&mask);
127     sigaddset(&mask, sig);
128     sigprocmask(SIG_BLOCK, &mask, (sigset_t *)NULL);
129 }
130 
sys_sigrelease(int sig)131 void sys_sigrelease(int sig)
132 {
133     sigset_t mask;
134     sigemptyset(&mask);
135     sigaddset(&mask, sig);
136     sigprocmask(SIG_UNBLOCK, &mask, (sigset_t *)NULL);
137 }
138 
139 static void add_os_pid_to_port_id_mapping(Eterm, pid_t);
140 static Eterm get_port_id(pid_t);
141 static int forker_hash_init(void);
142 
143 static int max_files = -1;
144 static int sigchld_pipe[2];
145 
146 static int
start_new_child(int pipes[])147 start_new_child(int pipes[])
148 {
149     struct sigaction sa;
150     int errln = -1;
151     int size, res, i, pos = 0;
152     char *buff, *o_buff;
153 
154     char *cmd, *cwd, *wd, **new_environ, **args = NULL;
155 
156     Sint32 cnt, flags;
157 
158     /* only child executes here */
159 
160     /* Restore default handling of sigterm... */
161     sa.sa_handler = SIG_DFL;
162     sigemptyset(&sa.sa_mask);
163     sa.sa_flags = 0;
164 
165     if (sigaction(SIGTERM, &sa, 0) == -1) {
166         perror(NULL);
167         exit(1);
168     }
169 
170     do {
171         res = read(pipes[0], (char*)&size, sizeof(size));
172     } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK));
173 
174     if (res <= 0) {
175         errln = __LINE__;
176         goto child_error;
177     }
178 
179     buff = malloc(size);
180 
181     DEBUG_PRINT("size = %d", size);
182 
183     do {
184         if ((res = read(pipes[0], buff + pos, size - pos)) < 0) {
185             if (errno == ERRNO_BLOCK || errno == EINTR)
186                 continue;
187             errln = __LINE__;
188             goto child_error;
189         }
190         if (res == 0) {
191             errno = EPIPE;
192             errln = __LINE__;
193             goto child_error;
194         }
195         pos += res;
196     } while(size - pos != 0);
197 
198     o_buff = buff;
199 
200     flags = get_int32(buff);
201     buff += sizeof(flags);
202 
203     DEBUG_PRINT("flags = %d", flags);
204 
205     cmd = buff;
206     buff += strlen(buff) + 1;
207 
208     cwd = buff;
209     buff += strlen(buff) + 1;
210 
211     if (*buff == '\0') {
212         wd = NULL;
213     } else {
214         wd = buff;
215         buff += strlen(buff) + 1;
216     }
217     buff++;
218 
219     DEBUG_PRINT("wd = %s", wd);
220 
221     cnt = get_int32(buff);
222     buff += sizeof(cnt);
223     new_environ = malloc(sizeof(char*)*(cnt + 1));
224 
225     DEBUG_PRINT("env_len = %d", cnt);
226     for (i = 0; i < cnt; i++, buff++) {
227         new_environ[i] = buff;
228         while(*buff != '\0') buff++;
229     }
230     new_environ[cnt] = NULL;
231 
232     if (o_buff + size != buff) {
233         /* This is a spawn executable call */
234         cnt = get_int32(buff);
235         buff += sizeof(cnt);
236         args = malloc(sizeof(char*)*(cnt + 1));
237         for (i = 0; i < cnt; i++, buff++) {
238             args[i] = buff;
239             while(*buff != '\0') buff++;
240         }
241         args[cnt] = NULL;
242     }
243 
244     if (o_buff + size != buff) {
245         errno = EINVAL;
246         errln = __LINE__;
247         fprintf(stderr,"erl_child_setup: failed with protocol "
248                 "error %d on line %d", errno, errln);
249         /* we abort here as it is most likely a symptom of an
250            emulator/erl_child_setup bug */
251         abort();
252     }
253 
254     DEBUG_PRINT("read ack");
255     do {
256         ErtsSysForkerProto proto;
257         res = read(pipes[0], &proto, sizeof(proto));
258         if (res > 0) {
259             ASSERT(proto.action == ErtsSysForkerProtoAction_Ack);
260             ASSERT(res == sizeof(proto));
261         }
262     } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK));
263 
264     if (res < 1) {
265         errno = EPIPE;
266         errln = __LINE__;
267         goto child_error;
268     }
269 
270     DEBUG_PRINT("Set cwd to: '%s'",cwd);
271 
272     if (chdir(cwd) < 0) {
273         /* This is not good, it probably means that the cwd of
274            beam is invalid. We ignore it and try anyways as
275            the child might now need a cwd or the chdir below
276            could take us to a valid directory.
277         */
278     }
279 
280     DEBUG_PRINT("Set wd to: '%s'",wd);
281 
282     if (wd && chdir(wd) < 0) {
283         int err = errno;
284         fprintf(stderr,"spawn: Could not cd to %s\r\n", wd);
285         _exit(err);
286     }
287 
288     DEBUG_PRINT("Do that forking business: '%s'",cmd);
289 
290     /* When the dup2'ing below is done, only
291        fd's 0, 1, 2 and maybe 3, 4 should survive the
292        exec. All other fds (i.e. the unix domain sockets
293        and stray pipe ends) should have CLOEXEC set on them
294        so they will be closed when the exec happens */
295     if (flags & FORKER_FLAG_USE_STDIO) {
296         /* stdin for process */
297         if (flags & FORKER_FLAG_DO_WRITE &&
298             dup2(pipes[0], 0) < 0) {
299             errln = __LINE__;
300             goto child_error;
301         }
302         /* stdout for process */
303         if (flags & FORKER_FLAG_DO_READ &&
304             dup2(pipes[1], 1) < 0) {
305             errln = __LINE__;
306             goto child_error;
307         }
308     }
309     else {	/* XXX will fail if pipes[0] == 4 (unlikely..) */
310         if (flags & FORKER_FLAG_DO_READ && dup2(pipes[1], 4) < 0) {
311             errln = __LINE__;
312             goto child_error;
313         }
314         if (flags & FORKER_FLAG_DO_WRITE && dup2(pipes[0], 3) < 0) {
315             errln = __LINE__;
316             goto child_error;
317         }
318     }
319 
320     /* we do the dup2 of stderr last so that errors
321        in child_error will be printed to stderr */
322     if (dup2(pipes[2], 2) < 0) {
323         errln = __LINE__;
324         goto child_error;
325     }
326 
327 #if defined(USE_SETPGRP_NOARGS)		/* SysV */
328     (void) setpgrp();
329 #elif defined(USE_SETPGRP)		/* BSD */
330     (void) setpgrp(0, getpid());
331 #else					/* POSIX */
332     (void) setsid();
333 #endif
334 
335     close(pipes[0]);
336     close(pipes[1]);
337     close(pipes[2]);
338 
339     sys_sigrelease(SIGCHLD);
340 
341     if (args) {
342         /* spawn_executable */
343         execve(cmd, args, new_environ);
344     } else {
345         execle(SHELL, "sh", "-c", cmd, (char *) NULL, new_environ);
346     }
347 
348     DEBUG_PRINT("exec error: %d",errno);
349     _exit(errno);
350 
351 child_error:
352     fprintf(stderr,"erl_child_setup: failed with error %d on line %d\r\n",
353             errno, errln);
354     _exit(errno);
355 }
356 
357 
358 /*
359  * [OTP-3906]
360  * Solaris signal management gets confused when threads are used and a
361  * lot of child processes dies. The confusion results in that SIGCHLD
362  * signals aren't delivered to the emulator which in turn results in
363  * a lot of defunct processes in the system.
364  *
365  * The problem seems to appear when a signal is frequently
366  * blocked/unblocked at the same time as the signal is frequently
367  * propagated. The child waiter thread is a workaround for this problem.
368  * The SIGCHLD signal is always blocked (in all threads), and the child
369  * waiter thread fetches the signal by a call to sigwait(). See
370  * child_waiter().
371  *
372  * This should be a non-issue since the fork:ing was moved outside of
373  * the emulator into erl_child_setup. I'm leaving the comment here
374  * for posterity. */
375 
handle_sigchld(int sig)376 static void handle_sigchld(int sig) {
377     int buff[2], res, __preverrno = errno;
378 
379     sys_sigblock(SIGCHLD);
380 
381     while ((buff[0] = waitpid((pid_t)(-1), buff+1, WNOHANG)) > 0) {
382         do {
383             res = write(sigchld_pipe[1], buff, sizeof(buff));
384         } while (res < 0 && errno == EINTR);
385         if (res <= 0)
386             ABORT("Failed to write to sigchld_pipe (%d): %d (%d)", sigchld_pipe[1], res, errno);
387         DEBUG_PRINT("Reap child %d (%d)", buff[0], buff[1]);
388     }
389 
390     sys_sigrelease(SIGCHLD);
391 
392     /* We save and restore the original errno as otherwise
393        the thread we are running in may end up with an
394        unexpected errno. An example of when this happened
395        was when the select in main had gotten an EINTR but
396        before the errno was checked the signal handler
397        was called and set errno to ECHILD from waitpid
398        which caused erl_child_setup to abort as it does
399        not expect ECHILD to be set after select */
400     errno = __preverrno;
401 }
402 
403 #if defined(__ANDROID__)
system_properties_fd(void)404 static int system_properties_fd(void)
405 {
406     static int fd = -2;
407     char *env;
408 
409     if (fd != -2) return fd;
410     env = getenv("ANDROID_PROPERTY_WORKSPACE");
411     if (!env) {
412         fd = -1;
413         return -1;
414     }
415     fd = atoi(env);
416     return fd;
417 }
418 #endif /* __ANDROID__ */
419 
420 int
main(int argc,char * argv[])421 main(int argc, char *argv[])
422 {
423     /* This fd should be open from beam */
424     int uds_fd = 3, max_fd = 3;
425 #ifndef HAVE_CLOSEFROM
426     int i;
427     DIR *dir;
428 #endif
429     struct sigaction sa;
430 
431     if (argc < 2 || sscanf(argv[1],"%d",&max_files) != 1) {
432         ABORT("Invalid arguments to child_setup");
433     }
434 
435 /* We close all fds except the uds from beam.
436    All other fds from now on will have the
437    CLOEXEC flags set on them. This means that we
438    only have to close a very limited number of fds
439    after we fork before the exec. */
440 #if defined(HAVE_CLOSEFROM)
441     closefrom(4);
442 #else
443     dir = opendir("/dev/fd");
444     if (dir == NULL) { /* /dev/fd not available */
445         for (i = 4; i < max_files; i++)
446 #if defined(__ANDROID__)
447             if (i != system_properties_fd())
448 #endif
449             (void) close(i);
450     } else {
451         /* Iterate over fds obtained from /dev/fd */
452         struct dirent *entry;
453         int dir_fd = dirfd(dir);
454 
455         while ((entry = readdir(dir)) != NULL) {
456             i = atoi(entry->d_name);
457 #if defined(__ANDROID__)
458             if (i != system_properties_fd())
459 #endif
460             if (i >= 4 && i != dir_fd)
461                 (void) close(i);
462         }
463 
464         closedir(dir);
465     }
466 #endif
467 
468     if (pipe(sigchld_pipe) < 0) {
469         ABORT("Failed to setup sigchld pipe (%d)", errno);
470     }
471 
472     SET_CLOEXEC(sigchld_pipe[0]);
473     SET_CLOEXEC(sigchld_pipe[1]);
474 
475     max_fd = max_fd < sigchld_pipe[0] ? sigchld_pipe[0] : max_fd;
476 
477     sa.sa_handler = &handle_sigchld;
478     sigemptyset(&sa.sa_mask);
479     sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
480     if (sigaction(SIGCHLD, &sa, 0) == -1) {
481         perror(NULL);
482         exit(1);
483     }
484 
485     /* Ignore SIGTERM.
486        Some container environments send SIGTERM to all processes
487        when terminating. We don't want erl_child_setup to terminate
488        in these cases as that will prevent beam from properly
489        cleaning up.
490     */
491     sa.sa_handler = SIG_IGN;
492     sigemptyset(&sa.sa_mask);
493     sa.sa_flags = 0;
494 
495     if (sigaction(SIGTERM, &sa, 0) == -1) {
496         perror(NULL);
497         exit(1);
498     }
499 
500     forker_hash_init();
501 
502     SET_CLOEXEC(uds_fd);
503 
504     DEBUG_PRINT("Starting forker %d", max_files);
505 
506     while (1) {
507         fd_set read_fds;
508         int res;
509         FD_ZERO(&read_fds);
510         FD_SET(uds_fd, &read_fds);
511         FD_SET(sigchld_pipe[0], &read_fds);
512         DEBUG_PRINT("child_setup selecting on %d, %d (%d)",
513                 uds_fd, sigchld_pipe[0], max_fd);
514         res = select(max_fd+1, &read_fds, NULL, NULL, NULL);
515 
516         if (res < 0) {
517             if (errno == EINTR) continue;
518             ABORT("Select failed: %d (%d)",res, errno);
519         }
520 
521         if (FD_ISSET(uds_fd, &read_fds)) {
522             int pipes[3], res, os_pid;
523             ErtsSysForkerProto proto;
524             errno = 0;
525             if ((res = sys_uds_read(uds_fd, (char*)&proto, sizeof(proto),
526                                     pipes, 3, MSG_DONTWAIT)) < 0) {
527                 if (errno == EINTR)
528                     continue;
529                 DEBUG_PRINT("erl_child_setup failed to read from uds: %d, %d", res, errno);
530                 _exit(0);
531             }
532 
533             if (res == 0) {
534                 DEBUG_PRINT("uds was closed!");
535                 _exit(0);
536             }
537             /* Since we use unix domain sockets and send the entire data in
538                one go we *should* get the entire payload at once. */
539             ASSERT(res == sizeof(proto));
540             ASSERT(proto.action == ErtsSysForkerProtoAction_Start);
541 
542             sys_sigblock(SIGCHLD);
543 
544             errno = 0;
545 
546             os_pid = fork();
547             if (os_pid == 0)
548                 start_new_child(pipes);
549 
550             add_os_pid_to_port_id_mapping(proto.u.start.port_id, os_pid);
551 
552             /* We write an ack here, but expect the reply on
553                the pipes[0] inside the fork */
554             proto.action = ErtsSysForkerProtoAction_Go;
555             proto.u.go.os_pid = os_pid;
556             proto.u.go.error_number = errno;
557             while (write(pipes[1], &proto, sizeof(proto)) < 0 && errno == EINTR)
558                 ; /* remove gcc warning */
559 
560 #ifdef FORKER_PROTO_START_ACK
561             proto.action = ErtsSysForkerProtoAction_StartAck;
562             while (write(uds_fd, &proto, sizeof(proto)) < 0 && errno == EINTR)
563                 ; /* remove gcc warning */
564 #endif
565 
566             sys_sigrelease(SIGCHLD);
567             close(pipes[0]);
568             close(pipes[1]);
569             close(pipes[2]);
570         }
571 
572         if (FD_ISSET(sigchld_pipe[0], &read_fds)) {
573             int ibuff[2];
574             ErtsSysForkerProto proto;
575             res = read(sigchld_pipe[0], ibuff, sizeof(ibuff));
576             if (res <= 0) {
577                 if (errno == EINTR)
578                     continue;
579                 ABORT("Failed to read from sigchld pipe: %d (%d)", res, errno);
580             }
581 
582             proto.u.sigchld.port_id = get_port_id((pid_t)(ibuff[0]));
583 
584             if (proto.u.sigchld.port_id == THE_NON_VALUE)
585                 continue; /* exit status report not requested */
586 
587             proto.action = ErtsSysForkerProtoAction_SigChld;
588             proto.u.sigchld.error_number = ibuff[1];
589             DEBUG_PRINT("send sigchld to %d (errno = %d)", uds_fd, ibuff[1]);
590             if (write(uds_fd, &proto, sizeof(proto)) < 0) {
591                 if (errno == EINTR)
592                     continue;
593                 /* The uds was close, which most likely means that the VM
594                    has exited. This will be detected when we try to read
595                    from the uds_fd. */
596                 DEBUG_PRINT("Failed to write to uds: %d (%d)", uds_fd, errno);
597             }
598         }
599     }
600     return 1;
601 }
602 
603 typedef struct exit_status {
604     HashBucket hb;
605     pid_t os_pid;
606     Eterm port_id;
607 } ErtsSysExitStatus;
608 
609 static Hash *forker_hash;
610 
add_os_pid_to_port_id_mapping(Eterm port_id,pid_t os_pid)611 static void add_os_pid_to_port_id_mapping(Eterm port_id, pid_t os_pid)
612 {
613     if (port_id != THE_NON_VALUE) {
614         /* exit status report requested */
615         ErtsSysExitStatus es;
616         es.os_pid = os_pid;
617         es.port_id = port_id;
618         hash_put(forker_hash, &es);
619     }
620 }
621 
get_port_id(pid_t os_pid)622 static Eterm get_port_id(pid_t os_pid)
623 {
624     ErtsSysExitStatus est, *es;
625     Eterm port_id;
626     est.os_pid = os_pid;
627     es = hash_remove(forker_hash, &est);
628     if (!es) return THE_NON_VALUE;
629     port_id = es->port_id;
630     free(es);
631     return port_id;
632 }
633 
fcmp(void * a,void * b)634 static int fcmp(void *a, void *b)
635 {
636     ErtsSysExitStatus *sa = a;
637     ErtsSysExitStatus *sb = b;
638     return !(sa->os_pid == sb->os_pid);
639 }
640 
fhash(void * e)641 static HashValue fhash(void *e)
642 {
643     ErtsSysExitStatus *se = e;
644     Uint32 val = se->os_pid;
645     val = (val+0x7ed55d16) + (val<<12);
646     val = (val^0xc761c23c) ^ (val>>19);
647     val = (val+0x165667b1) + (val<<5);
648     val = (val+0xd3a2646c) ^ (val<<9);
649     val = (val+0xfd7046c5) + (val<<3);
650     val = (val^0xb55a4f09) ^ (val>>16);
651     return val;
652 }
653 
falloc(void * e)654 static void *falloc(void *e)
655 {
656     ErtsSysExitStatus *se = e;
657     ErtsSysExitStatus *ne = malloc(sizeof(ErtsSysExitStatus));
658     ne->os_pid = se->os_pid;
659     ne->port_id = se->port_id;
660     return ne;
661 }
662 
meta_alloc(int type,size_t size)663 static void *meta_alloc(int type, size_t size) { return malloc(size); }
meta_free(int type,void * p)664 static void meta_free(int type, void *p)       { free(p); }
665 
forker_hash_init(void)666 static int forker_hash_init(void)
667 {
668     HashFunctions forker_hash_functions;
669     forker_hash_functions.hash = fhash;
670     forker_hash_functions.cmp = fcmp;
671     forker_hash_functions.alloc = falloc;
672     forker_hash_functions.free = free;
673     forker_hash_functions.meta_alloc = meta_alloc;
674     forker_hash_functions.meta_free  = meta_free;
675     forker_hash_functions.meta_print = NULL;
676 
677     forker_hash = hash_new(0, "forker_hash",
678                            16, forker_hash_functions);
679 
680     return 1;
681 }
682