1 /*
2 * %CopyrightBegin%
3 *
4 * Copyright Ericsson AB 2002-2018. All Rights Reserved.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 * %CopyrightEnd%
19 */
20
21 /*
22 * This program is started at erts startup and all fork's that
23 * have to be done are done in here. This is done for a couple
24 * of reasons:
25 * - Allow usage of fork without a memory explosion.
26 * -- we do not want to use vfork, as it blocks the VM
27 * until the execv is done, and if the program that
28 * is to be executed is on an NFS that is unavailable,
29 * the execv can block for a very long time.
30 * -- we cannot do fork inside the VM as that would temporarily
31 * duplicate the memory usage of the VM per parallel exec.
32 *
33 * Some implementation notes:
34 * - A single Unix Domain Socket is setup in between the VM and
35 * this program. Over that UDS the file descriptors that should
36 * be used to talk to the child program are sent.
37 * The actual command to execute, together with options and the
38 * environment, is sent over the pipe represented by the
39 * file descriptors mentioned above. We don't send the
40 * command over the UDS as that would increase the likely hood
41 * that it's buffer would be full.
42 *
43 * - Since it is this program that execv's, it has to take care of
44 * all the SIGCHLD signals that the child programs generate. The
45 * signals are received and the pid+exit reason is sent as data
46 * on the UDS to the VM. The VM is then able to map the pid to the
47 * port of the child program that just exited and deliver the status
48 * code if requested.
49 */
50
51 #ifdef HAVE_CONFIG_H
52 # include "config.h"
53 #endif
54
55 #include <stdlib.h>
56 #include <stdio.h>
57 #include <stdarg.h>
58 #include <sys/wait.h>
59 #include <sys/types.h>
60 #include <sys/socket.h>
61
62 #define WANT_NONBLOCKING
63
64 #include "erl_driver.h"
65 #include "sys_uds.h"
66 #include "erl_term.h"
67 #include "erl_child_setup.h"
68
69 #undef ERTS_GLB_INLINE_INCL_FUNC_DEF
70 #define ERTS_GLB_INLINE_INCL_FUNC_DEF 1
71 #include "hash.h"
72
73 #define SET_CLOEXEC(fd) fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC)
74
75 #if defined(__ANDROID__)
76 #define SHELL "/system/bin/sh"
77 #else
78 #define SHELL "/bin/sh"
79 #endif /* __ANDROID__ */
80
81 #if !defined(MSG_DONTWAIT) && defined(MSG_NONBLOCK)
82 #define MSG_DONTWAIT MSG_NONBLOCK
83 #endif
84
85 //#define HARD_DEBUG
86 #ifdef HARD_DEBUG
87 #define DEBUG_PRINT(fmt, ...) fprintf(stderr, "%d:" fmt "\r\n", getpid(), ##__VA_ARGS__)
88 #else
89 #define DEBUG_PRINT(fmt, ...)
90 #endif
91
92 #ifdef __clang_analyzer__
93 /* CodeChecker does not seem to understand inline asm in FD_ZERO */
94 # undef FD_ZERO
95 # define FD_ZERO(FD_SET_PTR) memset(FD_SET_PTR, 0, sizeof(fd_set))
96 #endif
97
98 static char abort_reason[200]; /* for core dump inspection */
99
ABORT(const char * fmt,...)100 static void ABORT(const char* fmt, ...)
101 {
102 va_list arglist;
103 va_start(arglist, fmt);
104 vsprintf(abort_reason, fmt, arglist);
105 fprintf(stderr, "erl_child_setup: %s\r\n", abort_reason);
106 va_end(arglist);
107 abort();
108 }
109
110 #ifdef DEBUG
111 void
erl_assert_error(const char * expr,const char * func,const char * file,int line)112 erl_assert_error(const char* expr, const char* func, const char* file, int line)
113 {
114 fflush(stdout);
115 fprintf(stderr, "%s:%d:%s() Assertion failed: %s\n",
116 file, line, func, expr);
117 fflush(stderr);
118 abort();
119 }
120 #endif
121
sys_sigblock(int sig)122 void sys_sigblock(int sig)
123 {
124 sigset_t mask;
125
126 sigemptyset(&mask);
127 sigaddset(&mask, sig);
128 sigprocmask(SIG_BLOCK, &mask, (sigset_t *)NULL);
129 }
130
sys_sigrelease(int sig)131 void sys_sigrelease(int sig)
132 {
133 sigset_t mask;
134 sigemptyset(&mask);
135 sigaddset(&mask, sig);
136 sigprocmask(SIG_UNBLOCK, &mask, (sigset_t *)NULL);
137 }
138
139 static void add_os_pid_to_port_id_mapping(Eterm, pid_t);
140 static Eterm get_port_id(pid_t);
141 static int forker_hash_init(void);
142
143 static int max_files = -1;
144 static int sigchld_pipe[2];
145
146 static int
start_new_child(int pipes[])147 start_new_child(int pipes[])
148 {
149 struct sigaction sa;
150 int errln = -1;
151 int size, res, i, pos = 0;
152 char *buff, *o_buff;
153
154 char *cmd, *cwd, *wd, **new_environ, **args = NULL;
155
156 Sint32 cnt, flags;
157
158 /* only child executes here */
159
160 /* Restore default handling of sigterm... */
161 sa.sa_handler = SIG_DFL;
162 sigemptyset(&sa.sa_mask);
163 sa.sa_flags = 0;
164
165 if (sigaction(SIGTERM, &sa, 0) == -1) {
166 perror(NULL);
167 exit(1);
168 }
169
170 do {
171 res = read(pipes[0], (char*)&size, sizeof(size));
172 } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK));
173
174 if (res <= 0) {
175 errln = __LINE__;
176 goto child_error;
177 }
178
179 buff = malloc(size);
180
181 DEBUG_PRINT("size = %d", size);
182
183 do {
184 if ((res = read(pipes[0], buff + pos, size - pos)) < 0) {
185 if (errno == ERRNO_BLOCK || errno == EINTR)
186 continue;
187 errln = __LINE__;
188 goto child_error;
189 }
190 if (res == 0) {
191 errno = EPIPE;
192 errln = __LINE__;
193 goto child_error;
194 }
195 pos += res;
196 } while(size - pos != 0);
197
198 o_buff = buff;
199
200 flags = get_int32(buff);
201 buff += sizeof(flags);
202
203 DEBUG_PRINT("flags = %d", flags);
204
205 cmd = buff;
206 buff += strlen(buff) + 1;
207
208 cwd = buff;
209 buff += strlen(buff) + 1;
210
211 if (*buff == '\0') {
212 wd = NULL;
213 } else {
214 wd = buff;
215 buff += strlen(buff) + 1;
216 }
217 buff++;
218
219 DEBUG_PRINT("wd = %s", wd);
220
221 cnt = get_int32(buff);
222 buff += sizeof(cnt);
223 new_environ = malloc(sizeof(char*)*(cnt + 1));
224
225 DEBUG_PRINT("env_len = %d", cnt);
226 for (i = 0; i < cnt; i++, buff++) {
227 new_environ[i] = buff;
228 while(*buff != '\0') buff++;
229 }
230 new_environ[cnt] = NULL;
231
232 if (o_buff + size != buff) {
233 /* This is a spawn executable call */
234 cnt = get_int32(buff);
235 buff += sizeof(cnt);
236 args = malloc(sizeof(char*)*(cnt + 1));
237 for (i = 0; i < cnt; i++, buff++) {
238 args[i] = buff;
239 while(*buff != '\0') buff++;
240 }
241 args[cnt] = NULL;
242 }
243
244 if (o_buff + size != buff) {
245 errno = EINVAL;
246 errln = __LINE__;
247 fprintf(stderr,"erl_child_setup: failed with protocol "
248 "error %d on line %d", errno, errln);
249 /* we abort here as it is most likely a symptom of an
250 emulator/erl_child_setup bug */
251 abort();
252 }
253
254 DEBUG_PRINT("read ack");
255 do {
256 ErtsSysForkerProto proto;
257 res = read(pipes[0], &proto, sizeof(proto));
258 if (res > 0) {
259 ASSERT(proto.action == ErtsSysForkerProtoAction_Ack);
260 ASSERT(res == sizeof(proto));
261 }
262 } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK));
263
264 if (res < 1) {
265 errno = EPIPE;
266 errln = __LINE__;
267 goto child_error;
268 }
269
270 DEBUG_PRINT("Set cwd to: '%s'",cwd);
271
272 if (chdir(cwd) < 0) {
273 /* This is not good, it probably means that the cwd of
274 beam is invalid. We ignore it and try anyways as
275 the child might now need a cwd or the chdir below
276 could take us to a valid directory.
277 */
278 }
279
280 DEBUG_PRINT("Set wd to: '%s'",wd);
281
282 if (wd && chdir(wd) < 0) {
283 int err = errno;
284 fprintf(stderr,"spawn: Could not cd to %s\r\n", wd);
285 _exit(err);
286 }
287
288 DEBUG_PRINT("Do that forking business: '%s'",cmd);
289
290 /* When the dup2'ing below is done, only
291 fd's 0, 1, 2 and maybe 3, 4 should survive the
292 exec. All other fds (i.e. the unix domain sockets
293 and stray pipe ends) should have CLOEXEC set on them
294 so they will be closed when the exec happens */
295 if (flags & FORKER_FLAG_USE_STDIO) {
296 /* stdin for process */
297 if (flags & FORKER_FLAG_DO_WRITE &&
298 dup2(pipes[0], 0) < 0) {
299 errln = __LINE__;
300 goto child_error;
301 }
302 /* stdout for process */
303 if (flags & FORKER_FLAG_DO_READ &&
304 dup2(pipes[1], 1) < 0) {
305 errln = __LINE__;
306 goto child_error;
307 }
308 }
309 else { /* XXX will fail if pipes[0] == 4 (unlikely..) */
310 if (flags & FORKER_FLAG_DO_READ && dup2(pipes[1], 4) < 0) {
311 errln = __LINE__;
312 goto child_error;
313 }
314 if (flags & FORKER_FLAG_DO_WRITE && dup2(pipes[0], 3) < 0) {
315 errln = __LINE__;
316 goto child_error;
317 }
318 }
319
320 /* we do the dup2 of stderr last so that errors
321 in child_error will be printed to stderr */
322 if (dup2(pipes[2], 2) < 0) {
323 errln = __LINE__;
324 goto child_error;
325 }
326
327 #if defined(USE_SETPGRP_NOARGS) /* SysV */
328 (void) setpgrp();
329 #elif defined(USE_SETPGRP) /* BSD */
330 (void) setpgrp(0, getpid());
331 #else /* POSIX */
332 (void) setsid();
333 #endif
334
335 close(pipes[0]);
336 close(pipes[1]);
337 close(pipes[2]);
338
339 sys_sigrelease(SIGCHLD);
340
341 if (args) {
342 /* spawn_executable */
343 execve(cmd, args, new_environ);
344 } else {
345 execle(SHELL, "sh", "-c", cmd, (char *) NULL, new_environ);
346 }
347
348 DEBUG_PRINT("exec error: %d",errno);
349 _exit(errno);
350
351 child_error:
352 fprintf(stderr,"erl_child_setup: failed with error %d on line %d\r\n",
353 errno, errln);
354 _exit(errno);
355 }
356
357
358 /*
359 * [OTP-3906]
360 * Solaris signal management gets confused when threads are used and a
361 * lot of child processes dies. The confusion results in that SIGCHLD
362 * signals aren't delivered to the emulator which in turn results in
363 * a lot of defunct processes in the system.
364 *
365 * The problem seems to appear when a signal is frequently
366 * blocked/unblocked at the same time as the signal is frequently
367 * propagated. The child waiter thread is a workaround for this problem.
368 * The SIGCHLD signal is always blocked (in all threads), and the child
369 * waiter thread fetches the signal by a call to sigwait(). See
370 * child_waiter().
371 *
372 * This should be a non-issue since the fork:ing was moved outside of
373 * the emulator into erl_child_setup. I'm leaving the comment here
374 * for posterity. */
375
handle_sigchld(int sig)376 static void handle_sigchld(int sig) {
377 int buff[2], res, __preverrno = errno;
378
379 sys_sigblock(SIGCHLD);
380
381 while ((buff[0] = waitpid((pid_t)(-1), buff+1, WNOHANG)) > 0) {
382 do {
383 res = write(sigchld_pipe[1], buff, sizeof(buff));
384 } while (res < 0 && errno == EINTR);
385 if (res <= 0)
386 ABORT("Failed to write to sigchld_pipe (%d): %d (%d)", sigchld_pipe[1], res, errno);
387 DEBUG_PRINT("Reap child %d (%d)", buff[0], buff[1]);
388 }
389
390 sys_sigrelease(SIGCHLD);
391
392 /* We save and restore the original errno as otherwise
393 the thread we are running in may end up with an
394 unexpected errno. An example of when this happened
395 was when the select in main had gotten an EINTR but
396 before the errno was checked the signal handler
397 was called and set errno to ECHILD from waitpid
398 which caused erl_child_setup to abort as it does
399 not expect ECHILD to be set after select */
400 errno = __preverrno;
401 }
402
403 #if defined(__ANDROID__)
system_properties_fd(void)404 static int system_properties_fd(void)
405 {
406 static int fd = -2;
407 char *env;
408
409 if (fd != -2) return fd;
410 env = getenv("ANDROID_PROPERTY_WORKSPACE");
411 if (!env) {
412 fd = -1;
413 return -1;
414 }
415 fd = atoi(env);
416 return fd;
417 }
418 #endif /* __ANDROID__ */
419
420 int
main(int argc,char * argv[])421 main(int argc, char *argv[])
422 {
423 /* This fd should be open from beam */
424 int uds_fd = 3, max_fd = 3;
425 #ifndef HAVE_CLOSEFROM
426 int i;
427 DIR *dir;
428 #endif
429 struct sigaction sa;
430
431 if (argc < 2 || sscanf(argv[1],"%d",&max_files) != 1) {
432 ABORT("Invalid arguments to child_setup");
433 }
434
435 /* We close all fds except the uds from beam.
436 All other fds from now on will have the
437 CLOEXEC flags set on them. This means that we
438 only have to close a very limited number of fds
439 after we fork before the exec. */
440 #if defined(HAVE_CLOSEFROM)
441 closefrom(4);
442 #else
443 dir = opendir("/dev/fd");
444 if (dir == NULL) { /* /dev/fd not available */
445 for (i = 4; i < max_files; i++)
446 #if defined(__ANDROID__)
447 if (i != system_properties_fd())
448 #endif
449 (void) close(i);
450 } else {
451 /* Iterate over fds obtained from /dev/fd */
452 struct dirent *entry;
453 int dir_fd = dirfd(dir);
454
455 while ((entry = readdir(dir)) != NULL) {
456 i = atoi(entry->d_name);
457 #if defined(__ANDROID__)
458 if (i != system_properties_fd())
459 #endif
460 if (i >= 4 && i != dir_fd)
461 (void) close(i);
462 }
463
464 closedir(dir);
465 }
466 #endif
467
468 if (pipe(sigchld_pipe) < 0) {
469 ABORT("Failed to setup sigchld pipe (%d)", errno);
470 }
471
472 SET_CLOEXEC(sigchld_pipe[0]);
473 SET_CLOEXEC(sigchld_pipe[1]);
474
475 max_fd = max_fd < sigchld_pipe[0] ? sigchld_pipe[0] : max_fd;
476
477 sa.sa_handler = &handle_sigchld;
478 sigemptyset(&sa.sa_mask);
479 sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
480 if (sigaction(SIGCHLD, &sa, 0) == -1) {
481 perror(NULL);
482 exit(1);
483 }
484
485 /* Ignore SIGTERM.
486 Some container environments send SIGTERM to all processes
487 when terminating. We don't want erl_child_setup to terminate
488 in these cases as that will prevent beam from properly
489 cleaning up.
490 */
491 sa.sa_handler = SIG_IGN;
492 sigemptyset(&sa.sa_mask);
493 sa.sa_flags = 0;
494
495 if (sigaction(SIGTERM, &sa, 0) == -1) {
496 perror(NULL);
497 exit(1);
498 }
499
500 forker_hash_init();
501
502 SET_CLOEXEC(uds_fd);
503
504 DEBUG_PRINT("Starting forker %d", max_files);
505
506 while (1) {
507 fd_set read_fds;
508 int res;
509 FD_ZERO(&read_fds);
510 FD_SET(uds_fd, &read_fds);
511 FD_SET(sigchld_pipe[0], &read_fds);
512 DEBUG_PRINT("child_setup selecting on %d, %d (%d)",
513 uds_fd, sigchld_pipe[0], max_fd);
514 res = select(max_fd+1, &read_fds, NULL, NULL, NULL);
515
516 if (res < 0) {
517 if (errno == EINTR) continue;
518 ABORT("Select failed: %d (%d)",res, errno);
519 }
520
521 if (FD_ISSET(uds_fd, &read_fds)) {
522 int pipes[3], res, os_pid;
523 ErtsSysForkerProto proto;
524 errno = 0;
525 if ((res = sys_uds_read(uds_fd, (char*)&proto, sizeof(proto),
526 pipes, 3, MSG_DONTWAIT)) < 0) {
527 if (errno == EINTR)
528 continue;
529 DEBUG_PRINT("erl_child_setup failed to read from uds: %d, %d", res, errno);
530 _exit(0);
531 }
532
533 if (res == 0) {
534 DEBUG_PRINT("uds was closed!");
535 _exit(0);
536 }
537 /* Since we use unix domain sockets and send the entire data in
538 one go we *should* get the entire payload at once. */
539 ASSERT(res == sizeof(proto));
540 ASSERT(proto.action == ErtsSysForkerProtoAction_Start);
541
542 sys_sigblock(SIGCHLD);
543
544 errno = 0;
545
546 os_pid = fork();
547 if (os_pid == 0)
548 start_new_child(pipes);
549
550 add_os_pid_to_port_id_mapping(proto.u.start.port_id, os_pid);
551
552 /* We write an ack here, but expect the reply on
553 the pipes[0] inside the fork */
554 proto.action = ErtsSysForkerProtoAction_Go;
555 proto.u.go.os_pid = os_pid;
556 proto.u.go.error_number = errno;
557 while (write(pipes[1], &proto, sizeof(proto)) < 0 && errno == EINTR)
558 ; /* remove gcc warning */
559
560 #ifdef FORKER_PROTO_START_ACK
561 proto.action = ErtsSysForkerProtoAction_StartAck;
562 while (write(uds_fd, &proto, sizeof(proto)) < 0 && errno == EINTR)
563 ; /* remove gcc warning */
564 #endif
565
566 sys_sigrelease(SIGCHLD);
567 close(pipes[0]);
568 close(pipes[1]);
569 close(pipes[2]);
570 }
571
572 if (FD_ISSET(sigchld_pipe[0], &read_fds)) {
573 int ibuff[2];
574 ErtsSysForkerProto proto;
575 res = read(sigchld_pipe[0], ibuff, sizeof(ibuff));
576 if (res <= 0) {
577 if (errno == EINTR)
578 continue;
579 ABORT("Failed to read from sigchld pipe: %d (%d)", res, errno);
580 }
581
582 proto.u.sigchld.port_id = get_port_id((pid_t)(ibuff[0]));
583
584 if (proto.u.sigchld.port_id == THE_NON_VALUE)
585 continue; /* exit status report not requested */
586
587 proto.action = ErtsSysForkerProtoAction_SigChld;
588 proto.u.sigchld.error_number = ibuff[1];
589 DEBUG_PRINT("send sigchld to %d (errno = %d)", uds_fd, ibuff[1]);
590 if (write(uds_fd, &proto, sizeof(proto)) < 0) {
591 if (errno == EINTR)
592 continue;
593 /* The uds was close, which most likely means that the VM
594 has exited. This will be detected when we try to read
595 from the uds_fd. */
596 DEBUG_PRINT("Failed to write to uds: %d (%d)", uds_fd, errno);
597 }
598 }
599 }
600 return 1;
601 }
602
603 typedef struct exit_status {
604 HashBucket hb;
605 pid_t os_pid;
606 Eterm port_id;
607 } ErtsSysExitStatus;
608
609 static Hash *forker_hash;
610
add_os_pid_to_port_id_mapping(Eterm port_id,pid_t os_pid)611 static void add_os_pid_to_port_id_mapping(Eterm port_id, pid_t os_pid)
612 {
613 if (port_id != THE_NON_VALUE) {
614 /* exit status report requested */
615 ErtsSysExitStatus es;
616 es.os_pid = os_pid;
617 es.port_id = port_id;
618 hash_put(forker_hash, &es);
619 }
620 }
621
get_port_id(pid_t os_pid)622 static Eterm get_port_id(pid_t os_pid)
623 {
624 ErtsSysExitStatus est, *es;
625 Eterm port_id;
626 est.os_pid = os_pid;
627 es = hash_remove(forker_hash, &est);
628 if (!es) return THE_NON_VALUE;
629 port_id = es->port_id;
630 free(es);
631 return port_id;
632 }
633
fcmp(void * a,void * b)634 static int fcmp(void *a, void *b)
635 {
636 ErtsSysExitStatus *sa = a;
637 ErtsSysExitStatus *sb = b;
638 return !(sa->os_pid == sb->os_pid);
639 }
640
fhash(void * e)641 static HashValue fhash(void *e)
642 {
643 ErtsSysExitStatus *se = e;
644 Uint32 val = se->os_pid;
645 val = (val+0x7ed55d16) + (val<<12);
646 val = (val^0xc761c23c) ^ (val>>19);
647 val = (val+0x165667b1) + (val<<5);
648 val = (val+0xd3a2646c) ^ (val<<9);
649 val = (val+0xfd7046c5) + (val<<3);
650 val = (val^0xb55a4f09) ^ (val>>16);
651 return val;
652 }
653
falloc(void * e)654 static void *falloc(void *e)
655 {
656 ErtsSysExitStatus *se = e;
657 ErtsSysExitStatus *ne = malloc(sizeof(ErtsSysExitStatus));
658 ne->os_pid = se->os_pid;
659 ne->port_id = se->port_id;
660 return ne;
661 }
662
meta_alloc(int type,size_t size)663 static void *meta_alloc(int type, size_t size) { return malloc(size); }
meta_free(int type,void * p)664 static void meta_free(int type, void *p) { free(p); }
665
forker_hash_init(void)666 static int forker_hash_init(void)
667 {
668 HashFunctions forker_hash_functions;
669 forker_hash_functions.hash = fhash;
670 forker_hash_functions.cmp = fcmp;
671 forker_hash_functions.alloc = falloc;
672 forker_hash_functions.free = free;
673 forker_hash_functions.meta_alloc = meta_alloc;
674 forker_hash_functions.meta_free = meta_free;
675 forker_hash_functions.meta_print = NULL;
676
677 forker_hash = hash_new(0, "forker_hash",
678 16, forker_hash_functions);
679
680 return 1;
681 }
682