1 /* Copyright (c) 2005-2008, Google Inc.
2  * All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  * ---
31  * Author: Markus Gutschke, Carl Crous
32  */
33 
34 #include "elfcore.h"
35 #if defined DUMPER
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39 
40 #include <elf.h>
41 #include <fcntl.h>
42 #include <limits.h>
43 #include <linux/sched.h>
44 #include <pthread.h>
45 #include <signal.h>
46 #include <stdbool.h>
47 #include <stdint.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <sys/poll.h>
51 #include <sys/prctl.h>
52 #include <sys/socket.h>
53 #include <sys/time.h>
54 #include <sys/uio.h>
55 #include <sys/wait.h>
56 
57 #include "coredumper/coredumper.h"
58 #include "linux_syscall_support.h"
59 #include "linuxthreads.h"
60 #include "thread_lister.h"
61 
62 #ifndef CLONE_UNTRACED
63 #define CLONE_UNTRACED 0x00800000
64 #endif
65 
66 #ifndef AT_SYSINFO_EHDR
67 #define AT_SYSINFO_EHDR 33
68 #endif
69 
70 #ifndef O_LARGEFILE
71 #if defined(__mips__)
72 #define O_LARGEFILE 0x2000
73 #elif defined(__ARM_ARCH_3__)
74 #define O_LARGEFILE 0400000
75 #elif defined(__PPC__) || defined(__ppc__)
76 #define O_LARGEFILE 0200000
77 #else
78 #define O_LARGEFILE 00100000 /* generic                                  */
79 #endif
80 #endif
81 
82 /* Data structures found in x86-32/64, ARM, and MIPS core dumps on Linux;
83  * similar data structures are defined in /usr/include/{linux,asm}/... but
84  * those headers conflict with the rest of the libc headers. So we cannot
85  * include them here.
86  */
87 
88 #if defined(__i386__) || defined(__x86_64__)
89 #if !defined(__x86_64__)
90 typedef struct fpregs { /* FPU registers                             */
91   uint32_t cwd;
92   uint32_t swd;
93   uint32_t twd;
94   uint32_t fip;
95   uint32_t fcs;
96   uint32_t foo;
97   uint32_t fos;
98   uint32_t st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes     */
99 } fpregs;
100 typedef struct fpxregs { /* SSE registers                             */
101 #define FPREGS fpxregs
102 #else
103 typedef struct fpxregs { /* x86-64 stores FPU registers in SSE struct */
104 } fpxregs;
105 typedef struct fpregs {      /* FPU registers                             */
106 #define FPREGS fpregs
107 #endif
108   uint16_t cwd;
109   uint16_t swd;
110   uint16_t twd;
111   uint16_t fop;
112   uint32_t fip;
113   uint32_t fcs;
114   uint32_t foo;
115   uint32_t fos;
116   uint32_t mxcsr;
117   uint32_t mxcsr_mask;
118   uint32_t st_space[32];  /*  8*16 bytes for each FP-reg  = 128 bytes  */
119   uint32_t xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes  */
120   uint32_t padding[24];
121 } FPREGS;
122 #undef FPREGS
123 #define regs i386_regs /* General purpose registers                 */
124 #elif defined(__ARM_ARCH_3__)
125 typedef struct fpxregs { /* No extended FPU registers on ARM          */
126 } fpxregs;
127 typedef struct fpregs { /* FPU registers                             */
128   struct fp_reg {
129     unsigned int sign1 : 1;
130     unsigned int unused : 15;
131     unsigned int sign2 : 1;
132     unsigned int exponent : 14;
133     unsigned int j : 1;
134     unsigned int mantissa1 : 31;
135     unsigned int mantissa0 : 32;
136   } fpregs[8];
137   unsigned int fpsr : 32;
138   unsigned int fpcr : 32;
139   unsigned char ftype[8];
140   unsigned int init_flag;
141 } fpregs;
142 #define regs arm_regs /* General purpose registers                 */
143 #elif defined(__mips__)
144 typedef struct fpxregs { /* No extended FPU registers on MIPS         */
145 } fpxregs;
146 typedef struct fpregs {
147   uint64_t fpuregs[32];
148   uint32_t fcr31;
149   uint32_t fir;
150 } fpregs;
151 #define regs mips_regs
152 #endif
153 
154 typedef struct elf_timeval { /* Time value with microsecond resolution    */
155   long tv_sec;               /* Seconds                                   */
156   long tv_usec;              /* Microseconds                              */
157 } elf_timeval;
158 
159 typedef struct elf_siginfo { /* Information about signal (unused)         */
160   int32_t si_signo;          /* Signal number                             */
161   int32_t si_code;           /* Extra code                                */
162   int32_t si_errno;          /* Errno                                     */
163 } elf_siginfo;
164 
165 typedef struct prstatus {   /* Information about thread; includes CPU reg*/
166   elf_siginfo pr_info;      /* Info associated with signal               */
167   uint16_t pr_cursig;       /* Current signal                            */
168   unsigned long pr_sigpend; /* Set of pending signals                    */
169   unsigned long pr_sighold; /* Set of held signals                       */
170   pid_t pr_pid;             /* Process ID                                */
171   pid_t pr_ppid;            /* Parent's process ID                       */
172   pid_t pr_pgrp;            /* Group ID                                  */
173   pid_t pr_sid;             /* Session ID                                */
174   elf_timeval pr_utime;     /* User time                                 */
175   elf_timeval pr_stime;     /* System time                               */
176   elf_timeval pr_cutime;    /* Cumulative user time                      */
177   elf_timeval pr_cstime;    /* Cumulative system time                    */
178   regs pr_reg;              /* CPU registers                             */
179   uint32_t pr_fpvalid;      /* True if math co-processor being used      */
180 } prstatus;
181 
182 typedef struct prpsinfo { /* Information about process                 */
183   unsigned char pr_state; /* Numeric process state                     */
184   char pr_sname;          /* Char for pr_state                         */
185   unsigned char pr_zomb;  /* Zombie                                    */
186   signed char pr_nice;    /* Nice val                                  */
187   unsigned long pr_flag;  /* Flags                                     */
188 #if defined(__x86_64__) || defined(__mips__)
189   uint32_t pr_uid; /* User ID                                   */
190   uint32_t pr_gid; /* Group ID                                  */
191 #else
192   uint16_t pr_uid; /* User ID                                   */
193   uint16_t pr_gid; /* Group ID                                  */
194 #endif
195   pid_t pr_pid;       /* Process ID                                */
196   pid_t pr_ppid;      /* Parent's process ID                       */
197   pid_t pr_pgrp;      /* Group ID                                  */
198   pid_t pr_sid;       /* Session ID                                */
199   char pr_fname[16];  /* Filename of executable                    */
200   char pr_psargs[80]; /* Initial part of arg list                  */
201 } prpsinfo;
202 
203 typedef struct core_user { /* Ptrace returns this data for thread state */
204 #ifndef __mips__
205   struct regs regs;      /* CPU registers                             */
206   unsigned long fpvalid; /* True if math co-processor being used      */
207 #if defined(__i386__) || defined(__x86_64__)
208   struct fpregs fpregs; /* FPU registers                             */
209 #endif
210   unsigned long tsize;       /* Text segment size in pages                */
211   unsigned long dsize;       /* Data segment size in pages                */
212   unsigned long ssize;       /* Stack segment size in pages               */
213   unsigned long start_code;  /* Starting virtual address of text          */
214   unsigned long start_stack; /* Starting virtual address of stack area    */
215   unsigned long signal;      /* Signal that caused the core dump          */
216   unsigned long reserved;    /* No longer used                            */
217   struct regs *regs_ptr;     /* Used by gdb to help find the CPU registers*/
218 #if defined(__i386__) || defined(__x86_64__)
219   struct fpregs *fpregs_ptr; /* Pointer to FPU registers                  */
220 #endif
221   unsigned long magic; /* Magic for old A.OUT core files            */
222   char comm[32];       /* User command that was responsible         */
223   unsigned long debugreg[8];
224 #if defined(__i386__) || defined(__x86_64__)
225   unsigned long error_code;    /* CPU error code or 0                       */
226   unsigned long fault_address; /* CR3 or 0                                  */
227 #elif defined(__ARM_ARCH_3__)
228   struct fpregs fpregs;      /* FPU registers                             */
229   struct fpregs *fpregs_ptr; /* Pointer to FPU registers                  */
230 #endif
231 #endif
232 } core_user;
233 
234 #if __WORDSIZE == 64
235 #define ELF_CLASS ELFCLASS64
236 #define Ehdr Elf64_Ehdr
237 #define Phdr Elf64_Phdr
238 #define Shdr Elf64_Shdr
239 #define Nhdr Elf64_Nhdr
240 #define auxv_t Elf64_auxv_t
241 #else
242 #define ELF_CLASS ELFCLASS32
243 #define Ehdr Elf32_Ehdr
244 #define Phdr Elf32_Phdr
245 #define Shdr Elf32_Shdr
246 #define Nhdr Elf32_Nhdr
247 #define auxv_t Elf32_auxv_t
248 #endif
249 
250 #if defined(__x86_64__)
251 #define ELF_ARCH EM_X86_64
252 #elif defined(__i386__)
253 #define ELF_ARCH EM_386
254 #elif defined(__ARM_ARCH_3__)
255 #define ELF_ARCH EM_ARM
256 #elif defined(__mips__)
257 #define ELF_ARCH EM_MIPS
258 #endif
259 
260 /* Wrap a class around system calls, in order to give us access to
261  * a private copy of errno. This only works in C++, but it has the
262  * advantage of not needing nested functions, which are a non-standard
263  * language extension.
264  */
265 #ifdef __cplusplus
266 namespace {
267 class SysCalls {
268  public:
269 #define SYS_CPLUSPLUS
270 #define SYS_ERRNO my_errno
271 #define SYS_INLINE inline
272 #define SYS_PREFIX -1
273 #undef SYS_LINUX_SYSCALL_SUPPORT_H
274 #include "linux_syscall_support.h"
SysCalls()275   SysCalls() : my_errno(0) {}
276   int my_errno;
277 };
278 }  // namespace
279 #define ERRNO sys.my_errno
280 #else
281 #define ERRNO my_errno
282 #endif
283 
284 /* Re-runs fn until it doesn't cause EINTR
285  */
286 #define NO_INTR(fn) \
287   do {              \
288   } while ((fn) < 0 && errno == EINTR)
289 #define MY_NO_INTR(fn) \
290   do {                 \
291   } while ((fn) < 0 && ERRNO == EINTR)
292 
293 /* Replacement memcpy.  GCC's __builtin_memcpy causes cores?
294  * Yes I know the return value isn't the same as memcpy().
295  */
my_memcpy(void * dest,const void * src,size_t len)296 static void my_memcpy(void *dest, const void *src, size_t len) {
297   char *d = (char *)dest;
298   const char *s = (const char *)src;
299   size_t i;
300   for (i = 0; i < len; ++i) *(d++) = *(s++);
301 }
302 
303 /* Wrapper for read() which is guaranteed to never return EINTR.
304  */
c_read(int f,void * buf,size_t bytes,int * errno_)305 static ssize_t c_read(int f, void *buf, size_t bytes, int *errno_) {
306   /* scope */ {
307 /* Define a private copy of syscall macros, which does not modify the
308  * global copy of errno.
309  */
310 #ifdef __cplusplus
311 #define sys0_read sys.read
312     SysCalls sys;
313 #else
314     int my_errno;
315 #define SYS_ERRNO my_errno
316 #define SYS_INLINE inline
317 #define SYS_PREFIX 0
318 #undef SYS_LINUX_SYSCALL_SUPPORT_H
319 #include "linux_syscall_support.h"
320 #endif
321 
322     if (bytes > 0) {
323       ssize_t rc;
324       MY_NO_INTR(rc = sys0_read(f, buf, bytes));
325       if (rc < 0) {
326         *errno_ = ERRNO;
327       }
328       return rc;
329     }
330     return 0;
331   }
332 }
333 
334 /* Wrapper for write() which is guaranteed to never return EINTR nor
335  * short writes.
336  */
c_write(int f,const void * void_buf,size_t bytes,int * errno_)337 static ssize_t c_write(int f, const void *void_buf, size_t bytes, int *errno_) {
338   /* scope */ {
339 /* Define a private copy of syscall macros, which does not modify the
340  * global copy of errno.
341  */
342 #ifdef __cplusplus
343 #define sys0_write sys.write
344     SysCalls sys;
345 #else
346     int my_errno;
347 #define SYS_ERRNO my_errno
348 #define SYS_INLINE inline
349 #undef SYS_LINUX_SYSCALL_SUPPORT_H
350 #define SYS_PREFIX 0
351 #include "linux_syscall_support.h"
352 #endif
353 
354     const unsigned char *buf = (const unsigned char *)void_buf;
355     size_t len = bytes;
356     while (len > 0) {
357       ssize_t rc;
358       MY_NO_INTR(rc = sys0_write(f, buf, len));
359       if (rc < 0) {
360         *errno_ = ERRNO;
361         return rc;
362       } else if (rc == 0) {
363         break;
364       }
365       buf += rc;
366       len -= rc;
367     }
368     return bytes - len;
369   }
370 }
371 
372 /* The simple synchronous writer is only used when outputting to a pipe
373  * instead of a file. In that case, we do not enforce a pre-determined
374  * maximum output size.
375  */
SimpleDone(void * f)376 static int SimpleDone(void *f) { return 0; }
377 
378 /* Simple synchronous writer function used by CreateElfCore() when writing
379  * directly to a pipe.
380  */
SimpleWriter(void * f,const void * void_buf,size_t bytes)381 static ssize_t SimpleWriter(void *f, const void *void_buf, size_t bytes) {
382   return c_write(*(int *)f, void_buf, bytes, &errno);
383 }
384 
385 struct WriterFds {
386   size_t max_length;
387   int write_fd;
388   int compressed_fd;
389   int out_fd;
390 };
391 
392 /* Checks whether the maximum number of allowed bytes has been written
393  * to the output file already.
394  */
PipeDone(void * f)395 static int PipeDone(void *f) {
396   struct WriterFds *fds = (struct WriterFds *)f;
397   return fds->max_length == 0;
398 }
399 
400 /* Writer function that writes directly to a file and honors size limits.
401  */
LimitWriter(void * f,const void * void_buf,size_t bytes)402 static ssize_t LimitWriter(void *f, const void *void_buf, size_t bytes) {
403   struct WriterFds *fds = (struct WriterFds *)f;
404   ssize_t rc;
405   if (bytes > fds->max_length) {
406     bytes = fds->max_length;
407   }
408   rc = c_write(fds->out_fd, void_buf, bytes, &errno);
409   if (rc > 0) {
410     fds->max_length -= rc;
411   }
412   return rc;
413 }
414 
415 /* Writer function that can handle writing to one end of a compression
416  * pipeline, reading from the other end of the pipe as compressed data
417  * becomes available, and finally outputting it to a file.
418  */
PipeWriter(void * f,const void * void_buf,size_t bytes)419 static ssize_t PipeWriter(void *f, const void *void_buf, size_t bytes) {
420   const unsigned char *buf = (const unsigned char *)void_buf;
421   struct WriterFds *fds = (struct WriterFds *)f;
422   size_t len = bytes;
423   while (fds->max_length > 0 && len > 0) {
424     ssize_t rc;
425     struct kernel_pollfd pfd[2] = {{fds->compressed_fd, POLLIN, 0}, {fds->write_fd, POLLOUT, 0}};
426     int nfds = sys_poll(pfd, 2, -1);
427 
428     if (nfds < 0) {
429       /* Abort on fatal unexpected I/O errors.                               */
430       break;
431     }
432 
433     if (nfds > 0 && (pfd[0].revents & POLLIN)) {
434       /* Some compressed data has become available. Copy to output file.     */
435       char scratch[4096];
436       for (;;) {
437         size_t l = sizeof(scratch);
438         if (l > fds->max_length) {
439           l = fds->max_length;
440         }
441 
442         /* The following line is needed on MIPS. Not sure why. Compiler bug? */
443         errno = -1;
444 
445         NO_INTR(rc = sys_read(fds->compressed_fd, scratch, l));
446         if (rc < 0) {
447           /* The file handle is set to be non-blocking, so we loop until
448            * read() returns -1.
449            */
450           if (errno == EAGAIN) {
451             break;
452           }
453           return -1;
454         } else if (rc == 0) {
455           fds->max_length = 0;
456           break;
457         }
458         rc = c_write(fds->out_fd, scratch, rc, &errno);
459         if (rc <= 0) {
460           return -1;
461         }
462         fds->max_length -= rc;
463       }
464       nfds--;
465     }
466     if (nfds > 0 && (pfd[1].revents & POLLOUT)) {
467       /* The compressor has consumed all previous data and is ready to
468        * receive more.
469        */
470       NO_INTR(rc = sys_write(fds->write_fd, buf, len));
471       if (rc < 0 && errno != EAGAIN) {
472         return -1;
473       }
474       buf += rc;
475       len -= rc;
476     }
477   }
478   return bytes - len;
479 }
480 
481 /* Flush the remaining data (if any) from the pipe.
482  */
FlushPipe(struct WriterFds * fds)483 static int FlushPipe(struct WriterFds *fds) {
484   long flags;
485   NO_INTR(flags = sys_fcntl(fds->compressed_fd, F_GETFL, 0));
486   NO_INTR(sys_fcntl(fds->compressed_fd, F_SETFL, flags & ~O_NONBLOCK));
487   while (fds->max_length > 0) {
488     char scratch[4096];
489     size_t l = sizeof(scratch);
490     ssize_t rc;
491     if (l > fds->max_length) {
492       l = fds->max_length;
493     }
494     if (l > 0) {
495       NO_INTR(rc = sys_read(fds->compressed_fd, scratch, l));
496       if (rc < 0) {
497         return -1;
498       } else if (rc == 0) {
499         break;
500       }
501       if (c_write(fds->out_fd, scratch, rc, &errno) != rc) {
502         return -1;
503       }
504       fds->max_length -= rc;
505     }
506   }
507   return 0;
508 }
509 
510 struct io {
511   int fd;
512   unsigned char *data, *end;
513   unsigned char buf[4096];
514 };
515 
516 /* Reads one character from the "io" file. This function has the same
517  * semantics as fgetc(), but we cannot call any library functions at this
518  * time.
519  */
GetChar(struct io * io)520 static int GetChar(struct io *io) {
521   unsigned char *ptr = io->data;
522   if (ptr == io->end) {
523     /* Even though we are parsing one character at a time, read in larger
524      * chunks.
525      */
526     ssize_t n = c_read(io->fd, io->buf, sizeof(io->buf), &errno);
527     if (n <= 0) {
528       if (n == 0) errno = 0;
529       return -1;
530     }
531     ptr = &io->buf[0];
532     io->end = &io->buf[n];
533   }
534   io->data = ptr + 1;
535   return *ptr;
536 }
537 
538 /* Place the hex number read from "io" into "*hex".  The first non-hex
539  * character is returned (or -1 in the case of end-of-file). If read_first
540  * then we start by getting the next char, otherwise we get the current one.
541  */
GetHexHelper(struct io * io,size_t * hex,bool read_first,int init_char)542 static int GetHexHelper(struct io *io, size_t *hex, bool read_first, int init_char) {
543   int ch;
544   *hex = 0;
545   while (((ch = read_first ? GetChar(io) : init_char) >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') ||
546          (ch >= 'a' && ch <= 'f')) {
547     read_first = true;
548     *hex = (*hex << 4) | (ch < 'A' ? ch - '0' : (ch & 0xF) + 9);
549   }
550 
551   return ch;
552 }
553 
GetHex(struct io * io,size_t * hex)554 static int GetHex(struct io *io, size_t *hex) { return GetHexHelper(io, hex, true, 0); }
555 
GetHexWithInitChar(struct io * io,size_t * hex,int init_char)556 static int GetHexWithInitChar(struct io *io, size_t *hex, int init_char) {
557   return GetHexHelper(io, hex, false, init_char);
558 }
559 
560 /* Computes the amount of leading zeros in a memory region.
561  */
LeadingZeros(int * loopback,void * mem,size_t len,size_t pagesize)562 static size_t LeadingZeros(int *loopback, void *mem, size_t len, size_t pagesize) {
563   char buf[pagesize];
564   size_t count;
565 
566   char *ptr = buf;
567   for (count = 0; count < len;) {
568     /* Read a page by going through the pipe. Assume that we can write at
569      * least one page without blocking.
570      *
571      * "Normal" kernels do not require this hack. But some of the security
572      * patches (e.g. grsec) can be configured to disallow read access of
573      * executable pages. So, directly scanning the memory range would
574      * result in a segmentation fault.
575      *
576      * If we cannot access a page, we assume that it was all zeros.
577      */
578     if ((count % pagesize) == 0) {
579       if (c_write(loopback[1], (char *)mem + count, pagesize, &errno) < 0 ||
580           c_read(loopback[0], buf, pagesize, &errno) < 0) {
581         count += pagesize;
582         continue;
583       } else {
584         ptr = buf;
585       }
586     }
587     if (*ptr++) {
588       break;
589     }
590     count++;
591   }
592   return count & ~(pagesize - 1);
593 }
594 
595 /* Dynamically determines the byte sex of the system. Returns non-zero
596  * for big-endian machines.
597  */
sex()598 static inline int sex() {
599   int probe = 1;
600   return !*(char *)&probe;
601 }
602 
WriteThreadRegs(void * handle,ssize_t (* writer)(void *,const void *,size_t),prstatus * prstatus,pid_t pid,regs * regs,fpregs * fpregs,fpxregs * fpxregs)603 static int WriteThreadRegs(void *handle, ssize_t (*writer)(void *, const void *, size_t), prstatus *prstatus, pid_t pid,
604                            regs *regs, fpregs *fpregs, fpxregs *fpxregs) {
605   Nhdr nhdr;
606   memset(&nhdr, 0, sizeof(Nhdr));
607   /* Process status and integer registers                                    */
608   nhdr.n_namesz = 5;
609   nhdr.n_descsz = sizeof(struct prstatus);
610   nhdr.n_type = NT_PRSTATUS;
611   prstatus->pr_pid = pid;
612   prstatus->pr_reg = *regs;
613   if (writer(handle, &nhdr, sizeof(Nhdr)) != sizeof(Nhdr) || writer(handle, "CORE\0\0\0\0", 8) != 8 ||
614       writer(handle, prstatus, sizeof(struct prstatus)) != sizeof(struct prstatus)) {
615     return -1;
616   }
617 
618   /* FPU registers                                                           */
619   nhdr.n_descsz = sizeof(struct fpregs);
620   nhdr.n_type = NT_FPREGSET;
621   if (writer(handle, &nhdr, sizeof(Nhdr)) != sizeof(Nhdr) || writer(handle, "CORE\0\0\0\0", 8) != 8 ||
622       writer(handle, fpregs, sizeof(struct fpregs)) != sizeof(struct fpregs)) {
623     return -1;
624   }
625 
626 /* SSE registers                                                           */
627 #if defined(__i386__) && !defined(__x86_64__)
628   /* Linux on x86-64 stores all FPU registers in the SSE structure           */
629   if (fpxregs) {
630     nhdr.n_namesz = 8;
631     nhdr.n_descsz = sizeof(struct fpxregs);
632     nhdr.n_type = NT_PRXFPREG;
633     if (writer(handle, &nhdr, sizeof(Nhdr)) != sizeof(Nhdr) || writer(handle, "LINUX\000\000", 8) != 8 ||
634         writer(handle, fpxregs, sizeof(struct fpxregs)) != sizeof(struct fpxregs)) {
635       return -1;
636     }
637   }
638 #endif
639   return 0;
640 }
641 
642 /* Read /proc/self/auxv (if it exists), count number of entries.
643  * Since we are already reading all entries, it is convenient
644  * to also return the address of VDSO Elf header, if AT_SYSINFO_EHDR
645  * is present.
646  */
CountAUXV(size_t * pnum_auxv,size_t * pvdso_ehdr)647 static void CountAUXV(size_t *pnum_auxv, size_t *pvdso_ehdr) {
648   int fd;
649   auxv_t auxv;
650   size_t num_auxv = 0, vdso_ehdr = 0;
651   NO_INTR(fd = sys_open("/proc/self/auxv", O_RDONLY, 0));
652   if (fd >= 0) {
653     ssize_t nread;
654     do {
655       NO_INTR(nread = sys_read(fd, &auxv, sizeof(auxv_t)));
656       if (sizeof(auxv_t) != nread) break;
657       num_auxv++;
658       if (auxv.a_type == AT_SYSINFO_EHDR) {
659         vdso_ehdr = (size_t)auxv.a_un.a_val;
660       }
661     } while (auxv.a_type != AT_NULL);
662   }
663   NO_INTR(sys_close(fd));
664   *pnum_auxv = num_auxv;
665   *pvdso_ehdr = vdso_ehdr;
666   return;
667 }
668 
669 /* Verify that alleged vdso and its internals are sane (properly
670  * aligned, within readable memory etc. Returns NULL if not.
671  */
SanitizeVDSO(Ehdr * ehdr,size_t start,size_t end)672 static Ehdr *SanitizeVDSO(Ehdr *ehdr, size_t start, size_t end) {
673   const size_t ehdr_address = (size_t)ehdr; /* ehdr alias to avoid casts     */
674   int i;
675   Phdr *phdr;
676   if (!ehdr_address || (ehdr_address & (sizeof(size_t) - 1))) {
677     /* Not properly aligned. Something goofy is going on.                    */
678     return NULL;
679   }
680   if (end <= ehdr_address + sizeof(Ehdr)) {
681     /* Entire Ehdr is not "covered" by expected region.                      */
682     return NULL;
683   }
684   if (ehdr->e_phoff & (sizeof(size_t) - 1)) {
685     /* Phdr not properly aligned                                             */
686     return NULL;
687   }
688   phdr = (Phdr *)(ehdr_address + ehdr->e_phoff);
689   if ((size_t)phdr <= start || end <= (size_t)(phdr + ehdr->e_phnum)) {
690     /* Phdr[] is not "covered" by expected region.                           */
691     return NULL;
692   }
693   if (phdr[0].p_type != PT_LOAD || phdr[0].p_vaddr != start || phdr[0].p_vaddr + phdr[0].p_memsz >= end) {
694     /* Something goofy.                                                      */
695     return NULL;
696   }
697   for (i = 1; i < ehdr->e_phnum; i++) {
698     if (phdr[i].p_type == PT_LOAD) {
699       /* Only a single PT_LOAD at index 0 is expected                        */
700       return NULL;
701     }
702     if (phdr[i].p_vaddr & (sizeof(size_t) - 1)) {
703       /* Phdr data not properly aligned                                      */
704       return NULL;
705     }
706     if (phdr[i].p_vaddr <= start || end <= phdr[i].p_vaddr + phdr[i].p_filesz) {
707       /* The data isn't in the expected range                                */
708       return NULL;
709     }
710   }
711   return ehdr;
712 }
713 
714 /* This function is invoked from a separate process. It has access to a
715  * copy-on-write copy of the parents address space, and all crucial
716  * information about the parent has been computed by the caller.
717  */
CreateElfCore(void * handle,ssize_t (* writer)(void *,const void *,size_t),int (* is_done)(void *),prpsinfo * prpsinfo,core_user * user,prstatus * prstatus,int num_threads,pid_t * pids,regs * regs,fpregs * fpregs,fpxregs * fpxregs,size_t pagesize,size_t prioritize_max_length,pid_t main_pid,const struct CoredumperNote * extra_notes,int extra_notes_count)718 static int CreateElfCore(void *handle, ssize_t (*writer)(void *, const void *, size_t), int (*is_done)(void *),
719                          prpsinfo *prpsinfo, core_user *user, prstatus *prstatus, int num_threads, pid_t *pids,
720                          regs *regs, fpregs *fpregs, fpxregs *fpxregs, size_t pagesize, size_t prioritize_max_length,
721                          pid_t main_pid, const struct CoredumperNote *extra_notes, int extra_notes_count) {
722   /* Count the number of mappings in "/proc/self/maps". We are guaranteed
723    * that this number is not going to change while this function executes.
724    */
725   int rc = -1, num_mappings = 0;
726   struct io io;
727   int loopback[2] = {-1, -1};
728   size_t num_auxv;
729   union {
730     Ehdr *ehdr;
731     size_t address;
732   } vdso;
733 
734   if (sys_pipe(loopback) < 0) goto done;
735 
736   io.data = io.end = 0;
737   NO_INTR(io.fd = sys_open("/proc/self/maps", O_RDONLY, 0));
738   if (io.fd >= 0) {
739     int i, ch;
740     while ((ch = GetChar(&io)) >= 0) {
741       num_mappings += (ch == '\n');
742     }
743     if (errno != 0) {
744     read_error:
745       NO_INTR(sys_close(io.fd));
746       goto done;
747     }
748     NO_INTR(sys_close(io.fd));
749 
750     CountAUXV(&num_auxv, &vdso.address);
751     /* Read all mappings. This requires re-opening "/proc/self/maps"         */
752     /* scope */ {
753       static const int PF_MASK = 0x00000007;
754       struct {
755         size_t start_address, end_address, offset, write_size;
756         int flags;
757       } mappings[num_mappings];
758       io.data = io.end = 0;
759       NO_INTR(io.fd = sys_open("/proc/self/smaps", O_RDONLY, 0));
760       if (io.fd >= 0) {
761         size_t note_align;
762         size_t num_extra_phdrs = 0;
763 
764         if ((ch = GetChar(&io)) < 0) {
765           goto read_error;
766         }
767 
768         /* Parse entries of the form:
769          * "^[0-9A-F]*-[0-9A-F]* [r-][w-][x-][p-] [0-9A-F]*.*$"
770          * At the start of each iteration, ch contains the first character.
771          */
772         for (i = 0; i < num_mappings;) {
773           static const char *const dev_zero = "/dev/zero";
774           const char *dev = dev_zero;
775           int j, is_device, is_anonymous;
776           int dontdump = 0;
777           int has_anonymous_pages = 0;
778           size_t zeros;
779 
780           memset(&mappings[i], 0, sizeof(mappings[i]));
781 
782           /* Read start and end addresses                                    */
783           if (GetHexWithInitChar(&io, &mappings[i].start_address, ch) != '-' ||
784               GetHex(&io, &mappings[i].end_address) != ' ')
785             goto read_error;
786 
787           /* Read flags                                                      */
788           while ((ch = GetChar(&io)) != ' ') {
789             if (ch < 0) goto read_error;
790             mappings[i].flags = (mappings[i].flags << 1) | (ch != '-');
791           }
792 
793           /* Read offset                                                     */
794           if ((ch = GetHex(&io, &mappings[i].offset)) != ' ') goto read_error;
795 
796           /* Skip over device numbers, and inode number                      */
797           for (j = 0; j < 2; j++) {
798             while (ch == ' ') {
799               ch = GetChar(&io);
800             }
801             while (ch != ' ' && ch != '\n') {
802               if (ch < 0) goto read_error;
803               ch = GetChar(&io);
804             }
805             while (ch == ' ') {
806               ch = GetChar(&io);
807             }
808             if (ch < 0) goto read_error;
809           }
810 
811           /* Check whether this is a mapping for a device                    */
812           is_anonymous = (ch == '\n' || ch == '[');
813           while (*dev && ch == *dev) {
814             ch = GetChar(&io);
815             dev++;
816           }
817           is_device = dev >= dev_zero + 5 && ((ch != '\n' && ch != ' ') || *dev != '\000');
818 
819           /* Skip until end of line                                          */
820           while (ch != '\n') {
821             if (ch < 0) goto read_error;
822             ch = GetChar(&io);
823           }
824 
825           /*
826            * Parse extra information from smaps.
827            * Each time through this loop we read one full line.
828            * Stop when we've parsed one memory segment's complete description.
829            * Afterwards ch will contain the first character of the next
830            * description, or EOF.
831            */
832           while (1) {
833             ch = GetChar(&io);
834             if (ch < 1 || (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f'))
835               /* EOF, or new memory segment description start */
836               break;
837 
838             switch (ch) {
839               /* Anonymous: */
840               case 'A': {
841                 const char *str = "Anonymous:";
842                 while (*str && ch == *str) {
843                   ch = GetChar(&io);
844                   ++str;
845                 }
846 
847                 if (*str == '\0') {
848                   /* Check if there is at least one anonymous page */
849 
850                   /* Skip spaces until we reach the page number */
851                   while (ch == ' ') ch = GetChar(&io);
852 
853                   /* Make sure we reached a digit */
854                   if (ch < '0' || ch > '9') goto read_error;
855 
856                   has_anonymous_pages = ch != '0';
857                 }
858                 break;
859               }
860 
861               /* VmFlags: */
862               case 'V': {
863                 const char *str = "VmFlags:";
864                 while (*str && ch == *str) {
865                   ch = GetChar(&io);
866                   ++str;
867                 }
868 
869                 if (*str == '\0') {
870                   /* Check the flags for "don't dump" (dd) */
871                   while (ch == ' ') {
872                     /* skip space before the flag */
873                     while (ch == ' ') ch = GetChar(&io);
874 
875                     /* check if the flag is "dd" */
876                     if (ch == 'd') {
877                       ch = GetChar(&io);
878                       if (ch == 'd') {
879                         dontdump = true;
880                         break;
881                       }
882                     }
883 
884                     /* skip any remaining flag characters */
885                     while (ch >= 'a' && ch <= 'z') ch = GetChar(&io);
886                   }
887                 }
888                 break;
889               }
890 
891               default:
892                 break;
893             }
894 
895             /* Skip until end of line                                        */
896             while (ch != '\n') {
897               if (ch < 0) goto read_error;
898 
899               ch = GetChar(&io);
900             }
901           }
902 
903           /* Drop the private/shared bit. This makes the flags compatible with
904            * the ELF access bits
905            */
906           mappings[i].flags = (mappings[i].flags >> 1) & PF_MASK;
907 
908           /* Skip leading zeroed pages (as found in the stack segment)       */
909           if ((mappings[i].flags & PF_R) && !is_device) {
910             zeros = LeadingZeros(loopback, (void *)mappings[i].start_address,
911                                  mappings[i].end_address - mappings[i].start_address, pagesize);
912             mappings[i].start_address += zeros;
913           }
914 
915           /* Write segment content if the don't dump flag is not set, and one
916            * or more of the following is true:
917            *  - the segment is anonymous
918            *  - the segment is writable
919            *  - the segment has anonymous pages
920            */
921           if (!dontdump && (is_anonymous || has_anonymous_pages || (mappings[i].flags & PF_W) != 0)) {
922             mappings[i].write_size = mappings[i].end_address - mappings[i].start_address;
923           }
924 
925           /* Remove mapping, if it was not readable, or completely zero
926            * anyway. The former is usually the case of stack guard pages, and
927            * the latter occasionally happens for unused memory.
928            * Also, be careful not to touch mapped devices.
929            */
930           if ((mappings[i].flags & PF_R) == 0 || mappings[i].start_address == mappings[i].end_address || is_device) {
931             num_mappings--;
932           } else {
933             i++;
934           }
935         }
936         NO_INTR(sys_close(io.fd));
937 
938         if (vdso.address) {
939           /* Sanity checks.                                                  */
940           for (i = 0; i < num_mappings; i++) {
941             size_t start = mappings[i].start_address;
942             size_t end = mappings[i].end_address;
943             if ((mappings[i].flags & PF_R) && start <= vdso.address && vdso.address < end) {
944               vdso.ehdr = SanitizeVDSO(vdso.ehdr, start, end);
945               break;
946             }
947           }
948           if (i == num_mappings) {
949             /* Did not find a mapping "covering" vdso.
950              * Something goofy is going on; will not dump it.
951              */
952             vdso.address = 0;
953           }
954         }
955 
956         /* Write out the ELF header                                          */
957         /* scope */ {
958           Ehdr ehdr;
959           if (vdso.address) {
960             /* We are going to add Phdrs that "belong" to vdso.
961              * This isn't strictly necessary, but matches what kernel code
962              * in fs/binfmt_elf.c does on platforms that have vdso.
963              */
964             Phdr *vdso_phdr = (Phdr *)(vdso.address + vdso.ehdr->e_phoff);
965             for (i = 0; i < vdso.ehdr->e_phnum; i++) {
966               if (vdso_phdr[i].p_type == PT_LOAD) {
967                 /* This will be written as "normal" mapping                  */
968               } else {
969                 num_extra_phdrs++;
970               }
971             }
972           }
973           memset(&ehdr, 0, sizeof(Ehdr));
974           ehdr.e_ident[0] = ELFMAG0;
975           ehdr.e_ident[1] = ELFMAG1;
976           ehdr.e_ident[2] = ELFMAG2;
977           ehdr.e_ident[3] = ELFMAG3;
978           ehdr.e_ident[4] = ELF_CLASS;
979           ehdr.e_ident[5] = sex() ? ELFDATA2MSB : ELFDATA2LSB;
980           ehdr.e_ident[6] = EV_CURRENT;
981           ehdr.e_type = ET_CORE;
982           ehdr.e_machine = ELF_ARCH;
983           ehdr.e_version = EV_CURRENT;
984           ehdr.e_phoff = sizeof(Ehdr);
985           ehdr.e_ehsize = sizeof(Ehdr);
986           ehdr.e_phentsize = sizeof(Phdr);
987           ehdr.e_phnum = num_mappings + num_extra_phdrs + 1;
988           ehdr.e_shentsize = sizeof(Shdr);
989           if (writer(handle, &ehdr, sizeof(Ehdr)) != sizeof(Ehdr)) {
990             goto done;
991           }
992         }
993 
994         /* Write program headers, starting with the PT_NOTE entry            */
995         /* scope */ {
996           Phdr phdr;
997           size_t offset = sizeof(Ehdr) + (num_mappings + num_extra_phdrs + 1) * sizeof(Phdr);
998           size_t filesz =
999               sizeof(Nhdr) + 8 + sizeof(struct prpsinfo) + (user ? sizeof(Nhdr) + 8 + sizeof(struct core_user) : 0) +
1000               num_threads * (+sizeof(Nhdr) + 8 + sizeof(struct prstatus) + sizeof(Nhdr) + 8 + sizeof(struct fpregs));
1001 #if defined(__i386__) && !defined(__x86_64__)
1002           if (fpxregs) {
1003             filesz += num_threads * (sizeof(Nhdr) + 8 + sizeof(struct fpxregs));
1004           }
1005 #endif
1006           /* Calculate how much space the extra notes will take.             */
1007           for (i = 0; i < extra_notes_count; i++) {
1008             size_t name_size;
1009             name_size = strlen(extra_notes[i].name) + 1;
1010             filesz += sizeof(Nhdr) + name_size + extra_notes[i].description_size;
1011             /* Note names and descriptions are 4 byte aligned.               */
1012             if (name_size % 4 != 0) {
1013               filesz += 4 - name_size % 4;
1014             }
1015             if (extra_notes[i].description_size % 4 != 0) {
1016               filesz += 4 - extra_notes[i].description_size % 4;
1017             }
1018           }
1019           /* Space for auxv note                                             */
1020           if (num_auxv) {
1021             filesz += 8 + sizeof(Nhdr) + num_auxv * sizeof(auxv_t);
1022           }
1023 
1024           memset(&phdr, 0, sizeof(Phdr));
1025           phdr.p_type = PT_NOTE;
1026           phdr.p_offset = offset;
1027           phdr.p_filesz = filesz;
1028           if (writer(handle, &phdr, sizeof(Phdr)) != sizeof(Phdr)) {
1029             goto done;
1030           }
1031 
1032           /* Now follow with program headers for each of the memory segments */
1033           phdr.p_type = PT_LOAD;
1034           phdr.p_align = pagesize;
1035           phdr.p_paddr = 0;
1036           note_align = phdr.p_align - ((offset + filesz) % phdr.p_align);
1037           if (note_align == phdr.p_align) note_align = 0;
1038           offset += note_align;
1039 
1040           /* If the option is set, remove the largest memory sections first
1041            * when limiting the size of the core dump.
1042            * If prioritize_max_length is zero, the prioritization option wasn't
1043            * set. If max_length was set to zero, we wouldn't have gotten this
1044            * far.
1045            */
1046           if (prioritize_max_length > 0) {
1047             /* Calculates the size of the vdso sections which are added to the
1048              * end of the file. These need to be preserved in order for the
1049              * core file to be useful.
1050              */
1051             size_t vdso_size = 0;
1052             if (vdso.address) {
1053               Phdr *vdso_phdr = (Phdr *)(vdso.address + vdso.ehdr->e_phoff);
1054               for (i = 0; i < vdso.ehdr->e_phnum; i++) {
1055                 Phdr *p = vdso_phdr + i;
1056                 if (p->p_type != PT_LOAD) {
1057                   vdso_size += p->p_filesz;
1058                 }
1059               }
1060             }
1061 
1062             /* Loops while there isn't enough space for all the mappings. Each
1063              * iteration, the largest mapping will be reduced in size.
1064              */
1065             for (;;) {
1066               int largest = -1;
1067               size_t total_core_size = offset + filesz + vdso_size;
1068               /* Get the largest and total size of the core dump.            */
1069               for (i = 0; i < num_mappings; i++) {
1070                 total_core_size += mappings[i].write_size;
1071                 if (largest < 0 || mappings[largest].write_size < mappings[i].write_size) {
1072                   largest = i;
1073                 }
1074               }
1075               /* If the total size of all the maps is more than our file size,
1076                * we must reduce the size of the largest map.
1077                */
1078               if (largest >= 0 && total_core_size > prioritize_max_length) {
1079                 size_t space_needed = total_core_size - prioritize_max_length;
1080                 /* If there is no more space to free in the mappings, we must
1081                  * stop. The size limit will be preserved since if the
1082                  * prioritized limiting is enabled, the limited writer will be
1083                  * used.
1084                  */
1085                 if (mappings[largest].write_size > 0) {
1086                   if (space_needed > mappings[largest].write_size) {
1087                     mappings[largest].write_size = 0;
1088                     continue;
1089                   } else {
1090                     mappings[largest].write_size -= space_needed;
1091                   }
1092                 }
1093               }
1094               break;
1095             }
1096           }
1097 
1098           for (i = 0; i < num_mappings; i++) {
1099             offset += filesz;
1100             filesz = mappings[i].end_address - mappings[i].start_address;
1101             phdr.p_offset = offset;
1102             phdr.p_vaddr = mappings[i].start_address;
1103             phdr.p_memsz = filesz;
1104 
1105             filesz = mappings[i].write_size;
1106             phdr.p_filesz = filesz;
1107             phdr.p_flags = mappings[i].flags & PF_MASK;
1108             if (writer(handle, &phdr, sizeof(Phdr)) != sizeof(Phdr)) {
1109               goto done;
1110             }
1111           }
1112           if (vdso.ehdr) {
1113             Phdr *vdso_phdr = (Phdr *)(vdso.address + vdso.ehdr->e_phoff);
1114             for (i = 0; i < vdso.ehdr->e_phnum; i++) {
1115               if (vdso_phdr[i].p_type != PT_LOAD) {
1116                 memcpy(&phdr, vdso_phdr + i, sizeof(Phdr));
1117                 offset += filesz;
1118                 filesz = phdr.p_filesz;
1119                 phdr.p_offset = offset;
1120                 phdr.p_paddr = 0; /* match other core phdrs                 */
1121                 if (writer(handle, &phdr, sizeof(Phdr)) != sizeof(Phdr)) {
1122                   goto done;
1123                 }
1124               }
1125             }
1126           }
1127         }
1128         /* Write note section                                                */
1129         /* scope */ {
1130           Nhdr nhdr;
1131           memset(&nhdr, 0, sizeof(Nhdr));
1132           nhdr.n_namesz = 5;
1133           nhdr.n_descsz = sizeof(struct prpsinfo);
1134           nhdr.n_type = NT_PRPSINFO;
1135           if (writer(handle, &nhdr, sizeof(Nhdr)) != sizeof(Nhdr) || writer(handle, "CORE\0\0\0\0", 8) != 8 ||
1136               writer(handle, prpsinfo, sizeof(struct prpsinfo)) != sizeof(struct prpsinfo)) {
1137             goto done;
1138           }
1139           if (user) {
1140             nhdr.n_descsz = sizeof(struct core_user);
1141             nhdr.n_type = NT_PRXREG;
1142             if (writer(handle, &nhdr, sizeof(Nhdr)) != sizeof(Nhdr) || writer(handle, "CORE\0\0\0\0", 8) != 8 ||
1143                 writer(handle, user, sizeof(struct core_user)) != sizeof(struct core_user)) {
1144               goto done;
1145             }
1146           }
1147           if (num_auxv) {
1148             /* Dump entire auxv[] array as NT_AUXV note, to match what
1149              * kernel code in fs/binfmt_elf.c does.
1150              * Without this, gdb can't unwind through vdso on i686.
1151              */
1152             int fd, i;
1153             NO_INTR(fd = sys_open("/proc/self/auxv", O_RDONLY, 0));
1154             if (fd == -1) {
1155               goto done;
1156             }
1157             nhdr.n_descsz = num_auxv * sizeof(auxv_t);
1158             nhdr.n_type = NT_AUXV;
1159             if (writer(handle, &nhdr, sizeof(Nhdr)) != sizeof(Nhdr) || writer(handle, "CORE\0\0\0\0", 8) != 8) {
1160               NO_INTR(sys_close(fd));
1161               goto done;
1162             }
1163             for (i = 0; i < num_auxv; ++i) {
1164               ssize_t nread;
1165               auxv_t auxv;
1166               NO_INTR(nread = sys_read(fd, &auxv, sizeof(auxv_t)));
1167               if (nread != sizeof(auxv_t)) {
1168                 NO_INTR(sys_close(fd));
1169                 goto done;
1170               }
1171               if (writer(handle, &auxv, sizeof(auxv_t)) != sizeof(auxv_t)) {
1172                 NO_INTR(sys_close(fd));
1173                 goto done;
1174               }
1175             }
1176           }
1177           /* The order of threads in the output matters to gdb:
1178            * it assumes that the first one is the one that crashed.
1179            * Make it easier for the end-user to find crashing thread
1180            * by dumping it first.
1181            */
1182           for (i = num_threads; i-- > 0;) {
1183             if (pids[i] == main_pid) {
1184               if (WriteThreadRegs(handle, writer, prstatus, pids[i], regs + i, fpregs + i, fpxregs + i)) {
1185                 goto done;
1186               }
1187               break;
1188             }
1189           }
1190           for (i = num_threads; i-- > 0;) {
1191             if (pids[i] != main_pid) {
1192               if (WriteThreadRegs(handle, writer, prstatus, pids[i], regs + i, fpregs + i, fpxregs + i)) {
1193                 goto done;
1194               }
1195             }
1196           }
1197 
1198           /* Write user provided notes                                       */
1199           for (i = 0; i < extra_notes_count; i++) {
1200             size_t name_align = 0, description_align = 0;
1201             const char scratch[3] = {0, 0, 0};
1202             nhdr.n_namesz = strlen(extra_notes[i].name) + 1;
1203             nhdr.n_descsz = extra_notes[i].description_size;
1204             nhdr.n_type = extra_notes[i].type;
1205             /* Get the alignment for the data                                */
1206             if (nhdr.n_namesz % 4 != 0) {
1207               name_align = 4 - nhdr.n_namesz % 4;
1208             }
1209             if (nhdr.n_descsz % 4 != 0) {
1210               description_align = 4 - nhdr.n_descsz % 4;
1211             }
1212             /* Write the note header                                         */
1213             if (writer(handle, &nhdr, sizeof(Nhdr)) != sizeof(Nhdr)) {
1214               goto done;
1215             }
1216             /* Write the note name and padding                               */
1217             if (writer(handle, extra_notes[i].name, nhdr.n_namesz) != nhdr.n_namesz) {
1218               goto done;
1219             }
1220             if (writer(handle, scratch, name_align) != name_align) {
1221               goto done;
1222             }
1223             /* Write the note description and padding                        */
1224             if (writer(handle, extra_notes[i].description, nhdr.n_descsz) != nhdr.n_descsz) {
1225               goto done;
1226             }
1227             if (writer(handle, scratch, description_align) != description_align) {
1228               goto done;
1229             }
1230           }
1231         }
1232 
1233         /* Align all following segments to multiples of page size            */
1234         if (note_align) {
1235           char scratch[note_align];
1236           memset(scratch, 0, note_align * sizeof(char));
1237           if (writer(handle, scratch, note_align * sizeof(char)) != note_align * sizeof(char)) {
1238             goto done;
1239           }
1240         }
1241 
1242         /* Write all memory segments                                         */
1243         for (i = 0; i < num_mappings; i++) {
1244           if (mappings[i].write_size > 0 &&
1245               writer(handle, (void *)mappings[i].start_address, mappings[i].write_size) != mappings[i].write_size) {
1246             goto done;
1247           }
1248         }
1249         if (vdso.address) {
1250           /* Finally write the contents of Phdrs that "belong" to vdso.      */
1251           Phdr *vdso_phdr = (Phdr *)(vdso.address + vdso.ehdr->e_phoff);
1252           for (i = 0; i < vdso.ehdr->e_phnum; i++) {
1253             Phdr *p = vdso_phdr + i;
1254             if (p->p_type == PT_LOAD) {
1255               /* This segment has already been dumped, because it is one of
1256                * the mappings[].
1257                */
1258             } else if (writer(handle, (void *)p->p_vaddr, p->p_filesz) != p->p_filesz) {
1259               goto done;
1260             }
1261           }
1262         }
1263         rc = 0;
1264       }
1265     }
1266   }
1267 
1268 done:
1269   if (is_done(handle)) {
1270     rc = 0;
1271   }
1272 
1273   if (loopback[0] >= 0) NO_INTR(sys_close(loopback[0]));
1274   if (loopback[1] >= 0) NO_INTR(sys_close(loopback[1]));
1275   return rc;
1276 }
1277 
1278 struct CreateArgs {
1279   int *fds;
1280   int openmax;
1281   const char *PATH;
1282   const struct CoredumperCompressor *compressors;
1283   int zip_in[2];
1284   int zip_out[2];
1285 };
1286 
CreatePipelineChild(void * void_arg)1287 static int CreatePipelineChild(void *void_arg) {
1288   /* scope */ {
1289 /* Define a private copy of syscall macros, which does not modify the
1290  * global copy of errno.
1291  */
1292 #ifdef __cplusplus
1293 #define sys0_close sys.close
1294 #define sys0_dup sys.dup
1295 #define sys0_dup2 sys.dup2
1296 #define sys0_execve sys.execve
1297 #define sys0_open sys.open
1298 #define sys0_fcntl sys.fcntl
1299     SysCalls sys;
1300 #else
1301     int my_errno;
1302 #define SYS_ERRNO my_errno
1303 #define SYS_INLINE inline
1304 #define SYS_PREFIX 0
1305 #undef SYS_LINUX_SYSCALL_SUPPORT_H
1306 #include "linux_syscall_support.h"
1307 #endif
1308 
1309     struct CreateArgs *args = (struct CreateArgs *)void_arg;
1310     int i;
1311 
1312     /* Use pipe to tell parent about the compressor that we chose.
1313      * Make sure the file handle for the write-end of the pipe is
1314      * bigger than 2, so that it does not interfere with the
1315      * stdin/stdout/stderr file handles which must be 0-2.
1316      */
1317     MY_NO_INTR(sys0_close(args->fds[0]));
1318     while (args->fds[1] <= 2) {
1319       MY_NO_INTR(args->fds[1] = sys0_dup(args->fds[1]));
1320     }
1321     sys0_fcntl(args->fds[1], F_SETFD, FD_CLOEXEC);
1322 
1323     /* Move the filehandles for stdin/stdout/stderr, so that they
1324      * map to handles 0-2. stdin/stdout are connected to pipes, and
1325      * stderr points to "/dev/null".
1326      */
1327     while (args->zip_in[0] <= 2) {
1328       MY_NO_INTR(args->zip_in[0] = sys0_dup(args->zip_in[0]));
1329     }
1330     while (args->zip_out[1] <= 2) {
1331       MY_NO_INTR(args->zip_out[1] = sys0_dup(args->zip_out[1]));
1332     }
1333     MY_NO_INTR(sys0_dup2(args->zip_in[0], 0));
1334     MY_NO_INTR(sys0_dup2(args->zip_out[1], 1));
1335     MY_NO_INTR(sys0_close(2));
1336     MY_NO_INTR(sys0_dup2(sys0_open("/dev/null", O_WRONLY, 0), 2));
1337 
1338     /* Close all handles other than stdin/stdout/stderr and the
1339      * pipe to the parent. This also takes care of all the filehandles
1340      * that we temporarily created by calling sys_dup().
1341      */
1342     for (i = 3; i < args->openmax; i++)
1343       if (i != args->fds[1]) MY_NO_INTR(sys0_close(i));
1344 
1345     while (args->compressors->compressor != NULL && *args->compressors->compressor) {
1346       extern char **environ;
1347 
1348       const char *compressor = args->compressors->compressor;
1349       const char *const *cmd_args = args->compressors->args;
1350 
1351       /* Try next compressor description. If the compressor exists,
1352        * the fds[1] file handle will get closed on exec(). The
1353        * parent detects this, and eventually updates
1354        * selected_compressor with the compressor that is now running.
1355        *
1356        * Please note, the caller does not need to call wait() for any
1357        * compressor that gets launched, because our parent process is
1358        * going to die soon; thus, the compressor will be reaped by "init".
1359        */
1360       c_write(args->fds[1], &args->compressors, sizeof(&args->compressors), &ERRNO);
1361       if (strchr(compressor, '/')) {
1362         /* Absolute or relative path precedes name of executable             */
1363         sys0_execve(compressor, cmd_args, (const char *const *)environ);
1364       } else {
1365         /* Search for executable along PATH variable                         */
1366         const char *ptr = args->PATH;
1367         if (ptr != NULL) {
1368           for (;;) {
1369             const char *end = ptr;
1370             while (*end && *end != ':') end++;
1371             if (ptr == end) {
1372               /* Found current directory in PATH                             */
1373               sys0_execve(compressor, cmd_args, (const char *const *)environ);
1374             } else {
1375               /* Compute new file name                                       */
1376               char executable[strlen(compressor) + (end - ptr) + 2];
1377               memcpy(executable, ptr, end - ptr);
1378               executable[end - ptr] = '/';
1379               strcpy(executable + (end - ptr + 1), compressor);
1380               sys0_execve(executable, cmd_args, (const char *const *)environ);
1381             }
1382             if (!*end) break;
1383             ptr = end + 1;
1384           }
1385         }
1386       }
1387       ++args->compressors;
1388     }
1389 
1390     /* No suitable compressor found. Tell parent about it.                   */
1391     c_write(args->fds[1], &args->compressors, sizeof(&args->compressors), &ERRNO);
1392     MY_NO_INTR(sys0_close(args->fds[1]));
1393     sys__exit(0);
1394     return 0;
1395   }
1396 }
1397 
1398 /* Create a pipeline for sending the core file from the child process back to
1399  * the caller. Optionally include a compressor program in the loop. The
1400  * "compressors" variable will be updated to point to the compressor that was
1401  * actually used.
1402  */
CreatePipeline(int * fds,int openmax,const char * PATH,const struct CoredumperCompressor ** compressors)1403 static int CreatePipeline(int *fds, int openmax, const char *PATH, const struct CoredumperCompressor **compressors) {
1404   int saved_errno1 = 0;
1405 
1406   /* Create a pipe for communicating between processes                       */
1407   if (sys_pipe(fds) < 0) return -1;
1408 
1409   /* Find a suitable compressor program, if necessary                        */
1410   if (*compressors != NULL && (*compressors)->compressor != NULL) {
1411     char stack[4096];
1412     struct CreateArgs args;
1413     pid_t comp_pid;
1414 
1415     args.fds = fds;
1416     args.openmax = openmax;
1417     args.PATH = PATH;
1418     args.compressors = *compressors;
1419 
1420     if (sys_pipe(args.zip_in) < 0) {
1421     fail0 : {
1422       int saved_errno = errno;
1423       NO_INTR(sys_close(fds[0]));
1424       NO_INTR(sys_close(fds[1]));
1425       errno = saved_errno;
1426       return -1;
1427     }
1428     } else if (sys_pipe(args.zip_out) < 0) {
1429     fail1 : {
1430       int saved_errno = errno;
1431       NO_INTR(sys_close(args.zip_in[0]));
1432       NO_INTR(sys_close(args.zip_in[1]));
1433       errno = saved_errno;
1434       goto fail0;
1435     }
1436     }
1437 
1438     /* We use clone() here, instead of the more common fork(). This ensures
1439      * that the WriteCoreDump() code path never results in making a COW
1440      * instance of the processes' address space. This increases the likelihood
1441      * that we can dump core files even if we are using a lot of memory and
1442      * the kernel disallows overcomitting of memory.
1443      * After cloning, both the parent and the child share the same instance
1444      * of errno. We must make sure that at least one of these processes
1445      * (in our case, the child) uses modified syscall macros that update
1446      * a local copy of errno, instead.
1447      */
1448     comp_pid =
1449         sys_clone(CreatePipelineChild, stack + sizeof(stack) - 16, CLONE_VM | CLONE_UNTRACED | SIGCHLD, &args, 0, 0, 0);
1450     if (comp_pid < 0) {
1451       int clone_errno = errno;
1452       NO_INTR(sys_close(args.zip_out[0]));
1453       NO_INTR(sys_close(args.zip_out[1]));
1454       errno = clone_errno;
1455       goto fail1;
1456     }
1457 
1458     /* Close write-end of pipe, and read from read-end until child closes
1459      * its reference to the pipe.
1460      */
1461     NO_INTR(sys_close(fds[1]));
1462     *compressors = NULL;
1463     while (c_read(fds[0], compressors, sizeof(*compressors), &errno)) {
1464     }
1465     NO_INTR(sys_close(fds[0]));
1466 
1467     /* Fail if either the child never even executed (unlikely), or
1468      * did not find any compressor that could be executed.
1469      */
1470     if (*compressors == NULL || (*compressors)->compressor == NULL) {
1471       saved_errno1 = errno;
1472       NO_INTR(sys_close(args.zip_out[0]));
1473       NO_INTR(sys_close(args.zip_out[1]));
1474       errno = saved_errno1;
1475     fail2 : {
1476       int saved_errno2 = errno;
1477       NO_INTR(sys_close(args.zip_in[0]));
1478       NO_INTR(sys_close(args.zip_in[1]));
1479       errno = saved_errno2;
1480       return -1;
1481     }
1482     }
1483 
1484     if (*(*compressors)->compressor) {
1485       /* Found a good compressor program, which is now connected to
1486        * zip_in/zip_out.
1487        */
1488       fds[0] = args.zip_out[0];
1489       fds[1] = args.zip_in[1];
1490       NO_INTR(sys_close(args.zip_in[0]));
1491       NO_INTR(sys_close(args.zip_out[1]));
1492     } else {
1493       /* No suitable compressor found, but the caller allowed
1494        * uncompressed core files. So, just close unneeded file handles,
1495        * and reap the child's exit code.
1496        */
1497       int status;
1498       fds[0] = -1;
1499       fds[1] = -1;
1500       NO_INTR(sys_close(args.zip_in[0]));
1501       NO_INTR(sys_close(args.zip_out[0]));
1502       NO_INTR(sys_close(args.zip_in[1]));
1503       NO_INTR(sys_close(args.zip_out[1]));
1504       while (sys_waitpid(comp_pid, &status, 0) < 0) {
1505         if (errno != EINTR) {
1506           goto fail2;
1507         }
1508       }
1509     }
1510   }
1511   return 0;
1512 }
1513 
1514 /* If this code is being built without support for multi-threaded core files,
1515  * some of our basic assumptions are not quite right. Most noticably, the
1516  * fake thread lister ends up calling InternalGetCoreDump() from the main
1517  * (i.e. only) thread in the application, which cannot be ptrace()'d at this
1518  * time. This prevents us from retrieving CPU registers.
1519  *
1520  * We work around this problem by delaying the call to ptrace() until we
1521  * have forked. We also need to double-fork here, in order to make sure that
1522  * the core writer process can get reaped by "init" after it reaches EOF.
1523  */
GetParentRegs(void * frame,regs * cpu,fpregs * fp,fpxregs * fpx,int * hasSSE)1524 static inline int GetParentRegs(void *frame, regs *cpu, fpregs *fp, fpxregs *fpx, int *hasSSE) {
1525 #ifdef THREADS
1526   return 1;
1527 #else
1528   int rc = 0;
1529   char scratch[4096];
1530   pid_t pid = getppid();
1531   if (sys_ptrace(PTRACE_ATTACH, pid, (void *)0, (void *)0) == 0 && waitpid(pid, (void *)0, __WALL) >= 0) {
1532     memset(scratch, 0xFF, sizeof(scratch));
1533     if (sys_ptrace(PTRACE_GETREGS, pid, scratch, scratch) == 0) {
1534       memcpy(cpu, scratch, sizeof(struct regs));
1535       SET_FRAME(*(Frame *)frame, *cpu);
1536       memset(scratch, 0xFF, sizeof(scratch));
1537       if (sys_ptrace(PTRACE_GETFPREGS, pid, scratch, scratch) == 0) {
1538         memcpy(fp, scratch, sizeof(struct fpregs));
1539         memset(scratch, 0xFF, sizeof(scratch));
1540 #if defined(__i386__) && !defined(__x86_64__)
1541         /* Linux on x86-64 stores all FPU registers in the SSE structure     */
1542         if (sys_ptrace(PTRACE_GETFPXREGS, pid, scratch, scratch) == 0) {
1543           memcpy(fpx, scratch, sizeof(struct fpxregs));
1544         } else {
1545           *hasSSE = 0;
1546         }
1547 #else
1548         *hasSSE = 0;
1549 #endif
1550         rc = 1;
1551       }
1552     }
1553   }
1554   sys_ptrace_detach(pid);
1555 
1556   /* Need to double-fork, so that "init" can reap the core writer upon EOF.  */
1557   switch (sys_fork()) {
1558     case -1:
1559       return 0;
1560     case 0:
1561       return rc;
1562     default:
1563       sys__exit(0);
1564   }
1565 #endif
1566 }
1567 
1568 /* Internal function for generating a core file. This function works for
1569  * both single- and multi-threaded core files. It assumes that all threads
1570  * are already suspended, and will resume them before returning.
1571  *
1572  * The caller must make sure that prctl(PR_SET_DUMPABLE, 1) has been called,
1573  * or this function might fail.
1574  */
InternalGetCoreDump(void * frame,int num_threads,pid_t * pids,va_list ap)1575 int InternalGetCoreDump(void *frame, int num_threads, pid_t *pids,
1576                         va_list ap
1577                      /* const struct CoreDumpParameters *params,
1578                         const char *file_name,
1579                         const char *PATH
1580                       */) {
1581   long i;
1582   int rc = -1, fd = -1, threads = num_threads, hasSSE = 1;
1583   struct core_user user, *puser = &user;
1584   prpsinfo prpsinfo;
1585   prstatus prstatus;
1586   regs thread_regs[threads];
1587   fpregs thread_fpregs[threads];
1588   fpxregs thread_fpxregs[threads];
1589   int pair[2];
1590   int main_pid = ((Frame *)frame)->tid;
1591 
1592   const struct CoreDumpParameters *params = va_arg(ap, const struct CoreDumpParameters *);
1593 
1594   int (*callback_fn)(void *) = GetCoreDumpParameter(params, callback_fn);
1595   if (callback_fn) {
1596     void *arg = GetCoreDumpParameter(params, callback_arg);
1597     if (callback_fn(arg) != 0) {
1598       goto error;
1599     }
1600   }
1601 
1602   /* Get thread status                                                       */
1603   memset(puser, 0, sizeof(struct core_user));
1604   memset(thread_regs, 0, threads * sizeof(struct regs));
1605   memset(thread_fpregs, 0, threads * sizeof(struct fpregs));
1606   memset(thread_fpxregs, 0, threads * sizeof(struct fpxregs));
1607 
1608   /* Threads are already attached, read their registers now                  */
1609 #ifdef THREADS
1610   for (i = 0; i < threads; i++) {
1611     char scratch[4096];
1612 #ifdef __mips__
1613     /* MIPS kernels do not support PTRACE_GETREGS, instead we have to call
1614      * PTRACE_PEEKUSER go retrieve individual CPU registers. The indices
1615      * for these registers do not exactly match with the order in the
1616      * structures that get written to the core file, either. We use a lookup
1617      * table to do the mapping.
1618      * Incidentally, this also means that on MIPS we cannot use
1619      * PTRACE_PEEKUSER to fill "struct core_user". There just is no such thing
1620      * as a NT_PRXREG in our MIPS core files.
1621      */
1622     static const int map[sizeof(struct regs) / sizeof(long)] = {
1623         -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
1624         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 67, 68, 64, 66, 69, 65, -1};
1625     int j;
1626     for (j = 0; j < sizeof(struct regs) / sizeof(long); j++) {
1627       if (map[j] >= 0 && sys_ptrace(PTRACE_PEEKUSER, pids[i], (void *)map[j], (unsigned long *)(thread_regs + i) + j)) {
1628         ResumeAllProcessThreads(threads, pids);
1629         goto error;
1630       }
1631     }
1632 
1633     /* Older kernels do not support PTRACE_GETFPREGS, and require calling
1634      * PTRACE_PEEKUSER. This is a little awkward because of the layout of
1635      * "struct fpregs" that expands all 32bit variables to 64bits.
1636      */
1637     memset(thread_fpregs + i, 0xFF, sizeof(struct fpregs));
1638     for (j = 0; j < 32; j++) {
1639       if (sys_ptrace(PTRACE_PEEKUSER, pids[i], (void *)(32 + j), (uint64_t *)(thread_fpregs + i) + j)) {
1640         ResumeAllProcessThreads(threads, pids);
1641         goto error;
1642       }
1643     }
1644     if (sys_ptrace(PTRACE_PEEKUSER, pids[i], (void *)69, scratch) == 0) {
1645       memcpy(&thread_fpregs[i].fcr31, scratch, sizeof(thread_fpregs[i].fcr31));
1646     }
1647     if (sys_ptrace(PTRACE_PEEKUSER, pids[i], (void *)70, scratch) == 0) {
1648       memcpy(&thread_fpregs[i].fir, scratch, sizeof(thread_fpregs[i].fir));
1649     }
1650 
1651     /* If the kernel supports it, PTRACE_GETFPREGS is a better way to
1652      * retrieve the FP registers.
1653      */
1654     if (sys_ptrace(PTRACE_GETFPREGS, pids[i], scratch, scratch) == 0) {
1655       memcpy(thread_fpregs + i, scratch, sizeof(struct fpregs));
1656     }
1657 
1658     /* Set the saved integer registers, if we are looking at the thread that
1659      * called us.
1660      */
1661     if (main_pid == pids[i]) {
1662       SET_FRAME(*(Frame *)frame, thread_regs[i]);
1663     }
1664     hasSSE = 0;
1665 #else
1666     memset(scratch, 0xFF, sizeof(scratch));
1667     if (sys_ptrace(PTRACE_GETREGS, pids[i], scratch, scratch) == 0) {
1668       memcpy(thread_regs + i, scratch, sizeof(struct regs));
1669       if (main_pid == pids[i]) {
1670         SET_FRAME(*(Frame *)frame, thread_regs[i]);
1671       }
1672       memset(scratch, 0xFF, sizeof(scratch));
1673       if (sys_ptrace(PTRACE_GETFPREGS, pids[i], scratch, scratch) == 0) {
1674         memcpy(thread_fpregs + i, scratch, sizeof(struct fpregs));
1675         memset(scratch, 0xFF, sizeof(scratch));
1676 #if defined(__i386__) && !defined(__x86_64__)
1677         /* Linux on x86-64 stores all FPU registers in the SSE structure     */
1678         if (sys_ptrace(PTRACE_GETFPXREGS, pids[i], scratch, scratch) == 0) {
1679           memcpy(thread_fpxregs + i, scratch, sizeof(struct fpxregs));
1680         } else {
1681           hasSSE = 0;
1682         }
1683 #else
1684         hasSSE = 0;
1685 #endif
1686       } else {
1687         goto ptrace;
1688       }
1689     } else {
1690     ptrace: /* Oh, well, undo everything and get out of here                  */
1691       ResumeAllProcessThreads(threads, pids);
1692       goto error;
1693     }
1694 #endif
1695   }
1696 
1697   /* Get parent's CPU registers, and user data structure                     */
1698   {
1699 #ifndef __mips__
1700     for (i = 0; i < sizeof(struct core_user); i += sizeof(int)) {
1701       sys_ptrace(PTRACE_PEEKUSER, pids[0], (void *)i, ((char *)&user) + i);
1702     }
1703     /* Avoid using GCC's builtin memcpy... causes crashes in GCC 8.x at -O1?
1704      * I could not discover why this is... we are copying from one stack
1705      * buffer to another, so it's hard to imagine what could go wrong.
1706      * Unfortunately my assembly-fu is not sufficient to figure it out.  */
1707 
1708     /* Overwrite the regs from ptrace with the ones previously computed.  */
1709     my_memcpy(&user.regs, thread_regs, sizeof(struct regs));
1710 #else
1711     puser = NULL;
1712 #endif
1713   }
1714 #endif
1715 
1716   /* Build the PRPSINFO data structure                                       */
1717   memset(&prpsinfo, 0, sizeof(struct prpsinfo));
1718   prpsinfo.pr_sname = 'R';
1719   prpsinfo.pr_nice = sys_getpriority(PRIO_PROCESS, 0);
1720   prpsinfo.pr_uid = sys_geteuid();
1721   prpsinfo.pr_gid = sys_getegid();
1722   prpsinfo.pr_pid = main_pid;
1723   prpsinfo.pr_ppid = sys_getppid();
1724   prpsinfo.pr_pgrp = sys_getpgrp();
1725   prpsinfo.pr_sid = sys_getsid(0);
1726   /* scope */ {
1727     char scratch[4096], *cmd = scratch, *ptr;
1728     ssize_t size, len;
1729     int cmd_fd;
1730     memset(&scratch, 0, sizeof(scratch));
1731     size = sys_readlink("/proc/self/exe", scratch, sizeof(scratch));
1732     len = 0;
1733     for (ptr = cmd; *ptr != '\000' && size-- > 0; ptr++) {
1734       if (*ptr == '/') {
1735         cmd = ptr + 1;
1736         len = 0;
1737       } else
1738         len++;
1739     }
1740     memcpy(prpsinfo.pr_fname, cmd, len > sizeof(prpsinfo.pr_fname) ? sizeof(prpsinfo.pr_fname) : len);
1741     NO_INTR(cmd_fd = sys_open("/proc/self/cmdline", O_RDONLY, 0));
1742     if (cmd_fd >= 0) {
1743       char *ptr;
1744       ssize_t size = c_read(cmd_fd, &prpsinfo.pr_psargs, sizeof(prpsinfo.pr_psargs), &errno);
1745       for (ptr = prpsinfo.pr_psargs; size-- > 0; ptr++)
1746         if (*ptr == '\000') *ptr = ' ';
1747       NO_INTR(sys_close(cmd_fd));
1748     }
1749   }
1750 
1751   /* Build the PRSTATUS data structure                                       */
1752   /* scope */ {
1753     int stat_fd;
1754     memset(&prstatus, 0, sizeof(struct prstatus));
1755     prstatus.pr_pid = prpsinfo.pr_pid;
1756     prstatus.pr_ppid = prpsinfo.pr_ppid;
1757     prstatus.pr_pgrp = prpsinfo.pr_pgrp;
1758     prstatus.pr_sid = prpsinfo.pr_sid;
1759     prstatus.pr_fpvalid = 1;
1760     NO_INTR(stat_fd = sys_open("/proc/self/stat", O_RDONLY, 0));
1761     if (stat_fd >= 0) {
1762       char scratch[4096];
1763       ssize_t size = c_read(stat_fd, scratch, sizeof(scratch) - 1, &errno);
1764       if (size >= 0) {
1765         unsigned long tms;
1766         char *ptr = scratch;
1767         scratch[size] = '\000';
1768 
1769         /* User time                                                         */
1770         for (i = 13; i && *ptr; ptr++)
1771           if (*ptr == ' ') i--;
1772         tms = 0;
1773         while (*ptr && *ptr != ' ') tms = 10 * tms + *ptr++ - '0';
1774         prstatus.pr_utime.tv_sec = tms / 1000;
1775         prstatus.pr_utime.tv_usec = (tms % 1000) * 1000;
1776 
1777         /* System time                                                       */
1778         if (*ptr) ptr++;
1779         tms = 0;
1780         while (*ptr && *ptr != ' ') tms = 10 * tms + *ptr++ - '0';
1781         prstatus.pr_stime.tv_sec = tms / 1000;
1782         prstatus.pr_stime.tv_usec = (tms % 1000) * 1000;
1783 
1784         /* Cumulative user time                                              */
1785         if (*ptr) ptr++;
1786         tms = 0;
1787         while (*ptr && *ptr != ' ') tms = 10 * tms + *ptr++ - '0';
1788         prstatus.pr_cutime.tv_sec = tms / 1000;
1789         prstatus.pr_cutime.tv_usec = (tms % 1000) * 1000;
1790 
1791         /* Cumulative system time                                            */
1792         if (*ptr) ptr++;
1793         tms = 0;
1794         while (*ptr && *ptr != ' ') tms = 10 * tms + *ptr++ - '0';
1795         prstatus.pr_cstime.tv_sec = tms / 1000;
1796         prstatus.pr_cstime.tv_usec = (tms % 1000) * 1000;
1797 
1798         /* Pending signals                                                   */
1799         for (i = 14; i && *ptr; ptr++)
1800           if (*ptr == ' ') i--;
1801         while (*ptr && *ptr != ' ') prstatus.pr_sigpend = 10 * prstatus.pr_sigpend + *ptr++ - '0';
1802 
1803         /* Held signals                                                      */
1804         if (*ptr) ptr++;
1805         while (*ptr && *ptr != ' ') prstatus.pr_sigpend = 10 * prstatus.pr_sigpend + *ptr++ - '0';
1806       }
1807       NO_INTR(sys_close(stat_fd));
1808     }
1809   }
1810 
1811   /* scope */ {
1812     int openmax = sys_sysconf(_SC_OPEN_MAX);
1813     int pagesize = sys_sysconf(_SC_PAGESIZE);
1814     struct kernel_sigset_t old_signals, blocked_signals;
1815 
1816     const char *file_name = va_arg(ap, const char *);
1817     size_t max_length = GetCoreDumpParameter(params, max_length);
1818     const char *PATH = va_arg(ap, const char *);
1819     const struct CoredumperCompressor *compressors = GetCoreDumpParameter(params, compressors);
1820     const struct CoredumperCompressor **selected_compressor =
1821         (const struct CoredumperCompressor **)GetCoreDumpParameter(params, selected_compressor);
1822     int prioritize = GetCoreDumpParameter(params, flags) & COREDUMPER_FLAG_LIMITED_BY_PRIORITY;
1823     const struct CoredumperNote *notes = GetCoreDumpParameter(params, notes);
1824     int note_count = GetCoreDumpParameter(params, note_count);
1825 
1826     if (selected_compressor != NULL) {
1827       /* For now, assume that the core dump is uncompressed; we will later
1828        * override this setting, if we can find a suitable compressor program.
1829        */
1830       *selected_compressor = compressors;
1831       while (*selected_compressor && (*selected_compressor)->compressor != NULL) {
1832         ++*selected_compressor;
1833       }
1834     }
1835 
1836     if (file_name == NULL) {
1837       /* Create a file descriptor that can be used for reading data from
1838        * our child process. This is a little complicated because we need
1839        * to make sure there is no race condition with other threads
1840        * calling fork() at the same time (this is somewhat mitigated,
1841        * because our threads are supposedly suspended at this time). We
1842        * have to avoid other processes holding our file handles open. We
1843        * can do this by creating the pipe in the child and passing the
1844        * file handle back to the parent.
1845        */
1846       if (sys_socketpair(AF_UNIX, SOCK_STREAM, 0, pair) >= 0) {
1847         /* Block signals prior to forking. Technically, POSIX requires
1848          * us to call pthread_sigmask(), if this is a threaded
1849          * application. When using glibc, we are OK calling
1850          * sigprocmask(), though. We will end up blocking additional
1851          * signals that libpthread uses internally, but that
1852          * is actually exactly what we want.
1853          *
1854          * Also, POSIX claims that this should not actually be
1855          * necessarily, but reality says otherwise.
1856          */
1857         sys_sigfillset(&blocked_signals);
1858         sys_sigprocmask(SIG_BLOCK, &blocked_signals, &old_signals);
1859 
1860         /* Create a new core dump in child process; call sys_fork() in order to
1861          * avoid complications with pthread_atfork() handlers. In the child
1862          * process, we should only ever call system calls.
1863          */
1864         if ((rc = sys_fork()) == 0) {
1865           int fds[2];
1866 
1867           /* Create a pipe for communicating between processes. If
1868            * necessary, add a compressor to the pipeline.
1869            */
1870           if (CreatePipeline(fds, openmax, PATH, &compressors) < 0 || (fds[0] < 0 && sys_pipe(fds) < 0)) {
1871             sys__exit(1);
1872           }
1873 
1874           /* Pass file handle to parent                                      */
1875           /* scope */ {
1876             char cmsg_buf[CMSG_SPACE(sizeof(int))];
1877             struct kernel_iovec iov;
1878             struct kernel_msghdr msg;
1879             struct cmsghdr *cmsg;
1880             memset(&iov, 0, sizeof(iov));
1881             memset(&msg, 0, sizeof(msg));
1882             iov.iov_base = (void *)&compressors;
1883             iov.iov_len = sizeof(compressors);
1884             msg.msg_iov = &iov;
1885             msg.msg_iovlen = 1;
1886             msg.msg_control = &cmsg_buf;
1887             msg.msg_controllen = sizeof(cmsg_buf);
1888             cmsg = CMSG_FIRSTHDR(&msg);
1889             if (!cmsg) {
1890               /* This can't happen, but static analyzers still complain...   */
1891               sys__exit(1);
1892             }
1893             cmsg->cmsg_level = SOL_SOCKET;
1894             cmsg->cmsg_type = SCM_RIGHTS;
1895             cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1896             *(int *)CMSG_DATA(cmsg) = fds[0];
1897             while (sys_sendmsg(pair[1], &msg, 0) < 0) {
1898               if (errno != EINTR) sys__exit(1);
1899             }
1900             while (sys_shutdown(pair[1], SHUT_RDWR) < 0) {
1901               if (errno != EINTR) sys__exit(1);
1902             }
1903           }
1904 
1905           /* Close all file handles other than the write end of our pipe     */
1906           for (i = 0; i < openmax; i++) {
1907             if (i != fds[1]) {
1908               NO_INTR(sys_close(i));
1909             }
1910           }
1911 
1912           /* If compiled without threading support, this is the only
1913            * place where we can request the parent's CPU
1914            * registers. This function is a no-op when threading
1915            * support is available.
1916            */
1917           if (!GetParentRegs(frame, thread_regs, thread_fpregs, thread_fpxregs, &hasSSE)) {
1918             sys__exit(1);
1919           }
1920 
1921           CreateElfCore(&fds[1], SimpleWriter, SimpleDone, &prpsinfo, puser, &prstatus, threads, pids, thread_regs,
1922                         thread_fpregs, hasSSE ? thread_fpxregs : NULL, pagesize, 0, main_pid, notes, note_count);
1923           NO_INTR(sys_close(fds[1]));
1924           sys__exit(0);
1925 
1926           /* Make the compiler happy. We never actually get here.            */
1927           return 0;
1928         } else if (rc > 0) {
1929 #ifndef THREADS
1930           /* Child will double-fork, so reap the process, now.               */
1931           sys_waitpid(rc, (void *)0, __WALL);
1932 #endif
1933         }
1934 
1935         /* In the parent                                                     */
1936         sys_sigprocmask(SIG_SETMASK, &old_signals, (struct kernel_sigset_t *)0);
1937         NO_INTR(sys_close(pair[1]));
1938 
1939         /* Get pipe file handle from child                                   */
1940         /* scope */ {
1941           const struct CoredumperCompressor *buffer[1];
1942           char cmsg_buf[CMSG_SPACE(sizeof(int))];
1943           struct kernel_iovec iov;
1944           struct kernel_msghdr msg;
1945           for (;;) {
1946             int nbytes;
1947             memset(&iov, 0, sizeof(iov));
1948             memset(&msg, 0, sizeof(msg));
1949             iov.iov_base = buffer;
1950             iov.iov_len = sizeof(void *);
1951             msg.msg_iov = &iov;
1952             msg.msg_iovlen = 1;
1953             msg.msg_control = &cmsg_buf;
1954             msg.msg_controllen = sizeof(cmsg_buf);
1955             if ((nbytes = sys_recvmsg(pair[0], &msg, 0)) > 0) {
1956               struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
1957               if (cmsg != NULL && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS)
1958                 fd = *(int *)CMSG_DATA(cmsg);
1959               if (nbytes == sizeof(void *) && *buffer != NULL && selected_compressor != NULL)
1960                 *selected_compressor = *buffer;
1961               break;
1962             } else if (nbytes == 0 || errno != EINTR) {
1963               break;
1964             }
1965           }
1966         }
1967         sys_shutdown(pair[0], SHUT_RDWR);
1968         NO_INTR(sys_close(pair[0]));
1969       }
1970     } else {
1971       /* Synchronously write the core to a file. If necessary, compress the
1972        * data on the fly. All other threads are suspended during this time.
1973        * In principle, we could use the same code that we used earlier for
1974        * building a core file on the fly. But that results in creating a COW
1975        * copy of the address space (as a result of the call to fork()), and
1976        * some accounting applications are sensitive to the sudden spike in
1977        * memory usage.
1978        * So, instead, we run a single thread and make use of callback
1979        * functions that internally invoke poll() for managing the I/O.
1980        */
1981       int fds[2] = {-1, -1};
1982       int saved_errno, rc;
1983       const char *suffix = "";
1984       struct WriterFds writer_fds;
1985       ssize_t (*writer)(void *, const void *, size_t);
1986 
1987       /* If compiled without threading support, this is the only
1988        * place where we can request the parent's CPU
1989        * registers. This function is a no-op when threading
1990        * support is available.
1991        */
1992       if (!GetParentRegs(frame, thread_regs, thread_fpregs, thread_fpxregs, &hasSSE)) {
1993         goto error;
1994       }
1995 
1996       /* Create a pipe for communicating between processes. If
1997        * necessary, add a compressor to the pipeline.
1998        */
1999       if (compressors != NULL && compressors->compressor != NULL) {
2000         if (CreatePipeline(fds, openmax, PATH, &compressors) < 0) {
2001           goto error;
2002         }
2003       }
2004       if (selected_compressor) {
2005         *selected_compressor = compressors;
2006       }
2007 
2008       writer_fds.out_fd = -1;
2009       if (max_length > 0) {
2010         /* Open the output file. If necessary, pick a filename suffix that
2011          * matches the selected compression type.
2012          */
2013         if (compressors != NULL && compressors->compressor != NULL && compressors->suffix != NULL) {
2014           suffix = compressors->suffix;
2015         }
2016         /* scope */ {
2017           const int kOpenFlags = O_WRONLY | O_CREAT | O_TRUNC;
2018           char extended_file_name[strlen(file_name) + strlen(suffix) + 1];
2019           strcat(strcpy(extended_file_name, file_name), suffix);
2020           NO_INTR(writer_fds.out_fd = sys_open(extended_file_name, kOpenFlags | O_LARGEFILE, 0600));
2021           if (writer_fds.out_fd < 0 && EINVAL == errno && O_LARGEFILE) {
2022             /* This kernel apears not to have large file support.
2023              * Try again without O_LARGEFILE.
2024              */
2025             NO_INTR(writer_fds.out_fd = sys_open(extended_file_name, kOpenFlags, 0600));
2026           }
2027           if (writer_fds.out_fd < 0) {
2028             saved_errno = errno;
2029             if (fds[0] >= 0) NO_INTR(sys_close(fds[0]));
2030             if (fds[1] >= 0) NO_INTR(sys_close(fds[1]));
2031             errno = saved_errno;
2032             goto error;
2033           }
2034         }
2035 
2036         /* Set up a suitable writer funtion.                                 */
2037         writer_fds.max_length = max_length;
2038         if (fds[0] >= 0) {
2039           /* The PipeWriter() can deal with multi I/O requests on the
2040            * compression pipeline.
2041            */
2042           long flags;
2043           NO_INTR(flags = sys_fcntl(fds[0], F_GETFL, 0));
2044           NO_INTR(sys_fcntl(fds[0], F_SETFL, flags | O_NONBLOCK));
2045           NO_INTR(flags = sys_fcntl(fds[1], F_GETFL, 0));
2046           NO_INTR(sys_fcntl(fds[1], F_SETFL, flags | O_NONBLOCK));
2047           writer_fds.write_fd = fds[1];
2048           writer_fds.compressed_fd = fds[0];
2049           writer = PipeWriter;
2050         } else {
2051           /* If no compression is needed, then we can directly write to the
2052            * file. This avoids quite a bit of unnecessary overhead.
2053            */
2054           writer = LimitWriter;
2055         }
2056 
2057         rc = CreateElfCore(&writer_fds, writer, PipeDone, &prpsinfo, puser, &prstatus, threads, pids, thread_regs,
2058                            thread_fpregs, hasSSE ? thread_fpxregs : NULL, pagesize, prioritize ? max_length : 0,
2059                            main_pid, notes, note_count);
2060         if (fds[0] >= 0) {
2061           saved_errno = errno;
2062           /* Close the input side of the compression pipeline, and flush
2063            * the remaining compressed data bytes out to the file.
2064            */
2065           if (fds[1] >= 0) {
2066             NO_INTR(sys_close(fds[1]));
2067             fds[1] = -1;
2068           }
2069           if (FlushPipe(&writer_fds) < 0) {
2070             rc = -1;
2071           } else {
2072             errno = saved_errno;
2073           }
2074         }
2075       } else {
2076         rc = 0;
2077       }
2078 
2079       /* Close all remaining open file handles.                              */
2080       saved_errno = errno;
2081       if (writer_fds.out_fd >= 0) NO_INTR(sys_close(writer_fds.out_fd));
2082       if (fds[0] >= 0) NO_INTR(sys_close(fds[0]));
2083       if (fds[1] >= 0) NO_INTR(sys_close(fds[1]));
2084       errno = saved_errno;
2085 
2086       if (rc < 0) {
2087         goto error;
2088       }
2089 
2090       /* If called with a filename, we do not actually return a file handle,
2091        * but instead just signal whether the core file has been written
2092        * successfully.
2093        */
2094       fd = 0;
2095     }
2096   }
2097 
2098   ResumeAllProcessThreads(threads, pids);
2099   return fd;
2100 
2101 error:
2102   /* scope */ {
2103     int saved_errno = errno;
2104     if (fd > 0) NO_INTR(sys_close(fd));
2105     errno = saved_errno;
2106   }
2107   ResumeAllProcessThreads(threads, pids);
2108   return -1;
2109 }
2110 
2111 #ifdef __cplusplus
2112 }
2113 #endif
2114 #endif
2115