1 #include "cado.h" // IWYU pragma: keep
2 // IWYU pragma: no_include <bits/types/struct_rusage.h>
3 #include <cstdlib>
4 #include <climits>
5 #include <cstdio> // FILE // IWYU pragma: keep
6 #include <cstring>
7 #include <sys/types.h>  // pid_t
8 #include <sys/wait.h>  // WIFEXITED WEXITSTATUS (on freebsd at least)
9 #include <unistd.h>     // close getpid
10 #include <sys/stat.h> // stat // IWYU pragma: keep
11 #ifdef HAVE_GETRUSAGE
12 #include <sys/time.h> // IWYU pragma: keep
13 #include <sys/resource.h> // IWYU pragma: keep
14 #endif
15 #include <cerrno>
16 
17 #include "fmt/format.h"
18 
19 
20 #include "macros.h"
21 #include "gzip.h"
22 #include "misc.h"
23 #include "cado_popen.h"
24 #include "cado_pipe_streambuf.hpp"
25 
26 struct suffix_handler {
27     const char * suffix;
28     const char * pfmt_in;
29     const char * pfmt_out;
30 };
31 
32 static char antebuffer[PATH_MAX];	/* "directory/antebuffer" or "cat" */
33 static int antebuffer_buffer_size = 24; /* default value 2^24 = 16 Mo */
34 
35 #if 0
36 const char * suffix = NULL;
37 
38 const char * copy_suffix_noalloc(const char * name)
39 {
40     const char * p = strrchr(name, '.');
41     if (p == NULL)
42         p = name + strlen(name);
43     return strdup(p);
44 }
45 
46 const char * copy_suffix_alloc(const char * name)
47 {
48     return strdup(copy_suffix_noalloc(name));
49 }
50 const char * path_remove_suffix(char * name)
51 {
52     char * p = strrchr(name, '.');
53     if (p) *p=0;
54     return name;
55 }
56 
57 #endif
58 
59 struct suffix_handler supported_compression_formats[] = {
60     { ".gz", "gzip -dc %s", "gzip -c --fast > %s", },
61     { ".bz2", "bzip2 -dc %s", "bzip2 -c --fast > %s", },
62     /* zstd seems to be uniformly better than any other alternative */
63     { ".zstd", "zstdcat %s", "zstd --fast > %s", },
64     /* xz is really slow */
65     { ".xz", "xzcat %s", "xz --fast > %s", },
66     { ".lzma", "lzma -dc %s", "lzma -c -0 > %s", },
67     /* These two have to be present */
68     { "", NULL, NULL },
69     { NULL, NULL, NULL },
70 };
71 
path_basename(const char * path)72 const char * path_basename(const char * path)
73 {
74     const char *p = strrchr(path, '/');
75     if (p == NULL) {
76         p = path;
77     } else {
78         p = p + 1;
79     }
80     return p;
81 }
82 
is_supported_compression_format(const char * s)83 int is_supported_compression_format(const char * s)
84 {
85     struct suffix_handler * r = supported_compression_formats;
86     for( ; r->suffix ; r++) {
87         if (strcmp(r->suffix, s) == 0)
88             return 1;
89     }
90     return 0;
91 }
92 
filename_matches_one_compression_format(const char * path)93 int filename_matches_one_compression_format(const char * path)
94 {
95     const struct suffix_handler * r = supported_compression_formats;
96 
97     for( ; r->suffix ; r++) {
98         if (!*r->suffix) continue;
99         if (has_suffix(path, r->suffix)) return 1;
100     }
101     return 0;
102 }
103 
get_suffix_from_filename(char * s,char const ** sfx)104 void get_suffix_from_filename (char *s, char const **sfx)
105 {
106   const struct suffix_handler * r = supported_compression_formats;
107   for( ; r->suffix ; r++)
108   {
109     if (has_suffix(s, r->suffix))
110     {
111       *sfx = r->suffix;
112       return;
113     }
114   }
115 
116   /* If we arrive here, it's because "" is not among the suffixes */
117   abort();
118   return;
119 }
120 
try_antebuffer_path()121 static int try_antebuffer_path()
122 {
123     int rc = access(antebuffer, X_OK);
124     if (rc >= 0) {
125         fprintf(stderr, "antebuffer set to %s\n", antebuffer);
126         return 1;
127     }
128     fprintf(stderr, "access to %s: %s\n", antebuffer, strerror(errno));
129     *antebuffer = 0;
130     return 0;
131 }
132 
set_antebuffer_path(const char * executable_filename,const char * path_antebuffer)133 int set_antebuffer_path (const char *executable_filename, const char *path_antebuffer)
134 {
135   *antebuffer = 0;
136   antebuffer[PATH_MAX-1]='\0';
137   /* First, if we have path_antebuffer, we must have antebuffer or error */
138   if (path_antebuffer) {
139       struct stat sbuf[1];
140       int rc = stat(path_antebuffer, sbuf);
141       if (rc < 0) {
142           fprintf(stderr, "%s: path_antebuffer=\"%s\" access error: %s\n",
143                   __func__, path_antebuffer, strerror(errno));
144       } else {
145           /* Older versions had path_antebuffer be a directory. We still
146            * support this, but only as a compatibility measure. */
147           if (S_ISDIR(sbuf->st_mode)) {
148 #ifdef EXECUTABLE_SUFFIX
149               snprintf(antebuffer, PATH_MAX-1, "%s/antebuffer" EXECUTABLE_SUFFIX, path_antebuffer);
150 #else
151               snprintf(antebuffer, PATH_MAX-1, "%s/antebuffer", path_antebuffer);
152 #endif
153           } else {
154               strncpy(antebuffer, path_antebuffer, PATH_MAX-1);
155           }
156           if (try_antebuffer_path()) return 1;
157       }
158   }
159   /* Second option: if we failed for any reason, and if $0 was given to
160    * us, use that as a potential fallback */
161   if (executable_filename) {
162       char dummy[PATH_MAX];
163       char dummy2[PATH_MAX + 64];
164       const char * slash = strrchr(executable_filename, '/');
165       if (slash) {
166           int len = MIN(PATH_MAX - 1, slash - executable_filename);
167           strncpy(dummy, executable_filename, len);
168           dummy[len]='\0';
169       } else {
170           dummy[0]='.';
171           dummy[1]='\0';
172       }
173 #ifdef EXECUTABLE_SUFFIX
174       snprintf(dummy2, sizeof(dummy2), "%s/../utils/antebuffer" EXECUTABLE_SUFFIX, dummy);
175 #else
176       snprintf(dummy2, sizeof(dummy2), "%s/../utils/antebuffer", dummy);
177 #endif
178       if (realpath(dummy2, antebuffer) && try_antebuffer_path())
179           return 1;
180   }
181   /* Third option: walk $PATH */
182   if ((path_resolve("antebuffer", antebuffer)) != NULL && try_antebuffer_path()) {
183       return 1;
184   }
185   *antebuffer = 0;
186   fprintf(stderr, "No antebuffer configured\n");
187   *antebuffer='\0';
188   return 0;
189 }
190 
191 /* Return a list of unix commands to _read_ a set of files. Consecutive
192  * files sharing the same decompression mechanism are grouped into a
193  * single command line.
194  *
195  * antebuffer file1.gz file2.gz file3.gz | gzip -dc
196  * antebuffer file4.bz2 | gzip -dc
197  * antebuffer file5.gz | gzip -dc
198  * antebuffer file6.gz | cat    // useless use of cat, should be fixed.
199  *
200  * Note that antebuffer may also not be defined. In that case, the
201  * simpler command formats like "gzip -dc file1.gz file2.gz file3.gz" are
202  * used.
203  *
204  * All strings returned are meant to be passed to popen(). The return
205  * value is a malloc()-ed array of malloc()-ed strings, and the caller is
206  * in charge of freeing it (with filelist_clear, for instance).
207  */
prepare_grouped_command_lines(char ** list_of_files)208 char **prepare_grouped_command_lines(char **list_of_files)
209 {
210     const struct suffix_handler *r = supported_compression_formats;
211     char ** new_commands = NULL;
212     size_t n_new_commands = 0;
213 
214     /* Allow a few bytes extra for popen's "/bin/sh" "-c" prefix */
215     ASSERT_ALWAYS(get_arg_max() >= 20);
216     size_t arg_max = get_arg_max() - 20;
217 
218     for(char ** grouphead = list_of_files ; *grouphead ; ) {
219         char *cmd_prefix = NULL, *cmd_postfix = NULL;
220         size_t prefix_len, postfix_len;
221         const struct suffix_handler * this_suffix = r;
222         for (; this_suffix && this_suffix->suffix; this_suffix++)
223             if (has_suffix(*grouphead, this_suffix->suffix))
224                 break;
225         ASSERT_ALWAYS(this_suffix);
226         size_t filenames_total_size = 0;
227         char ** grouptail;
228 
229         if (*antebuffer) {
230             if (this_suffix->pfmt_in) {
231                 /* antebuffer 24 file1.gz file2.gz file3.gz | gzip -dc - */
232                 int rc = asprintf(&cmd_prefix, "%s %d ", antebuffer, antebuffer_buffer_size);
233                 ASSERT_ALWAYS(rc >= 0);
234                 char *tmp;
235                 rc = asprintf(&tmp, this_suffix->pfmt_in, "-");
236                 ASSERT_ALWAYS(rc >= 0);
237                 rc = asprintf(&cmd_postfix, " | %s", tmp);
238                 ASSERT_ALWAYS(rc >= 0);
239                 free(tmp);
240             } else {
241                 /* antebuffer 24 file1.txt file2.txt file3.txt */
242                 /* avoid piping through cat */
243                 int rc = asprintf(&cmd_prefix, "%s %d ", antebuffer, antebuffer_buffer_size);
244                 ASSERT_ALWAYS(rc >= 0);
245             }
246         } else {
247             if (this_suffix->pfmt_in) {
248                 /* gzip -dc file1.gz file2.gz file3.gz */
249                 int rc = asprintf(&cmd_prefix, this_suffix->pfmt_in, "");
250                 ASSERT_ALWAYS(rc >= 0);
251             } else {
252                 /* cat file1.txt file2.txt file3.txt */
253                 /* There's potential for this to qualify as a useless use
254                  * of cat, but anyway we don't expect to meet this case
255                  * often.
256                  */
257                 int rc = asprintf(&cmd_prefix, "cat ");
258                 ASSERT_ALWAYS(rc >= 0);
259             }
260         }
261         prefix_len = cmd_prefix ? strlen(cmd_prefix) : 0;
262         postfix_len = cmd_postfix ? strlen(cmd_postfix) : 0;
263 
264         for(grouptail = grouphead ; *grouptail ; grouptail++) {
265             const struct suffix_handler * other_suffix = r;
266             for (; other_suffix && other_suffix->suffix; other_suffix++)
267                 if (has_suffix(*grouptail, other_suffix->suffix))
268                     break;
269             if (other_suffix != this_suffix)
270                 break;
271             /* Add 1 for a space */
272             size_t ds = strlen(*grouptail) + 1;
273             if (filenames_total_size + prefix_len + postfix_len + ds > arg_max)
274                 break;
275             filenames_total_size += ds;
276         }
277         /* Now all file names referenced by pointers in the interval
278          * [grouphead..grouptail[ have the same suffix. Create a new
279          * command for unpacking them.
280          */
281         new_commands = (char**) realloc(new_commands, ++n_new_commands * sizeof(char*));
282 
283         /* intermediary string for the list of file names */
284         char * tmp = (char*)  malloc(filenames_total_size + 1);
285         size_t k = 0;
286         for(char ** g = grouphead ; g != grouptail ; g++) {
287             k += snprintf(tmp + k, filenames_total_size + 1 - k, "%s ", *g);
288         }
289         tmp[k-1]='\0';  /* turn final space to a null byte */
290         filenames_total_size--; /* and adjust filenames_total_size for deleted space */
291 
292         char * cmd;
293         int rc;
294 
295         rc = asprintf(&cmd, "%s%s%s",
296                 cmd_prefix ? cmd_prefix : "",
297                 tmp,
298                 cmd_postfix ? cmd_postfix : "");
299         ASSERT_ALWAYS(rc >= 0);
300         ASSERT_ALWAYS(strlen(cmd) <= arg_max);
301         ASSERT_ALWAYS(strlen(cmd) == filenames_total_size + prefix_len + postfix_len);
302         new_commands[n_new_commands-1] = cmd;
303         free(tmp);
304         if (cmd_prefix) free(cmd_prefix);
305         if (cmd_postfix) free(cmd_postfix);
306         grouphead = grouptail;
307     }
308     new_commands = (char**) realloc(new_commands, ++n_new_commands * sizeof(char*));
309     new_commands[n_new_commands-1] = NULL;
310     return new_commands;
311 }
312 
313 FILE*
fopen_maybe_compressed2(const char * name,const char * mode,int * p_pipeflag,char const ** suf)314 fopen_maybe_compressed2 (const char * name, const char * mode, int* p_pipeflag, char const ** suf)
315 {
316     const struct suffix_handler * r = supported_compression_formats;
317     FILE * f;
318 
319     // coverity[fs_check_call]
320     if (strchr(mode, 'r') && access(name, R_OK) != 0)
321         return NULL;
322 
323     for( ; r->suffix ; r++) {
324         if (!has_suffix(name, r->suffix)) continue;
325         if (suf) *suf = r->suffix;
326         char * command = NULL;
327         char * tempname = NULL;
328         int ret;
329 
330         /* Just *any* file that we write to will get a .tmp.$PID suffix
331          */
332         if (strchr(mode, 'w')) {
333             ret = asprintf(&tempname, "%s.tmp.%d", name, getpid());
334             ASSERT_ALWAYS(ret >= 0);
335             name = tempname;
336         }
337 
338         if (strchr(mode, 'r') && r->pfmt_in) {
339             int ret = asprintf(&command, r->pfmt_in, name);
340             ASSERT_ALWAYS(ret >= 0);
341         } else if (strchr(mode, 'w') && r->pfmt_out) {
342             ret = asprintf(&command, r->pfmt_out, name);
343             ASSERT_ALWAYS(ret >= 0);
344         }
345 
346         if (command) {
347           /* apparently popen() under Linux does not accept the 'b' modifier */
348             char pmode[2] = "x";
349             pmode[0] = mode[0];
350             f = cado_popen(command, pmode);
351             if (p_pipeflag) *p_pipeflag = 1;
352 #ifdef F_SETPIPE_SZxxx
353             /* The pipe capacity is 2^16 by default; we can increase it,
354              * but it does not seem to make a difference, thus we don't
355              * change it by default (patch from Alain Filbois). */
356             fcntl (fileno (f), F_SETPIPE_SZ, 1UL << 20);
357 #endif
358             free(command);
359         } else {
360             f = fopen(name, mode);
361             if (p_pipeflag) *p_pipeflag = 0;
362         }
363         if (tempname)
364             free(tempname);
365         return f;
366     }
367     /* If we arrive here, it's because "" is not among the suffixes */
368     abort();
369     return NULL;
370 }
371 
372 
373 FILE*
fopen_maybe_compressed(const char * name,const char * mode)374 fopen_maybe_compressed (const char * name, const char * mode)
375 {
376     return fopen_maybe_compressed2(name, mode, NULL, NULL);
377 }
378 
379 #ifdef  HAVE_GETRUSAGE
380 int
fclose_maybe_compressed2(FILE * f,const char * name,struct rusage * rr)381 fclose_maybe_compressed2 (FILE * f, const char * name, struct rusage * rr)
382 #else
383 /* if we don't even have getrusage, then no fclose_maybe_compressed2 is
384  * exposed. Yet, we use one as a code shortcut
385  */
386 static int
387 fclose_maybe_compressed2 (FILE * f, const char * name, void * rr MAYBE_UNUSED)
388 #endif
389 {
390     const struct suffix_handler * r = supported_compression_formats;
391 
392     for( ; r->suffix ; r++) {
393         if (!has_suffix(name, r->suffix)) continue;
394         /* It doesn't really make sense to imagine that one of these two
395          * may exist and not the other */
396         ASSERT_ALWAYS((r->pfmt_out == NULL) == (r->pfmt_in == NULL));
397 
398         char * tempname;
399         int ret = asprintf(&tempname, "%s.tmp.%d", name, getpid());
400         struct stat sbuf[1];
401         ASSERT_ALWAYS(ret >= 0);
402 
403         if (r->pfmt_in || r->pfmt_out) {
404 #ifdef  HAVE_GETRUSAGE
405             if (rr)
406                 ret = cado_pclose2(f, rr);
407             else
408 #endif
409                 ret = cado_pclose(f);
410 
411 #if defined(WIFEXITED) && defined(WEXITSTATUS)
412             /* Unless child process finished normally and with exit ret 0,
413                we return an error */
414             if (ret == -1 || !WIFEXITED(ret) || WEXITSTATUS(ret) != 0)
415                 return EOF;
416 #else
417             /* What do under MinGW? -1 definitely means an error, but how do
418                we parse the other possible ret codes? */
419             if (ret == -1)
420                 return EOF;
421 #endif
422 
423         } else {
424 #ifdef  HAVE_GETRUSAGE
425             if (rr) memset(rr, 0, sizeof(*rr));
426 #endif
427             ret = fclose(f);
428             if (ret != 0)
429                 return ret;
430         }
431 
432         /* do the rename only if the child completed successfully */
433 
434         // coverity[fs_check_call]
435         if (stat(tempname, sbuf) == 0) {
436             ret = rename(tempname, name);
437             if (ret != 0) return EOF;
438         }
439 
440         return 0;
441     }
442     /* If we arrive here, it's because "" is not among the suffixes */
443     abort();
444     return EOF;
445 }
446 
447 int
fclose_maybe_compressed(FILE * f,const char * name)448 fclose_maybe_compressed (FILE * f, const char * name)
449 {
450     return fclose_maybe_compressed2(f, name, NULL);
451 }
452 
453 #include <stdexcept>
454 #include <ios>  // std::ios_base::openmode // IWYU pragma: keep
455 #include <fstream>  // filebuf
456 #include "portability.h" // strdup // IWYU pragma: keep
457 
streambase_maybe_compressed(const char * name,std::ios_base::openmode mode)458 streambase_maybe_compressed::streambase_maybe_compressed(const char * name, std::ios_base::openmode mode)
459 {
460     open(name, mode);
461     init(buf);
462 }
463 
open(const char * name,std::ios_base::openmode mode)464 void streambase_maybe_compressed::open(const char * name, std::ios_base::openmode mode)
465 {
466     orig_name = name;
467     const struct suffix_handler * r = supported_compression_formats;
468     if (mode & std::ios_base::out && r->pfmt_out) {
469         // fmtlib's fmt::format oddly mentions that it can throw a format
470         // error, while its constexpr nature should be able to mark it as
471         // impossible.
472         // coverity[exception_thrown]
473         tempname = fmt::format(FMT_STRING("{}.tmp.{}"), name, getpid());
474         name = tempname.c_str();
475     }
476 
477     if (mode & std::ios_base::in && access(name, R_OK) != 0)
478         throw std::runtime_error("cannot open file for reading");
479     /* creating is ok, of course
480     if (mode & std::ios_base::out && access(name, W_OK) != 0)
481         throw std::runtime_error("cannot open file for writing");
482      */
483     for( ; r->suffix ; r++) {
484         if (!has_suffix(orig_name.c_str(), r->suffix)) continue;
485         char * command = NULL;
486         if (mode & std::ios_base::in && r->pfmt_in) {
487             int ret = asprintf(&command, r->pfmt_in, name);
488             ASSERT_ALWAYS(ret >= 0);
489         }
490         if (mode & std::ios_base::out && r->pfmt_out) {
491             int ret = asprintf(&command, r->pfmt_out, name);
492             ASSERT_ALWAYS(ret >= 0);
493         }
494 
495         if (command) {
496             /* apparently popen() under Linux does not accept the 'b' modifier */
497             pbuf.reset(new cado_pipe_streambuf(command, mode));
498             buf = pbuf.get();
499             pipe = true;
500             free(command);
501         } else {
502             fbuf.reset(new std::filebuf());
503             fbuf->open(name, mode);
504             buf = fbuf.get();
505             pipe = false;
506         }
507         /* hmmm */
508         return;
509     }
510 };
511 
close()512 void streambase_maybe_compressed::close()
513 {
514     if (pipe) pbuf->close();
515     else fbuf->close();
516     if (!tempname.empty()) {
517         int rc = rename(tempname.c_str(), orig_name.c_str());
518         ASSERT_ALWAYS(rc == 0);
519         tempname.clear();
520     }
521 }
522 
523 // we're in a dtor, exceptions can turn your computer into a coconut.
524 // yet we have an ASSERT_ALWAYS in close()
525 // coverity[exn_spec_violation]
~streambase_maybe_compressed()526 streambase_maybe_compressed::~streambase_maybe_compressed()
527 {
528     sync();
529     close();
530 }
531