1 /** Uncompress input files using pipes.
2 * Hook the standard file opening functions, open, fopen and fopen64.
3 * If the extension of the file being opened indicates the file is
4 * compressed (.gz, .bz2, .xz), open a pipe to a program that
5 * decompresses that file (gunzip, bunzip2 or xzdec) and return a
6 * handle to the open pipe.
7 * @author Shaun Jackman <sjackman@bcgsc.ca>
8 */
9
10 #include "config.h"
11 #if HAVE_LIBDL
12
13 #include "Fcontrol.h"
14 #include "SignalHandler.h"
15 #include "StringUtil.h"
16 #include <cassert>
17 #include <cstdio> // for perror
18 #include <cstdlib>
19 #include <dlfcn.h>
20 #include <string>
21 #include <unistd.h>
22
23 using namespace std;
24
wgetExec(const string & path)25 static const char* wgetExec(const string& path)
26 {
27 return
28 startsWith(path, "http://") ? "wget -O-" :
29 startsWith(path, "https://") ? "wget -O-" :
30 startsWith(path, "ftp://") ? "wget -O-" :
31 NULL;
32 }
33
zcatExec(const string & path)34 static const char* zcatExec(const string& path)
35 {
36 return
37 endsWith(path, ".ar") ? "ar -p" :
38 endsWith(path, ".tar") ? "tar -xOf" :
39 endsWith(path, ".tar.Z") ? "tar -zxOf" :
40 endsWith(path, ".tar.gz") ? "tar -zxOf" :
41 endsWith(path, ".tar.bz2") ? "tar -jxOf" :
42 endsWith(path, ".tar.xz") ?
43 "tar --use-compress-program=xzdec -xOf" :
44 endsWith(path, ".Z") ? "gunzip -c" :
45 endsWith(path, ".gz") ? "gunzip -c" :
46 endsWith(path, ".bz2") ? "bunzip2 -c" :
47 endsWith(path, ".xz") ? "xzdec -c" :
48 endsWith(path, ".zip") ? "unzip -p" :
49 endsWith(path, ".bam") ? "samtools view -h" :
50 endsWith(path, ".cram") ? "samtools view -h" :
51 endsWith(path, ".jf") ? "jellyfish dump" :
52 endsWith(path, ".jfq") ? "jellyfish qdump" :
53 endsWith(path, ".sra") ? "fastq-dump -Z --split-spot" :
54 endsWith(path, ".url") ? "wget -O- -i" :
55 endsWith(path, ".fqz") ? "fqz_comp -d" :
56 NULL;
57 }
58
59 extern "C" {
60
61 /** Open a pipe to uncompress the specified file.
62 * Not thread safe.
63 * @return a file descriptor
64 */
uncompress(const char * path)65 static int uncompress(const char *path)
66 {
67 const char *wget = wgetExec(path);
68 const char *zcat = wget != NULL ? wget : zcatExec(path);
69 assert(zcat != NULL);
70
71 int fd[2];
72 if (pipe(fd) == -1)
73 return -1;
74 int err = setCloexec(fd[0]);
75 assert(err == 0);
76 (void)err;
77
78 char arg0[16], arg1[16], arg2[16];
79 int n = sscanf(zcat, "%s %s %s", arg0, arg1, arg2);
80 assert(n == 2 || n == 3);
81
82 /* It would be more portable to use fork than vfork, but fork can
83 * fail with ENOMEM when the process calling fork is using a lot
84 * of memory. A workaround for this problem is to set
85 * sysctl vm.overcommit_memory=1
86 */
87 #if HAVE_WORKING_VFORK
88 pid_t pid = vfork();
89 #else
90 pid_t pid = fork();
91 #endif
92 if (pid == -1)
93 return -1;
94
95 if (pid == 0) {
96 dup2(fd[1], STDOUT_FILENO);
97 close(fd[1]);
98 if (n == 2)
99 execlp(arg0, arg0, arg1, path, NULL);
100 else
101 execlp(arg0, arg0, arg1, arg2, path, NULL);
102 // Calling perror after vfork is not allowed, but we're about
103 // to exit and an error message would be really helpful.
104 perror(arg0);
105 _exit(EXIT_FAILURE);
106 } else {
107 close(fd[1]);
108 return fd[0];
109 }
110 }
111
112 /** Open a pipe to uncompress the specified file.
113 * @return a FILE pointer
114 */
funcompress(const char * path)115 static FILE* funcompress(const char* path)
116 {
117 int fd = uncompress(path);
118 if (fd == -1) {
119 perror(path);
120 exit(EXIT_FAILURE);
121 }
122 return fdopen(fd, "r");
123 }
124
125 typedef FILE* (*fopen_t)(const char *path, const char *mode);
126
127 /** If the specified file is compressed, return a pipe that
128 * uncompresses it.
129 */
fopen(const char * path,const char * mode)130 FILE *fopen(const char *path, const char *mode)
131 {
132 static fopen_t real_fopen;
133 if (real_fopen == NULL)
134 real_fopen = (fopen_t)dlsym(RTLD_NEXT, "fopen");
135 if (real_fopen == NULL) {
136 fprintf(stderr, "error: dlsym fopen: %s\n", dlerror());
137 exit(EXIT_FAILURE);
138 }
139
140 // open a web address
141 if (wgetExec(path) != NULL)
142 return funcompress(path);
143
144 // to check if the file exists, we need to attempt to open it
145 FILE* stream = real_fopen(path, mode);
146 if (string(mode) != "r" || !stream || zcatExec(path) == NULL)
147 return stream;
148 else {
149 fclose(stream);
150 return funcompress(path);
151 }
152 }
153
154 /** If the specified file is compressed, return a pipe that
155 * uncompresses it.
156 */
fopen64(const char * path,const char * mode)157 FILE *fopen64(const char *path, const char *mode)
158 {
159 static fopen_t real_fopen64;
160 if (real_fopen64 == NULL)
161 real_fopen64 = (fopen_t)dlsym(RTLD_NEXT, "fopen64");
162 if (real_fopen64 == NULL) {
163 fprintf(stderr, "error: dlsym fopen64: %s\n", dlerror());
164 exit(EXIT_FAILURE);
165 }
166
167 // open a web address
168 if (wgetExec(path) != NULL)
169 return funcompress(path);
170
171 // to check if the file exists, we need to attempt to open it
172 FILE* stream = real_fopen64(path, mode);
173 if (string(mode) != "r" || !stream || zcatExec(path) == NULL)
174 return stream;
175 else {
176 fclose(stream);
177 return funcompress(path);
178 }
179 }
180
181 typedef int (*open_t)(const char *path, int flags, mode_t mode);
182
183 /** If the specified file is compressed, return a pipe that
184 * uncompresses it.
185 */
open(const char * path,int flags,mode_t mode)186 int open(const char *path, int flags, mode_t mode)
187 {
188 static open_t real_open;
189 if (real_open == NULL)
190 real_open = (open_t)dlsym(RTLD_NEXT, "open");
191 if (real_open == NULL) {
192 fprintf(stderr, "error: dlsym open: %s\n", dlerror());
193 exit(EXIT_FAILURE);
194 }
195
196 // open a web address
197 if (wgetExec(path) != NULL)
198 return uncompress(path);
199
200 // to check if the file exists, we need to attempt to open it
201 int filedesc = real_open(path, flags, mode);
202 if (mode != ios_base::in || filedesc < 0
203 || zcatExec(path) == NULL)
204 return filedesc;
205 else {
206 close(filedesc);
207 return uncompress(path);
208 }
209 }
210
211 } // extern "C"
212
213 #endif // HAVE_LIBDL
214
215 /** Initialize the uncompress module. */
uncompress_init()216 bool uncompress_init()
217 {
218 #if HAVE_LIBDL
219 signalInit();
220 #endif
221 return HAVE_LIBDL;
222 }
223