1 /** Uncompress input files using pipes.
2  * Hook the standard file opening functions, open, fopen and fopen64.
3  * If the extension of the file being opened indicates the file is
4  * compressed (.gz, .bz2, .xz), open a pipe to a program that
5  * decompresses that file (gunzip, bunzip2 or xzdec) and return a
6  * handle to the open pipe.
7  * @author Shaun Jackman <sjackman@bcgsc.ca>
8  */
9 
10 #include "config.h"
11 #if HAVE_LIBDL
12 
13 #include "Fcontrol.h"
14 #include "SignalHandler.h"
15 #include "StringUtil.h"
16 #include <cassert>
17 #include <cstdio> // for perror
18 #include <cstdlib>
19 #include <dlfcn.h>
20 #include <string>
21 #include <unistd.h>
22 
23 using namespace std;
24 
wgetExec(const string & path)25 static const char* wgetExec(const string& path)
26 {
27 	return
28 		startsWith(path, "http://") ? "wget -O-" :
29 		startsWith(path, "https://") ? "wget -O-" :
30 		startsWith(path, "ftp://") ? "wget -O-" :
31 		NULL;
32 }
33 
zcatExec(const string & path)34 static const char* zcatExec(const string& path)
35 {
36 	return
37 		endsWith(path, ".ar") ? "ar -p" :
38 		endsWith(path, ".tar") ? "tar -xOf" :
39 		endsWith(path, ".tar.Z") ? "tar -zxOf" :
40 		endsWith(path, ".tar.gz") ? "tar -zxOf" :
41 		endsWith(path, ".tar.bz2") ? "tar -jxOf" :
42 		endsWith(path, ".tar.xz") ?
43 			"tar --use-compress-program=xzdec -xOf" :
44 		endsWith(path, ".Z") ? "gunzip -c" :
45 		endsWith(path, ".gz") ? "gunzip -c" :
46 		endsWith(path, ".bz2") ? "bunzip2 -c" :
47 		endsWith(path, ".xz") ? "xzdec -c" :
48 		endsWith(path, ".zip") ? "unzip -p" :
49 		endsWith(path, ".bam") ? "samtools view -h" :
50 		endsWith(path, ".cram") ? "samtools view -h" :
51 		endsWith(path, ".jf") ? "jellyfish dump" :
52 		endsWith(path, ".jfq") ? "jellyfish qdump" :
53 		endsWith(path, ".sra") ? "fastq-dump -Z --split-spot" :
54 		endsWith(path, ".url") ? "wget -O- -i" :
55 	        endsWith(path, ".fqz") ? "fqz_comp -d" :
56 		NULL;
57 }
58 
59 extern "C" {
60 
61 /** Open a pipe to uncompress the specified file.
62  * Not thread safe.
63  * @return a file descriptor
64  */
uncompress(const char * path)65 static int uncompress(const char *path)
66 {
67 	const char *wget = wgetExec(path);
68 	const char *zcat = wget != NULL ? wget : zcatExec(path);
69 	assert(zcat != NULL);
70 
71 	int fd[2];
72 	if (pipe(fd) == -1)
73 		return -1;
74 	int err = setCloexec(fd[0]);
75 	assert(err == 0);
76 	(void)err;
77 
78 	char arg0[16], arg1[16], arg2[16];
79 	int n = sscanf(zcat, "%s %s %s", arg0, arg1, arg2);
80 	assert(n == 2 || n == 3);
81 
82 	/* It would be more portable to use fork than vfork, but fork can
83 	 * fail with ENOMEM when the process calling fork is using a lot
84 	 * of memory. A workaround for this problem is to set
85 	 * sysctl vm.overcommit_memory=1
86 	 */
87 #if HAVE_WORKING_VFORK
88 	pid_t pid = vfork();
89 #else
90 	pid_t pid = fork();
91 #endif
92 	if (pid == -1)
93 		return -1;
94 
95 	if (pid == 0) {
96 		dup2(fd[1], STDOUT_FILENO);
97 		close(fd[1]);
98 		if (n == 2)
99 			execlp(arg0, arg0, arg1, path, NULL);
100 		else
101 			execlp(arg0, arg0, arg1, arg2, path, NULL);
102 		// Calling perror after vfork is not allowed, but we're about
103 		// to exit and an error message would be really helpful.
104 		perror(arg0);
105 		_exit(EXIT_FAILURE);
106 	} else {
107 		close(fd[1]);
108 		return fd[0];
109 	}
110 }
111 
112 /** Open a pipe to uncompress the specified file.
113  * @return a FILE pointer
114  */
funcompress(const char * path)115 static FILE* funcompress(const char* path)
116 {
117 	int fd = uncompress(path);
118 	if (fd == -1) {
119 		perror(path);
120 		exit(EXIT_FAILURE);
121 	}
122 	return fdopen(fd, "r");
123 }
124 
125 typedef FILE* (*fopen_t)(const char *path, const char *mode);
126 
127 /** If the specified file is compressed, return a pipe that
128  * uncompresses it.
129  */
fopen(const char * path,const char * mode)130 FILE *fopen(const char *path, const char *mode)
131 {
132 	static fopen_t real_fopen;
133 	if (real_fopen == NULL)
134 		real_fopen = (fopen_t)dlsym(RTLD_NEXT, "fopen");
135 	if (real_fopen == NULL) {
136 		fprintf(stderr, "error: dlsym fopen: %s\n", dlerror());
137 		exit(EXIT_FAILURE);
138 	}
139 
140 	// open a web address
141 	if (wgetExec(path) != NULL)
142 		return funcompress(path);
143 
144 	// to check if the file exists, we need to attempt to open it
145 	FILE* stream = real_fopen(path, mode);
146 	if (string(mode) != "r" || !stream || zcatExec(path) == NULL)
147 		return stream;
148 	else {
149 		fclose(stream);
150 		return funcompress(path);
151 	}
152 }
153 
154 /** If the specified file is compressed, return a pipe that
155  * uncompresses it.
156  */
fopen64(const char * path,const char * mode)157 FILE *fopen64(const char *path, const char *mode)
158 {
159 	static fopen_t real_fopen64;
160 	if (real_fopen64 == NULL)
161 		real_fopen64 = (fopen_t)dlsym(RTLD_NEXT, "fopen64");
162 	if (real_fopen64 == NULL) {
163 		fprintf(stderr, "error: dlsym fopen64: %s\n", dlerror());
164 		exit(EXIT_FAILURE);
165 	}
166 
167 	// open a web address
168 	if (wgetExec(path) != NULL)
169 		return funcompress(path);
170 
171 	// to check if the file exists, we need to attempt to open it
172 	FILE* stream = real_fopen64(path, mode);
173 	if (string(mode) != "r" || !stream || zcatExec(path) == NULL)
174 		return stream;
175 	else {
176 		fclose(stream);
177 		return funcompress(path);
178 	}
179 }
180 
181 typedef int (*open_t)(const char *path, int flags, mode_t mode);
182 
183 /** If the specified file is compressed, return a pipe that
184  * uncompresses it.
185  */
open(const char * path,int flags,mode_t mode)186 int open(const char *path, int flags, mode_t mode)
187 {
188 	static open_t real_open;
189 	if (real_open == NULL)
190 		real_open = (open_t)dlsym(RTLD_NEXT, "open");
191 	if (real_open == NULL) {
192 		fprintf(stderr, "error: dlsym open: %s\n", dlerror());
193 		exit(EXIT_FAILURE);
194 	}
195 
196 	// open a web address
197 	if (wgetExec(path) != NULL)
198 		return uncompress(path);
199 
200 	// to check if the file exists, we need to attempt to open it
201 	int filedesc = real_open(path, flags, mode);
202 	if (mode != ios_base::in || filedesc < 0
203 			|| zcatExec(path) == NULL)
204 		return filedesc;
205 	else {
206 		close(filedesc);
207 		return uncompress(path);
208 	}
209 }
210 
211 } // extern "C"
212 
213 #endif // HAVE_LIBDL
214 
215 /** Initialize the uncompress module. */
uncompress_init()216 bool uncompress_init()
217 {
218 #if HAVE_LIBDL
219 	signalInit();
220 #endif
221 	return HAVE_LIBDL;
222 }
223