1 /*
2 Author: James Bonfield
3 
4 Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL
5 All rights reserved
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10    1. Redistributions of source code must retain the above copyright notice,
11 this list of conditions and the following disclaimer.
12 
13    2. Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the documentation
15 and/or other materials provided with the distribution.
16 
17    3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
18 MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
19 promote products derived from this software without specific prior written
20 permission.
21 
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /*
35 Copyright (c) 2008, 2009, 2013, 2014 Genome Research Ltd.
36 Author: James Bonfield <jkb@sanger.ac.uk>
37 
38 Redistribution and use in source and binary forms, with or without
39 modification, are permitted provided that the following conditions are met:
40 
41    1. Redistributions of source code must retain the above copyright notice,
42 this list of conditions and the following disclaimer.
43 
44    2. Redistributions in binary form must reproduce the above copyright notice,
45 this list of conditions and the following disclaimer in the documentation
46 and/or other materials provided with the distribution.
47 
48    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
49 Institute nor the names of its contributors may be used to endorse or promote
50 products derived from this software without specific prior written permission.
51 
52 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
53 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
54 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
56 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 */
63 
64 #include <config.h>
65 
66 #include <stdlib.h>
67 #include <stdio.h>
68 #include <string.h>
69 #include <unistd.h>
70 #include <limits.h>
71 #include <sys/types.h>
72 #include <sys/stat.h>
73 #include "cram/os.h"
74 #ifndef PATH_MAX
75 #  define PATH_MAX 1024
76 #endif
77 
78 #include "cram/open_trace_file.h"
79 #include "cram/misc.h"
80 #include "htslib/hfile.h"
81 
82 /*
83  * Tokenises the search path splitting on colons (unix) or semicolons
84  * (windows).
85  * We also  explicitly add a "./" to the end of the search path
86  *
87  * Returns: A new search path with items separated by nul chars. Two nul
88  *          chars in a row represent the end of the tokenised path.
89  * Returns NULL for a failure.
90  *
91  * The returned data has been malloced. It is up to the caller to free this
92  * memory.
93  */
tokenise_search_path(char * searchpath)94 char *tokenise_search_path(char *searchpath) {
95     char *newsearch;
96     unsigned int i, j;
97     size_t len;
98 #ifdef _WIN32
99     char path_sep = ';';
100 #else
101     char path_sep = ':';
102 #endif
103 
104     if (!searchpath)
105 	searchpath="";
106 
107     newsearch = (char *)malloc((len = strlen(searchpath))+5);
108     if (!newsearch)
109 	return NULL;
110 
111     for (i = 0, j = 0; i < len; i++) {
112 	/* "::" => ":". Used for escaping colons in http://foo */
113 	if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') {
114 	    newsearch[j++] = ':';
115 	    i++;
116 	    continue;
117 	}
118 
119 	/* Handle http:// and ftp:// too without :: */
120 	if (path_sep == ':') {
121 	    if ((i == 0 || (i > 0 && searchpath[i-1] == ':')) &&
122 		(!strncmp(&searchpath[i], "http:",     5) ||
123 		 !strncmp(&searchpath[i], "ftp:",      4) ||
124 		 !strncmp(&searchpath[i], "|http:",    6) ||
125 		 !strncmp(&searchpath[i], "|ftp:",     5) ||
126 		 !strncmp(&searchpath[i], "URL=http:", 9) ||
127 		 !strncmp(&searchpath[i], "URL=ftp:",  8))) {
128 		do {
129 		    newsearch[j++] = searchpath[i];
130 		} while (i<len && searchpath[i++] != ':');
131 		if (searchpath[i] == ':')
132 		    i++;
133 		if (searchpath[i]=='/')
134 		    newsearch[j++] = searchpath[i++];
135 		if (searchpath[i]=='/')
136 		    newsearch[j++] = searchpath[i++];
137 		// Look for host:port
138 		do {
139 		    newsearch[j++] = searchpath[i++];
140 		} while (i<len && searchpath[i] != ':' && searchpath[i] != '/');
141 		newsearch[j++] = searchpath[i++];
142 		if (searchpath[i] == ':')
143 		    i++;
144 	    }
145 	}
146 
147 	if (searchpath[i] == path_sep) {
148 	    /* Skip blank path components */
149 	    if (j && newsearch[j-1] != 0)
150 		newsearch[j++] = 0;
151 	} else {
152 	    newsearch[j++] = searchpath[i];
153 	}
154     }
155 
156     if (j)
157 	newsearch[j++] = 0;
158     newsearch[j++] = '.';
159     newsearch[j++] = '/';
160     newsearch[j++] = 0;
161     newsearch[j++] = 0;
162 
163     return newsearch;
164 }
165 
find_file_url(char * file,char * url)166 mFILE *find_file_url(char *file, char *url) {
167     char buf[8192], *cp;
168     mFILE *mf = NULL;
169     int maxlen = 8190 - strlen(file), len;
170     hFILE *hf;
171 
172     /* Expand %s for the trace name */
173     for (cp = buf; *url && cp - buf < maxlen; url++) {
174 	if (*url == '%' && *(url+1) == 's') {
175 	    url++;
176 	    cp += strlen(strcpy(cp, file));
177 	} else {
178 	    *cp++ = *url;
179 	}
180     }
181     *cp++ = 0;
182 
183     if (!(hf = hopen(buf, "r")))
184 	return NULL;
185 
186     if (NULL == (mf = mfcreate(NULL, 0)))
187 	return NULL;
188     while ((len = hread(hf, buf, 8192)) > 0) {
189 	if (mfwrite(buf, len, 1, mf) <= 0) {
190 	    hclose_abruptly(hf);
191 	    mfdestroy(mf);
192 	    return NULL;
193 	}
194     }
195     if (hclose(hf) < 0 || len < 0) {
196 	mfdestroy(mf);
197 	return NULL;
198     }
199 
200     mrewind(mf);
201     return mf;
202 }
203 
204 /*
205  * Takes a dirname possibly including % rules and appends the filename
206  * to it.
207  *
208  * Returns expanded pathname or NULL for malloc failure.
209  */
expand_path(char * file,char * dirname)210 static char *expand_path(char *file, char *dirname) {
211     size_t len = strlen(dirname);
212     size_t lenf = strlen(file);
213     char *cp, *path;
214 
215     path = malloc(len+lenf+2); // worst expansion DIR/FILE
216     if (!path)
217 	return NULL;
218 
219     if (dirname[len-1] == '/')
220 	len--;
221 
222     /* Special case for "./" or absolute filenames */
223     if (*file == '/' || (len==1 && *dirname == '.')) {
224 	sprintf(path, "%s", file);
225     } else {
226 	/* Handle %[0-9]*s expansions, if required */
227 	char *path_end = path;
228 	*path = 0;
229 	while ((cp = strchr(dirname, '%'))) {
230 	    char *endp;
231 	    long l = strtol(cp+1, &endp, 10);
232 	    if (*endp != 's') {
233 		strncpy(path_end, dirname, (endp+1)-dirname);
234 		path_end += (endp+1)-dirname;
235 		dirname = endp+1;
236 		continue;
237 	    }
238 
239 	    strncpy(path_end, dirname, cp-dirname);
240 	    path_end += cp-dirname;
241 	    if (l) {
242 		strncpy(path_end, file, l);
243 		path_end += MIN(strlen(file), l);
244 		file     += MIN(strlen(file), l);
245 	    } else {
246 		strcpy(path_end, file);
247 		path_end += strlen(file);
248 		file     += strlen(file);
249 	    }
250 	    len -= (endp+1) - dirname;
251 	    dirname = endp+1;
252 	}
253 	strncpy(path_end, dirname, len);
254 	path_end += MIN(strlen(dirname), len);
255 	*path_end = 0;
256 	if (*file) {
257 	    *path_end++ = '/';
258 	    strcpy(path_end, file);
259 	}
260     }
261 
262     //fprintf(stderr, "*PATH=\"%s\"\n", path);
263     return path;
264 }
265 
266 /*
267  * Searches for file in the directory 'dirname'. If it finds it, it opens
268  * it. This also searches for compressed versions of the file in dirname
269  * too.
270  *
271  * Returns mFILE pointer if found
272  *         NULL if not
273  */
find_file_dir(char * file,char * dirname)274 static mFILE *find_file_dir(char *file, char *dirname) {
275     char *path;
276     mFILE *mf = NULL;
277 
278     path = expand_path(file, dirname);
279 
280     if (is_file(path))
281 	mf = mfopen(path, "rbm");
282 
283     free(path);
284     return mf;
285 }
286 
287 /*
288  * ------------------------------------------------------------------------
289  * Public functions below.
290  */
291 
292 /*
293  * Opens a trace file named 'file'. This is initially looked for as a
294  * pathname relative to a file named "relative_to". This may (for
295  * example) be the name of an experiment file referencing the trace
296  * file. In this case by passing relative_to as the experiment file
297  * filename the trace file will be picked up in the same directory as
298  * the experiment file. Relative_to may be supplied as NULL.
299  *
300  * 'file' is looked for at relative_to, then the current directory, and then
301  * all of the locations listed in 'path' (which is a colon separated list).
302  * If 'path' is NULL it uses the RAWDATA environment variable instead.
303  *
304  * Returns a mFILE pointer when found.
305  *           NULL otherwise.
306  */
open_path_mfile(char * file,char * path,char * relative_to)307 mFILE *open_path_mfile(char *file, char *path, char *relative_to) {
308     char *newsearch;
309     char *ele;
310     mFILE *fp;
311 
312     /* Use path first */
313     if (!path)
314 	path = getenv("RAWDATA");
315     if (NULL == (newsearch = tokenise_search_path(path)))
316 	return NULL;
317 
318     /*
319      * Step through the search path testing out each component.
320      * We now look through each path element treating some prefixes as
321      * special, otherwise we treat the element as a directory.
322      */
323     for (ele = newsearch; *ele; ele += strlen(ele)+1) {
324 	char *ele2;
325 
326 	/*
327 	 * '|' prefixing a path component indicates that we do not
328 	 * wish to perform the compression extension searching in that
329 	 * location.
330 	 *
331 	 * NB: this has been removed from the htslib implementation.
332 	 */
333 	if (*ele == '|') {
334 	    ele2 = ele+1;
335 	} else {
336 	    ele2 = ele;
337 	}
338 
339 	if (0 == strncmp(ele2, "URL=", 4)) {
340 	    if ((fp = find_file_url(file, ele2+4))) {
341 		free(newsearch);
342 		return fp;
343 	    }
344 	} else if (!strncmp(ele2, "http:", 5) ||
345 		   !strncmp(ele2, "ftp:", 4)) {
346 	    if ((fp = find_file_url(file, ele2))) {
347 		free(newsearch);
348 		return fp;
349 	    }
350 	} else if ((fp = find_file_dir(file, ele2))) {
351 	    free(newsearch);
352 	    return fp;
353 	}
354     }
355 
356     free(newsearch);
357 
358     /* Look in the same location as the incoming 'relative_to' filename */
359     if (relative_to) {
360 	char *cp;
361 	char relative_path[PATH_MAX+1];
362 	strcpy(relative_path, relative_to);
363 	if ((cp = strrchr(relative_path, '/')))
364 	    *cp = 0;
365 	if ((fp = find_file_dir(file, relative_path)))
366 	    return fp;
367     }
368 
369     return NULL;
370 }
371 
372 
373 /*
374  * As per open_path_mfile, but searching only for local filenames.
375  * This is useful as we may avoid doing a full mfopen and loading
376  * the entire file into memory.
377  *
378  * Returns the expanded pathname if found.
379  *         NULL if not
380  */
find_path(char * file,char * path)381 char *find_path(char *file, char *path) {
382     char *newsearch;
383     char *ele;
384     char *outpath = NULL;
385 
386     /* Use path first */
387     if (!path)
388 	path = getenv("RAWDATA");
389     if (NULL == (newsearch = tokenise_search_path(path)))
390 	return NULL;
391 
392     for (ele = newsearch; *ele; ele += strlen(ele)+1) {
393 	char *ele2 = (*ele == '|') ? ele+1 : ele;
394 
395 	if (!strncmp(ele2, "URL=", 4) ||
396 	    !strncmp(ele2, "http:", 5) ||
397 	    !strncmp(ele2, "ftp:", 4)) {
398 	    continue;
399 	} else {
400 	    outpath = expand_path(file, ele2);
401 	    if (is_file(outpath)) {
402 		free(newsearch);
403 		return outpath;
404 	    } else {
405 		free(outpath);
406 	    }
407 	}
408     }
409 
410     free(newsearch);
411 
412     return NULL;
413 }
414