1 /*
2 Author: James Bonfield
3 
4 Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL
5 All rights reserved
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10    1. Redistributions of source code must retain the above copyright notice,
11 this list of conditions and the following disclaimer.
12 
13    2. Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the documentation
15 and/or other materials provided with the distribution.
16 
17    3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
18 MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
19 promote products derived from this software without specific prior written
20 permission.
21 
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 /*
35 Copyright (c) 2008, 2009, 2013, 2014-2015, 2018-2020 Genome Research Ltd.
36 Author: James Bonfield <jkb@sanger.ac.uk>
37 
38 Redistribution and use in source and binary forms, with or without
39 modification, are permitted provided that the following conditions are met:
40 
41    1. Redistributions of source code must retain the above copyright notice,
42 this list of conditions and the following disclaimer.
43 
44    2. Redistributions in binary form must reproduce the above copyright notice,
45 this list of conditions and the following disclaimer in the documentation
46 and/or other materials provided with the distribution.
47 
48    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
49 Institute nor the names of its contributors may be used to endorse or promote
50 products derived from this software without specific prior written permission.
51 
52 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
53 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
54 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
56 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 */
63 
64 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
65 #include <config.h>
66 
67 #include <stdlib.h>
68 #include <stdio.h>
69 #include <string.h>
70 #include <unistd.h>
71 #include <limits.h>
72 #include <errno.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 
76 #include "os.h"
77 #ifndef PATH_MAX
78 #  define PATH_MAX 1024
79 #endif
80 
81 #include "open_trace_file.h"
82 #include "misc.h"
83 #include "../htslib/hfile.h"
84 #include "../htslib/hts_log.h"
85 #include "../htslib/hts.h"
86 
87 /*
88  * Returns whether the path refers to a regular file.
89  */
is_file(char * fn)90 static int is_file(char *fn) {
91     struct stat buf;
92     if ( stat(fn,&buf) ) return 0;
93     return S_ISREG(buf.st_mode);
94 }
95 
96 /*
97  * Tokenises the search path splitting on colons (unix) or semicolons
98  * (windows).
99  * We also  explicitly add a "./" to the end of the search path
100  *
101  * Returns: A new search path with items separated by nul chars. Two nul
102  *          chars in a row represent the end of the tokenised path.
103  * Returns NULL for a failure.
104  *
105  * The returned data has been malloced. It is up to the caller to free this
106  * memory.
107  */
tokenise_search_path(const char * searchpath)108 char *tokenise_search_path(const char *searchpath) {
109     char *newsearch;
110     unsigned int i, j;
111     size_t len;
112     char path_sep = HTS_PATH_SEPARATOR_CHAR;
113 
114     if (!searchpath)
115         searchpath="";
116 
117     newsearch = (char *)malloc((len = strlen(searchpath))+5);
118     if (!newsearch)
119         return NULL;
120 
121     for (i = 0, j = 0; i < len; i++) {
122         /* "::" => ":". Used for escaping colons in http://foo */
123         if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') {
124             newsearch[j++] = ':';
125             i++;
126             continue;
127         }
128 
129         /* Handle http:// and ftp:// too without :: */
130         if (path_sep == ':') {
131             if ((i == 0 || (i > 0 && searchpath[i-1] == ':')) &&
132                 (!strncmp(&searchpath[i], "http:",     5) ||
133                  !strncmp(&searchpath[i], "https:",    6) ||
134                  !strncmp(&searchpath[i], "ftp:",      4) ||
135                  !strncmp(&searchpath[i], "|http:",    6) ||
136                  !strncmp(&searchpath[i], "|https:",   7) ||
137                  !strncmp(&searchpath[i], "|ftp:",     5) ||
138                  !strncmp(&searchpath[i], "URL=http:", 9) ||
139                  !strncmp(&searchpath[i], "URL=https:",10)||
140                  !strncmp(&searchpath[i], "URL=ftp:",  8))) {
141                 do {
142                     newsearch[j++] = searchpath[i];
143                 } while (i<len && searchpath[i++] != ':');
144                 if (searchpath[i] == ':')
145                     i++;
146                 if (searchpath[i]=='/')
147                     newsearch[j++] = searchpath[i++];
148                 if (searchpath[i]=='/')
149                     newsearch[j++] = searchpath[i++];
150                 // Look for host:port
151                 do {
152                     newsearch[j++] = searchpath[i++];
153                 } while (i<len && searchpath[i] != ':' && searchpath[i] != '/');
154                 newsearch[j++] = searchpath[i++];
155                 if (searchpath[i] == ':')
156                     i++;
157             }
158         }
159 
160         if (searchpath[i] == path_sep) {
161             /* Skip blank path components */
162             if (j && newsearch[j-1] != 0)
163                 newsearch[j++] = 0;
164         } else {
165             newsearch[j++] = searchpath[i];
166         }
167     }
168 
169     if (j)
170         newsearch[j++] = 0;
171     newsearch[j++] = '.';
172     newsearch[j++] = '/';
173     newsearch[j++] = 0;
174     newsearch[j++] = 0;
175 
176     return newsearch;
177 }
178 
179 static char *expand_path(const char *file, char *dirname, int max_s_digits);
180 
find_file_url(const char * file,char * url)181 mFILE *find_file_url(const char *file, char *url) {
182     char *path = NULL, buf[8192];
183     mFILE *mf = NULL;
184     ssize_t len;
185     hFILE *hf = NULL;
186 
187     /* Expand %s for the trace name.  Only one digit is allowed between
188        The % and s to avoid ambiguity with percent-encoded URLs */
189 
190     path = expand_path(file, url, 1);
191     if (!path)
192         return NULL;
193 
194     if (!(hf = hopen(path, "r"))) {
195         if (errno != ENOENT)
196             hts_log_warning("Failed to open reference \"%s\": %s", path, strerror(errno));
197         goto fail;
198     }
199 
200     if (NULL == (mf = mfcreate(NULL, 0)))
201         goto fail;
202     while ((len = hread(hf, buf, sizeof(buf))) > 0) {
203         if (mfwrite(buf, len, 1, mf) <= 0) {
204             hclose_abruptly(hf);
205             goto fail;
206         }
207     }
208     if (hclose(hf) < 0 || len < 0) {
209         hts_log_warning("Failed to read reference \"%s\": %s", path, strerror(errno));
210         goto fail;
211     }
212 
213     free(path);
214     mrewind(mf);
215     return mf;
216 
217  fail:
218     mfdestroy(mf);
219     free(path);
220     return NULL;
221 }
222 
223 /*
224  * Takes a dirname possibly including % rules and appends the filename
225  * to it.
226  *
227  * Returns expanded pathname or NULL for malloc failure.
228  */
expand_path(const char * file,char * dirname,int max_s_digits)229 static char *expand_path(const char *file, char *dirname, int max_s_digits) {
230     size_t len = strlen(dirname);
231     size_t lenf = strlen(file);
232     char *cp, *path;
233 
234     path = malloc(len+lenf+2); // worst expansion DIR/FILE
235     if (!path) {
236         hts_log_error("Out of memory");
237         return NULL;
238     }
239 
240     if (dirname[len-1] == '/')
241         len--;
242 
243     /* Special case for "./" or absolute filenames */
244     if (*file == '/' || (len==1 && *dirname == '.')) {
245         sprintf(path, "%s", file);
246     } else {
247         /* Handle %[0-9]*s expansions, if required */
248         char *path_end = path;
249         *path = 0;
250         while ((cp = strchr(dirname, '%'))) {
251             char *endp;
252             long l = strtol(cp+1, &endp, 10);
253             if (*endp != 's' || endp - cp - 1 > max_s_digits) {
254                 strncpy(path_end, dirname, (endp+1)-dirname);
255                 path_end += (endp+1)-dirname;
256                 dirname = endp+1;
257                 continue;
258             }
259 
260             strncpy(path_end, dirname, cp-dirname);
261             path_end += cp-dirname;
262             if (l) {
263                 strncpy(path_end, file, l);
264                 path_end += MIN(strlen(file), l);
265                 file     += MIN(strlen(file), l);
266             } else {
267                 strcpy(path_end, file);
268                 path_end += strlen(file);
269                 file     += strlen(file);
270             }
271             len -= (endp+1) - dirname;
272             dirname = endp+1;
273         }
274         strncpy(path_end, dirname, len);
275         path_end += MIN(strlen(dirname), len);
276         *path_end = 0;
277         if (*file) {
278             *path_end++ = '/';
279             strcpy(path_end, file);
280         }
281     }
282 
283     //fprintf(stderr, "*PATH=\"%s\"\n", path);
284     return path;
285 }
286 
287 /*
288  * Searches for file in the directory 'dirname'. If it finds it, it opens
289  * it. This also searches for compressed versions of the file in dirname
290  * too.
291  *
292  * Returns mFILE pointer if found
293  *         NULL if not
294  */
find_file_dir(const char * file,char * dirname)295 static mFILE *find_file_dir(const char *file, char *dirname) {
296     char *path;
297     mFILE *mf = NULL;
298 
299     path = expand_path(file, dirname, INT_MAX);
300     if (!path)
301         return NULL;
302 
303     if (is_file(path))
304         mf = mfopen(path, "rbm");
305 
306     free(path);
307     return mf;
308 }
309 
310 /*
311  * ------------------------------------------------------------------------
312  * Public functions below.
313  */
314 
315 /*
316  * Opens a trace file named 'file'. This is initially looked for as a
317  * pathname relative to a file named "relative_to". This may (for
318  * example) be the name of an experiment file referencing the trace
319  * file. In this case by passing relative_to as the experiment file
320  * filename the trace file will be picked up in the same directory as
321  * the experiment file. Relative_to may be supplied as NULL.
322  *
323  * 'file' is looked for at relative_to, then the current directory, and then
324  * all of the locations listed in 'path' (which is a colon separated list).
325  * If 'path' is NULL it uses the RAWDATA environment variable instead.
326  *
327  * Returns a mFILE pointer when found.
328  *           NULL otherwise.
329  */
open_path_mfile(const char * file,char * path,char * relative_to)330 mFILE *open_path_mfile(const char *file, char *path, char *relative_to) {
331     char *newsearch;
332     char *ele;
333     mFILE *fp;
334 
335     /* Use path first */
336     if (!path)
337         path = getenv("RAWDATA");
338     if (NULL == (newsearch = tokenise_search_path(path)))
339         return NULL;
340 
341     /*
342      * Step through the search path testing out each component.
343      * We now look through each path element treating some prefixes as
344      * special, otherwise we treat the element as a directory.
345      */
346     for (ele = newsearch; *ele; ele += strlen(ele)+1) {
347         char *ele2;
348 
349         /*
350          * '|' prefixing a path component indicates that we do not
351          * wish to perform the compression extension searching in that
352          * location.
353          *
354          * NB: this has been removed from the htslib implementation.
355          */
356         if (*ele == '|') {
357             ele2 = ele+1;
358         } else {
359             ele2 = ele;
360         }
361 
362         if (0 == strncmp(ele2, "URL=", 4)) {
363             if ((fp = find_file_url(file, ele2+4))) {
364                 free(newsearch);
365                 return fp;
366             }
367         } else if (!strncmp(ele2, "http:", 5) ||
368                    !strncmp(ele2, "https:", 6) ||
369                    !strncmp(ele2, "ftp:", 4)) {
370             if ((fp = find_file_url(file, ele2))) {
371                 free(newsearch);
372                 return fp;
373             }
374         } else if ((fp = find_file_dir(file, ele2))) {
375             free(newsearch);
376             return fp;
377         }
378     }
379 
380     free(newsearch);
381 
382     /* Look in the same location as the incoming 'relative_to' filename */
383     if (relative_to) {
384         char *cp;
385         char relative_path[PATH_MAX+1];
386         strcpy(relative_path, relative_to);
387         if ((cp = strrchr(relative_path, '/')))
388             *cp = 0;
389         if ((fp = find_file_dir(file, relative_path)))
390             return fp;
391     }
392 
393     return NULL;
394 }
395 
396 
397 /*
398  * As per open_path_mfile, but searching only for local filenames.
399  * This is useful as we may avoid doing a full mfopen and loading
400  * the entire file into memory.
401  *
402  * Returns the expanded pathname if found.
403  *         NULL if not
404  */
find_path(const char * file,const char * path)405 char *find_path(const char *file, const char *path) {
406     char *newsearch;
407     char *ele;
408     char *outpath = NULL;
409 
410     /* Use path first */
411     if (!path)
412         path = getenv("RAWDATA");
413     if (NULL == (newsearch = tokenise_search_path(path)))
414         return NULL;
415 
416     for (ele = newsearch; *ele; ele += strlen(ele)+1) {
417         char *ele2 = (*ele == '|') ? ele+1 : ele;
418 
419         if (!strncmp(ele2, "URL=", 4) ||
420             !strncmp(ele2, "http:", 5) ||
421             !strncmp(ele2, "https:", 6) ||
422             !strncmp(ele2, "ftp:", 4)) {
423             continue;
424         } else {
425             outpath = expand_path(file, ele2, INT_MAX);
426             if (is_file(outpath)) {
427                 free(newsearch);
428                 return outpath;
429             } else {
430                 free(outpath);
431             }
432         }
433     }
434 
435     free(newsearch);
436 
437     return NULL;
438 }
439