1 /*
2 Author: James Bonfield
3
4 Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL
5 All rights reserved
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9
10 1. Redistributions of source code must retain the above copyright notice,
11 this list of conditions and the following disclaimer.
12
13 2. Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the documentation
15 and/or other materials provided with the distribution.
16
17 3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
18 MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
19 promote products derived from this software without specific prior written
20 permission.
21
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
26 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 Copyright (c) 2008, 2009, 2013, 2014-2015, 2018-2020 Genome Research Ltd.
36 Author: James Bonfield <jkb@sanger.ac.uk>
37
38 Redistribution and use in source and binary forms, with or without
39 modification, are permitted provided that the following conditions are met:
40
41 1. Redistributions of source code must retain the above copyright notice,
42 this list of conditions and the following disclaimer.
43
44 2. Redistributions in binary form must reproduce the above copyright notice,
45 this list of conditions and the following disclaimer in the documentation
46 and/or other materials provided with the distribution.
47
48 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
49 Institute nor the names of its contributors may be used to endorse or promote
50 products derived from this software without specific prior written permission.
51
52 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
53 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
54 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
56 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 */
63
64 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
65 #include <config.h>
66
67 #include <stdlib.h>
68 #include <stdio.h>
69 #include <string.h>
70 #include <unistd.h>
71 #include <limits.h>
72 #include <errno.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75
76 #include "os.h"
77 #ifndef PATH_MAX
78 # define PATH_MAX 1024
79 #endif
80
81 #include "open_trace_file.h"
82 #include "misc.h"
83 #include "../htslib/hfile.h"
84 #include "../htslib/hts_log.h"
85 #include "../htslib/hts.h"
86
87 /*
88 * Returns whether the path refers to a regular file.
89 */
is_file(char * fn)90 static int is_file(char *fn) {
91 struct stat buf;
92 if ( stat(fn,&buf) ) return 0;
93 return S_ISREG(buf.st_mode);
94 }
95
96 /*
97 * Tokenises the search path splitting on colons (unix) or semicolons
98 * (windows).
99 * We also explicitly add a "./" to the end of the search path
100 *
101 * Returns: A new search path with items separated by nul chars. Two nul
102 * chars in a row represent the end of the tokenised path.
103 * Returns NULL for a failure.
104 *
105 * The returned data has been malloced. It is up to the caller to free this
106 * memory.
107 */
tokenise_search_path(const char * searchpath)108 char *tokenise_search_path(const char *searchpath) {
109 char *newsearch;
110 unsigned int i, j;
111 size_t len;
112 char path_sep = HTS_PATH_SEPARATOR_CHAR;
113
114 if (!searchpath)
115 searchpath="";
116
117 newsearch = (char *)malloc((len = strlen(searchpath))+5);
118 if (!newsearch)
119 return NULL;
120
121 for (i = 0, j = 0; i < len; i++) {
122 /* "::" => ":". Used for escaping colons in http://foo */
123 if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') {
124 newsearch[j++] = ':';
125 i++;
126 continue;
127 }
128
129 /* Handle http:// and ftp:// too without :: */
130 if (path_sep == ':') {
131 if ((i == 0 || (i > 0 && searchpath[i-1] == ':')) &&
132 (!strncmp(&searchpath[i], "http:", 5) ||
133 !strncmp(&searchpath[i], "https:", 6) ||
134 !strncmp(&searchpath[i], "ftp:", 4) ||
135 !strncmp(&searchpath[i], "|http:", 6) ||
136 !strncmp(&searchpath[i], "|https:", 7) ||
137 !strncmp(&searchpath[i], "|ftp:", 5) ||
138 !strncmp(&searchpath[i], "URL=http:", 9) ||
139 !strncmp(&searchpath[i], "URL=https:",10)||
140 !strncmp(&searchpath[i], "URL=ftp:", 8))) {
141 do {
142 newsearch[j++] = searchpath[i];
143 } while (i<len && searchpath[i++] != ':');
144 if (searchpath[i] == ':')
145 i++;
146 if (searchpath[i]=='/')
147 newsearch[j++] = searchpath[i++];
148 if (searchpath[i]=='/')
149 newsearch[j++] = searchpath[i++];
150 // Look for host:port
151 do {
152 newsearch[j++] = searchpath[i++];
153 } while (i<len && searchpath[i] != ':' && searchpath[i] != '/');
154 newsearch[j++] = searchpath[i++];
155 if (searchpath[i] == ':')
156 i++;
157 }
158 }
159
160 if (searchpath[i] == path_sep) {
161 /* Skip blank path components */
162 if (j && newsearch[j-1] != 0)
163 newsearch[j++] = 0;
164 } else {
165 newsearch[j++] = searchpath[i];
166 }
167 }
168
169 if (j)
170 newsearch[j++] = 0;
171 newsearch[j++] = '.';
172 newsearch[j++] = '/';
173 newsearch[j++] = 0;
174 newsearch[j++] = 0;
175
176 return newsearch;
177 }
178
179 static char *expand_path(const char *file, char *dirname, int max_s_digits);
180
find_file_url(const char * file,char * url)181 mFILE *find_file_url(const char *file, char *url) {
182 char *path = NULL, buf[8192];
183 mFILE *mf = NULL;
184 ssize_t len;
185 hFILE *hf = NULL;
186
187 /* Expand %s for the trace name. Only one digit is allowed between
188 The % and s to avoid ambiguity with percent-encoded URLs */
189
190 path = expand_path(file, url, 1);
191 if (!path)
192 return NULL;
193
194 if (!(hf = hopen(path, "r"))) {
195 if (errno != ENOENT)
196 hts_log_warning("Failed to open reference \"%s\": %s", path, strerror(errno));
197 goto fail;
198 }
199
200 if (NULL == (mf = mfcreate(NULL, 0)))
201 goto fail;
202 while ((len = hread(hf, buf, sizeof(buf))) > 0) {
203 if (mfwrite(buf, len, 1, mf) <= 0) {
204 hclose_abruptly(hf);
205 goto fail;
206 }
207 }
208 if (hclose(hf) < 0 || len < 0) {
209 hts_log_warning("Failed to read reference \"%s\": %s", path, strerror(errno));
210 goto fail;
211 }
212
213 free(path);
214 mrewind(mf);
215 return mf;
216
217 fail:
218 mfdestroy(mf);
219 free(path);
220 return NULL;
221 }
222
223 /*
224 * Takes a dirname possibly including % rules and appends the filename
225 * to it.
226 *
227 * Returns expanded pathname or NULL for malloc failure.
228 */
expand_path(const char * file,char * dirname,int max_s_digits)229 static char *expand_path(const char *file, char *dirname, int max_s_digits) {
230 size_t len = strlen(dirname);
231 size_t lenf = strlen(file);
232 char *cp, *path;
233
234 path = malloc(len+lenf+2); // worst expansion DIR/FILE
235 if (!path) {
236 hts_log_error("Out of memory");
237 return NULL;
238 }
239
240 if (dirname[len-1] == '/')
241 len--;
242
243 /* Special case for "./" or absolute filenames */
244 if (*file == '/' || (len==1 && *dirname == '.')) {
245 sprintf(path, "%s", file);
246 } else {
247 /* Handle %[0-9]*s expansions, if required */
248 char *path_end = path;
249 *path = 0;
250 while ((cp = strchr(dirname, '%'))) {
251 char *endp;
252 long l = strtol(cp+1, &endp, 10);
253 if (*endp != 's' || endp - cp - 1 > max_s_digits) {
254 strncpy(path_end, dirname, (endp+1)-dirname);
255 path_end += (endp+1)-dirname;
256 dirname = endp+1;
257 continue;
258 }
259
260 strncpy(path_end, dirname, cp-dirname);
261 path_end += cp-dirname;
262 if (l) {
263 strncpy(path_end, file, l);
264 path_end += MIN(strlen(file), l);
265 file += MIN(strlen(file), l);
266 } else {
267 strcpy(path_end, file);
268 path_end += strlen(file);
269 file += strlen(file);
270 }
271 len -= (endp+1) - dirname;
272 dirname = endp+1;
273 }
274 strncpy(path_end, dirname, len);
275 path_end += MIN(strlen(dirname), len);
276 *path_end = 0;
277 if (*file) {
278 *path_end++ = '/';
279 strcpy(path_end, file);
280 }
281 }
282
283 //fprintf(stderr, "*PATH=\"%s\"\n", path);
284 return path;
285 }
286
287 /*
288 * Searches for file in the directory 'dirname'. If it finds it, it opens
289 * it. This also searches for compressed versions of the file in dirname
290 * too.
291 *
292 * Returns mFILE pointer if found
293 * NULL if not
294 */
find_file_dir(const char * file,char * dirname)295 static mFILE *find_file_dir(const char *file, char *dirname) {
296 char *path;
297 mFILE *mf = NULL;
298
299 path = expand_path(file, dirname, INT_MAX);
300 if (!path)
301 return NULL;
302
303 if (is_file(path))
304 mf = mfopen(path, "rbm");
305
306 free(path);
307 return mf;
308 }
309
310 /*
311 * ------------------------------------------------------------------------
312 * Public functions below.
313 */
314
315 /*
316 * Opens a trace file named 'file'. This is initially looked for as a
317 * pathname relative to a file named "relative_to". This may (for
318 * example) be the name of an experiment file referencing the trace
319 * file. In this case by passing relative_to as the experiment file
320 * filename the trace file will be picked up in the same directory as
321 * the experiment file. Relative_to may be supplied as NULL.
322 *
323 * 'file' is looked for at relative_to, then the current directory, and then
324 * all of the locations listed in 'path' (which is a colon separated list).
325 * If 'path' is NULL it uses the RAWDATA environment variable instead.
326 *
327 * Returns a mFILE pointer when found.
328 * NULL otherwise.
329 */
open_path_mfile(const char * file,char * path,char * relative_to)330 mFILE *open_path_mfile(const char *file, char *path, char *relative_to) {
331 char *newsearch;
332 char *ele;
333 mFILE *fp;
334
335 /* Use path first */
336 if (!path)
337 path = getenv("RAWDATA");
338 if (NULL == (newsearch = tokenise_search_path(path)))
339 return NULL;
340
341 /*
342 * Step through the search path testing out each component.
343 * We now look through each path element treating some prefixes as
344 * special, otherwise we treat the element as a directory.
345 */
346 for (ele = newsearch; *ele; ele += strlen(ele)+1) {
347 char *ele2;
348
349 /*
350 * '|' prefixing a path component indicates that we do not
351 * wish to perform the compression extension searching in that
352 * location.
353 *
354 * NB: this has been removed from the htslib implementation.
355 */
356 if (*ele == '|') {
357 ele2 = ele+1;
358 } else {
359 ele2 = ele;
360 }
361
362 if (0 == strncmp(ele2, "URL=", 4)) {
363 if ((fp = find_file_url(file, ele2+4))) {
364 free(newsearch);
365 return fp;
366 }
367 } else if (!strncmp(ele2, "http:", 5) ||
368 !strncmp(ele2, "https:", 6) ||
369 !strncmp(ele2, "ftp:", 4)) {
370 if ((fp = find_file_url(file, ele2))) {
371 free(newsearch);
372 return fp;
373 }
374 } else if ((fp = find_file_dir(file, ele2))) {
375 free(newsearch);
376 return fp;
377 }
378 }
379
380 free(newsearch);
381
382 /* Look in the same location as the incoming 'relative_to' filename */
383 if (relative_to) {
384 char *cp;
385 char relative_path[PATH_MAX+1];
386 strcpy(relative_path, relative_to);
387 if ((cp = strrchr(relative_path, '/')))
388 *cp = 0;
389 if ((fp = find_file_dir(file, relative_path)))
390 return fp;
391 }
392
393 return NULL;
394 }
395
396
397 /*
398 * As per open_path_mfile, but searching only for local filenames.
399 * This is useful as we may avoid doing a full mfopen and loading
400 * the entire file into memory.
401 *
402 * Returns the expanded pathname if found.
403 * NULL if not
404 */
find_path(const char * file,const char * path)405 char *find_path(const char *file, const char *path) {
406 char *newsearch;
407 char *ele;
408 char *outpath = NULL;
409
410 /* Use path first */
411 if (!path)
412 path = getenv("RAWDATA");
413 if (NULL == (newsearch = tokenise_search_path(path)))
414 return NULL;
415
416 for (ele = newsearch; *ele; ele += strlen(ele)+1) {
417 char *ele2 = (*ele == '|') ? ele+1 : ele;
418
419 if (!strncmp(ele2, "URL=", 4) ||
420 !strncmp(ele2, "http:", 5) ||
421 !strncmp(ele2, "https:", 6) ||
422 !strncmp(ele2, "ftp:", 4)) {
423 continue;
424 } else {
425 outpath = expand_path(file, ele2, INT_MAX);
426 if (is_file(outpath)) {
427 free(newsearch);
428 return outpath;
429 } else {
430 free(outpath);
431 }
432 }
433 }
434
435 free(newsearch);
436
437 return NULL;
438 }
439