1 /*  hfile.c -- buffered low-level input/output streams.
2 
3     Copyright (C) 2013-2020 Genome Research Ltd.
4 
5     Author: John Marshall <jm18@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
26 #include <config.h>
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <stddef.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <limits.h>
34 
35 #include <pthread.h>
36 
37 #ifdef ENABLE_PLUGINS
38 #if defined(_WIN32) || defined(__CYGWIN__) || defined(__MSYS__)
39 #define USING_WINDOWS_PLUGIN_DLLS
40 #include <dlfcn.h>
41 #endif
42 #endif
43 
44 #include "htslib/hfile.h"
45 #include "hfile_internal.h"
46 #include "htslib/kstring.h"
47 
48 #ifndef ENOTSUP
49 #define ENOTSUP EINVAL
50 #endif
51 #ifndef EOVERFLOW
52 #define EOVERFLOW ERANGE
53 #endif
54 #ifndef EPROTONOSUPPORT
55 #define EPROTONOSUPPORT ENOSYS
56 #endif
57 
58 #ifndef SSIZE_MAX /* SSIZE_MAX is POSIX 1 */
59 #define SSIZE_MAX LONG_MAX
60 #endif
61 
62 /* hFILE fields are used as follows:
63 
64    char *buffer;     // Pointer to the start of the I/O buffer
65    char *begin;      // First not-yet-read character / unused position
66    char *end;        // First unfilled/unfillable position
67    char *limit;      // Pointer to the first position past the buffer
68 
69    const hFILE_backend *backend;  // Methods to refill/flush I/O buffer
70 
71    off_t offset;     // Offset within the stream of buffer position 0
72    unsigned at_eof:1;// For reading, whether EOF has been seen
73    unsigned mobile:1;// Buffer is a mobile window or fixed full contents
74    unsigned readonly:1;// Whether opened as "r" rather than "r+"/"w"/"a"
75    int has_errno;    // Error number from the last failure on this stream
76 
77 For reading, begin is the first unread character in the buffer and end is the
78 first unfilled position:
79 
80    -----------ABCDEFGHIJKLMNO---------------
81    ^buffer    ^begin         ^end           ^limit
82 
83 For writing, begin is the first unused position and end is unused so remains
84 equal to buffer:
85 
86    ABCDEFGHIJKLMNOPQRSTUVWXYZ---------------
87    ^buffer                   ^begin         ^limit
88    ^end
89 
90 Thus if begin > end then there is a non-empty write buffer, if begin < end
91 then there is a non-empty read buffer, and if begin == end then both buffers
92 are empty.  In all cases, the stream's file position indicator corresponds
93 to the position pointed to by begin.
94 
95 The above is the normal scenario of a mobile window.  For in-memory
96 streams (eg via hfile_init_fixed) the buffer can be used as the full
97 contents without any separate backend behind it.  These always have at_eof
98 set, offset set to 0, need no read() method, and should just return EINVAL
99 for seek():
100 
101    abcdefghijkLMNOPQRSTUVWXYZ------
102    ^buffer    ^begin         ^end  ^limit
103 */
104 HTSLIB_EXPORT
hfile_init(size_t struct_size,const char * mode,size_t capacity)105 hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity)
106 {
107     hFILE *fp = (hFILE *) malloc(struct_size);
108     if (fp == NULL) goto error;
109 
110     if (capacity == 0) capacity = 32768;
111     // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory
112     if (strchr(mode, 'r') && capacity > 32768) capacity = 32768;
113 
114     fp->buffer = (char *) malloc(capacity);
115     if (fp->buffer == NULL) goto error;
116 
117     fp->begin = fp->end = fp->buffer;
118     fp->limit = &fp->buffer[capacity];
119 
120     fp->offset = 0;
121     fp->at_eof = 0;
122     fp->mobile = 1;
123     fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+'));
124     fp->has_errno = 0;
125     return fp;
126 
127 error:
128     hfile_destroy(fp);
129     return NULL;
130 }
131 
hfile_init_fixed(size_t struct_size,const char * mode,char * buffer,size_t buf_filled,size_t buf_size)132 hFILE *hfile_init_fixed(size_t struct_size, const char *mode,
133                         char *buffer, size_t buf_filled, size_t buf_size)
134 {
135     hFILE *fp = (hFILE *) malloc(struct_size);
136     if (fp == NULL) return NULL;
137 
138     fp->buffer = fp->begin = buffer;
139     fp->end = &fp->buffer[buf_filled];
140     fp->limit = &fp->buffer[buf_size];
141 
142     fp->offset = 0;
143     fp->at_eof = 1;
144     fp->mobile = 0;
145     fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+'));
146     fp->has_errno = 0;
147     return fp;
148 }
149 
150 static const struct hFILE_backend mem_backend;
151 
152 HTSLIB_EXPORT
hfile_destroy(hFILE * fp)153 void hfile_destroy(hFILE *fp)
154 {
155     int save = errno;
156     if (fp) free(fp->buffer);
157     free(fp);
158     errno = save;
159 }
160 
writebuffer_is_nonempty(hFILE * fp)161 static inline int writebuffer_is_nonempty(hFILE *fp)
162 {
163     return fp->begin > fp->end;
164 }
165 
166 /* Refills the read buffer from the backend (once, so may only partially
167    fill the buffer), returning the number of additional characters read
168    (which might be 0), or negative when an error occurred.  */
refill_buffer(hFILE * fp)169 static ssize_t refill_buffer(hFILE *fp)
170 {
171     ssize_t n;
172 
173     // Move any unread characters to the start of the buffer
174     if (fp->mobile && fp->begin > fp->buffer) {
175         fp->offset += fp->begin - fp->buffer;
176         memmove(fp->buffer, fp->begin, fp->end - fp->begin);
177         fp->end = &fp->buffer[fp->end - fp->begin];
178         fp->begin = fp->buffer;
179     }
180 
181     // Read into the available buffer space at fp->[end,limit)
182     if (fp->at_eof || fp->end == fp->limit) n = 0;
183     else {
184         n = fp->backend->read(fp, fp->end, fp->limit - fp->end);
185         if (n < 0) { fp->has_errno = errno; return n; }
186         else if (n == 0) fp->at_eof = 1;
187     }
188 
189     fp->end += n;
190     return n;
191 }
192 
193 /*
194  * Changes the buffer size for an hFILE.  Ideally this is done
195  * immediately after opening.  If performed later, this function may
196  * fail if we are reducing the buffer size and the current offset into
197  * the buffer is beyond the new capacity.
198  *
199  * Returns 0 on success;
200  *        -1 on failure.
201  */
202 HTSLIB_EXPORT
hfile_set_blksize(hFILE * fp,size_t bufsiz)203 int hfile_set_blksize(hFILE *fp, size_t bufsiz) {
204     char *buffer;
205     ptrdiff_t curr_used;
206     if (!fp) return -1;
207     curr_used = (fp->begin > fp->end ? fp->begin : fp->end) - fp->buffer;
208     if (bufsiz == 0) bufsiz = 32768;
209 
210     // Ensure buffer resize will not erase live data
211     if (bufsiz < curr_used)
212         return -1;
213 
214     if (!(buffer = (char *) realloc(fp->buffer, bufsiz))) return -1;
215 
216     fp->begin  = buffer + (fp->begin - fp->buffer);
217     fp->end    = buffer + (fp->end   - fp->buffer);
218     fp->buffer = buffer;
219     fp->limit  = &fp->buffer[bufsiz];
220 
221     return 0;
222 }
223 
224 /* Called only from hgetc(), when our buffer is empty.  */
225 HTSLIB_EXPORT
hgetc2(hFILE * fp)226 int hgetc2(hFILE *fp)
227 {
228     return (refill_buffer(fp) > 0)? (unsigned char) *(fp->begin++) : EOF;
229 }
230 
hgetdelim(char * buffer,size_t size,int delim,hFILE * fp)231 ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp)
232 {
233     char *found;
234     size_t n, copied = 0;
235     ssize_t got;
236 
237     if (size < 1 || size > SSIZE_MAX) {
238         fp->has_errno = errno = EINVAL;
239         return -1;
240     }
241     if (writebuffer_is_nonempty(fp)) {
242         fp->has_errno = errno = EBADF;
243         return -1;
244     }
245 
246     --size; /* to allow space for the NUL terminator */
247 
248     do {
249         n = fp->end - fp->begin;
250         if (n > size - copied) n = size - copied;
251 
252         /* Look in the hFILE buffer for the delimiter */
253         found = memchr(fp->begin, delim, n);
254         if (found != NULL) {
255             n = found - fp->begin + 1;
256             memcpy(buffer + copied, fp->begin, n);
257             buffer[n + copied] = '\0';
258             fp->begin += n;
259             return n + copied;
260         }
261 
262         /* No delimiter yet, copy as much as we can and refill if necessary */
263         memcpy(buffer + copied, fp->begin, n);
264         fp->begin += n;
265         copied += n;
266 
267         if (copied == size) { /* Output buffer full */
268             buffer[copied] = '\0';
269             return copied;
270         }
271 
272         got = refill_buffer(fp);
273     } while (got > 0);
274 
275     if (got < 0) return -1; /* Error on refill. */
276 
277     buffer[copied] = '\0';  /* EOF, return anything that was copied. */
278     return copied;
279 }
280 
hgets(char * buffer,int size,hFILE * fp)281 char *hgets(char *buffer, int size, hFILE *fp)
282 {
283     if (size < 1) {
284         fp->has_errno = errno = EINVAL;
285         return NULL;
286     }
287     return hgetln(buffer, size, fp) > 0 ? buffer : NULL;
288 }
289 
hpeek(hFILE * fp,void * buffer,size_t nbytes)290 ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
291 {
292     size_t n = fp->end - fp->begin;
293     while (n < nbytes) {
294         ssize_t ret = refill_buffer(fp);
295         if (ret < 0) return ret;
296         else if (ret == 0) break;
297         else n += ret;
298     }
299 
300     if (n > nbytes) n = nbytes;
301     memcpy(buffer, fp->begin, n);
302     return n;
303 }
304 
305 /* Called only from hread(); when called, our buffer is empty and nread bytes
306    have already been placed in the destination buffer.  */
307 HTSLIB_EXPORT
hread2(hFILE * fp,void * destv,size_t nbytes,size_t nread)308 ssize_t hread2(hFILE *fp, void *destv, size_t nbytes, size_t nread)
309 {
310     const size_t capacity = fp->limit - fp->buffer;
311     int buffer_invalidated = 0;
312     char *dest = (char *) destv;
313     dest += nread, nbytes -= nread;
314 
315     // Read large requests directly into the destination buffer
316     while (nbytes * 2 >= capacity && !fp->at_eof) {
317         ssize_t n = fp->backend->read(fp, dest, nbytes);
318         if (n < 0) { fp->has_errno = errno; return n; }
319         else if (n == 0) fp->at_eof = 1;
320         else buffer_invalidated = 1;
321         fp->offset += n;
322         dest += n, nbytes -= n;
323         nread += n;
324     }
325 
326     if (buffer_invalidated) {
327         // Our unread buffer is empty, so begin == end, but our already-read
328         // buffer [buffer,begin) is likely non-empty and is no longer valid as
329         // its contents are no longer adjacent to the file position indicator.
330         // Discard it so that hseek() can't try to take advantage of it.
331         fp->offset += fp->begin - fp->buffer;
332         fp->begin = fp->end = fp->buffer;
333     }
334 
335     while (nbytes > 0 && !fp->at_eof) {
336         size_t n;
337         ssize_t ret = refill_buffer(fp);
338         if (ret < 0) return ret;
339 
340         n = fp->end - fp->begin;
341         if (n > nbytes) n = nbytes;
342         memcpy(dest, fp->begin, n);
343         fp->begin += n;
344         dest += n, nbytes -= n;
345         nread += n;
346     }
347 
348     return nread;
349 }
350 
351 /* Flushes the write buffer, fp->[buffer,begin), out through the backend
352    returning 0 on success or negative if an error occurred.  */
flush_buffer(hFILE * fp)353 static ssize_t flush_buffer(hFILE *fp)
354 {
355     const char *buffer = fp->buffer;
356     while (buffer < fp->begin) {
357         ssize_t n = fp->backend->write(fp, buffer, fp->begin - buffer);
358         if (n < 0) { fp->has_errno = errno; return n; }
359         buffer += n;
360         fp->offset += n;
361     }
362 
363     fp->begin = fp->buffer;  // Leave the buffer empty
364     return 0;
365 }
366 
hflush(hFILE * fp)367 int hflush(hFILE *fp)
368 {
369     if (flush_buffer(fp) < 0) return EOF;
370     if (fp->backend->flush) {
371         if (fp->backend->flush(fp) < 0) { fp->has_errno = errno; return EOF; }
372     }
373     return 0;
374 }
375 
376 /* Called only from hputc(), when our buffer is already full.  */
377 HTSLIB_EXPORT
hputc2(int c,hFILE * fp)378 int hputc2(int c, hFILE *fp)
379 {
380     if (flush_buffer(fp) < 0) return EOF;
381     *(fp->begin++) = c;
382     return c;
383 }
384 
385 /* Called only from hwrite() and hputs2(); when called, our buffer is either
386    full and ncopied bytes from the source have already been copied to our
387    buffer; or completely empty, ncopied is zero and totalbytes is greater than
388    the buffer size.  */
389 HTSLIB_EXPORT
hwrite2(hFILE * fp,const void * srcv,size_t totalbytes,size_t ncopied)390 ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied)
391 {
392     const char *src = (const char *) srcv;
393     ssize_t ret;
394     const size_t capacity = fp->limit - fp->buffer;
395     size_t remaining = totalbytes - ncopied;
396     src += ncopied;
397 
398     ret = flush_buffer(fp);
399     if (ret < 0) return ret;
400 
401     // Write large blocks out directly from the source buffer
402     while (remaining * 2 >= capacity) {
403         ssize_t n = fp->backend->write(fp, src, remaining);
404         if (n < 0) { fp->has_errno = errno; return n; }
405         fp->offset += n;
406         src += n, remaining -= n;
407     }
408 
409     // Just buffer any remaining characters
410     memcpy(fp->begin, src, remaining);
411     fp->begin += remaining;
412 
413     return totalbytes;
414 }
415 
416 /* Called only from hputs(), when our buffer is already full.  */
417 HTSLIB_EXPORT
hputs2(const char * text,size_t totalbytes,size_t ncopied,hFILE * fp)418 int hputs2(const char *text, size_t totalbytes, size_t ncopied, hFILE *fp)
419 {
420     return (hwrite2(fp, text, totalbytes, ncopied) >= 0)? 0 : EOF;
421 }
422 
hseek(hFILE * fp,off_t offset,int whence)423 off_t hseek(hFILE *fp, off_t offset, int whence)
424 {
425     off_t curpos, pos;
426 
427     if (writebuffer_is_nonempty(fp) && fp->mobile) {
428         int ret = flush_buffer(fp);
429         if (ret < 0) return ret;
430     }
431 
432     curpos = htell(fp);
433 
434     // Relative offsets are given relative to the hFILE's stream position,
435     // which may differ from the backend's physical position due to buffering
436     // read-ahead.  Correct for this by converting to an absolute position.
437     if (whence == SEEK_CUR) {
438         if (curpos + offset < 0) {
439             // Either a negative offset resulted in a position before the
440             // start of the file, or we overflowed when given a positive offset
441             fp->has_errno = errno = (offset < 0)? EINVAL : EOVERFLOW;
442             return -1;
443         }
444 
445         whence = SEEK_SET;
446         offset = curpos + offset;
447     }
448     // For fixed immobile buffers, convert everything else to SEEK_SET too
449     // so that seeking can be avoided for all (within range) requests.
450     else if (! fp->mobile && whence == SEEK_END) {
451         size_t length = fp->end - fp->buffer;
452         if (offset > 0 || -offset > length) {
453             fp->has_errno = errno = EINVAL;
454             return -1;
455         }
456 
457         whence = SEEK_SET;
458         offset = length + offset;
459     }
460 
461     // Avoid seeking if the desired position is within our read buffer.
462     // (But not when the next operation may be a write on a mobile buffer.)
463     if (whence == SEEK_SET && (! fp->mobile || fp->readonly) &&
464         offset >= fp->offset && offset - fp->offset <= fp->end - fp->buffer) {
465         fp->begin = &fp->buffer[offset - fp->offset];
466         return offset;
467     }
468 
469     pos = fp->backend->seek(fp, offset, whence);
470     if (pos < 0) { fp->has_errno = errno; return pos; }
471 
472     // Seeking succeeded, so discard any non-empty read buffer
473     fp->begin = fp->end = fp->buffer;
474     fp->at_eof = 0;
475 
476     fp->offset = pos;
477     return pos;
478 }
479 
hclose(hFILE * fp)480 int hclose(hFILE *fp)
481 {
482     int err = fp->has_errno;
483 
484     if (writebuffer_is_nonempty(fp) && hflush(fp) < 0) err = fp->has_errno;
485     if (fp->backend->close(fp) < 0) err = errno;
486     hfile_destroy(fp);
487 
488     if (err) {
489         errno = err;
490         return EOF;
491     }
492     else return 0;
493 }
494 
hclose_abruptly(hFILE * fp)495 void hclose_abruptly(hFILE *fp)
496 {
497     int save = errno;
498     if (fp->backend->close(fp) < 0) { /* Ignore subsequent errors */ }
499     hfile_destroy(fp);
500     errno = save;
501 }
502 
503 
504 /***************************
505  * File descriptor backend *
506  ***************************/
507 
508 #ifndef _WIN32
509 #include <sys/socket.h>
510 #include <sys/stat.h>
511 #define HAVE_STRUCT_STAT_ST_BLKSIZE
512 #else
513 #include <winsock2.h>
514 #define HAVE_CLOSESOCKET
515 #define HAVE_SETMODE
516 #endif
517 #include <fcntl.h>
518 #include <unistd.h>
519 
520 /* For Unix, it doesn't matter whether a file descriptor is a socket.
521    However Windows insists on send()/recv() and its own closesocket()
522    being used when fd happens to be a socket.  */
523 
524 typedef struct {
525     hFILE base;
526     int fd;
527     unsigned is_socket:1;
528 } hFILE_fd;
529 
fd_read(hFILE * fpv,void * buffer,size_t nbytes)530 static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes)
531 {
532     hFILE_fd *fp = (hFILE_fd *) fpv;
533     ssize_t n;
534     do {
535         n = fp->is_socket? recv(fp->fd, buffer, nbytes, 0)
536                          : read(fp->fd, buffer, nbytes);
537     } while (n < 0 && errno == EINTR);
538     return n;
539 }
540 
fd_write(hFILE * fpv,const void * buffer,size_t nbytes)541 static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes)
542 {
543     hFILE_fd *fp = (hFILE_fd *) fpv;
544     ssize_t n;
545     do {
546         n = fp->is_socket?  send(fp->fd, buffer, nbytes, 0)
547                          : write(fp->fd, buffer, nbytes);
548     } while (n < 0 && errno == EINTR);
549 #ifdef _WIN32
550         // On windows we have no SIGPIPE.  Instead write returns
551         // EINVAL.  We check for this and our fd being a pipe.
552         // If so, we raise SIGTERM instead of SIGPIPE.  It's not
553         // ideal, but I think the only alternative is extra checking
554         // in every single piece of code.
555         if (n < 0 && errno == EINVAL &&
556             GetLastError() == ERROR_NO_DATA &&
557             GetFileType((HANDLE)_get_osfhandle(fp->fd)) == FILE_TYPE_PIPE) {
558             raise(SIGTERM);
559         }
560 #endif
561     return n;
562 }
563 
fd_seek(hFILE * fpv,off_t offset,int whence)564 static off_t fd_seek(hFILE *fpv, off_t offset, int whence)
565 {
566     hFILE_fd *fp = (hFILE_fd *) fpv;
567     return lseek(fp->fd, offset, whence);
568 }
569 
fd_flush(hFILE * fpv)570 static int fd_flush(hFILE *fpv)
571 {
572     int ret = 0;
573     do {
574 #ifdef HAVE_FDATASYNC
575         hFILE_fd *fp = (hFILE_fd *) fpv;
576         ret = fdatasync(fp->fd);
577 #elif defined(HAVE_FSYNC)
578         hFILE_fd *fp = (hFILE_fd *) fpv;
579         ret = fsync(fp->fd);
580 #endif
581         // Ignore invalid-for-fsync(2) errors due to being, e.g., a pipe,
582         // and operation-not-supported errors (Mac OS X)
583         if (ret < 0 && (errno == EINVAL || errno == ENOTSUP)) ret = 0;
584     } while (ret < 0 && errno == EINTR);
585     return ret;
586 }
587 
fd_close(hFILE * fpv)588 static int fd_close(hFILE *fpv)
589 {
590     hFILE_fd *fp = (hFILE_fd *) fpv;
591     int ret;
592     do {
593 #ifdef HAVE_CLOSESOCKET
594         ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd);
595 #else
596         ret = close(fp->fd);
597 #endif
598     } while (ret < 0 && errno == EINTR);
599     return ret;
600 }
601 
602 static const struct hFILE_backend fd_backend =
603 {
604     fd_read, fd_write, fd_seek, fd_flush, fd_close
605 };
606 
blksize(int fd)607 static size_t blksize(int fd)
608 {
609 #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
610     struct stat sbuf;
611     if (fstat(fd, &sbuf) != 0) return 0;
612     return sbuf.st_blksize;
613 #else
614     return 0;
615 #endif
616 }
617 
hopen_fd(const char * filename,const char * mode)618 static hFILE *hopen_fd(const char *filename, const char *mode)
619 {
620     hFILE_fd *fp = NULL;
621     int fd = open(filename, hfile_oflags(mode), 0666);
622     if (fd < 0) goto error;
623 
624     fp = (hFILE_fd *) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
625     if (fp == NULL) goto error;
626 
627     fp->fd = fd;
628     fp->is_socket = 0;
629     fp->base.backend = &fd_backend;
630     return &fp->base;
631 
632 error:
633     if (fd >= 0) { int save = errno; (void) close(fd); errno = save; }
634     hfile_destroy((hFILE *) fp);
635     return NULL;
636 }
637 
638 // Loads the contents of filename to produced a read-only, in memory,
639 // immobile hfile.  fp is the already opened file.  We always close this
640 // input fp, irrespective of whether we error or whether we return a new
641 // immobile hfile.
hpreload(hFILE * fp)642 static hFILE *hpreload(hFILE *fp) {
643     hFILE *mem_fp;
644     char *buf = NULL;
645     off_t buf_sz = 0, buf_a = 0, buf_inc = 8192, len;
646 
647     for (;;) {
648         if (buf_a - buf_sz < 5000) {
649             buf_a += buf_inc;
650             char *t = realloc(buf, buf_a);
651             if (!t) goto err;
652             buf = t;
653             if (buf_inc < 1000000) buf_inc *= 1.3;
654         }
655         len = hread(fp, buf+buf_sz, buf_a-buf_sz);
656         if (len > 0)
657             buf_sz += len;
658         else
659             break;
660     }
661 
662     if (len < 0) goto err;
663     mem_fp = hfile_init_fixed(sizeof(hFILE), "r", buf, buf_sz, buf_a);
664     if (!mem_fp) goto err;
665     mem_fp->backend = &mem_backend;
666 
667     if (hclose(fp) < 0) {
668         hclose_abruptly(mem_fp);
669         goto err;
670     }
671     return mem_fp;
672 
673  err:
674     free(buf);
675     hclose_abruptly(fp);
676     return NULL;
677 }
678 
is_preload_url_remote(const char * url)679 static int is_preload_url_remote(const char *url){
680     return hisremote(url + 8); // len("preload:") = 8
681 }
682 
hopen_preload(const char * url,const char * mode)683 static hFILE *hopen_preload(const char *url, const char *mode){
684     hFILE* fp = hopen(url + 8, mode);
685     return hpreload(fp);
686 }
687 
hdopen(int fd,const char * mode)688 hFILE *hdopen(int fd, const char *mode)
689 {
690     hFILE_fd *fp = (hFILE_fd*) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
691     if (fp == NULL) return NULL;
692 
693     fp->fd = fd;
694     fp->is_socket = (strchr(mode, 's') != NULL);
695     fp->base.backend = &fd_backend;
696     return &fp->base;
697 }
698 
hopen_fd_fileuri(const char * url,const char * mode)699 static hFILE *hopen_fd_fileuri(const char *url, const char *mode)
700 {
701     if (strncmp(url, "file://localhost/", 17) == 0) url += 16;
702     else if (strncmp(url, "file:///", 8) == 0) url += 7;
703     else { errno = EPROTONOSUPPORT; return NULL; }
704 
705 #if defined(_WIN32) || defined(__MSYS__)
706     // For cases like C:/foo
707     if (url[0] == '/' && url[1] && url[2] == ':' && url[3] == '/') url++;
708 #endif
709 
710     return hopen_fd(url, mode);
711 }
712 
hopen_fd_stdinout(const char * mode)713 static hFILE *hopen_fd_stdinout(const char *mode)
714 {
715     int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO;
716 #if defined HAVE_SETMODE && defined O_BINARY
717     if (setmode(fd, O_BINARY) < 0) return NULL;
718 #endif
719     return hdopen(fd, mode);
720 }
721 
722 HTSLIB_EXPORT
hfile_oflags(const char * mode)723 int hfile_oflags(const char *mode)
724 {
725     int rdwr = 0, flags = 0;
726     const char *s;
727     for (s = mode; *s; s++)
728         switch (*s) {
729         case 'r': rdwr = O_RDONLY;  break;
730         case 'w': rdwr = O_WRONLY; flags |= O_CREAT | O_TRUNC;  break;
731         case 'a': rdwr = O_WRONLY; flags |= O_CREAT | O_APPEND;  break;
732         case '+': rdwr = O_RDWR;  break;
733 #ifdef O_CLOEXEC
734         case 'e': flags |= O_CLOEXEC;  break;
735 #endif
736 #ifdef O_EXCL
737         case 'x': flags |= O_EXCL;  break;
738 #endif
739         default:  break;
740         }
741 
742 #ifdef O_BINARY
743     flags |= O_BINARY;
744 #endif
745 
746     return rdwr | flags;
747 }
748 
749 
750 /*********************
751  * In-memory backend *
752  *********************/
753 
754 #include "hts_internal.h"
755 
756 typedef struct {
757     hFILE base;
758 } hFILE_mem;
759 
mem_seek(hFILE * fpv,off_t offset,int whence)760 static off_t mem_seek(hFILE *fpv, off_t offset, int whence)
761 {
762     errno = EINVAL;
763     return -1;
764 }
765 
mem_close(hFILE * fpv)766 static int mem_close(hFILE *fpv)
767 {
768     return 0;
769 }
770 
771 static const struct hFILE_backend mem_backend =
772 {
773     NULL, NULL, mem_seek, NULL, mem_close
774 };
775 
cmp_prefix(const char * key,const char * s)776 static int cmp_prefix(const char *key, const char *s)
777 {
778     while (*key)
779         if (tolower_c(*s) != *key) return +1;
780         else s++, key++;
781 
782     return 0;
783 }
784 
create_hfile_mem(char * buffer,const char * mode,size_t buf_filled,size_t buf_size)785 static hFILE *create_hfile_mem(char* buffer, const char* mode, size_t buf_filled, size_t buf_size)
786 {
787     hFILE_mem *fp = (hFILE_mem *) hfile_init_fixed(sizeof(hFILE_mem), mode, buffer, buf_filled, buf_size);
788     if (fp == NULL)
789         return NULL;
790 
791     fp->base.backend = &mem_backend;
792     return &fp->base;
793 }
794 
hopen_mem(const char * url,const char * mode)795 static hFILE *hopen_mem(const char *url, const char *mode)
796 {
797     size_t length, size;
798     char *buffer;
799     const char *data, *comma = strchr(url, ',');
800     if (comma == NULL) { errno = EINVAL; return NULL; }
801     data = comma+1;
802 
803     // TODO Implement write modes
804     if (strchr(mode, 'r') == NULL) { errno = EROFS; return NULL; }
805 
806     if (comma - url >= 7 && cmp_prefix(";base64", &comma[-7]) == 0) {
807         size = hts_base64_decoded_length(strlen(data));
808         buffer = malloc(size);
809         if (buffer == NULL) return NULL;
810         hts_decode_base64(buffer, &length, data);
811     }
812     else {
813         size = strlen(data) + 1;
814         buffer = malloc(size);
815         if (buffer == NULL) return NULL;
816         hts_decode_percent(buffer, &length, data);
817     }
818     hFILE* hf;
819 
820     if(!(hf = create_hfile_mem(buffer, mode, length, size))){
821         free(buffer);
822         return NULL;
823     }
824 
825     return hf;
826 }
827 
hopenv_mem(const char * filename,const char * mode,va_list args)828 static hFILE *hopenv_mem(const char *filename, const char *mode, va_list args)
829 {
830     char* buffer = va_arg(args, char*);
831     size_t sz = va_arg(args, size_t);
832     va_end(args);
833 
834     hFILE* hf;
835 
836     if(!(hf = create_hfile_mem(buffer, mode, sz, sz))){
837         free(buffer);
838         return NULL;
839     }
840 
841     return hf;
842 }
843 
hfile_mem_get_buffer(hFILE * file,size_t * length)844 char *hfile_mem_get_buffer(hFILE *file, size_t *length) {
845     if (file->backend != &mem_backend) {
846         errno = EINVAL;
847         return NULL;
848     }
849 
850     if (length)
851         *length = file->buffer - file->limit;
852 
853     return file->buffer;
854 }
855 
hfile_mem_steal_buffer(hFILE * file,size_t * length)856 char *hfile_mem_steal_buffer(hFILE *file, size_t *length) {
857     char *buf = hfile_mem_get_buffer(file, length);
858     if (buf)
859         file->buffer = NULL;
860     return buf;
861 }
862 
hfile_plugin_init_mem(struct hFILE_plugin * self)863 int hfile_plugin_init_mem(struct hFILE_plugin *self)
864 {
865     // mem files are declared remote so they work with a tabix index
866     static const struct hFILE_scheme_handler handler =
867             {NULL, hfile_always_remote, "mem", 2000 + 50, hopenv_mem};
868     self->name = "mem";
869     hfile_add_scheme_handler("mem", &handler);
870     return 0;
871 }
872 
873 /**********************************************************************
874  * Dummy crypt4gh plug-in.  Does nothing apart from advise how to get *
875  * the real one.  It will be overridden by the actual plug-in.        *
876  **********************************************************************/
877 
crypt4gh_needed(const char * url,const char * mode)878 static hFILE *crypt4gh_needed(const char *url, const char *mode)
879 {
880     const char *u = strncmp(url, "crypt4gh:", 9) == 0 ? url + 9 : url;
881 #if defined(ENABLE_PLUGINS)
882     const char *enable_plugins = "";
883 #else
884     const char *enable_plugins = "You also need to rebuild HTSlib with plug-ins enabled.\n";
885 #endif
886 
887     hts_log_error("Accessing \"%s\" needs the crypt4gh plug-in.\n"
888                   "It can be found at "
889                   "https://github.com/samtools/htslib-crypt4gh\n"
890                   "%s"
891                   "If you have the plug-in, please ensure it can be "
892                   "found on your HTS_PATH.",
893                   u, enable_plugins);
894 
895     errno = EPROTONOSUPPORT;
896     return NULL;
897 }
898 
hfile_plugin_init_crypt4gh_needed(struct hFILE_plugin * self)899 int hfile_plugin_init_crypt4gh_needed(struct hFILE_plugin *self)
900 {
901     static const struct hFILE_scheme_handler handler =
902         { crypt4gh_needed, NULL, "crypt4gh-needed", 0, NULL };
903     self->name = "crypt4gh-needed";
904     hfile_add_scheme_handler("crypt4gh", &handler);
905     return 0;
906 }
907 
908 
909 /*****************************************
910  * Plugin and hopen() backend dispatcher *
911  *****************************************/
912 
913 #include "htslib/khash.h"
914 
915 KHASH_MAP_INIT_STR(scheme_string, const struct hFILE_scheme_handler *)
916 static khash_t(scheme_string) *schemes = NULL;
917 
918 struct hFILE_plugin_list {
919     struct hFILE_plugin plugin;
920     struct hFILE_plugin_list *next;
921 };
922 
923 static struct hFILE_plugin_list *plugins = NULL;
924 static pthread_mutex_t plugins_lock = PTHREAD_MUTEX_INITIALIZER;
925 
hfile_shutdown(int do_close_plugin)926 void hfile_shutdown(int do_close_plugin)
927 {
928     pthread_mutex_lock(&plugins_lock);
929 
930     if (schemes) {
931         kh_destroy(scheme_string, schemes);
932         schemes = NULL;
933     }
934 
935     while (plugins != NULL) {
936         struct hFILE_plugin_list *p = plugins;
937         if (p->plugin.destroy) p->plugin.destroy();
938 #ifdef ENABLE_PLUGINS
939         if (p->plugin.obj && do_close_plugin) close_plugin(p->plugin.obj);
940 #endif
941         plugins = p->next;
942         free(p);
943     }
944 
945     pthread_mutex_unlock(&plugins_lock);
946 }
947 
hfile_exit()948 static void hfile_exit()
949 {
950     hfile_shutdown(0);
951     pthread_mutex_destroy(&plugins_lock);
952 }
953 
priority(const struct hFILE_scheme_handler * handler)954 static inline int priority(const struct hFILE_scheme_handler *handler)
955 {
956     return handler->priority % 1000;
957 }
958 
959 #ifdef USING_WINDOWS_PLUGIN_DLLS
960 /*
961  * Work-around for Windows plug-in dlls where the plug-in could be
962  * using a different HTSlib library to the executable (for example
963  * because the latter was build against a static libhts.a).  When this
964  * happens, the plug-in can call the wrong copy of hfile_add_scheme_handler().
965  * If this is detected, it calls this function which attempts to fix the
966  * problem by redirecting to the hfile_add_scheme_handler() in the main
967  * executable.
968  */
try_exe_add_scheme_handler(const char * scheme,const struct hFILE_scheme_handler * handler)969 static int try_exe_add_scheme_handler(const char *scheme,
970                                       const struct hFILE_scheme_handler *handler)
971 {
972     static void (*add_scheme_handler)(const char *scheme,
973                                       const struct hFILE_scheme_handler *handler);
974     if (!add_scheme_handler) {
975         // dlopen the main executable and resolve hfile_add_scheme_handler
976         void *exe_handle = dlopen(NULL, RTLD_LAZY);
977         if (!exe_handle) return -1;
978         *(void **) (&add_scheme_handler) = dlsym(exe_handle, "hfile_add_scheme_handler");
979         dlclose(exe_handle);
980     }
981     // Check that the symbol was obtained and isn't the one in this copy
982     // of the library (to avoid infinite recursion)
983     if (!add_scheme_handler || add_scheme_handler == hfile_add_scheme_handler)
984         return -1;
985     add_scheme_handler(scheme, handler);
986     return 0;
987 }
988 #else
try_exe_add_scheme_handler(const char * scheme,const struct hFILE_scheme_handler * handler)989 static int try_exe_add_scheme_handler(const char *scheme,
990                                       const struct hFILE_scheme_handler *handler)
991 {
992     return -1;
993 }
994 #endif
995 
996 HTSLIB_EXPORT
hfile_add_scheme_handler(const char * scheme,const struct hFILE_scheme_handler * handler)997 void hfile_add_scheme_handler(const char *scheme,
998                               const struct hFILE_scheme_handler *handler)
999 {
1000     int absent;
1001     if (!schemes) {
1002         if (try_exe_add_scheme_handler(scheme, handler) != 0) {
1003             hts_log_warning("Couldn't register scheme handler for %s", scheme);
1004         }
1005         return;
1006     }
1007     khint_t k = kh_put(scheme_string, schemes, scheme, &absent);
1008     if (absent < 0) {
1009         hts_log_warning("Couldn't register scheme handler for %s : %s",
1010                         scheme, strerror(errno));
1011         return;
1012     }
1013     if (absent || priority(handler) > priority(kh_value(schemes, k))) {
1014         kh_value(schemes, k) = handler;
1015     }
1016 }
1017 
init_add_plugin(void * obj,int (* init)(struct hFILE_plugin *),const char * pluginname)1018 static int init_add_plugin(void *obj, int (*init)(struct hFILE_plugin *),
1019                            const char *pluginname)
1020 {
1021     struct hFILE_plugin_list *p = malloc (sizeof (struct hFILE_plugin_list));
1022     if (p == NULL) {
1023         hts_log_debug("Failed to allocate memory for plugin \"%s\"", pluginname);
1024         return -1;
1025     }
1026 
1027     p->plugin.api_version = 1;
1028     p->plugin.obj = obj;
1029     p->plugin.name = NULL;
1030     p->plugin.destroy = NULL;
1031 
1032     int ret = (*init)(&p->plugin);
1033 
1034     if (ret != 0) {
1035         hts_log_debug("Initialisation failed for plugin \"%s\": %d", pluginname, ret);
1036         free(p);
1037         return ret;
1038     }
1039 
1040     hts_log_debug("Loaded \"%s\"", pluginname);
1041 
1042     p->next = plugins, plugins = p;
1043     return 0;
1044 }
1045 
1046 /*
1047  * Returns 0 on success,
1048  *        <0 on failure
1049  */
load_hfile_plugins()1050 static int load_hfile_plugins()
1051 {
1052     static const struct hFILE_scheme_handler
1053         data = { hopen_mem, hfile_always_local, "built-in", 80 },
1054         file = { hopen_fd_fileuri, hfile_always_local, "built-in", 80 },
1055         preload = { hopen_preload, is_preload_url_remote, "built-in", 80 };
1056 
1057     schemes = kh_init(scheme_string);
1058     if (schemes == NULL)
1059         return -1;
1060 
1061     hfile_add_scheme_handler("data", &data);
1062     hfile_add_scheme_handler("file", &file);
1063     hfile_add_scheme_handler("preload", &preload);
1064     init_add_plugin(NULL, hfile_plugin_init_net, "knetfile");
1065     init_add_plugin(NULL, hfile_plugin_init_mem, "mem");
1066     init_add_plugin(NULL, hfile_plugin_init_crypt4gh_needed, "crypt4gh-needed");
1067 
1068 #ifdef ENABLE_PLUGINS
1069     struct hts_path_itr path;
1070     const char *pluginname;
1071     hts_path_itr_setup(&path, NULL, NULL, "hfile_", 6, NULL, 0);
1072     while ((pluginname = hts_path_itr_next(&path)) != NULL) {
1073         void *obj;
1074         int (*init)(struct hFILE_plugin *) = (int (*)(struct hFILE_plugin *))
1075             load_plugin(&obj, pluginname, "hfile_plugin_init");
1076 
1077         if (init) {
1078             if (init_add_plugin(obj, init, pluginname) != 0)
1079                 close_plugin(obj);
1080         }
1081     }
1082 #else
1083 
1084 #ifdef HAVE_LIBCURL
1085     init_add_plugin(NULL, hfile_plugin_init_libcurl, "libcurl");
1086 #endif
1087 #ifdef ENABLE_GCS
1088     init_add_plugin(NULL, hfile_plugin_init_gcs, "gcs");
1089 #endif
1090 #ifdef ENABLE_S3
1091     init_add_plugin(NULL, hfile_plugin_init_s3, "s3");
1092     init_add_plugin(NULL, hfile_plugin_init_s3_write, "s3w");
1093 #endif
1094 
1095 #endif
1096 
1097     // In the unlikely event atexit() fails, it's better to succeed here and
1098     // carry on; then eventually when the program exits, we'll merely close
1099     // down the plugins uncleanly, as if we had aborted.
1100     (void) atexit(hfile_exit);
1101 
1102     return 0;
1103 }
1104 
1105 /* A filename like "foo:bar" in which we don't recognise the scheme is
1106    either an ordinary file or an indication of a missing or broken plugin.
1107    Try to open it as an ordinary file; but if there's no such file, set
1108    errno distinctively to make the plugin issue apparent.  */
hopen_unknown_scheme(const char * fname,const char * mode)1109 static hFILE *hopen_unknown_scheme(const char *fname, const char *mode)
1110 {
1111     hFILE *fp = hopen_fd(fname, mode);
1112     if (fp == NULL && errno == ENOENT) errno = EPROTONOSUPPORT;
1113     return fp;
1114 }
1115 
1116 /* Returns the appropriate handler, or NULL if the string isn't an URL.  */
find_scheme_handler(const char * s)1117 static const struct hFILE_scheme_handler *find_scheme_handler(const char *s)
1118 {
1119     static const struct hFILE_scheme_handler unknown_scheme =
1120         { hopen_unknown_scheme, hfile_always_local, "built-in", 0 };
1121 
1122     char scheme[12];
1123     int i;
1124 
1125     for (i = 0; i < sizeof scheme; i++)
1126         if (isalnum_c(s[i]) || s[i] == '+' || s[i] == '-' || s[i] == '.')
1127             scheme[i] = tolower_c(s[i]);
1128         else if (s[i] == ':') break;
1129         else return NULL;
1130 
1131     // 1 byte schemes are likely windows C:/foo pathnames
1132     if (i <= 1 || i >= sizeof scheme) return NULL;
1133     scheme[i] = '\0';
1134 
1135     pthread_mutex_lock(&plugins_lock);
1136     if (!schemes && load_hfile_plugins() < 0) {
1137         pthread_mutex_unlock(&plugins_lock);
1138         return NULL;
1139     }
1140     pthread_mutex_unlock(&plugins_lock);
1141 
1142     khint_t k = kh_get(scheme_string, schemes, scheme);
1143     return (k != kh_end(schemes))? kh_value(schemes, k) : &unknown_scheme;
1144 }
1145 
hopen(const char * fname,const char * mode,...)1146 hFILE *hopen(const char *fname, const char *mode, ...)
1147 {
1148     const struct hFILE_scheme_handler *handler = find_scheme_handler(fname);
1149     if (handler) {
1150         if (strchr(mode, ':') == NULL
1151             || handler->priority < 2000
1152             || handler->vopen == NULL) {
1153             return handler->open(fname, mode);
1154         }
1155         else {
1156             hFILE *fp;
1157             va_list arg;
1158             va_start(arg, mode);
1159             fp = handler->vopen(fname, mode, arg);
1160             va_end(arg);
1161             return fp;
1162         }
1163     }
1164     else if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode);
1165     else return hopen_fd(fname, mode);
1166 }
1167 
1168 HTSLIB_EXPORT
hfile_always_local(const char * fname)1169 int hfile_always_local (const char *fname) { return 0; }
1170 
1171 HTSLIB_EXPORT
hfile_always_remote(const char * fname)1172 int hfile_always_remote(const char *fname) { return 1; }
1173 
hisremote(const char * fname)1174 int hisremote(const char *fname)
1175 {
1176     const struct hFILE_scheme_handler *handler = find_scheme_handler(fname);
1177     return handler? handler->isremote(fname) : 0;
1178 }
1179 
1180 // Remove an extension, if any, from the basename part of [start,limit).
1181 // Note: Doesn't notice percent-encoded '.' and '/' characters. Don't do that.
strip_extension(const char * start,const char * limit)1182 static const char *strip_extension(const char *start, const char *limit)
1183 {
1184     const char *s = limit;
1185     while (s > start) {
1186         --s;
1187         if (*s == '.') return s;
1188         else if (*s == '/') break;
1189     }
1190     return limit;
1191 }
1192 
haddextension(struct kstring_t * buffer,const char * filename,int replace,const char * new_extension)1193 char *haddextension(struct kstring_t *buffer, const char *filename,
1194                     int replace, const char *new_extension)
1195 {
1196     const char *trailing, *end;
1197 
1198     if (find_scheme_handler(filename)) {
1199         // URL, so alter extensions before any trailing query or fragment parts
1200         // Allow # symbols in s3 URLs
1201         trailing = filename + ((strncmp(filename, "s3://", 5) && strncmp(filename, "s3+http://", 10) && strncmp(filename, "s3+https://", 11))  ? strcspn(filename, "?#") : strcspn(filename, "?"));
1202     }
1203     else {
1204         // Local path, so alter extensions at the end of the filename
1205         trailing = strchr(filename, '\0');
1206     }
1207 
1208     end = replace? strip_extension(filename, trailing) : trailing;
1209 
1210     buffer->l = 0;
1211     if (kputsn(filename, end - filename, buffer) >= 0 &&
1212         kputs(new_extension, buffer) >= 0 &&
1213         kputs(trailing, buffer) >= 0) return buffer->s;
1214     else return NULL;
1215 }
1216