1 /* hfile.c -- buffered low-level input/output streams.
2
3 Copyright (C) 2013-2020 Genome Research Ltd.
4
5 Author: John Marshall <jm18@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24
25 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
26 #include <config.h>
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <stddef.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <limits.h>
34
35 #include <pthread.h>
36
37 #ifdef ENABLE_PLUGINS
38 #if defined(_WIN32) || defined(__CYGWIN__) || defined(__MSYS__)
39 #define USING_WINDOWS_PLUGIN_DLLS
40 #include <dlfcn.h>
41 #endif
42 #endif
43
44 #include "htslib/hfile.h"
45 #include "hfile_internal.h"
46 #include "htslib/kstring.h"
47
48 #ifndef ENOTSUP
49 #define ENOTSUP EINVAL
50 #endif
51 #ifndef EOVERFLOW
52 #define EOVERFLOW ERANGE
53 #endif
54 #ifndef EPROTONOSUPPORT
55 #define EPROTONOSUPPORT ENOSYS
56 #endif
57
58 #ifndef SSIZE_MAX /* SSIZE_MAX is POSIX 1 */
59 #define SSIZE_MAX LONG_MAX
60 #endif
61
62 /* hFILE fields are used as follows:
63
64 char *buffer; // Pointer to the start of the I/O buffer
65 char *begin; // First not-yet-read character / unused position
66 char *end; // First unfilled/unfillable position
67 char *limit; // Pointer to the first position past the buffer
68
69 const hFILE_backend *backend; // Methods to refill/flush I/O buffer
70
71 off_t offset; // Offset within the stream of buffer position 0
72 unsigned at_eof:1;// For reading, whether EOF has been seen
73 unsigned mobile:1;// Buffer is a mobile window or fixed full contents
74 unsigned readonly:1;// Whether opened as "r" rather than "r+"/"w"/"a"
75 int has_errno; // Error number from the last failure on this stream
76
77 For reading, begin is the first unread character in the buffer and end is the
78 first unfilled position:
79
80 -----------ABCDEFGHIJKLMNO---------------
81 ^buffer ^begin ^end ^limit
82
83 For writing, begin is the first unused position and end is unused so remains
84 equal to buffer:
85
86 ABCDEFGHIJKLMNOPQRSTUVWXYZ---------------
87 ^buffer ^begin ^limit
88 ^end
89
90 Thus if begin > end then there is a non-empty write buffer, if begin < end
91 then there is a non-empty read buffer, and if begin == end then both buffers
92 are empty. In all cases, the stream's file position indicator corresponds
93 to the position pointed to by begin.
94
95 The above is the normal scenario of a mobile window. For in-memory
96 streams (eg via hfile_init_fixed) the buffer can be used as the full
97 contents without any separate backend behind it. These always have at_eof
98 set, offset set to 0, need no read() method, and should just return EINVAL
99 for seek():
100
101 abcdefghijkLMNOPQRSTUVWXYZ------
102 ^buffer ^begin ^end ^limit
103 */
104 HTSLIB_EXPORT
hfile_init(size_t struct_size,const char * mode,size_t capacity)105 hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity)
106 {
107 hFILE *fp = (hFILE *) malloc(struct_size);
108 if (fp == NULL) goto error;
109
110 if (capacity == 0) capacity = 32768;
111 // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory
112 if (strchr(mode, 'r') && capacity > 32768) capacity = 32768;
113
114 fp->buffer = (char *) malloc(capacity);
115 if (fp->buffer == NULL) goto error;
116
117 fp->begin = fp->end = fp->buffer;
118 fp->limit = &fp->buffer[capacity];
119
120 fp->offset = 0;
121 fp->at_eof = 0;
122 fp->mobile = 1;
123 fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+'));
124 fp->has_errno = 0;
125 return fp;
126
127 error:
128 hfile_destroy(fp);
129 return NULL;
130 }
131
hfile_init_fixed(size_t struct_size,const char * mode,char * buffer,size_t buf_filled,size_t buf_size)132 hFILE *hfile_init_fixed(size_t struct_size, const char *mode,
133 char *buffer, size_t buf_filled, size_t buf_size)
134 {
135 hFILE *fp = (hFILE *) malloc(struct_size);
136 if (fp == NULL) return NULL;
137
138 fp->buffer = fp->begin = buffer;
139 fp->end = &fp->buffer[buf_filled];
140 fp->limit = &fp->buffer[buf_size];
141
142 fp->offset = 0;
143 fp->at_eof = 1;
144 fp->mobile = 0;
145 fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+'));
146 fp->has_errno = 0;
147 return fp;
148 }
149
150 static const struct hFILE_backend mem_backend;
151
152 HTSLIB_EXPORT
hfile_destroy(hFILE * fp)153 void hfile_destroy(hFILE *fp)
154 {
155 int save = errno;
156 if (fp) free(fp->buffer);
157 free(fp);
158 errno = save;
159 }
160
writebuffer_is_nonempty(hFILE * fp)161 static inline int writebuffer_is_nonempty(hFILE *fp)
162 {
163 return fp->begin > fp->end;
164 }
165
166 /* Refills the read buffer from the backend (once, so may only partially
167 fill the buffer), returning the number of additional characters read
168 (which might be 0), or negative when an error occurred. */
refill_buffer(hFILE * fp)169 static ssize_t refill_buffer(hFILE *fp)
170 {
171 ssize_t n;
172
173 // Move any unread characters to the start of the buffer
174 if (fp->mobile && fp->begin > fp->buffer) {
175 fp->offset += fp->begin - fp->buffer;
176 memmove(fp->buffer, fp->begin, fp->end - fp->begin);
177 fp->end = &fp->buffer[fp->end - fp->begin];
178 fp->begin = fp->buffer;
179 }
180
181 // Read into the available buffer space at fp->[end,limit)
182 if (fp->at_eof || fp->end == fp->limit) n = 0;
183 else {
184 n = fp->backend->read(fp, fp->end, fp->limit - fp->end);
185 if (n < 0) { fp->has_errno = errno; return n; }
186 else if (n == 0) fp->at_eof = 1;
187 }
188
189 fp->end += n;
190 return n;
191 }
192
193 /*
194 * Changes the buffer size for an hFILE. Ideally this is done
195 * immediately after opening. If performed later, this function may
196 * fail if we are reducing the buffer size and the current offset into
197 * the buffer is beyond the new capacity.
198 *
199 * Returns 0 on success;
200 * -1 on failure.
201 */
202 HTSLIB_EXPORT
hfile_set_blksize(hFILE * fp,size_t bufsiz)203 int hfile_set_blksize(hFILE *fp, size_t bufsiz) {
204 char *buffer;
205 ptrdiff_t curr_used;
206 if (!fp) return -1;
207 curr_used = (fp->begin > fp->end ? fp->begin : fp->end) - fp->buffer;
208 if (bufsiz == 0) bufsiz = 32768;
209
210 // Ensure buffer resize will not erase live data
211 if (bufsiz < curr_used)
212 return -1;
213
214 if (!(buffer = (char *) realloc(fp->buffer, bufsiz))) return -1;
215
216 fp->begin = buffer + (fp->begin - fp->buffer);
217 fp->end = buffer + (fp->end - fp->buffer);
218 fp->buffer = buffer;
219 fp->limit = &fp->buffer[bufsiz];
220
221 return 0;
222 }
223
224 /* Called only from hgetc(), when our buffer is empty. */
225 HTSLIB_EXPORT
hgetc2(hFILE * fp)226 int hgetc2(hFILE *fp)
227 {
228 return (refill_buffer(fp) > 0)? (unsigned char) *(fp->begin++) : EOF;
229 }
230
hgetdelim(char * buffer,size_t size,int delim,hFILE * fp)231 ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp)
232 {
233 char *found;
234 size_t n, copied = 0;
235 ssize_t got;
236
237 if (size < 1 || size > SSIZE_MAX) {
238 fp->has_errno = errno = EINVAL;
239 return -1;
240 }
241 if (writebuffer_is_nonempty(fp)) {
242 fp->has_errno = errno = EBADF;
243 return -1;
244 }
245
246 --size; /* to allow space for the NUL terminator */
247
248 do {
249 n = fp->end - fp->begin;
250 if (n > size - copied) n = size - copied;
251
252 /* Look in the hFILE buffer for the delimiter */
253 found = memchr(fp->begin, delim, n);
254 if (found != NULL) {
255 n = found - fp->begin + 1;
256 memcpy(buffer + copied, fp->begin, n);
257 buffer[n + copied] = '\0';
258 fp->begin += n;
259 return n + copied;
260 }
261
262 /* No delimiter yet, copy as much as we can and refill if necessary */
263 memcpy(buffer + copied, fp->begin, n);
264 fp->begin += n;
265 copied += n;
266
267 if (copied == size) { /* Output buffer full */
268 buffer[copied] = '\0';
269 return copied;
270 }
271
272 got = refill_buffer(fp);
273 } while (got > 0);
274
275 if (got < 0) return -1; /* Error on refill. */
276
277 buffer[copied] = '\0'; /* EOF, return anything that was copied. */
278 return copied;
279 }
280
hgets(char * buffer,int size,hFILE * fp)281 char *hgets(char *buffer, int size, hFILE *fp)
282 {
283 if (size < 1) {
284 fp->has_errno = errno = EINVAL;
285 return NULL;
286 }
287 return hgetln(buffer, size, fp) > 0 ? buffer : NULL;
288 }
289
hpeek(hFILE * fp,void * buffer,size_t nbytes)290 ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
291 {
292 size_t n = fp->end - fp->begin;
293 while (n < nbytes) {
294 ssize_t ret = refill_buffer(fp);
295 if (ret < 0) return ret;
296 else if (ret == 0) break;
297 else n += ret;
298 }
299
300 if (n > nbytes) n = nbytes;
301 memcpy(buffer, fp->begin, n);
302 return n;
303 }
304
305 /* Called only from hread(); when called, our buffer is empty and nread bytes
306 have already been placed in the destination buffer. */
307 HTSLIB_EXPORT
hread2(hFILE * fp,void * destv,size_t nbytes,size_t nread)308 ssize_t hread2(hFILE *fp, void *destv, size_t nbytes, size_t nread)
309 {
310 const size_t capacity = fp->limit - fp->buffer;
311 int buffer_invalidated = 0;
312 char *dest = (char *) destv;
313 dest += nread, nbytes -= nread;
314
315 // Read large requests directly into the destination buffer
316 while (nbytes * 2 >= capacity && !fp->at_eof) {
317 ssize_t n = fp->backend->read(fp, dest, nbytes);
318 if (n < 0) { fp->has_errno = errno; return n; }
319 else if (n == 0) fp->at_eof = 1;
320 else buffer_invalidated = 1;
321 fp->offset += n;
322 dest += n, nbytes -= n;
323 nread += n;
324 }
325
326 if (buffer_invalidated) {
327 // Our unread buffer is empty, so begin == end, but our already-read
328 // buffer [buffer,begin) is likely non-empty and is no longer valid as
329 // its contents are no longer adjacent to the file position indicator.
330 // Discard it so that hseek() can't try to take advantage of it.
331 fp->offset += fp->begin - fp->buffer;
332 fp->begin = fp->end = fp->buffer;
333 }
334
335 while (nbytes > 0 && !fp->at_eof) {
336 size_t n;
337 ssize_t ret = refill_buffer(fp);
338 if (ret < 0) return ret;
339
340 n = fp->end - fp->begin;
341 if (n > nbytes) n = nbytes;
342 memcpy(dest, fp->begin, n);
343 fp->begin += n;
344 dest += n, nbytes -= n;
345 nread += n;
346 }
347
348 return nread;
349 }
350
351 /* Flushes the write buffer, fp->[buffer,begin), out through the backend
352 returning 0 on success or negative if an error occurred. */
flush_buffer(hFILE * fp)353 static ssize_t flush_buffer(hFILE *fp)
354 {
355 const char *buffer = fp->buffer;
356 while (buffer < fp->begin) {
357 ssize_t n = fp->backend->write(fp, buffer, fp->begin - buffer);
358 if (n < 0) { fp->has_errno = errno; return n; }
359 buffer += n;
360 fp->offset += n;
361 }
362
363 fp->begin = fp->buffer; // Leave the buffer empty
364 return 0;
365 }
366
hflush(hFILE * fp)367 int hflush(hFILE *fp)
368 {
369 if (flush_buffer(fp) < 0) return EOF;
370 if (fp->backend->flush) {
371 if (fp->backend->flush(fp) < 0) { fp->has_errno = errno; return EOF; }
372 }
373 return 0;
374 }
375
376 /* Called only from hputc(), when our buffer is already full. */
377 HTSLIB_EXPORT
hputc2(int c,hFILE * fp)378 int hputc2(int c, hFILE *fp)
379 {
380 if (flush_buffer(fp) < 0) return EOF;
381 *(fp->begin++) = c;
382 return c;
383 }
384
385 /* Called only from hwrite() and hputs2(); when called, our buffer is either
386 full and ncopied bytes from the source have already been copied to our
387 buffer; or completely empty, ncopied is zero and totalbytes is greater than
388 the buffer size. */
389 HTSLIB_EXPORT
hwrite2(hFILE * fp,const void * srcv,size_t totalbytes,size_t ncopied)390 ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied)
391 {
392 const char *src = (const char *) srcv;
393 ssize_t ret;
394 const size_t capacity = fp->limit - fp->buffer;
395 size_t remaining = totalbytes - ncopied;
396 src += ncopied;
397
398 ret = flush_buffer(fp);
399 if (ret < 0) return ret;
400
401 // Write large blocks out directly from the source buffer
402 while (remaining * 2 >= capacity) {
403 ssize_t n = fp->backend->write(fp, src, remaining);
404 if (n < 0) { fp->has_errno = errno; return n; }
405 fp->offset += n;
406 src += n, remaining -= n;
407 }
408
409 // Just buffer any remaining characters
410 memcpy(fp->begin, src, remaining);
411 fp->begin += remaining;
412
413 return totalbytes;
414 }
415
416 /* Called only from hputs(), when our buffer is already full. */
417 HTSLIB_EXPORT
hputs2(const char * text,size_t totalbytes,size_t ncopied,hFILE * fp)418 int hputs2(const char *text, size_t totalbytes, size_t ncopied, hFILE *fp)
419 {
420 return (hwrite2(fp, text, totalbytes, ncopied) >= 0)? 0 : EOF;
421 }
422
hseek(hFILE * fp,off_t offset,int whence)423 off_t hseek(hFILE *fp, off_t offset, int whence)
424 {
425 off_t curpos, pos;
426
427 if (writebuffer_is_nonempty(fp) && fp->mobile) {
428 int ret = flush_buffer(fp);
429 if (ret < 0) return ret;
430 }
431
432 curpos = htell(fp);
433
434 // Relative offsets are given relative to the hFILE's stream position,
435 // which may differ from the backend's physical position due to buffering
436 // read-ahead. Correct for this by converting to an absolute position.
437 if (whence == SEEK_CUR) {
438 if (curpos + offset < 0) {
439 // Either a negative offset resulted in a position before the
440 // start of the file, or we overflowed when given a positive offset
441 fp->has_errno = errno = (offset < 0)? EINVAL : EOVERFLOW;
442 return -1;
443 }
444
445 whence = SEEK_SET;
446 offset = curpos + offset;
447 }
448 // For fixed immobile buffers, convert everything else to SEEK_SET too
449 // so that seeking can be avoided for all (within range) requests.
450 else if (! fp->mobile && whence == SEEK_END) {
451 size_t length = fp->end - fp->buffer;
452 if (offset > 0 || -offset > length) {
453 fp->has_errno = errno = EINVAL;
454 return -1;
455 }
456
457 whence = SEEK_SET;
458 offset = length + offset;
459 }
460
461 // Avoid seeking if the desired position is within our read buffer.
462 // (But not when the next operation may be a write on a mobile buffer.)
463 if (whence == SEEK_SET && (! fp->mobile || fp->readonly) &&
464 offset >= fp->offset && offset - fp->offset <= fp->end - fp->buffer) {
465 fp->begin = &fp->buffer[offset - fp->offset];
466 return offset;
467 }
468
469 pos = fp->backend->seek(fp, offset, whence);
470 if (pos < 0) { fp->has_errno = errno; return pos; }
471
472 // Seeking succeeded, so discard any non-empty read buffer
473 fp->begin = fp->end = fp->buffer;
474 fp->at_eof = 0;
475
476 fp->offset = pos;
477 return pos;
478 }
479
hclose(hFILE * fp)480 int hclose(hFILE *fp)
481 {
482 int err = fp->has_errno;
483
484 if (writebuffer_is_nonempty(fp) && hflush(fp) < 0) err = fp->has_errno;
485 if (fp->backend->close(fp) < 0) err = errno;
486 hfile_destroy(fp);
487
488 if (err) {
489 errno = err;
490 return EOF;
491 }
492 else return 0;
493 }
494
hclose_abruptly(hFILE * fp)495 void hclose_abruptly(hFILE *fp)
496 {
497 int save = errno;
498 if (fp->backend->close(fp) < 0) { /* Ignore subsequent errors */ }
499 hfile_destroy(fp);
500 errno = save;
501 }
502
503
504 /***************************
505 * File descriptor backend *
506 ***************************/
507
508 #ifndef _WIN32
509 #include <sys/socket.h>
510 #include <sys/stat.h>
511 #define HAVE_STRUCT_STAT_ST_BLKSIZE
512 #else
513 #include <winsock2.h>
514 #define HAVE_CLOSESOCKET
515 #define HAVE_SETMODE
516 #endif
517 #include <fcntl.h>
518 #include <unistd.h>
519
520 /* For Unix, it doesn't matter whether a file descriptor is a socket.
521 However Windows insists on send()/recv() and its own closesocket()
522 being used when fd happens to be a socket. */
523
524 typedef struct {
525 hFILE base;
526 int fd;
527 unsigned is_socket:1;
528 } hFILE_fd;
529
fd_read(hFILE * fpv,void * buffer,size_t nbytes)530 static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes)
531 {
532 hFILE_fd *fp = (hFILE_fd *) fpv;
533 ssize_t n;
534 do {
535 n = fp->is_socket? recv(fp->fd, buffer, nbytes, 0)
536 : read(fp->fd, buffer, nbytes);
537 } while (n < 0 && errno == EINTR);
538 return n;
539 }
540
fd_write(hFILE * fpv,const void * buffer,size_t nbytes)541 static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes)
542 {
543 hFILE_fd *fp = (hFILE_fd *) fpv;
544 ssize_t n;
545 do {
546 n = fp->is_socket? send(fp->fd, buffer, nbytes, 0)
547 : write(fp->fd, buffer, nbytes);
548 } while (n < 0 && errno == EINTR);
549 #ifdef _WIN32
550 // On windows we have no SIGPIPE. Instead write returns
551 // EINVAL. We check for this and our fd being a pipe.
552 // If so, we raise SIGTERM instead of SIGPIPE. It's not
553 // ideal, but I think the only alternative is extra checking
554 // in every single piece of code.
555 if (n < 0 && errno == EINVAL &&
556 GetLastError() == ERROR_NO_DATA &&
557 GetFileType((HANDLE)_get_osfhandle(fp->fd)) == FILE_TYPE_PIPE) {
558 raise(SIGTERM);
559 }
560 #endif
561 return n;
562 }
563
fd_seek(hFILE * fpv,off_t offset,int whence)564 static off_t fd_seek(hFILE *fpv, off_t offset, int whence)
565 {
566 hFILE_fd *fp = (hFILE_fd *) fpv;
567 return lseek(fp->fd, offset, whence);
568 }
569
fd_flush(hFILE * fpv)570 static int fd_flush(hFILE *fpv)
571 {
572 int ret = 0;
573 do {
574 #ifdef HAVE_FDATASYNC
575 hFILE_fd *fp = (hFILE_fd *) fpv;
576 ret = fdatasync(fp->fd);
577 #elif defined(HAVE_FSYNC)
578 hFILE_fd *fp = (hFILE_fd *) fpv;
579 ret = fsync(fp->fd);
580 #endif
581 // Ignore invalid-for-fsync(2) errors due to being, e.g., a pipe,
582 // and operation-not-supported errors (Mac OS X)
583 if (ret < 0 && (errno == EINVAL || errno == ENOTSUP)) ret = 0;
584 } while (ret < 0 && errno == EINTR);
585 return ret;
586 }
587
fd_close(hFILE * fpv)588 static int fd_close(hFILE *fpv)
589 {
590 hFILE_fd *fp = (hFILE_fd *) fpv;
591 int ret;
592 do {
593 #ifdef HAVE_CLOSESOCKET
594 ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd);
595 #else
596 ret = close(fp->fd);
597 #endif
598 } while (ret < 0 && errno == EINTR);
599 return ret;
600 }
601
602 static const struct hFILE_backend fd_backend =
603 {
604 fd_read, fd_write, fd_seek, fd_flush, fd_close
605 };
606
blksize(int fd)607 static size_t blksize(int fd)
608 {
609 #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
610 struct stat sbuf;
611 if (fstat(fd, &sbuf) != 0) return 0;
612 return sbuf.st_blksize;
613 #else
614 return 0;
615 #endif
616 }
617
hopen_fd(const char * filename,const char * mode)618 static hFILE *hopen_fd(const char *filename, const char *mode)
619 {
620 hFILE_fd *fp = NULL;
621 int fd = open(filename, hfile_oflags(mode), 0666);
622 if (fd < 0) goto error;
623
624 fp = (hFILE_fd *) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
625 if (fp == NULL) goto error;
626
627 fp->fd = fd;
628 fp->is_socket = 0;
629 fp->base.backend = &fd_backend;
630 return &fp->base;
631
632 error:
633 if (fd >= 0) { int save = errno; (void) close(fd); errno = save; }
634 hfile_destroy((hFILE *) fp);
635 return NULL;
636 }
637
638 // Loads the contents of filename to produced a read-only, in memory,
639 // immobile hfile. fp is the already opened file. We always close this
640 // input fp, irrespective of whether we error or whether we return a new
641 // immobile hfile.
hpreload(hFILE * fp)642 static hFILE *hpreload(hFILE *fp) {
643 hFILE *mem_fp;
644 char *buf = NULL;
645 off_t buf_sz = 0, buf_a = 0, buf_inc = 8192, len;
646
647 for (;;) {
648 if (buf_a - buf_sz < 5000) {
649 buf_a += buf_inc;
650 char *t = realloc(buf, buf_a);
651 if (!t) goto err;
652 buf = t;
653 if (buf_inc < 1000000) buf_inc *= 1.3;
654 }
655 len = hread(fp, buf+buf_sz, buf_a-buf_sz);
656 if (len > 0)
657 buf_sz += len;
658 else
659 break;
660 }
661
662 if (len < 0) goto err;
663 mem_fp = hfile_init_fixed(sizeof(hFILE), "r", buf, buf_sz, buf_a);
664 if (!mem_fp) goto err;
665 mem_fp->backend = &mem_backend;
666
667 if (hclose(fp) < 0) {
668 hclose_abruptly(mem_fp);
669 goto err;
670 }
671 return mem_fp;
672
673 err:
674 free(buf);
675 hclose_abruptly(fp);
676 return NULL;
677 }
678
is_preload_url_remote(const char * url)679 static int is_preload_url_remote(const char *url){
680 return hisremote(url + 8); // len("preload:") = 8
681 }
682
hopen_preload(const char * url,const char * mode)683 static hFILE *hopen_preload(const char *url, const char *mode){
684 hFILE* fp = hopen(url + 8, mode);
685 return hpreload(fp);
686 }
687
hdopen(int fd,const char * mode)688 hFILE *hdopen(int fd, const char *mode)
689 {
690 hFILE_fd *fp = (hFILE_fd*) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
691 if (fp == NULL) return NULL;
692
693 fp->fd = fd;
694 fp->is_socket = (strchr(mode, 's') != NULL);
695 fp->base.backend = &fd_backend;
696 return &fp->base;
697 }
698
hopen_fd_fileuri(const char * url,const char * mode)699 static hFILE *hopen_fd_fileuri(const char *url, const char *mode)
700 {
701 if (strncmp(url, "file://localhost/", 17) == 0) url += 16;
702 else if (strncmp(url, "file:///", 8) == 0) url += 7;
703 else { errno = EPROTONOSUPPORT; return NULL; }
704
705 #if defined(_WIN32) || defined(__MSYS__)
706 // For cases like C:/foo
707 if (url[0] == '/' && url[1] && url[2] == ':' && url[3] == '/') url++;
708 #endif
709
710 return hopen_fd(url, mode);
711 }
712
hopen_fd_stdinout(const char * mode)713 static hFILE *hopen_fd_stdinout(const char *mode)
714 {
715 int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO;
716 #if defined HAVE_SETMODE && defined O_BINARY
717 if (setmode(fd, O_BINARY) < 0) return NULL;
718 #endif
719 return hdopen(fd, mode);
720 }
721
722 HTSLIB_EXPORT
hfile_oflags(const char * mode)723 int hfile_oflags(const char *mode)
724 {
725 int rdwr = 0, flags = 0;
726 const char *s;
727 for (s = mode; *s; s++)
728 switch (*s) {
729 case 'r': rdwr = O_RDONLY; break;
730 case 'w': rdwr = O_WRONLY; flags |= O_CREAT | O_TRUNC; break;
731 case 'a': rdwr = O_WRONLY; flags |= O_CREAT | O_APPEND; break;
732 case '+': rdwr = O_RDWR; break;
733 #ifdef O_CLOEXEC
734 case 'e': flags |= O_CLOEXEC; break;
735 #endif
736 #ifdef O_EXCL
737 case 'x': flags |= O_EXCL; break;
738 #endif
739 default: break;
740 }
741
742 #ifdef O_BINARY
743 flags |= O_BINARY;
744 #endif
745
746 return rdwr | flags;
747 }
748
749
750 /*********************
751 * In-memory backend *
752 *********************/
753
754 #include "hts_internal.h"
755
756 typedef struct {
757 hFILE base;
758 } hFILE_mem;
759
mem_seek(hFILE * fpv,off_t offset,int whence)760 static off_t mem_seek(hFILE *fpv, off_t offset, int whence)
761 {
762 errno = EINVAL;
763 return -1;
764 }
765
mem_close(hFILE * fpv)766 static int mem_close(hFILE *fpv)
767 {
768 return 0;
769 }
770
771 static const struct hFILE_backend mem_backend =
772 {
773 NULL, NULL, mem_seek, NULL, mem_close
774 };
775
cmp_prefix(const char * key,const char * s)776 static int cmp_prefix(const char *key, const char *s)
777 {
778 while (*key)
779 if (tolower_c(*s) != *key) return +1;
780 else s++, key++;
781
782 return 0;
783 }
784
create_hfile_mem(char * buffer,const char * mode,size_t buf_filled,size_t buf_size)785 static hFILE *create_hfile_mem(char* buffer, const char* mode, size_t buf_filled, size_t buf_size)
786 {
787 hFILE_mem *fp = (hFILE_mem *) hfile_init_fixed(sizeof(hFILE_mem), mode, buffer, buf_filled, buf_size);
788 if (fp == NULL)
789 return NULL;
790
791 fp->base.backend = &mem_backend;
792 return &fp->base;
793 }
794
hopen_mem(const char * url,const char * mode)795 static hFILE *hopen_mem(const char *url, const char *mode)
796 {
797 size_t length, size;
798 char *buffer;
799 const char *data, *comma = strchr(url, ',');
800 if (comma == NULL) { errno = EINVAL; return NULL; }
801 data = comma+1;
802
803 // TODO Implement write modes
804 if (strchr(mode, 'r') == NULL) { errno = EROFS; return NULL; }
805
806 if (comma - url >= 7 && cmp_prefix(";base64", &comma[-7]) == 0) {
807 size = hts_base64_decoded_length(strlen(data));
808 buffer = malloc(size);
809 if (buffer == NULL) return NULL;
810 hts_decode_base64(buffer, &length, data);
811 }
812 else {
813 size = strlen(data) + 1;
814 buffer = malloc(size);
815 if (buffer == NULL) return NULL;
816 hts_decode_percent(buffer, &length, data);
817 }
818 hFILE* hf;
819
820 if(!(hf = create_hfile_mem(buffer, mode, length, size))){
821 free(buffer);
822 return NULL;
823 }
824
825 return hf;
826 }
827
hopenv_mem(const char * filename,const char * mode,va_list args)828 static hFILE *hopenv_mem(const char *filename, const char *mode, va_list args)
829 {
830 char* buffer = va_arg(args, char*);
831 size_t sz = va_arg(args, size_t);
832 va_end(args);
833
834 hFILE* hf;
835
836 if(!(hf = create_hfile_mem(buffer, mode, sz, sz))){
837 free(buffer);
838 return NULL;
839 }
840
841 return hf;
842 }
843
hfile_mem_get_buffer(hFILE * file,size_t * length)844 char *hfile_mem_get_buffer(hFILE *file, size_t *length) {
845 if (file->backend != &mem_backend) {
846 errno = EINVAL;
847 return NULL;
848 }
849
850 if (length)
851 *length = file->buffer - file->limit;
852
853 return file->buffer;
854 }
855
hfile_mem_steal_buffer(hFILE * file,size_t * length)856 char *hfile_mem_steal_buffer(hFILE *file, size_t *length) {
857 char *buf = hfile_mem_get_buffer(file, length);
858 if (buf)
859 file->buffer = NULL;
860 return buf;
861 }
862
hfile_plugin_init_mem(struct hFILE_plugin * self)863 int hfile_plugin_init_mem(struct hFILE_plugin *self)
864 {
865 // mem files are declared remote so they work with a tabix index
866 static const struct hFILE_scheme_handler handler =
867 {NULL, hfile_always_remote, "mem", 2000 + 50, hopenv_mem};
868 self->name = "mem";
869 hfile_add_scheme_handler("mem", &handler);
870 return 0;
871 }
872
873 /**********************************************************************
874 * Dummy crypt4gh plug-in. Does nothing apart from advise how to get *
875 * the real one. It will be overridden by the actual plug-in. *
876 **********************************************************************/
877
crypt4gh_needed(const char * url,const char * mode)878 static hFILE *crypt4gh_needed(const char *url, const char *mode)
879 {
880 const char *u = strncmp(url, "crypt4gh:", 9) == 0 ? url + 9 : url;
881 #if defined(ENABLE_PLUGINS)
882 const char *enable_plugins = "";
883 #else
884 const char *enable_plugins = "You also need to rebuild HTSlib with plug-ins enabled.\n";
885 #endif
886
887 hts_log_error("Accessing \"%s\" needs the crypt4gh plug-in.\n"
888 "It can be found at "
889 "https://github.com/samtools/htslib-crypt4gh\n"
890 "%s"
891 "If you have the plug-in, please ensure it can be "
892 "found on your HTS_PATH.",
893 u, enable_plugins);
894
895 errno = EPROTONOSUPPORT;
896 return NULL;
897 }
898
hfile_plugin_init_crypt4gh_needed(struct hFILE_plugin * self)899 int hfile_plugin_init_crypt4gh_needed(struct hFILE_plugin *self)
900 {
901 static const struct hFILE_scheme_handler handler =
902 { crypt4gh_needed, NULL, "crypt4gh-needed", 0, NULL };
903 self->name = "crypt4gh-needed";
904 hfile_add_scheme_handler("crypt4gh", &handler);
905 return 0;
906 }
907
908
909 /*****************************************
910 * Plugin and hopen() backend dispatcher *
911 *****************************************/
912
913 #include "htslib/khash.h"
914
915 KHASH_MAP_INIT_STR(scheme_string, const struct hFILE_scheme_handler *)
916 static khash_t(scheme_string) *schemes = NULL;
917
918 struct hFILE_plugin_list {
919 struct hFILE_plugin plugin;
920 struct hFILE_plugin_list *next;
921 };
922
923 static struct hFILE_plugin_list *plugins = NULL;
924 static pthread_mutex_t plugins_lock = PTHREAD_MUTEX_INITIALIZER;
925
hfile_shutdown(int do_close_plugin)926 void hfile_shutdown(int do_close_plugin)
927 {
928 pthread_mutex_lock(&plugins_lock);
929
930 if (schemes) {
931 kh_destroy(scheme_string, schemes);
932 schemes = NULL;
933 }
934
935 while (plugins != NULL) {
936 struct hFILE_plugin_list *p = plugins;
937 if (p->plugin.destroy) p->plugin.destroy();
938 #ifdef ENABLE_PLUGINS
939 if (p->plugin.obj && do_close_plugin) close_plugin(p->plugin.obj);
940 #endif
941 plugins = p->next;
942 free(p);
943 }
944
945 pthread_mutex_unlock(&plugins_lock);
946 }
947
hfile_exit()948 static void hfile_exit()
949 {
950 hfile_shutdown(0);
951 pthread_mutex_destroy(&plugins_lock);
952 }
953
priority(const struct hFILE_scheme_handler * handler)954 static inline int priority(const struct hFILE_scheme_handler *handler)
955 {
956 return handler->priority % 1000;
957 }
958
959 #ifdef USING_WINDOWS_PLUGIN_DLLS
960 /*
961 * Work-around for Windows plug-in dlls where the plug-in could be
962 * using a different HTSlib library to the executable (for example
963 * because the latter was build against a static libhts.a). When this
964 * happens, the plug-in can call the wrong copy of hfile_add_scheme_handler().
965 * If this is detected, it calls this function which attempts to fix the
966 * problem by redirecting to the hfile_add_scheme_handler() in the main
967 * executable.
968 */
try_exe_add_scheme_handler(const char * scheme,const struct hFILE_scheme_handler * handler)969 static int try_exe_add_scheme_handler(const char *scheme,
970 const struct hFILE_scheme_handler *handler)
971 {
972 static void (*add_scheme_handler)(const char *scheme,
973 const struct hFILE_scheme_handler *handler);
974 if (!add_scheme_handler) {
975 // dlopen the main executable and resolve hfile_add_scheme_handler
976 void *exe_handle = dlopen(NULL, RTLD_LAZY);
977 if (!exe_handle) return -1;
978 *(void **) (&add_scheme_handler) = dlsym(exe_handle, "hfile_add_scheme_handler");
979 dlclose(exe_handle);
980 }
981 // Check that the symbol was obtained and isn't the one in this copy
982 // of the library (to avoid infinite recursion)
983 if (!add_scheme_handler || add_scheme_handler == hfile_add_scheme_handler)
984 return -1;
985 add_scheme_handler(scheme, handler);
986 return 0;
987 }
988 #else
try_exe_add_scheme_handler(const char * scheme,const struct hFILE_scheme_handler * handler)989 static int try_exe_add_scheme_handler(const char *scheme,
990 const struct hFILE_scheme_handler *handler)
991 {
992 return -1;
993 }
994 #endif
995
996 HTSLIB_EXPORT
hfile_add_scheme_handler(const char * scheme,const struct hFILE_scheme_handler * handler)997 void hfile_add_scheme_handler(const char *scheme,
998 const struct hFILE_scheme_handler *handler)
999 {
1000 int absent;
1001 if (!schemes) {
1002 if (try_exe_add_scheme_handler(scheme, handler) != 0) {
1003 hts_log_warning("Couldn't register scheme handler for %s", scheme);
1004 }
1005 return;
1006 }
1007 khint_t k = kh_put(scheme_string, schemes, scheme, &absent);
1008 if (absent < 0) {
1009 hts_log_warning("Couldn't register scheme handler for %s : %s",
1010 scheme, strerror(errno));
1011 return;
1012 }
1013 if (absent || priority(handler) > priority(kh_value(schemes, k))) {
1014 kh_value(schemes, k) = handler;
1015 }
1016 }
1017
init_add_plugin(void * obj,int (* init)(struct hFILE_plugin *),const char * pluginname)1018 static int init_add_plugin(void *obj, int (*init)(struct hFILE_plugin *),
1019 const char *pluginname)
1020 {
1021 struct hFILE_plugin_list *p = malloc (sizeof (struct hFILE_plugin_list));
1022 if (p == NULL) {
1023 hts_log_debug("Failed to allocate memory for plugin \"%s\"", pluginname);
1024 return -1;
1025 }
1026
1027 p->plugin.api_version = 1;
1028 p->plugin.obj = obj;
1029 p->plugin.name = NULL;
1030 p->plugin.destroy = NULL;
1031
1032 int ret = (*init)(&p->plugin);
1033
1034 if (ret != 0) {
1035 hts_log_debug("Initialisation failed for plugin \"%s\": %d", pluginname, ret);
1036 free(p);
1037 return ret;
1038 }
1039
1040 hts_log_debug("Loaded \"%s\"", pluginname);
1041
1042 p->next = plugins, plugins = p;
1043 return 0;
1044 }
1045
1046 /*
1047 * Returns 0 on success,
1048 * <0 on failure
1049 */
load_hfile_plugins()1050 static int load_hfile_plugins()
1051 {
1052 static const struct hFILE_scheme_handler
1053 data = { hopen_mem, hfile_always_local, "built-in", 80 },
1054 file = { hopen_fd_fileuri, hfile_always_local, "built-in", 80 },
1055 preload = { hopen_preload, is_preload_url_remote, "built-in", 80 };
1056
1057 schemes = kh_init(scheme_string);
1058 if (schemes == NULL)
1059 return -1;
1060
1061 hfile_add_scheme_handler("data", &data);
1062 hfile_add_scheme_handler("file", &file);
1063 hfile_add_scheme_handler("preload", &preload);
1064 init_add_plugin(NULL, hfile_plugin_init_net, "knetfile");
1065 init_add_plugin(NULL, hfile_plugin_init_mem, "mem");
1066 init_add_plugin(NULL, hfile_plugin_init_crypt4gh_needed, "crypt4gh-needed");
1067
1068 #ifdef ENABLE_PLUGINS
1069 struct hts_path_itr path;
1070 const char *pluginname;
1071 hts_path_itr_setup(&path, NULL, NULL, "hfile_", 6, NULL, 0);
1072 while ((pluginname = hts_path_itr_next(&path)) != NULL) {
1073 void *obj;
1074 int (*init)(struct hFILE_plugin *) = (int (*)(struct hFILE_plugin *))
1075 load_plugin(&obj, pluginname, "hfile_plugin_init");
1076
1077 if (init) {
1078 if (init_add_plugin(obj, init, pluginname) != 0)
1079 close_plugin(obj);
1080 }
1081 }
1082 #else
1083
1084 #ifdef HAVE_LIBCURL
1085 init_add_plugin(NULL, hfile_plugin_init_libcurl, "libcurl");
1086 #endif
1087 #ifdef ENABLE_GCS
1088 init_add_plugin(NULL, hfile_plugin_init_gcs, "gcs");
1089 #endif
1090 #ifdef ENABLE_S3
1091 init_add_plugin(NULL, hfile_plugin_init_s3, "s3");
1092 init_add_plugin(NULL, hfile_plugin_init_s3_write, "s3w");
1093 #endif
1094
1095 #endif
1096
1097 // In the unlikely event atexit() fails, it's better to succeed here and
1098 // carry on; then eventually when the program exits, we'll merely close
1099 // down the plugins uncleanly, as if we had aborted.
1100 (void) atexit(hfile_exit);
1101
1102 return 0;
1103 }
1104
1105 /* A filename like "foo:bar" in which we don't recognise the scheme is
1106 either an ordinary file or an indication of a missing or broken plugin.
1107 Try to open it as an ordinary file; but if there's no such file, set
1108 errno distinctively to make the plugin issue apparent. */
hopen_unknown_scheme(const char * fname,const char * mode)1109 static hFILE *hopen_unknown_scheme(const char *fname, const char *mode)
1110 {
1111 hFILE *fp = hopen_fd(fname, mode);
1112 if (fp == NULL && errno == ENOENT) errno = EPROTONOSUPPORT;
1113 return fp;
1114 }
1115
1116 /* Returns the appropriate handler, or NULL if the string isn't an URL. */
find_scheme_handler(const char * s)1117 static const struct hFILE_scheme_handler *find_scheme_handler(const char *s)
1118 {
1119 static const struct hFILE_scheme_handler unknown_scheme =
1120 { hopen_unknown_scheme, hfile_always_local, "built-in", 0 };
1121
1122 char scheme[12];
1123 int i;
1124
1125 for (i = 0; i < sizeof scheme; i++)
1126 if (isalnum_c(s[i]) || s[i] == '+' || s[i] == '-' || s[i] == '.')
1127 scheme[i] = tolower_c(s[i]);
1128 else if (s[i] == ':') break;
1129 else return NULL;
1130
1131 // 1 byte schemes are likely windows C:/foo pathnames
1132 if (i <= 1 || i >= sizeof scheme) return NULL;
1133 scheme[i] = '\0';
1134
1135 pthread_mutex_lock(&plugins_lock);
1136 if (!schemes && load_hfile_plugins() < 0) {
1137 pthread_mutex_unlock(&plugins_lock);
1138 return NULL;
1139 }
1140 pthread_mutex_unlock(&plugins_lock);
1141
1142 khint_t k = kh_get(scheme_string, schemes, scheme);
1143 return (k != kh_end(schemes))? kh_value(schemes, k) : &unknown_scheme;
1144 }
1145
hopen(const char * fname,const char * mode,...)1146 hFILE *hopen(const char *fname, const char *mode, ...)
1147 {
1148 const struct hFILE_scheme_handler *handler = find_scheme_handler(fname);
1149 if (handler) {
1150 if (strchr(mode, ':') == NULL
1151 || handler->priority < 2000
1152 || handler->vopen == NULL) {
1153 return handler->open(fname, mode);
1154 }
1155 else {
1156 hFILE *fp;
1157 va_list arg;
1158 va_start(arg, mode);
1159 fp = handler->vopen(fname, mode, arg);
1160 va_end(arg);
1161 return fp;
1162 }
1163 }
1164 else if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode);
1165 else return hopen_fd(fname, mode);
1166 }
1167
1168 HTSLIB_EXPORT
hfile_always_local(const char * fname)1169 int hfile_always_local (const char *fname) { return 0; }
1170
1171 HTSLIB_EXPORT
hfile_always_remote(const char * fname)1172 int hfile_always_remote(const char *fname) { return 1; }
1173
hisremote(const char * fname)1174 int hisremote(const char *fname)
1175 {
1176 const struct hFILE_scheme_handler *handler = find_scheme_handler(fname);
1177 return handler? handler->isremote(fname) : 0;
1178 }
1179
1180 // Remove an extension, if any, from the basename part of [start,limit).
1181 // Note: Doesn't notice percent-encoded '.' and '/' characters. Don't do that.
strip_extension(const char * start,const char * limit)1182 static const char *strip_extension(const char *start, const char *limit)
1183 {
1184 const char *s = limit;
1185 while (s > start) {
1186 --s;
1187 if (*s == '.') return s;
1188 else if (*s == '/') break;
1189 }
1190 return limit;
1191 }
1192
haddextension(struct kstring_t * buffer,const char * filename,int replace,const char * new_extension)1193 char *haddextension(struct kstring_t *buffer, const char *filename,
1194 int replace, const char *new_extension)
1195 {
1196 const char *trailing, *end;
1197
1198 if (find_scheme_handler(filename)) {
1199 // URL, so alter extensions before any trailing query or fragment parts
1200 // Allow # symbols in s3 URLs
1201 trailing = filename + ((strncmp(filename, "s3://", 5) && strncmp(filename, "s3+http://", 10) && strncmp(filename, "s3+https://", 11)) ? strcspn(filename, "?#") : strcspn(filename, "?"));
1202 }
1203 else {
1204 // Local path, so alter extensions at the end of the filename
1205 trailing = strchr(filename, '\0');
1206 }
1207
1208 end = replace? strip_extension(filename, trailing) : trailing;
1209
1210 buffer->l = 0;
1211 if (kputsn(filename, end - filename, buffer) >= 0 &&
1212 kputs(new_extension, buffer) >= 0 &&
1213 kputs(trailing, buffer) >= 0) return buffer->s;
1214 else return NULL;
1215 }
1216