1 /*
2 Copyright (c) 2005-2006, 2008-2009, 2013, 2015, 2017-2019 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32 #include <config.h>
33 
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <errno.h>
37 #include <string.h>
38 #include <sys/types.h>
39 #include <sys/stat.h>
40 #include <fcntl.h>
41 #include <unistd.h>
42 #include <stdarg.h>
43 
44 #include "../htslib/hts_log.h"
45 #include "os.h"
46 #include "mFILE.h"
47 
48 #ifdef HAVE_MMAP
49 #include <sys/mman.h>
50 #endif
51 
52 /*
53  * This file contains memory-based versions of the most commonly used
54  * (by io_lib) stdio functions.
55  *
56  * Actual file IO takes place either on opening or closing an mFILE.
57  *
58  * Coupled to this are a bunch of rather scary macros which can be obtained
59  * by including stdio_hack.h. It is recommended though that you use mFILE.h
60  * instead and replace fopen with mfopen (etc). This is more or less
61  * mandatory if you wish to use both FILE and mFILE structs in a single file.
62  */
63 
64 static mFILE *m_channel[3];  /* stdin, stdout and stderr fakes */
65 
66 /*
67  * Reads the entirety of fp into memory. If 'fn' exists it is the filename
68  * associated with fp. This will be used for more optimal reading (via a
69  * stat to identify the size and a single read). Otherwise we use successive
70  * reads until EOF.
71  *
72  * Returns a malloced buffer on success of length *size
73  *         NULL on failure
74  */
mfload(FILE * fp,const char * fn,size_t * size,int binary)75 static char *mfload(FILE *fp, const char *fn, size_t *size, int binary) {
76     struct stat sb;
77     char *data = NULL;
78     size_t allocated = 0, used = 0;
79     int bufsize = 8192;
80 
81 #ifdef _WIN32
82     if (binary)
83         _setmode(_fileno(fp), _O_BINARY);
84     else
85         _setmode(_fileno(fp), _O_TEXT);
86 #endif
87 
88     if (fn && -1 != stat(fn, &sb)) {
89         data = malloc(allocated = sb.st_size);
90         if (!data)
91             return NULL;
92         bufsize = sb.st_size;
93     } else {
94         fn = NULL;
95     }
96 
97     do {
98         size_t len;
99         if (used + bufsize > allocated) {
100             allocated += bufsize;
101             char *datan = realloc(data, allocated);
102             if (datan) {
103                 data = datan;
104             } else {
105                 free(data);
106                 return NULL;
107             }
108         }
109         len = fread(data + used, 1, allocated - used, fp);
110         if (len > 0)
111             used += len;
112     } while (!feof(fp) && (fn == NULL || used < sb.st_size));
113 
114     *size = used;
115 
116     return data;
117 }
118 
119 
120 #ifdef HAVE_MMAP
121 /*
122  * mmaps in the file, but only for reading currently.
123  *
124  * Returns 0 on success
125  *        -1 on failure
126  */
mfmmap(mFILE * mf,FILE * fp,const char * fn)127 int mfmmap(mFILE *mf, FILE *fp, const char *fn) {
128     struct stat sb;
129 
130     if (stat(fn, &sb) != 0)
131         return -1;
132 
133     mf->size = sb.st_size;
134     mf->data = mmap(NULL, mf->size, PROT_READ, MAP_SHARED,
135                     fileno(fp), 0);
136 
137     if (!mf->data || mf->data == (void *)-1)
138         return -1;
139 
140     mf->alloced = 0;
141     return 0;
142 }
143 #endif
144 
145 
146 /*
147  * Creates and returns m_channel[0].
148  * We initialise this on the first attempted read, which then slurps in
149  * all of stdin until EOF is met.
150  */
mstdin(void)151 mFILE *mstdin(void) {
152     if (m_channel[0])
153         return m_channel[0];
154 
155     m_channel[0] = mfcreate(NULL, 0);
156     if (NULL == m_channel[0]) return NULL;
157     m_channel[0]->fp = stdin;
158     return m_channel[0];
159 }
160 
init_mstdin(void)161 static void init_mstdin(void) {
162     static int done_stdin = 0;
163     if (done_stdin)
164         return;
165 
166     m_channel[0]->data = mfload(stdin, NULL, &m_channel[0]->size, 1);
167     m_channel[0]->mode = MF_READ;
168     done_stdin = 1;
169 }
170 
171 /*
172  * Creates and returns m_channel[1]. This is the fake for stdout. It starts as
173  * an empty buffer which is physically written out only when mfflush or
174  * mfclose are called.
175  */
mstdout(void)176 mFILE *mstdout(void) {
177     if (m_channel[1])
178         return m_channel[1];
179 
180     m_channel[1] = mfcreate(NULL, 0);
181     if (NULL == m_channel[1]) return NULL;
182     m_channel[1]->fp = stdout;
183     m_channel[1]->mode = MF_WRITE;
184     return m_channel[1];
185 }
186 
187 /*
188  * Stderr as an mFILE.
189  * The code handles stderr by returning m_channel[2], but also checking
190  * for stderr in fprintf (the common usage of it) to auto-flush.
191  */
mstderr(void)192 mFILE *mstderr(void) {
193     if (m_channel[2])
194         return m_channel[2];
195 
196     m_channel[2] = mfcreate(NULL, 0);
197     if (NULL == m_channel[2]) return NULL;
198     m_channel[2]->fp = stderr;
199     m_channel[2]->mode = MF_WRITE;
200     return m_channel[2];
201 }
202 
203 
204 /*
205  * For creating existing mFILE pointers directly from memory buffers.
206  */
mfcreate(char * data,int size)207 mFILE *mfcreate(char *data, int size) {
208     mFILE *mf = (mFILE *)malloc(sizeof(*mf));
209     if (NULL == mf) return NULL;
210     mf->fp = NULL;
211     mf->data = data;
212     mf->alloced = size;
213     mf->size = size;
214     mf->eof = 0;
215     mf->offset = 0;
216     mf->flush_pos = 0;
217     mf->mode = MF_READ | MF_WRITE;
218     return mf;
219 }
220 
221 /*
222  * Recreate an existing mFILE to house new data/size.
223  * It also rewinds the file.
224  */
mfrecreate(mFILE * mf,char * data,int size)225 void mfrecreate(mFILE *mf, char *data, int size) {
226     if (mf->data)
227         free(mf->data);
228     mf->data = data;
229     mf->size = size;
230     mf->alloced = size;
231     mf->eof = 0;
232     mf->offset = 0;
233     mf->flush_pos = 0;
234 }
235 
236 
237 /*
238  * Creates a new mFILE to contain the contents of the FILE pointer.
239  * This mFILE is purely for in-memory operations and has no links to the
240  * original FILE* it came from. It also doesn't close the FILE pointer.
241  * Consider using mfreopen() is you need different behaviour.
242  *
243  * Returns mFILE * on success
244  *         NULL on failure.
245  */
mfcreate_from(const char * path,const char * mode_str,FILE * fp)246 mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp) {
247     mFILE *mf;
248 
249     /* Open using mfreopen() */
250     if (NULL == (mf = mfreopen(path, mode_str, fp)))
251         return NULL;
252 
253     /* Disassociate from the input stream */
254     mf->fp = NULL;
255 
256     return mf;
257 }
258 
259 /*
260  * Converts a FILE * to an mFILE *.
261  * Use this for wrapper functions to turn external prototypes requiring
262  * FILE * as an argument into internal code using mFILE *.
263  */
mfreopen(const char * path,const char * mode_str,FILE * fp)264 mFILE *mfreopen(const char *path, const char *mode_str, FILE *fp) {
265     mFILE *mf;
266     int r = 0, w = 0, a = 0, b = 0, x = 0, mode = 0;
267 
268     /* Parse mode:
269      * r = read file contents (if truncated => don't read)
270      * w = write on close
271      * a = position at end of buffer
272      * x = position at same location as the original fp, don't seek on flush
273      * + = for update (read and write)
274      * m = mmap (read only)
275      */
276     if (strchr(mode_str, 'r'))
277         r = 1, mode |= MF_READ;
278     if (strchr(mode_str, 'w'))
279         w = 1, mode |= MF_WRITE | MF_TRUNC;
280     if (strchr(mode_str, 'a'))
281         w = a = 1, mode |= MF_WRITE | MF_APPEND;
282     if (strchr(mode_str, 'b'))
283         b = 1, mode |= MF_BINARY;
284     if (strchr(mode_str, 'x'))
285         x = 1;
286     if (strchr(mode_str, '+')) {
287         w = 1, mode |= MF_READ | MF_WRITE;
288         if (a)
289             r = 1;
290     }
291 #ifdef HAVE_MMAP
292     if (strchr(mode_str, 'm'))
293         if (!w) mode |= MF_MMAP;
294 #endif
295 
296     if (r) {
297         mf = mfcreate(NULL, 0);
298         if (NULL == mf) return NULL;
299         if (!(mode & MF_TRUNC)) {
300 #ifdef HAVE_MMAP
301             if (mode & MF_MMAP) {
302                 if (mfmmap(mf, fp, path) == -1) {
303                     mf->data = NULL;
304                     mode &= ~MF_MMAP;
305                 }
306             }
307 #endif
308             if (!mf->data) {
309                 mf->data = mfload(fp, path, &mf->size, b);
310                 if (!mf->data) {
311                     free(mf);
312                     return NULL;
313                 }
314                 mf->alloced = mf->size;
315                 if (!a)
316                     fseek(fp, 0, SEEK_SET);
317             }
318         }
319     } else if (w) {
320         /* Write - initialise the data structures */
321         mf = mfcreate(NULL, 0);
322         if (NULL == mf) return NULL;
323     } else {
324         hts_log_error("Must specify either r, w or a for mode");
325         return NULL;
326     }
327     mf->fp = fp;
328     mf->mode = mode;
329 
330     if (x) {
331         mf->mode |= MF_MODEX;
332     }
333 
334     if (a) {
335         mf->flush_pos = mf->size;
336         fseek(fp, 0, SEEK_END);
337     }
338 
339     return mf;
340 }
341 
342 /*
343  * Opens a file. If we have read access (r or a+) then it loads the entire
344  * file into memory. If We have write access then the pathname is stored.
345  * We do not actually write until an mfclose, which then checks this pathname.
346  */
mfopen(const char * path,const char * mode)347 mFILE *mfopen(const char *path, const char *mode) {
348     FILE *fp;
349 
350     if (NULL == (fp = fopen(path, mode)))
351         return NULL;
352     return mfreopen(path, mode, fp);
353 }
354 
355 /*
356  * Closes an mFILE. If the filename is known (implying write access) then this
357  * also writes the data to disk.
358  *
359  * Stdout is handled by calling mfflush which writes to stdout if appropriate.
360  */
mfclose(mFILE * mf)361 int mfclose(mFILE *mf) {
362     if (!mf)
363         return -1;
364 
365     mfflush(mf);
366 
367 #ifdef HAVE_MMAP
368     if ((mf->mode & MF_MMAP) && mf->data) {
369         /* Mmaped */
370         munmap(mf->data, mf->size);
371         mf->data = NULL;
372     }
373 #endif
374 
375     if (mf->fp)
376         fclose(mf->fp);
377 
378     mfdestroy(mf);
379 
380     return 0;
381 }
382 
383 /*
384  * Closes the file pointer contained within the mFILE without destroying
385  * the in-memory data.
386  *
387  * Attempting to do this on an mmaped buffer is an error.
388  */
mfdetach(mFILE * mf)389 int mfdetach(mFILE *mf) {
390     if (!mf)
391         return -1;
392 
393     mfflush(mf);
394     if (mf->mode & MF_MMAP)
395         return -1;
396 
397     if (mf->fp) {
398         fclose(mf->fp);
399         mf->fp = NULL;
400     }
401 
402     return 0;
403 }
404 
405 /*
406  * Destroys an mFILE structure but does not flush or close it
407  */
mfdestroy(mFILE * mf)408 int mfdestroy(mFILE *mf) {
409     if (!mf)
410         return -1;
411 
412     if (mf->data)
413         free(mf->data);
414     free(mf);
415 
416     return 0;
417 }
418 
419 /*
420  * Steals that data out of an mFILE.  The mFILE itself will be closed.
421  * It is up to the caller to free the stolen buffer.  If size_out is
422  * not NULL, mf->size will be stored in it.
423  * This is more-or-less the opposite of mfcreate().
424  *
425  * Note, we cannot steal the allocated buffer from an mmaped mFILE.
426  */
427 
mfsteal(mFILE * mf,size_t * size_out)428 void *mfsteal(mFILE *mf, size_t *size_out) {
429     void *data;
430 
431     if (!mf) return NULL;
432 
433     data = mf->data;
434 
435     if (NULL != size_out) *size_out = mf->size;
436 
437     if (mfdetach(mf) != 0)
438         return NULL;
439 
440     mf->data = NULL;
441     mfdestroy(mf);
442 
443     return data;
444 }
445 
446 /*
447  * Seek/tell functions. Nothing more than updating and reporting an
448  * in-memory index. NB we can seek on stdin or stdout even provided we
449  * haven't been flushing.
450  */
mfseek(mFILE * mf,long offset,int whence)451 int mfseek(mFILE *mf, long offset, int whence) {
452     switch (whence) {
453     case SEEK_SET:
454         mf->offset = offset;
455         break;
456     case SEEK_CUR:
457         mf->offset += offset;
458         break;
459     case SEEK_END:
460         mf->offset = mf->size + offset;
461         break;
462     default:
463         errno = EINVAL;
464         return -1;
465     }
466 
467     mf->eof = 0;
468     return 0;
469 }
470 
mftell(mFILE * mf)471 long mftell(mFILE *mf) {
472     return mf->offset;
473 }
474 
mrewind(mFILE * mf)475 void mrewind(mFILE *mf) {
476     mf->offset = 0;
477     mf->eof = 0;
478 }
479 
480 /*
481  * mftruncate is not directly a translation of ftruncate as the latter
482  * takes a file descriptor instead of a FILE *. It performs the analogous
483  * role though.
484  *
485  * If offset is -1 then the file is truncated to be the current file
486  * offset.
487  */
mftruncate(mFILE * mf,long offset)488 void mftruncate(mFILE *mf, long offset) {
489     mf->size = offset != -1 ? offset : mf->offset;
490     if (mf->offset > mf->size)
491         mf->offset = mf->size;
492 }
493 
mfeof(mFILE * mf)494 int mfeof(mFILE *mf) {
495     return mf->eof;
496 }
497 
498 /*
499  * mFILE read/write functions. Basically these turn fread/fwrite syntax
500  * into memcpy statements, with appropriate memory handling for writing.
501  */
mfread(void * ptr,size_t size,size_t nmemb,mFILE * mf)502 size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
503     size_t len;
504     char *cptr = (char *)ptr;
505 
506     if (mf == m_channel[0]) init_mstdin();
507 
508     if (mf->size <= mf->offset)
509         return 0;
510 
511     len = size * nmemb <= mf->size - mf->offset
512         ? size * nmemb
513         : mf->size - mf->offset;
514     if (!size)
515         return 0;
516 
517     memcpy(cptr, &mf->data[mf->offset], len);
518     mf->offset += len;
519 
520     if (len != size * nmemb) {
521         mf->eof = 1;
522     }
523 
524     return len / size;
525 }
526 
mfwrite(void * ptr,size_t size,size_t nmemb,mFILE * mf)527 size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
528     if (!(mf->mode & MF_WRITE))
529         return 0;
530 
531     /* Append mode => forced all writes to end of file */
532     if (mf->mode & MF_APPEND)
533         mf->offset = mf->size;
534 
535     /* Make sure we have enough room */
536     while (size * nmemb + mf->offset > mf->alloced) {
537         size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024;
538         void * new_data = realloc(mf->data, new_alloced);
539         if (NULL == new_data) return 0;
540         mf->alloced = new_alloced;
541         mf->data    = new_data;
542     }
543 
544     /* Record where we need to reflush from */
545     if (mf->offset < mf->flush_pos)
546         mf->flush_pos = mf->offset;
547 
548     /* Copy the data over */
549     memcpy(&mf->data[mf->offset], ptr, size * nmemb);
550     mf->offset += size * nmemb;
551     if (mf->size < mf->offset)
552         mf->size = mf->offset;
553 
554     return nmemb;
555 }
556 
mfgetc(mFILE * mf)557 int mfgetc(mFILE *mf) {
558     if (mf == m_channel[0]) init_mstdin();
559     if (mf->offset < mf->size) {
560         return (unsigned char)mf->data[mf->offset++];
561     }
562 
563     mf->eof = 1;
564     return -1;
565 }
566 
mungetc(int c,mFILE * mf)567 int mungetc(int c, mFILE *mf) {
568     if (mf->offset > 0) {
569         mf->data[--mf->offset] = c;
570         return c;
571     }
572 
573     mf->eof = 1;
574     return -1;
575 }
576 
mfgets(char * s,int size,mFILE * mf)577 char *mfgets(char *s, int size, mFILE *mf) {
578     int i;
579 
580     if (mf == m_channel[0]) init_mstdin();
581     *s = 0;
582     for (i = 0; i < size-1;) {
583         if (mf->offset < mf->size) {
584             s[i] = mf->data[mf->offset++];
585             if (s[i++] == '\n')
586                 break;
587         } else {
588             mf->eof = 1;
589             break;
590         }
591     }
592 
593     s[i] = 0;
594     return i ? s : NULL;
595 }
596 
597 /*
598  * Flushes an mFILE. If this is a real open of a file in write mode then
599  * mFILE->fp will be set. We then write out any new data in mFILE since the
600  * last flush. We cannot tell what may have been modified as we don't keep
601  * track of that, so we typically rewrite out the entire file contents between
602  * the last flush_pos and the end of file.
603  *
604  * For stderr/stdout we also reset the offsets so we cannot modify things
605  * we've already output.
606  */
mfflush(mFILE * mf)607 int mfflush(mFILE *mf) {
608     if (!mf->fp)
609         return 0;
610 
611     /* FIXME: only do this when opened in write mode */
612     if (mf == m_channel[1] || mf == m_channel[2]) {
613         if (mf->flush_pos < mf->size) {
614             size_t bytes = mf->size - mf->flush_pos;
615             if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
616                 return -1;
617             if (0 != fflush(mf->fp))
618                 return -1;
619         }
620 
621         /* Stdout & stderr are non-seekable streams so throw away the data */
622         mf->offset = mf->size = mf->flush_pos = 0;
623     }
624 
625     /* only flush when opened in write mode */
626     if (mf->mode & MF_WRITE) {
627         if (mf->flush_pos < mf->size) {
628             size_t bytes = mf->size - mf->flush_pos;
629             if (!(mf->mode & MF_MODEX)) {
630                 fseek(mf->fp, mf->flush_pos, SEEK_SET);
631             }
632             if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
633                 return -1;
634             if (0 != fflush(mf->fp))
635                 return -1;
636         }
637         if (ftell(mf->fp) != -1 &&
638             ftruncate(fileno(mf->fp), ftell(mf->fp)) == -1)
639             return -1;
640         mf->flush_pos = mf->size;
641     }
642 
643     return 0;
644 }
645 
646 /*
647  * Converts an mFILE from binary to ascii mode by replacing all
648  * cr-nl with nl.
649  *
650  * Primarily used on windows when we've uncompressed a binary file which
651  * happens to be a text file (eg Experiment File). Previously we would have
652  * seeked back to the start and used _setmode(fileno(fp), _O_TEXT).
653  *
654  * Side effect: resets offset and flush_pos back to the start.
655  */
mfascii(mFILE * mf)656 void mfascii(mFILE *mf) {
657     size_t p1, p2;
658 
659     for (p1 = p2 = 1; p1 < mf->size; p1++, p2++) {
660         if (mf->data[p1] == '\n' && mf->data[p1-1] == '\r') {
661             p2--; /* delete the \r */
662         }
663         mf->data[p2] = mf->data[p1];
664     }
665     mf->size = p2;
666 
667     mf->offset = mf->flush_pos = 0;
668 }
669