1 /*
2 Copyright (c) 2005-2006, 2008-2009, 2013, 2015, 2017-2019 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32 #include <config.h>
33
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <errno.h>
37 #include <string.h>
38 #include <sys/types.h>
39 #include <sys/stat.h>
40 #include <fcntl.h>
41 #include <unistd.h>
42 #include <stdarg.h>
43
44 #include "../htslib/hts_log.h"
45 #include "os.h"
46 #include "mFILE.h"
47
48 #ifdef HAVE_MMAP
49 #include <sys/mman.h>
50 #endif
51
52 /*
53 * This file contains memory-based versions of the most commonly used
54 * (by io_lib) stdio functions.
55 *
56 * Actual file IO takes place either on opening or closing an mFILE.
57 *
58 * Coupled to this are a bunch of rather scary macros which can be obtained
59 * by including stdio_hack.h. It is recommended though that you use mFILE.h
60 * instead and replace fopen with mfopen (etc). This is more or less
61 * mandatory if you wish to use both FILE and mFILE structs in a single file.
62 */
63
64 static mFILE *m_channel[3]; /* stdin, stdout and stderr fakes */
65
66 /*
67 * Reads the entirety of fp into memory. If 'fn' exists it is the filename
68 * associated with fp. This will be used for more optimal reading (via a
69 * stat to identify the size and a single read). Otherwise we use successive
70 * reads until EOF.
71 *
72 * Returns a malloced buffer on success of length *size
73 * NULL on failure
74 */
mfload(FILE * fp,const char * fn,size_t * size,int binary)75 static char *mfload(FILE *fp, const char *fn, size_t *size, int binary) {
76 struct stat sb;
77 char *data = NULL;
78 size_t allocated = 0, used = 0;
79 int bufsize = 8192;
80
81 #ifdef _WIN32
82 if (binary)
83 _setmode(_fileno(fp), _O_BINARY);
84 else
85 _setmode(_fileno(fp), _O_TEXT);
86 #endif
87
88 if (fn && -1 != stat(fn, &sb)) {
89 data = malloc(allocated = sb.st_size);
90 if (!data)
91 return NULL;
92 bufsize = sb.st_size;
93 } else {
94 fn = NULL;
95 }
96
97 do {
98 size_t len;
99 if (used + bufsize > allocated) {
100 allocated += bufsize;
101 char *datan = realloc(data, allocated);
102 if (datan) {
103 data = datan;
104 } else {
105 free(data);
106 return NULL;
107 }
108 }
109 len = fread(data + used, 1, allocated - used, fp);
110 if (len > 0)
111 used += len;
112 } while (!feof(fp) && (fn == NULL || used < sb.st_size));
113
114 *size = used;
115
116 return data;
117 }
118
119
120 #ifdef HAVE_MMAP
121 /*
122 * mmaps in the file, but only for reading currently.
123 *
124 * Returns 0 on success
125 * -1 on failure
126 */
mfmmap(mFILE * mf,FILE * fp,const char * fn)127 int mfmmap(mFILE *mf, FILE *fp, const char *fn) {
128 struct stat sb;
129
130 if (stat(fn, &sb) != 0)
131 return -1;
132
133 mf->size = sb.st_size;
134 mf->data = mmap(NULL, mf->size, PROT_READ, MAP_SHARED,
135 fileno(fp), 0);
136
137 if (!mf->data || mf->data == (void *)-1)
138 return -1;
139
140 mf->alloced = 0;
141 return 0;
142 }
143 #endif
144
145
146 /*
147 * Creates and returns m_channel[0].
148 * We initialise this on the first attempted read, which then slurps in
149 * all of stdin until EOF is met.
150 */
mstdin(void)151 mFILE *mstdin(void) {
152 if (m_channel[0])
153 return m_channel[0];
154
155 m_channel[0] = mfcreate(NULL, 0);
156 if (NULL == m_channel[0]) return NULL;
157 m_channel[0]->fp = stdin;
158 return m_channel[0];
159 }
160
init_mstdin(void)161 static void init_mstdin(void) {
162 static int done_stdin = 0;
163 if (done_stdin)
164 return;
165
166 m_channel[0]->data = mfload(stdin, NULL, &m_channel[0]->size, 1);
167 m_channel[0]->mode = MF_READ;
168 done_stdin = 1;
169 }
170
171 /*
172 * Creates and returns m_channel[1]. This is the fake for stdout. It starts as
173 * an empty buffer which is physically written out only when mfflush or
174 * mfclose are called.
175 */
mstdout(void)176 mFILE *mstdout(void) {
177 if (m_channel[1])
178 return m_channel[1];
179
180 m_channel[1] = mfcreate(NULL, 0);
181 if (NULL == m_channel[1]) return NULL;
182 m_channel[1]->fp = stdout;
183 m_channel[1]->mode = MF_WRITE;
184 return m_channel[1];
185 }
186
187 /*
188 * Stderr as an mFILE.
189 * The code handles stderr by returning m_channel[2], but also checking
190 * for stderr in fprintf (the common usage of it) to auto-flush.
191 */
mstderr(void)192 mFILE *mstderr(void) {
193 if (m_channel[2])
194 return m_channel[2];
195
196 m_channel[2] = mfcreate(NULL, 0);
197 if (NULL == m_channel[2]) return NULL;
198 m_channel[2]->fp = stderr;
199 m_channel[2]->mode = MF_WRITE;
200 return m_channel[2];
201 }
202
203
204 /*
205 * For creating existing mFILE pointers directly from memory buffers.
206 */
mfcreate(char * data,int size)207 mFILE *mfcreate(char *data, int size) {
208 mFILE *mf = (mFILE *)malloc(sizeof(*mf));
209 if (NULL == mf) return NULL;
210 mf->fp = NULL;
211 mf->data = data;
212 mf->alloced = size;
213 mf->size = size;
214 mf->eof = 0;
215 mf->offset = 0;
216 mf->flush_pos = 0;
217 mf->mode = MF_READ | MF_WRITE;
218 return mf;
219 }
220
221 /*
222 * Recreate an existing mFILE to house new data/size.
223 * It also rewinds the file.
224 */
mfrecreate(mFILE * mf,char * data,int size)225 void mfrecreate(mFILE *mf, char *data, int size) {
226 if (mf->data)
227 free(mf->data);
228 mf->data = data;
229 mf->size = size;
230 mf->alloced = size;
231 mf->eof = 0;
232 mf->offset = 0;
233 mf->flush_pos = 0;
234 }
235
236
237 /*
238 * Creates a new mFILE to contain the contents of the FILE pointer.
239 * This mFILE is purely for in-memory operations and has no links to the
240 * original FILE* it came from. It also doesn't close the FILE pointer.
241 * Consider using mfreopen() is you need different behaviour.
242 *
243 * Returns mFILE * on success
244 * NULL on failure.
245 */
mfcreate_from(const char * path,const char * mode_str,FILE * fp)246 mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp) {
247 mFILE *mf;
248
249 /* Open using mfreopen() */
250 if (NULL == (mf = mfreopen(path, mode_str, fp)))
251 return NULL;
252
253 /* Disassociate from the input stream */
254 mf->fp = NULL;
255
256 return mf;
257 }
258
259 /*
260 * Converts a FILE * to an mFILE *.
261 * Use this for wrapper functions to turn external prototypes requiring
262 * FILE * as an argument into internal code using mFILE *.
263 */
mfreopen(const char * path,const char * mode_str,FILE * fp)264 mFILE *mfreopen(const char *path, const char *mode_str, FILE *fp) {
265 mFILE *mf;
266 int r = 0, w = 0, a = 0, b = 0, x = 0, mode = 0;
267
268 /* Parse mode:
269 * r = read file contents (if truncated => don't read)
270 * w = write on close
271 * a = position at end of buffer
272 * x = position at same location as the original fp, don't seek on flush
273 * + = for update (read and write)
274 * m = mmap (read only)
275 */
276 if (strchr(mode_str, 'r'))
277 r = 1, mode |= MF_READ;
278 if (strchr(mode_str, 'w'))
279 w = 1, mode |= MF_WRITE | MF_TRUNC;
280 if (strchr(mode_str, 'a'))
281 w = a = 1, mode |= MF_WRITE | MF_APPEND;
282 if (strchr(mode_str, 'b'))
283 b = 1, mode |= MF_BINARY;
284 if (strchr(mode_str, 'x'))
285 x = 1;
286 if (strchr(mode_str, '+')) {
287 w = 1, mode |= MF_READ | MF_WRITE;
288 if (a)
289 r = 1;
290 }
291 #ifdef HAVE_MMAP
292 if (strchr(mode_str, 'm'))
293 if (!w) mode |= MF_MMAP;
294 #endif
295
296 if (r) {
297 mf = mfcreate(NULL, 0);
298 if (NULL == mf) return NULL;
299 if (!(mode & MF_TRUNC)) {
300 #ifdef HAVE_MMAP
301 if (mode & MF_MMAP) {
302 if (mfmmap(mf, fp, path) == -1) {
303 mf->data = NULL;
304 mode &= ~MF_MMAP;
305 }
306 }
307 #endif
308 if (!mf->data) {
309 mf->data = mfload(fp, path, &mf->size, b);
310 if (!mf->data) {
311 free(mf);
312 return NULL;
313 }
314 mf->alloced = mf->size;
315 if (!a)
316 fseek(fp, 0, SEEK_SET);
317 }
318 }
319 } else if (w) {
320 /* Write - initialise the data structures */
321 mf = mfcreate(NULL, 0);
322 if (NULL == mf) return NULL;
323 } else {
324 hts_log_error("Must specify either r, w or a for mode");
325 return NULL;
326 }
327 mf->fp = fp;
328 mf->mode = mode;
329
330 if (x) {
331 mf->mode |= MF_MODEX;
332 }
333
334 if (a) {
335 mf->flush_pos = mf->size;
336 fseek(fp, 0, SEEK_END);
337 }
338
339 return mf;
340 }
341
342 /*
343 * Opens a file. If we have read access (r or a+) then it loads the entire
344 * file into memory. If We have write access then the pathname is stored.
345 * We do not actually write until an mfclose, which then checks this pathname.
346 */
mfopen(const char * path,const char * mode)347 mFILE *mfopen(const char *path, const char *mode) {
348 FILE *fp;
349
350 if (NULL == (fp = fopen(path, mode)))
351 return NULL;
352 return mfreopen(path, mode, fp);
353 }
354
355 /*
356 * Closes an mFILE. If the filename is known (implying write access) then this
357 * also writes the data to disk.
358 *
359 * Stdout is handled by calling mfflush which writes to stdout if appropriate.
360 */
mfclose(mFILE * mf)361 int mfclose(mFILE *mf) {
362 if (!mf)
363 return -1;
364
365 mfflush(mf);
366
367 #ifdef HAVE_MMAP
368 if ((mf->mode & MF_MMAP) && mf->data) {
369 /* Mmaped */
370 munmap(mf->data, mf->size);
371 mf->data = NULL;
372 }
373 #endif
374
375 if (mf->fp)
376 fclose(mf->fp);
377
378 mfdestroy(mf);
379
380 return 0;
381 }
382
383 /*
384 * Closes the file pointer contained within the mFILE without destroying
385 * the in-memory data.
386 *
387 * Attempting to do this on an mmaped buffer is an error.
388 */
mfdetach(mFILE * mf)389 int mfdetach(mFILE *mf) {
390 if (!mf)
391 return -1;
392
393 mfflush(mf);
394 if (mf->mode & MF_MMAP)
395 return -1;
396
397 if (mf->fp) {
398 fclose(mf->fp);
399 mf->fp = NULL;
400 }
401
402 return 0;
403 }
404
405 /*
406 * Destroys an mFILE structure but does not flush or close it
407 */
mfdestroy(mFILE * mf)408 int mfdestroy(mFILE *mf) {
409 if (!mf)
410 return -1;
411
412 if (mf->data)
413 free(mf->data);
414 free(mf);
415
416 return 0;
417 }
418
419 /*
420 * Steals that data out of an mFILE. The mFILE itself will be closed.
421 * It is up to the caller to free the stolen buffer. If size_out is
422 * not NULL, mf->size will be stored in it.
423 * This is more-or-less the opposite of mfcreate().
424 *
425 * Note, we cannot steal the allocated buffer from an mmaped mFILE.
426 */
427
mfsteal(mFILE * mf,size_t * size_out)428 void *mfsteal(mFILE *mf, size_t *size_out) {
429 void *data;
430
431 if (!mf) return NULL;
432
433 data = mf->data;
434
435 if (NULL != size_out) *size_out = mf->size;
436
437 if (mfdetach(mf) != 0)
438 return NULL;
439
440 mf->data = NULL;
441 mfdestroy(mf);
442
443 return data;
444 }
445
446 /*
447 * Seek/tell functions. Nothing more than updating and reporting an
448 * in-memory index. NB we can seek on stdin or stdout even provided we
449 * haven't been flushing.
450 */
mfseek(mFILE * mf,long offset,int whence)451 int mfseek(mFILE *mf, long offset, int whence) {
452 switch (whence) {
453 case SEEK_SET:
454 mf->offset = offset;
455 break;
456 case SEEK_CUR:
457 mf->offset += offset;
458 break;
459 case SEEK_END:
460 mf->offset = mf->size + offset;
461 break;
462 default:
463 errno = EINVAL;
464 return -1;
465 }
466
467 mf->eof = 0;
468 return 0;
469 }
470
mftell(mFILE * mf)471 long mftell(mFILE *mf) {
472 return mf->offset;
473 }
474
mrewind(mFILE * mf)475 void mrewind(mFILE *mf) {
476 mf->offset = 0;
477 mf->eof = 0;
478 }
479
480 /*
481 * mftruncate is not directly a translation of ftruncate as the latter
482 * takes a file descriptor instead of a FILE *. It performs the analogous
483 * role though.
484 *
485 * If offset is -1 then the file is truncated to be the current file
486 * offset.
487 */
mftruncate(mFILE * mf,long offset)488 void mftruncate(mFILE *mf, long offset) {
489 mf->size = offset != -1 ? offset : mf->offset;
490 if (mf->offset > mf->size)
491 mf->offset = mf->size;
492 }
493
mfeof(mFILE * mf)494 int mfeof(mFILE *mf) {
495 return mf->eof;
496 }
497
498 /*
499 * mFILE read/write functions. Basically these turn fread/fwrite syntax
500 * into memcpy statements, with appropriate memory handling for writing.
501 */
mfread(void * ptr,size_t size,size_t nmemb,mFILE * mf)502 size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
503 size_t len;
504 char *cptr = (char *)ptr;
505
506 if (mf == m_channel[0]) init_mstdin();
507
508 if (mf->size <= mf->offset)
509 return 0;
510
511 len = size * nmemb <= mf->size - mf->offset
512 ? size * nmemb
513 : mf->size - mf->offset;
514 if (!size)
515 return 0;
516
517 memcpy(cptr, &mf->data[mf->offset], len);
518 mf->offset += len;
519
520 if (len != size * nmemb) {
521 mf->eof = 1;
522 }
523
524 return len / size;
525 }
526
mfwrite(void * ptr,size_t size,size_t nmemb,mFILE * mf)527 size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
528 if (!(mf->mode & MF_WRITE))
529 return 0;
530
531 /* Append mode => forced all writes to end of file */
532 if (mf->mode & MF_APPEND)
533 mf->offset = mf->size;
534
535 /* Make sure we have enough room */
536 while (size * nmemb + mf->offset > mf->alloced) {
537 size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024;
538 void * new_data = realloc(mf->data, new_alloced);
539 if (NULL == new_data) return 0;
540 mf->alloced = new_alloced;
541 mf->data = new_data;
542 }
543
544 /* Record where we need to reflush from */
545 if (mf->offset < mf->flush_pos)
546 mf->flush_pos = mf->offset;
547
548 /* Copy the data over */
549 memcpy(&mf->data[mf->offset], ptr, size * nmemb);
550 mf->offset += size * nmemb;
551 if (mf->size < mf->offset)
552 mf->size = mf->offset;
553
554 return nmemb;
555 }
556
mfgetc(mFILE * mf)557 int mfgetc(mFILE *mf) {
558 if (mf == m_channel[0]) init_mstdin();
559 if (mf->offset < mf->size) {
560 return (unsigned char)mf->data[mf->offset++];
561 }
562
563 mf->eof = 1;
564 return -1;
565 }
566
mungetc(int c,mFILE * mf)567 int mungetc(int c, mFILE *mf) {
568 if (mf->offset > 0) {
569 mf->data[--mf->offset] = c;
570 return c;
571 }
572
573 mf->eof = 1;
574 return -1;
575 }
576
mfgets(char * s,int size,mFILE * mf)577 char *mfgets(char *s, int size, mFILE *mf) {
578 int i;
579
580 if (mf == m_channel[0]) init_mstdin();
581 *s = 0;
582 for (i = 0; i < size-1;) {
583 if (mf->offset < mf->size) {
584 s[i] = mf->data[mf->offset++];
585 if (s[i++] == '\n')
586 break;
587 } else {
588 mf->eof = 1;
589 break;
590 }
591 }
592
593 s[i] = 0;
594 return i ? s : NULL;
595 }
596
597 /*
598 * Flushes an mFILE. If this is a real open of a file in write mode then
599 * mFILE->fp will be set. We then write out any new data in mFILE since the
600 * last flush. We cannot tell what may have been modified as we don't keep
601 * track of that, so we typically rewrite out the entire file contents between
602 * the last flush_pos and the end of file.
603 *
604 * For stderr/stdout we also reset the offsets so we cannot modify things
605 * we've already output.
606 */
mfflush(mFILE * mf)607 int mfflush(mFILE *mf) {
608 if (!mf->fp)
609 return 0;
610
611 /* FIXME: only do this when opened in write mode */
612 if (mf == m_channel[1] || mf == m_channel[2]) {
613 if (mf->flush_pos < mf->size) {
614 size_t bytes = mf->size - mf->flush_pos;
615 if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
616 return -1;
617 if (0 != fflush(mf->fp))
618 return -1;
619 }
620
621 /* Stdout & stderr are non-seekable streams so throw away the data */
622 mf->offset = mf->size = mf->flush_pos = 0;
623 }
624
625 /* only flush when opened in write mode */
626 if (mf->mode & MF_WRITE) {
627 if (mf->flush_pos < mf->size) {
628 size_t bytes = mf->size - mf->flush_pos;
629 if (!(mf->mode & MF_MODEX)) {
630 fseek(mf->fp, mf->flush_pos, SEEK_SET);
631 }
632 if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
633 return -1;
634 if (0 != fflush(mf->fp))
635 return -1;
636 }
637 if (ftell(mf->fp) != -1 &&
638 ftruncate(fileno(mf->fp), ftell(mf->fp)) == -1)
639 return -1;
640 mf->flush_pos = mf->size;
641 }
642
643 return 0;
644 }
645
646 /*
647 * Converts an mFILE from binary to ascii mode by replacing all
648 * cr-nl with nl.
649 *
650 * Primarily used on windows when we've uncompressed a binary file which
651 * happens to be a text file (eg Experiment File). Previously we would have
652 * seeked back to the start and used _setmode(fileno(fp), _O_TEXT).
653 *
654 * Side effect: resets offset and flush_pos back to the start.
655 */
mfascii(mFILE * mf)656 void mfascii(mFILE *mf) {
657 size_t p1, p2;
658
659 for (p1 = p2 = 1; p1 < mf->size; p1++, p2++) {
660 if (mf->data[p1] == '\n' && mf->data[p1-1] == '\r') {
661 p2--; /* delete the \r */
662 }
663 mf->data[p2] = mf->data[p1];
664 }
665 mf->size = p2;
666
667 mf->offset = mf->flush_pos = 0;
668 }
669