1 // ***************************************************************************
2 // FastaIndex.h (c) 2010 Erik Garrison <erik.garrison@bc.edu>
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 5 February 2010 (EG)
7 // ---------------------------------------------------------------------------
8 
9 #ifndef _FASTA_H
10 #define _FASTA_H
11 
12 #include <map>
13 #include <iostream>
14 #include <fstream>
15 #include <vector>
16 #include <stdint.h>
17 #include <stdio.h>
18 #include <algorithm>
19 #include "LargeFileSupport.h"
20 #include "bedFile.h"
21 #include <sys/stat.h>
22 #include <sys/mman.h>
23 #include "split.h"
24 #include <stdlib.h>
25 #include <ctype.h>
26 #include <unistd.h>
27 
28 using namespace std;
29 
30 class FastaIndexEntry {
31     friend ostream& operator<<(ostream& output, const FastaIndexEntry& e);
32     public:
33         FastaIndexEntry(string name, CHRPOS length, CHRPOS offset,
34 			CHRPOS line_blen, CHRPOS line_len, bool useFullHeader);
35         FastaIndexEntry(void);
36         ~FastaIndexEntry(void);
37         string name;  // sequence name
38         CHRPOS length;  // length of sequence
39         long long offset;  // bytes offset of sequence from start of file
40         CHRPOS line_blen;  // line length in bytes, sequence characters
41         CHRPOS line_len;  // line length including newline
42 	bool useFullHeader;
43         void clear(void);
44 };
45 
46 class FastaIndex : public map<string, FastaIndexEntry> {
47     friend ostream& operator<<(ostream& output, FastaIndex& i);
48     public:
49         FastaIndex(bool useFullHeader);
50         ~FastaIndex(void);
51 	bool useFullHeader;
52         vector<string> sequenceNames;
53         void indexReference(string refName);
54         void readIndexFile(string fname);
55         void writeIndexFile(string fname);
56         ifstream indexFile;
57         FastaIndexEntry entry(string key);
58         bool chromFound(string name);
59         void flushEntryToIndex(FastaIndexEntry& entry);
60         string indexFileExtension(void);
61 };
62 
63 class FastaReference {
64     public:
65         void open(string reffilename, bool usemmap = false,
66 		bool useFullHeader = false);
67         bool usingmmap;
68         string filename;
69         bool usingfullheader;
FastaReference(void)70         FastaReference(void) : usingmmap(false), usingfullheader(false) { }
71         ~FastaReference(void);
72         FILE* file;
73         void* filemm;
74         size_t filesize;
75         FastaIndex* index;
76         vector<FastaIndexEntry> findSequencesStartingWith(string seqnameStart);
77         string getSequence(string seqname);
78         // potentially useful for performance, investigate
79         // void getSequence(string seqname, string& sequence);
80         string getSubSequence(string seqname, CHRPOS start, CHRPOS length);
81         string sequenceNameStartingWith(string seqnameStart);
82         CHRPOS sequenceLength(string seqname);
83 };
84 
85 #endif
86