1 // *************************************************************************** 2 // FastaIndex.h (c) 2010 Erik Garrison <erik.garrison@bc.edu> 3 // Marth Lab, Department of Biology, Boston College 4 // All rights reserved. 5 // --------------------------------------------------------------------------- 6 // Last modified: 5 February 2010 (EG) 7 // --------------------------------------------------------------------------- 8 9 #ifndef _FASTA_H 10 #define _FASTA_H 11 12 #include <map> 13 #include <iostream> 14 #include <fstream> 15 #include <vector> 16 #include <stdint.h> 17 #include <stdio.h> 18 #include <algorithm> 19 #include "LargeFileSupport.h" 20 #include "bedFile.h" 21 #include <sys/stat.h> 22 #include <sys/mman.h> 23 #include "split.h" 24 #include <stdlib.h> 25 #include <ctype.h> 26 #include <unistd.h> 27 28 using namespace std; 29 30 class FastaIndexEntry { 31 friend ostream& operator<<(ostream& output, const FastaIndexEntry& e); 32 public: 33 FastaIndexEntry(string name, CHRPOS length, CHRPOS offset, 34 CHRPOS line_blen, CHRPOS line_len, bool useFullHeader); 35 FastaIndexEntry(void); 36 ~FastaIndexEntry(void); 37 string name; // sequence name 38 CHRPOS length; // length of sequence 39 long long offset; // bytes offset of sequence from start of file 40 CHRPOS line_blen; // line length in bytes, sequence characters 41 CHRPOS line_len; // line length including newline 42 bool useFullHeader; 43 void clear(void); 44 }; 45 46 class FastaIndex : public map<string, FastaIndexEntry> { 47 friend ostream& operator<<(ostream& output, FastaIndex& i); 48 public: 49 FastaIndex(bool useFullHeader); 50 ~FastaIndex(void); 51 bool useFullHeader; 52 vector<string> sequenceNames; 53 void indexReference(string refName); 54 void readIndexFile(string fname); 55 void writeIndexFile(string fname); 56 ifstream indexFile; 57 FastaIndexEntry entry(string key); 58 bool chromFound(string name); 59 void flushEntryToIndex(FastaIndexEntry& entry); 60 string indexFileExtension(void); 61 }; 62 63 class FastaReference { 64 public: 65 void open(string reffilename, bool usemmap = false, 66 bool useFullHeader = false); 67 bool usingmmap; 68 string filename; 69 bool usingfullheader; FastaReference(void)70 FastaReference(void) : usingmmap(false), usingfullheader(false) { } 71 ~FastaReference(void); 72 FILE* file; 73 void* filemm; 74 size_t filesize; 75 FastaIndex* index; 76 vector<FastaIndexEntry> findSequencesStartingWith(string seqnameStart); 77 string getSequence(string seqname); 78 // potentially useful for performance, investigate 79 // void getSequence(string seqname, string& sequence); 80 string getSubSequence(string seqname, CHRPOS start, CHRPOS length); 81 string sequenceNameStartingWith(string seqnameStart); 82 CHRPOS sequenceLength(string seqname); 83 }; 84 85 #endif 86