1 /***************************************************************************** 2 NewChromsweep.h 3 4 (c) 2009 - Aaron Quinlan 5 Hall Laboratory 6 Department of Biochemistry and Molecular Genetics 7 University of Virginia 8 aaronquinlan@gmail.com 9 10 Licenced under the GNU General Public License 2.0 license. 11 ******************************************************************************/ 12 #ifndef NEW_CHROMSWEEP_H 13 #define NEW_CHROMSWEEP_H 14 15 #include <string> 16 #include "BTlist.h" 17 #include "RecordKeyList.h" 18 #include "RecordKeyVector.h" 19 #include <queue> 20 #include <iostream> 21 #include <fstream> 22 #include <stdlib.h> 23 #include "string.h" 24 25 using namespace std; 26 27 class Record; 28 class FileRecordMgr; 29 class ContextIntersect; 30 31 class NewChromSweep { 32 public: 33 34 NewChromSweep(ContextIntersect *context); 35 36 37 virtual ~NewChromSweep(void); 38 virtual bool init(); 39 40 typedef RecordList recListType; 41 typedef const RecordListNode *recListIterType; 42 43 virtual bool next(RecordKeyVector &next); 44 45 // NOTE! You MUST call this method after sweep if you want the 46 // getTotalRecordLength methods to return the whole length of the 47 // record files, rather than just what was used by sweep. 48 void closeOut(bool testChromOrder = false); 49 50 getQueryTotalRecordLength()51 unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; } getDatabaseTotalRecordLength()52 unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; } 53 getQueryTotalRecords()54 unsigned long getQueryTotalRecords() { return _queryTotalRecords; } getDatabaseTotalRecords()55 unsigned long getDatabaseTotalRecords() { return _databaseTotalRecords; } 56 57 58 protected: 59 ContextIntersect *_context; 60 FileRecordMgr *_queryFRM; 61 int _numDBs; //don't really need this stored, but here for code brevity. 62 int _numFiles; //ditto. Just numDBs + num queries, which for now is always 1. 63 vector<FileRecordMgr *> _dbFRMs; 64 65 unsigned long _queryRecordsTotalLength; 66 vector<unsigned long> _dbFileRecordsLength; //each value in this vector have the 67 //length of all records in the corresponding db file. 68 69 unsigned long _databaseRecordsTotalLength; 70 71 unsigned long _queryTotalRecords; 72 unsigned long _databaseTotalRecords; 73 74 75 bool _wasInitialized; 76 77 // a cache of still active features from the database file 78 // typedef enum { BEFORE_QUERY, NEAR_QUERY, AFTER_QUERY, EOF } cacheStatusType; 79 // vector <pair<cacheStatusType, recListType> >_caches; 80 81 vector <recListType>_caches; 82 // the set of hits in the database for the current query 83 // recListType _hits; 84 85 // the current query and db features. 86 Record * _currQueryRec; 87 vector<Record *> _currDbRecs; 88 89 // a cache of the current chrom from the query. used to handle chrom changes. 90 string _currQueryChromName; 91 string _prevQueryChromName; 92 bool _runToQueryEnd; 93 94 95 virtual void masterScan(RecordKeyVector &retList); 96 97 bool nextRecord(bool query, int dbIdx = -1); //true fetches next query record, false fetches next db record. 98 99 virtual void scanCache(int dbIdx, RecordKeyVector &retList); 100 virtual void clearCache(int dbIdx); 101 virtual bool chromChange(int dbIdx, RecordKeyVector &retList, bool wantScan); 102 103 bool dbFinished(int dbIdx); 104 105 bool intersects(const Record *rec1, const Record *rec2) const; 106 107 bool allCachesEmpty(); 108 bool allCurrDBrecsNull(); 109 110 111 // 112 // members and methods for detecting differently 113 // sorted files without a genome file. 114 // 115 116 typedef map<string, int> _orderTrackType; 117 vector<_orderTrackType *> _fileTracks; 118 vector<char*> _filePrevChrom; 119 bool _lexicoDisproven; //whether we've established that any file ISN'T in lexicographical order 120 bool _lexicoAssumed; //whether we've had to try to guess that any file might be in lexicographical order. 121 string _lexicoAssumedChromName; //which chromosome we had to make that guess for. Used in error reporting. 122 int _lexicoAssumedFileIdx; //which file we had to make the guess for. Also for error reporting. 123 bool _testLastQueryRec; 124 125 void testChromOrder(const Record *rec); 126 bool queryChromAfterDbRec(const Record *dbRec); 127 int findChromOrder(const Record *rec); 128 bool verifyChromOrderMismatch(const string & chrom, const string &prevChrom, int skipFile); 129 void testThatAllDbChromsExistInQuery(); 130 bool testLexicoQueryAfterDb(const Record *queryRec, const Record *dbRec); 131 132 133 }; 134 135 #endif /* NewChromSweep_H */ 136