1 /*****************************************************************************
2   NewChromsweep.h
3 
4   (c) 2009 - Aaron Quinlan
5   Hall Laboratory
6   Department of Biochemistry and Molecular Genetics
7   University of Virginia
8   aaronquinlan@gmail.com
9 
10   Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #ifndef NEW_CHROMSWEEP_H
13 #define NEW_CHROMSWEEP_H
14 
15 #include <string>
16 #include "BTlist.h"
17 #include "RecordKeyList.h"
18 #include "RecordKeyVector.h"
19 #include <queue>
20 #include <iostream>
21 #include <fstream>
22 #include <stdlib.h>
23 #include "string.h"
24 
25 using namespace std;
26 
27 class Record;
28 class FileRecordMgr;
29 class ContextIntersect;
30 
31 class NewChromSweep {
32 public:
33 
34     NewChromSweep(ContextIntersect *context);
35 
36 
37     virtual ~NewChromSweep(void);
38     virtual bool init();
39 
40     typedef RecordList recListType;
41     typedef const RecordListNode *recListIterType;
42 
43     virtual bool next(RecordKeyVector &next);
44 
45     // NOTE! You MUST call this method after sweep if you want the
46     // getTotalRecordLength methods to return the whole length of the
47     // record files, rather than just what was used by sweep.
48     void closeOut(bool testChromOrder = false);
49 
50 
getQueryTotalRecordLength()51     unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; }
getDatabaseTotalRecordLength()52     unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; }
53 
getQueryTotalRecords()54     unsigned long getQueryTotalRecords() { return _queryTotalRecords; }
getDatabaseTotalRecords()55     unsigned long getDatabaseTotalRecords() { return _databaseTotalRecords; }
56 
57 
58 protected:
59     ContextIntersect *_context;
60     FileRecordMgr *_queryFRM;
61     int _numDBs; //don't really need this stored, but here for code brevity.
62     int _numFiles; //ditto. Just numDBs + num queries, which for now is always 1.
63     vector<FileRecordMgr *> _dbFRMs;
64 
65      unsigned long _queryRecordsTotalLength;
66     vector<unsigned long> _dbFileRecordsLength; //each value in this vector have the
67     //length of all records in the corresponding db file.
68 
69     unsigned long _databaseRecordsTotalLength;
70 
71     unsigned long _queryTotalRecords;
72     unsigned long _databaseTotalRecords;
73 
74 
75     bool _wasInitialized;
76 
77     // a cache of still active features from the database file
78 //    typedef enum { BEFORE_QUERY, NEAR_QUERY, AFTER_QUERY, EOF } cacheStatusType;
79 //    vector <pair<cacheStatusType, recListType> >_caches;
80 
81     vector <recListType>_caches;
82     // the set of hits in the database for the current query
83 //    recListType _hits;
84 
85     // the current query and db features.
86     Record * _currQueryRec;
87     vector<Record *> _currDbRecs;
88 
89     // a cache of the current chrom from the query. used to handle chrom changes.
90     string _currQueryChromName;
91     string _prevQueryChromName;
92     bool _runToQueryEnd;
93 
94 
95     virtual void masterScan(RecordKeyVector &retList);
96 
97     bool nextRecord(bool query, int dbIdx = -1); //true fetches next query record, false fetches next db record.
98 
99     virtual void scanCache(int dbIdx, RecordKeyVector &retList);
100     virtual void clearCache(int dbIdx);
101     virtual bool chromChange(int dbIdx, RecordKeyVector &retList, bool wantScan);
102 
103     bool dbFinished(int dbIdx);
104 
105     bool intersects(const Record *rec1, const Record *rec2) const;
106 
107     bool allCachesEmpty();
108     bool allCurrDBrecsNull();
109 
110 
111     //
112     // members and methods for detecting differently
113     // sorted files without a genome file.
114     //
115 
116     typedef map<string, int> _orderTrackType;
117     vector<_orderTrackType *> _fileTracks;
118     vector<char*> _filePrevChrom;
119     bool _lexicoDisproven; //whether we've established that any file ISN'T in lexicographical order
120     bool _lexicoAssumed; //whether we've had to try to guess that any file might be in lexicographical order.
121     string _lexicoAssumedChromName; //which chromosome we had to make that guess for. Used in error reporting.
122     int _lexicoAssumedFileIdx; //which file we had to make the guess for. Also for error reporting.
123     bool _testLastQueryRec;
124 
125     void testChromOrder(const Record *rec);
126     bool queryChromAfterDbRec(const Record *dbRec);
127     int findChromOrder(const Record *rec);
128     bool verifyChromOrderMismatch(const string & chrom, const string &prevChrom, int skipFile);
129     void testThatAllDbChromsExistInQuery();
130     bool testLexicoQueryAfterDb(const Record *queryRec, const Record *dbRec);
131 
132 
133 };
134 
135 #endif /* NewChromSweep_H */
136